diff --git a/extern/re2/kokoro/bazel.sh b/extern/re2/.github/bazel.sh old mode 100644 new mode 100755 similarity index 97% rename from extern/re2/kokoro/bazel.sh rename to extern/re2/.github/bazel.sh index 95aee2e55e..fbe92e65d1 --- a/extern/re2/kokoro/bazel.sh +++ b/extern/re2/.github/bazel.sh @@ -1,8 +1,6 @@ #!/bin/bash set -eux -cd git/re2 - bazel clean bazel build --compilation_mode=dbg -- //:all bazel test --compilation_mode=dbg --test_output=errors -- //:all \ diff --git a/extern/re2/.github/cmake.sh b/extern/re2/.github/cmake.sh new file mode 100755 index 0000000000..7cc3db4860 --- /dev/null +++ b/extern/re2/.github/cmake.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -eux + +cmake -D CMAKE_BUILD_TYPE=Debug +cmake --build . --config Debug --clean-first +ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random' + +cmake -D CMAKE_BUILD_TYPE=Release +cmake --build . --config Release --clean-first +ctest -C Release --output-on-failure -E 'dfa|exhaustive|random' + +exit 0 diff --git a/extern/re2/.github/workflows/ci-bazel.yml b/extern/re2/.github/workflows/ci-bazel.yml new file mode 100644 index 0000000000..534da03cb4 --- /dev/null +++ b/extern/re2/.github/workflows/ci-bazel.yml @@ -0,0 +1,17 @@ +name: CI (Bazel) +on: + push: + branches: [master] +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest, windows-latest] + env: + BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + - run: .github/bazel.sh + shell: bash diff --git a/extern/re2/.github/workflows/ci-cmake.yml b/extern/re2/.github/workflows/ci-cmake.yml new file mode 100644 index 0000000000..568312e4ba --- /dev/null +++ b/extern/re2/.github/workflows/ci-cmake.yml @@ -0,0 +1,15 @@ +name: CI (CMake) +on: + push: + branches: [master] +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest, windows-latest] + steps: + - uses: actions/checkout@v2 + - run: .github/cmake.sh + shell: bash diff --git a/extern/re2/.github/workflows/ci.yml b/extern/re2/.github/workflows/ci.yml new file mode 100644 index 0000000000..bfc4c97c5b --- /dev/null +++ b/extern/re2/.github/workflows/ci.yml @@ -0,0 +1,51 @@ +name: CI +on: + push: + branches: [master] +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest] + env: + CC: clang + CXX: clang++ + steps: + - uses: actions/checkout@v2 + - run: make && make test + shell: bash + build-clang: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + tag: [9, 10, 11] + env: + CC: clang-${{ matrix.tag }} + CXX: clang++-${{ matrix.tag }} + steps: + - uses: actions/checkout@v2 + - name: Install Clang ${{ matrix.tag }} + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x ./llvm.sh + sudo ./llvm.sh ${{ matrix.tag }} + shell: bash + - run: make && make test + shell: bash + build-gcc: + runs-on: ubuntu-latest + container: gcc:${{ matrix.tag }} + strategy: + fail-fast: false + matrix: + tag: [4, 5, 6, 7, 8, 9, 10] + env: + CC: gcc + CXX: g++ + steps: + - uses: actions/checkout@v2 + - run: make && make test + shell: bash diff --git a/extern/re2/.travis.yml b/extern/re2/.travis.yml deleted file mode 100644 index f89c96d825..0000000000 --- a/extern/re2/.travis.yml +++ /dev/null @@ -1,179 +0,0 @@ -language: cpp -sudo: false -dist: trusty -script: - - make - - make test -matrix: - include: - - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.8 - env: - - MATRIX_EVAL="CC=gcc-4.8 CXX=g++-4.8" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-4.9 - env: - - MATRIX_EVAL="CC=gcc-4.9 CXX=g++-4.9" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-5 - env: - - MATRIX_EVAL="CC=gcc-5 CXX=g++-5" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-6 - env: - - MATRIX_EVAL="CC=gcc-6 CXX=g++-6" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-7 - env: - - MATRIX_EVAL="CC=gcc-7 CXX=g++-7" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-8 - env: - - MATRIX_EVAL="CC=gcc-8 CXX=g++-8" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-9 - env: - - MATRIX_EVAL="CC=gcc-9 CXX=g++-9" - - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.5 - packages: - - clang-3.5 - env: - - MATRIX_EVAL="CC=clang-3.5 CXX=clang++-3.5" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.6 - packages: - - clang-3.6 - env: - - MATRIX_EVAL="CC=clang-3.6 CXX=clang++-3.6" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.7 - packages: - - clang-3.7 - env: - - MATRIX_EVAL="CC=clang-3.7 CXX=clang++-3.7" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.8 - packages: - - clang-3.8 - env: - - MATRIX_EVAL="CC=clang-3.8 CXX=clang++-3.8" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-precise-3.9 - packages: - - clang-3.9 - env: - - MATRIX_EVAL="CC=clang-3.9 CXX=clang++-3.9" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-trusty-4.0 - packages: - - clang-4.0 - env: - - MATRIX_EVAL="CC=clang-4.0 CXX=clang++-4.0" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - llvm-toolchain-trusty-5.0 - packages: - - clang-5.0 - env: - - MATRIX_EVAL="CC=clang-5.0 CXX=clang++-5.0" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-6.0 main' - key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' - packages: - - clang-6.0 - env: - - MATRIX_EVAL="CC=clang-6.0 CXX=clang++-6.0" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-7 main' - key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' - packages: - - clang-7 - env: - - MATRIX_EVAL="CC=clang-7 CXX=clang++-7" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - - sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-8 main' - key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key' - packages: - - clang-8 - env: - - MATRIX_EVAL="CC=clang-8 CXX=clang++-8" - -before_install: - - eval "${MATRIX_EVAL}" diff --git a/extern/re2/BUILD b/extern/re2/BUILD index 30ce32094f..3dc27d5c73 100644 --- a/extern/re2/BUILD +++ b/extern/re2/BUILD @@ -9,19 +9,21 @@ licenses(["notice"]) exports_files(["LICENSE"]) config_setting( - name = "darwin", + name = "macos", values = {"cpu": "darwin"}, ) +config_setting( + name = "wasm", + values = {"cpu": "wasm32"}, +) + config_setting( name = "windows", values = {"cpu": "x64_windows"}, ) -config_setting( - name = "windows_msvc", - values = {"cpu": "x64_windows_msvc"}, -) +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") cc_library( name = "re2", @@ -36,6 +38,7 @@ cc_library( "re2/onepass.cc", "re2/parse.cc", "re2/perl_groups.cc", + "re2/pod_array.h", "re2/prefilter.cc", "re2/prefilter.h", "re2/prefilter_tree.cc", @@ -47,6 +50,8 @@ cc_library( "re2/regexp.h", "re2/set.cc", "re2/simplify.cc", + "re2/sparse_array.h", + "re2/sparse_set.h", "re2/stringpiece.cc", "re2/tostring.cc", "re2/unicode_casefold.cc", @@ -54,14 +59,10 @@ cc_library( "re2/unicode_groups.cc", "re2/unicode_groups.h", "re2/walker-inl.h", - "util/flags.h", "util/logging.h", "util/mix.h", "util/mutex.h", - "util/pod_array.h", "util/rune.cc", - "util/sparse_array.h", - "util/sparse_set.h", "util/strutil.cc", "util/strutil.h", "util/utf.h", @@ -74,17 +75,17 @@ cc_library( "re2/stringpiece.h", ], copts = select({ + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), linkopts = select({ - # Darwin doesn't need `-pthread' when linking and it appears that + # macOS doesn't need `-pthread' when linking and it appears that # older versions of Clang will warn about the unused command line # argument, so just don't pass it. - ":darwin": [], + ":macos": [], + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), visibility = ["//visibility:public"], @@ -109,6 +110,8 @@ cc_library( "re2/testing/string_generator.h", "re2/testing/tester.h", "util/benchmark.h", + "util/flags.h", + "util/malloc_counter.h", "util/pcre.h", "util/test.h", ], @@ -122,106 +125,144 @@ cc_library( deps = [":testing"], ) -load(":re2_test.bzl", "re2_test") - -re2_test( - "charclass_test", +cc_test( + name = "charclass_test", size = "small", + srcs = ["re2/testing/charclass_test.cc"], + deps = [":test"], ) -re2_test( - "compile_test", +cc_test( + name = "compile_test", size = "small", + srcs = ["re2/testing/compile_test.cc"], + deps = [":test"], ) -re2_test( - "filtered_re2_test", +cc_test( + name = "filtered_re2_test", size = "small", + srcs = ["re2/testing/filtered_re2_test.cc"], + deps = [":test"], ) -re2_test( - "mimics_pcre_test", +cc_test( + name = "mimics_pcre_test", size = "small", + srcs = ["re2/testing/mimics_pcre_test.cc"], + deps = [":test"], ) -re2_test( - "parse_test", +cc_test( + name = "parse_test", size = "small", + srcs = ["re2/testing/parse_test.cc"], + deps = [":test"], ) -re2_test( - "possible_match_test", +cc_test( + name = "possible_match_test", size = "small", + srcs = ["re2/testing/possible_match_test.cc"], + deps = [":test"], ) -re2_test( - "re2_arg_test", +cc_test( + name = "re2_arg_test", size = "small", + srcs = ["re2/testing/re2_arg_test.cc"], + deps = [":test"], ) -re2_test( - "re2_test", +cc_test( + name = "re2_test", size = "small", + srcs = ["re2/testing/re2_test.cc"], + deps = [":test"], ) -re2_test( - "regexp_test", +cc_test( + name = "regexp_test", size = "small", + srcs = ["re2/testing/regexp_test.cc"], + deps = [":test"], ) -re2_test( - "required_prefix_test", +cc_test( + name = "required_prefix_test", size = "small", + srcs = ["re2/testing/required_prefix_test.cc"], + deps = [":test"], ) -re2_test( - "search_test", +cc_test( + name = "search_test", size = "small", + srcs = ["re2/testing/search_test.cc"], + deps = [":test"], ) -re2_test( - "set_test", +cc_test( + name = "set_test", size = "small", + srcs = ["re2/testing/set_test.cc"], + deps = [":test"], ) -re2_test( - "simplify_test", +cc_test( + name = "simplify_test", size = "small", + srcs = ["re2/testing/simplify_test.cc"], + deps = [":test"], ) -re2_test( - "string_generator_test", +cc_test( + name = "string_generator_test", size = "small", + srcs = ["re2/testing/string_generator_test.cc"], + deps = [":test"], ) -re2_test( - "dfa_test", +cc_test( + name = "dfa_test", size = "large", + srcs = ["re2/testing/dfa_test.cc"], + deps = [":test"], ) -re2_test( - "exhaustive1_test", +cc_test( + name = "exhaustive1_test", size = "large", + srcs = ["re2/testing/exhaustive1_test.cc"], + deps = [":test"], ) -re2_test( - "exhaustive2_test", +cc_test( + name = "exhaustive2_test", size = "large", + srcs = ["re2/testing/exhaustive2_test.cc"], + deps = [":test"], ) -re2_test( - "exhaustive3_test", +cc_test( + name = "exhaustive3_test", size = "large", + srcs = ["re2/testing/exhaustive3_test.cc"], + deps = [":test"], ) -re2_test( - "exhaustive_test", +cc_test( + name = "exhaustive_test", size = "large", + srcs = ["re2/testing/exhaustive_test.cc"], + deps = [":test"], ) -re2_test( - "random_test", +cc_test( + name = "random_test", size = "large", + srcs = ["re2/testing/random_test.cc"], + deps = [":test"], ) cc_library( diff --git a/extern/re2/CMakeLists.txt b/extern/re2/CMakeLists.txt index 4c9240bfd5..f23bfa6012 100644 --- a/extern/re2/CMakeLists.txt +++ b/extern/re2/CMakeLists.txt @@ -2,8 +2,8 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -# Old enough to support Ubuntu Trusty. -cmake_minimum_required(VERSION 2.8.12) +# Old enough to support Ubuntu Xenial. +cmake_minimum_required(VERSION 3.5.1) if(POLICY CMP0048) cmake_policy(SET CMP0048 NEW) @@ -11,6 +11,12 @@ endif() project(RE2 CXX) include(CTest) +include(GNUInstallDirs) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(USEPCRE "use PCRE in tests and benchmarks" OFF) @@ -19,6 +25,10 @@ option(USEPCRE "use PCRE in tests and benchmarks" OFF) # so we provide an option similar to BUILD_TESTING, but just for RE2. option(RE2_BUILD_TESTING "enable testing for RE2" ON) +# ABI version +# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html +set(SONAME 9) + set(EXTRA_TARGET_LINK_LIBRARIES) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") @@ -27,7 +37,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") endif() if(BUILD_SHARED_LIBS) # See http://www.kitware.com/blog/home/post/939 for details. - cmake_minimum_required(VERSION 3.4) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() # CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX, @@ -38,11 +47,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") add_compile_options(/utf-8) # allow multi-processor compilation add_compile_options(/MP) -elseif(CYGWIN OR MINGW) - # See https://stackoverflow.com/questions/38139631 for details. - add_compile_options(-std=gnu++11) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-std=c++11) endif() if(WIN32) @@ -58,8 +62,6 @@ if(USEPCRE) list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre) endif() -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - set(RE2_SOURCES re2/bitstate.cc re2/compile.cc @@ -86,6 +88,8 @@ set(RE2_SOURCES ) add_library(re2 ${RE2_SOURCES}) +target_include_directories(re2 PUBLIC $) +set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0) add_library(re2::re2 ALIAS re2) if(RE2_BUILD_TESTING) @@ -101,6 +105,7 @@ if(RE2_BUILD_TESTING) ) add_library(testing STATIC ${TESTING_SOURCES}) + target_link_libraries(testing PUBLIC re2) set(TEST_TARGETS charclass_test @@ -132,13 +137,13 @@ if(RE2_BUILD_TESTING) foreach(target ${TEST_TARGETS}) add_executable(${target} re2/testing/${target}.cc util/test.cc) - target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES}) + target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) add_test(NAME ${target} COMMAND ${target}) endforeach(target) foreach(target ${BENCHMARK_TARGETS}) add_executable(${target} re2/testing/${target}.cc util/benchmark.cc) - target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES}) + target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES}) endforeach(target) endif() @@ -149,6 +154,12 @@ set(RE2_HEADERS re2/stringpiece.h ) -install(FILES ${RE2_HEADERS} DESTINATION include/re2) -install(TARGETS re2 EXPORT re2Config ARCHIVE DESTINATION lib LIBRARY DESTINATION lib RUNTIME DESTINATION bin INCLUDES DESTINATION include) -install(EXPORT re2Config DESTINATION lib/cmake/re2 NAMESPACE re2::) +install(FILES ${RE2_HEADERS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2) +install(TARGETS re2 EXPORT re2Config + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install(EXPORT re2Config + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 NAMESPACE re2::) diff --git a/extern/re2/Makefile b/extern/re2/Makefile index f001f0640c..ba5e4f6e52 100644 --- a/extern/re2/Makefile +++ b/extern/re2/Makefile @@ -44,7 +44,7 @@ endif # ABI version # http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html -SONAME=0 +SONAME=9 # To rebuild the Tables generated by Perl and Python scripts (requires Internet # access for Unicode data), uncomment the following line: @@ -55,7 +55,7 @@ ifeq ($(shell uname),Darwin) SOEXT=dylib SOEXTVER=$(SONAME).$(SOEXT) SOEXTVER00=$(SONAME).0.0.$(SOEXT) -MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS) else ifeq ($(shell uname),SunOS) SOEXT=so SOEXTVER=$(SOEXT).$(SONAME) @@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0 MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) endif +.PHONY: all all: obj/libre2.a obj/so/libre2.$(SOEXT) INSTALL_HFILES=\ @@ -80,24 +81,25 @@ HFILES=\ util/benchmark.h\ util/flags.h\ util/logging.h\ + util/malloc_counter.h\ util/mix.h\ util/mutex.h\ util/pcre.h\ - util/pod_array.h\ - util/sparse_array.h\ - util/sparse_set.h\ util/strutil.h\ util/test.h\ util/utf.h\ util/util.h\ re2/bitmap256.h\ re2/filtered_re2.h\ + re2/pod_array.h\ re2/prefilter.h\ re2/prefilter_tree.h\ re2/prog.h\ re2/re2.h\ re2/regexp.h\ re2/set.h\ + re2/sparse_array.h\ + re2/sparse_set.h\ re2/stringpiece.h\ re2/testing/exhaustive_tester.h\ re2/testing/regexp_generator.h\ @@ -175,117 +177,156 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) +.PRECIOUS: obj/%.o obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/dbg/%.o obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc +.PRECIOUS: obj/so/%.o obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/libre2.a obj/libre2.a: $(OFILES) @mkdir -p obj $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) +.PRECIOUS: obj/dbg/libre2.a obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) -obj/so/libre2.$(SOEXT): $(SOFILES) +.PRECIOUS: obj/so/libre2.$(SOEXT) +obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin @mkdir -p obj/so $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) ln -sf libre2.$(SOEXTVER) $@ +.PRECIOUS: obj/dbg/test/% obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) +.PRECIOUS: obj/test/% obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # Test the shared lib, falling back to the static lib for private symbols +.PRECIOUS: obj/so/test/% obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) +# Filter out dump.o because testing::TempDir() isn't available for it. obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o @mkdir -p obj/test - $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) + $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(filter-out obj/re2/testing/dump.o, $(TESTOFILES)) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing # is simply a way to check that the target builds and then to run it against a # fixed set of inputs. To perform real fuzzing, refer to the documentation for # libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/). +obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS) obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) ifdef REBUILD_TABLES +.PRECIOUS: re2/perl_groups.cc re2/perl_groups.cc: re2/make_perl_groups.pl perl $< > $@ +.PRECIOUS: re2/unicode_%.cc re2/unicode_%.cc: re2/make_unicode_%.py python $< > $@ - -.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc endif +.PHONY: distclean distclean: clean rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc +.PHONY: clean clean: rm -rf obj rm -f re2/*.pyc +.PHONY: testofiles testofiles: $(TESTOFILES) +.PHONY: test test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test +.PHONY: debug-test debug-test: $(DTESTS) @./runtests $(DTESTS) +.PHONY: static-test static-test: $(TESTS) @./runtests $(TESTS) +.PHONY: shared-test shared-test: $(STESTS) @./runtests -shared-library-path obj/so $(STESTS) +.PHONY: debug-bigtest debug-bigtest: $(DTESTS) $(DBIGTESTS) @./runtests $(DTESTS) $(DBIGTESTS) +.PHONY: static-bigtest static-bigtest: $(TESTS) $(BIGTESTS) @./runtests $(TESTS) $(BIGTESTS) +.PHONY: shared-bigtest shared-bigtest: $(STESTS) $(SBIGTESTS) @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS) +.PHONY: benchmark benchmark: obj/test/regexp_benchmark +.PHONY: fuzz fuzz: obj/test/re2_fuzzer -install: obj/libre2.a obj/so/libre2.$(SOEXT) - mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig - $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 +.PHONY: install +install: static-install shared-install + +.PHONY: static +static: obj/libre2.a + +.PHONY: static-install +static-install: obj/libre2.a common-install $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + +.PHONY: shared +shared: obj/so/libre2.$(SOEXT) + +.PHONY: shared-install +shared-install: obj/so/libre2.$(SOEXT) common-install $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) - $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@prefix@#${prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@exec_prefix@#${exec_prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@includedir@#${includedir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc - $(SED_INPLACE) -e "s#@libdir@#${libdir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc +.PHONY: common-install +common-install: + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 + $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc + +.PHONY: testinstall testinstall: static-testinstall shared-testinstall @echo @echo Install tests passed. @echo +.PHONY: static-testinstall static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: @@ -300,6 +341,7 @@ else obj/testinstall endif +.PHONY: shared-testinstall shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @@ -312,19 +354,14 @@ else LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall endif +.PHONY: benchlog benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') -# Keep gmake from deleting intermediate files it creates. -# This makes repeated builds faster and preserves debug info on OS X. - -.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \ - obj/dbg/libre2.a obj/so/libre2.a \ - obj/test/% obj/so/test/% obj/dbg/test/% - +.PHONY: log log: $(MAKE) clean $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ @@ -340,6 +377,3 @@ log: echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt - -x: x.cc obj/libre2.a - g++ -I. -o x x.cc obj/libre2.a diff --git a/extern/re2/README b/extern/re2/README index d1ef431b2b..0e1142c6d9 100644 --- a/extern/re2/README +++ b/extern/re2/README @@ -27,12 +27,15 @@ under the BSD-style license found in the LICENSE file. RE2's native language is C++. +The Python wrapper is at https://github.com/google/re2/tree/abseil/python +and on PyPI (https://pypi.org/project/google-re2/). + A C wrapper is at https://github.com/marcomaggi/cre2/. An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm). An Inferno wrapper is at https://github.com/powerman/inferno-re2/. A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com). An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org). A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org). -A Python wrapper is at https://github.com/facebook/pyre2/ and on PyPI (pypi.org). An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org). A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org). +A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com). diff --git a/extern/re2/WORKSPACE b/extern/re2/WORKSPACE index de481fe836..484abfe2a4 100644 --- a/extern/re2/WORKSPACE +++ b/extern/re2/WORKSPACE @@ -3,4 +3,13 @@ # license that can be found in the LICENSE file. # Bazel (http://bazel.io/) WORKSPACE file for RE2. + workspace(name = "com_googlesource_code_re2") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "rules_cc", + strip_prefix = "rules_cc-master", + urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"], +) diff --git a/extern/re2/benchlog/benchplot.py b/extern/re2/benchlog/benchplot.py old mode 100644 new mode 100755 diff --git a/extern/re2/benchlog/mktable b/extern/re2/benchlog/mktable old mode 100644 new mode 100755 diff --git a/extern/re2/doc/mksyntaxgo b/extern/re2/doc/mksyntaxgo old mode 100644 new mode 100755 index caad9b60b0..d30d281460 --- a/extern/re2/doc/mksyntaxgo +++ b/extern/re2/doc/mksyntaxgo @@ -15,7 +15,7 @@ sam -d $out <<'!' ,s/\n\n\n+/\n\n/g ,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}' 1,2c -// Copyright 2012 The Go Authors. All rights reserved. +// Copyright 2012 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. @@ -33,6 +33,7 @@ Parts of the syntax can be disabled by passing alternate flags to Parse. . $a +Unicode character classes are those in unicode.Categories and unicode.Scripts. */ package syntax . diff --git a/extern/re2/doc/mksyntaxhtml b/extern/re2/doc/mksyntaxhtml old mode 100644 new mode 100755 diff --git a/extern/re2/doc/mksyntaxwiki b/extern/re2/doc/mksyntaxwiki old mode 100644 new mode 100755 diff --git a/extern/re2/doc/syntax.html b/extern/re2/doc/syntax.html index aa08b1108b..47541e5c2e 100644 --- a/extern/re2/doc/syntax.html +++ b/extern/re2/doc/syntax.html @@ -47,6 +47,10 @@ x{-n}(≡ x{n}?) VIM x=(≡ x?) VIM +Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} +reject forms that create a minimum or maximum repetition count above 1000. +Unlimited repetitions are not subject to this restriction. + Possessive repetitions: x*+zero or more x, possessive x++one or more x, possessive @@ -56,10 +60,10 @@ x{n}+exactly n x, possessive Grouping: -(re)numbered capturing group -(?P<name>re)named & numbered capturing group -(?<name>re)named & numbered capturing group -(?'name're)named & numbered capturing group +(re)numbered capturing group (submatch) +(?P<name>re)named & numbered capturing group (submatch) +(?<name>re)named & numbered capturing group (submatch) +(?'name're)named & numbered capturing group (submatch) (?:re)non-capturing group (?flags)set flags within current group; non-capturing (?flags:re)set flags during re; non-capturing @@ -80,8 +84,8 @@ ^at beginning of text or line (m=true) $at end of text (like \z not \Z) or line (m=true) \Aat beginning of text -\bat word boundary (\w on one side and \W, \A, or \z on the other) -\Bnot a word boundary +\bat ASCII word boundary (\w on one side and \W, \A, or \z on the other) +\Bnot at ASCII word boundary \Gat beginning of subtext being searched PCRE \Gat end of last match PERL \Zat end of text, or before newline at end of text @@ -166,7 +170,7 @@ [\p{Name}]named Unicode property inside character class (≡ \p{Name}) [^\p{Name}]named Unicode property inside negated character class (≡ \P{Name}) -Perl character classes: +Perl character classes (all ASCII-only): \ddigits (≡ [0-9]) \Dnot digits (≡ [^0-9]) \swhitespace (≡ [\t\n\f\r ]) @@ -237,105 +241,162 @@ Zsspace separator Unicode character class names--scripts: -ArabicArabic -ArmenianArmenian -BalineseBalinese -BamumBamum -BatakBatak -BengaliBengali -BopomofoBopomofo -BrahmiBrahmi -BrailleBraille -BugineseBuginese -BuhidBuhid -Canadian_AboriginalCanadian Aboriginal -CarianCarian -ChakmaChakma -ChamCham -CherokeeCherokee -Commoncharacters not specific to one script -CopticCoptic -CuneiformCuneiform -CypriotCypriot -CyrillicCyrillic -DeseretDeseret -DevanagariDevanagari -Egyptian_HieroglyphsEgyptian Hieroglyphs -EthiopicEthiopic -GeorgianGeorgian -GlagoliticGlagolitic -GothicGothic -GreekGreek -GujaratiGujarati -GurmukhiGurmukhi -HanHan -HangulHangul -HanunooHanunoo -HebrewHebrew -HiraganaHiragana -Imperial_AramaicImperial Aramaic -Inheritedinherit script from previous character -Inscriptional_PahlaviInscriptional Pahlavi -Inscriptional_ParthianInscriptional Parthian -JavaneseJavanese -KaithiKaithi -KannadaKannada -KatakanaKatakana -Kayah_LiKayah Li -KharoshthiKharoshthi -KhmerKhmer -LaoLao -LatinLatin -LepchaLepcha -LimbuLimbu -Linear_BLinear B -LycianLycian -LydianLydian -MalayalamMalayalam -MandaicMandaic -Meetei_MayekMeetei Mayek -Meroitic_CursiveMeroitic Cursive -Meroitic_HieroglyphsMeroitic Hieroglyphs -MiaoMiao -MongolianMongolian -MyanmarMyanmar -New_Tai_LueNew Tai Lue (aka Simplified Tai Lue) -NkoNko -OghamOgham -Ol_ChikiOl Chiki -Old_ItalicOld Italic -Old_PersianOld Persian -Old_South_ArabianOld South Arabian -Old_TurkicOld Turkic -OriyaOriya -OsmanyaOsmanya -Phags_Pa'Phags Pa -PhoenicianPhoenician -RejangRejang -RunicRunic -SaurashtraSaurashtra -SharadaSharada -ShavianShavian -SinhalaSinhala -Sora_SompengSora Sompeng -SundaneseSundanese -Syloti_NagriSyloti Nagri -SyriacSyriac -TagalogTagalog -TagbanwaTagbanwa -Tai_LeTai Le -Tai_ThamTai Tham -Tai_VietTai Viet -TakriTakri -TamilTamil -TeluguTelugu -ThaanaThaana -ThaiThai -TibetanTibetan -TifinaghTifinagh -UgariticUgaritic -VaiVai -YiYi +Adlam +Ahom +Anatolian_Hieroglyphs +Arabic +Armenian +Avestan +Balinese +Bamum +Bassa_Vah +Batak +Bengali +Bhaiksuki +Bopomofo +Brahmi +Braille +Buginese +Buhid +Canadian_Aboriginal +Carian +Caucasian_Albanian +Chakma +Cham +Cherokee +Chorasmian +Common +Coptic +Cuneiform +Cypriot +Cyrillic +Deseret +Devanagari +Dives_Akuru +Dogra +Duployan +Egyptian_Hieroglyphs +Elbasan +Elymaic +Ethiopic +Georgian +Glagolitic +Gothic +Grantha +Greek +Gujarati +Gunjala_Gondi +Gurmukhi +Han +Hangul +Hanifi_Rohingya +Hanunoo +Hatran +Hebrew +Hiragana +Imperial_Aramaic +Inherited +Inscriptional_Pahlavi +Inscriptional_Parthian +Javanese +Kaithi +Kannada +Katakana +Kayah_Li +Kharoshthi +Khitan_Small_Script +Khmer +Khojki +Khudawadi +Lao +Latin +Lepcha +Limbu +Linear_A +Linear_B +Lisu +Lycian +Lydian +Mahajani +Makasar +Malayalam +Mandaic +Manichaean +Marchen +Masaram_Gondi +Medefaidrin +Meetei_Mayek +Mende_Kikakui +Meroitic_Cursive +Meroitic_Hieroglyphs +Miao +Modi +Mongolian +Mro +Multani +Myanmar +Nabataean +Nandinagari +New_Tai_Lue +Newa +Nko +Nushu +Nyiakeng_Puachue_Hmong +Ogham +Ol_Chiki +Old_Hungarian +Old_Italic +Old_North_Arabian +Old_Permic +Old_Persian +Old_Sogdian +Old_South_Arabian +Old_Turkic +Oriya +Osage +Osmanya +Pahawh_Hmong +Palmyrene +Pau_Cin_Hau +Phags_Pa +Phoenician +Psalter_Pahlavi +Rejang +Runic +Samaritan +Saurashtra +Sharada +Shavian +Siddham +SignWriting +Sinhala +Sogdian +Sora_Sompeng +Soyombo +Sundanese +Syloti_Nagri +Syriac +Tagalog +Tagbanwa +Tai_Le +Tai_Tham +Tai_Viet +Takri +Tamil +Tangut +Telugu +Thaana +Thai +Tibetan +Tifinagh +Tirhuta +Ugaritic +Vai +Wancho +Warang_Citi +Yezidi +Yi +Zanabazar_Square Vim character classes: \iidentifier character VIM diff --git a/extern/re2/doc/syntax.txt b/extern/re2/doc/syntax.txt index cb04bbf05e..ce87866054 100644 --- a/extern/re2/doc/syntax.txt +++ b/extern/re2/doc/syntax.txt @@ -253,6 +253,7 @@ Caucasian_Albanian Chakma Cham Cherokee +Chorasmian Common Coptic Cuneiform @@ -260,6 +261,7 @@ Cypriot Cyrillic Deseret Devanagari +Dives_Akuru Dogra Duployan Egyptian_Hieroglyphs @@ -291,6 +293,7 @@ Kannada Katakana Kayah_Li Kharoshthi +Khitan_Small_Script Khmer Khojki Khudawadi @@ -380,6 +383,7 @@ Ugaritic Vai Wancho Warang_Citi +Yezidi Yi Zanabazar_Square diff --git a/extern/re2/kokoro/cmake.sh b/extern/re2/kokoro/cmake.sh deleted file mode 100644 index 999fbfed02..0000000000 --- a/extern/re2/kokoro/cmake.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -eux - -cd git/re2 - -case "${KOKORO_JOB_NAME}" in - */windows-*) - CMAKE_G_A_FLAGS=('-G' 'Visual Studio 14 2015' '-A' 'x64') - ;; - *) - CMAKE_G_A_FLAGS=() - # Work around a bug in older versions of bash. :/ - set +u - ;; -esac - -cmake -D CMAKE_BUILD_TYPE=Debug "${CMAKE_G_A_FLAGS[@]}" . -cmake --build . --config Debug --clean-first -ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random' - -cmake -D CMAKE_BUILD_TYPE=Release "${CMAKE_G_A_FLAGS[@]}" . -cmake --build . --config Release --clean-first -ctest -C Release --output-on-failure -E 'dfa|exhaustive|random' - -exit 0 diff --git a/extern/re2/kokoro/macos-bazel.cfg b/extern/re2/kokoro/macos-bazel.cfg deleted file mode 100644 index 7901981405..0000000000 --- a/extern/re2/kokoro/macos-bazel.cfg +++ /dev/null @@ -1 +0,0 @@ -build_file: "re2/kokoro/macos-bazel.sh" diff --git a/extern/re2/kokoro/macos-bazel.sh b/extern/re2/kokoro/macos-bazel.sh deleted file mode 100644 index e43c852112..0000000000 --- a/extern/re2/kokoro/macos-bazel.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -eux -bash git/re2/kokoro/bazel.sh -exit $? diff --git a/extern/re2/kokoro/macos-cmake.cfg b/extern/re2/kokoro/macos-cmake.cfg deleted file mode 100644 index 5c459e7cd4..0000000000 --- a/extern/re2/kokoro/macos-cmake.cfg +++ /dev/null @@ -1 +0,0 @@ -build_file: "re2/kokoro/macos-cmake.sh" diff --git a/extern/re2/kokoro/macos-cmake.sh b/extern/re2/kokoro/macos-cmake.sh deleted file mode 100644 index ef4b7dcda1..0000000000 --- a/extern/re2/kokoro/macos-cmake.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -eux -bash git/re2/kokoro/cmake.sh -exit $? diff --git a/extern/re2/kokoro/ubuntu-bazel.cfg b/extern/re2/kokoro/ubuntu-bazel.cfg deleted file mode 100644 index 884d14f4be..0000000000 --- a/extern/re2/kokoro/ubuntu-bazel.cfg +++ /dev/null @@ -1 +0,0 @@ -build_file: "re2/kokoro/ubuntu-bazel.sh" diff --git a/extern/re2/kokoro/ubuntu-bazel.sh b/extern/re2/kokoro/ubuntu-bazel.sh deleted file mode 100644 index e43c852112..0000000000 --- a/extern/re2/kokoro/ubuntu-bazel.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -eux -bash git/re2/kokoro/bazel.sh -exit $? diff --git a/extern/re2/kokoro/windows-bazel.bat b/extern/re2/kokoro/windows-bazel.bat deleted file mode 100644 index 283f8d213b..0000000000 --- a/extern/re2/kokoro/windows-bazel.bat +++ /dev/null @@ -1,2 +0,0 @@ -bash git/re2/kokoro/bazel.sh -EXIT /B %ERRORLEVEL% diff --git a/extern/re2/kokoro/windows-bazel.cfg b/extern/re2/kokoro/windows-bazel.cfg deleted file mode 100644 index 18b1ed735c..0000000000 --- a/extern/re2/kokoro/windows-bazel.cfg +++ /dev/null @@ -1 +0,0 @@ -build_file: "re2/kokoro/windows-bazel.bat" diff --git a/extern/re2/kokoro/windows-cmake.bat b/extern/re2/kokoro/windows-cmake.bat deleted file mode 100644 index 77a4db9d30..0000000000 --- a/extern/re2/kokoro/windows-cmake.bat +++ /dev/null @@ -1,2 +0,0 @@ -bash git/re2/kokoro/cmake.sh -EXIT /B %ERRORLEVEL% diff --git a/extern/re2/kokoro/windows-cmake.cfg b/extern/re2/kokoro/windows-cmake.cfg deleted file mode 100644 index 4453eb6e09..0000000000 --- a/extern/re2/kokoro/windows-cmake.cfg +++ /dev/null @@ -1 +0,0 @@ -build_file: "re2/kokoro/windows-cmake.bat" diff --git a/extern/re2/lib/git/commit-msg.hook b/extern/re2/lib/git/commit-msg.hook old mode 100644 new mode 100755 diff --git a/extern/re2/libre2.symbols b/extern/re2/libre2.symbols index 8308b64892..93b71b4862 100644 --- a/extern/re2/libre2.symbols +++ b/extern/re2/libre2.symbols @@ -11,6 +11,9 @@ # re2::FilteredRE2* _ZN3re211FilteredRE2*; _ZNK3re211FilteredRE2*; + # re2::re2_internal* + _ZN3re212re2_internal*; + _ZNK3re212re2_internal*; local: *; }; diff --git a/extern/re2/libre2.symbols.darwin b/extern/re2/libre2.symbols.darwin index 31e8c52209..41ac96f93b 100644 --- a/extern/re2/libre2.symbols.darwin +++ b/extern/re2/libre2.symbols.darwin @@ -10,3 +10,6 @@ __ZN3re2ls* # re2::FilteredRE2* __ZN3re211FilteredRE2* __ZNK3re211FilteredRE2* +# re2::re2_internal* +__ZN3re212re2_internal* +__ZNK3re212re2_internal* diff --git a/extern/re2/re2.pc b/extern/re2/re2.pc index d66cf5199c..50fd637d4e 100644 --- a/extern/re2/re2.pc +++ b/extern/re2/re2.pc @@ -1,5 +1,3 @@ -prefix=@prefix@ -exec_prefix=@exec_prefix@ includedir=@includedir@ libdir=@libdir@ diff --git a/extern/re2/re2/bitmap256.h b/extern/re2/re2/bitmap256.h index f649b4ccca..4899379e4d 100644 --- a/extern/re2/re2/bitmap256.h +++ b/extern/re2/re2/bitmap256.h @@ -32,7 +32,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - return (words_[c / 64] & (1ULL << (c % 64))) != 0; + return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0; } // Sets the bit with index c. @@ -40,7 +40,7 @@ class Bitmap256 { DCHECK_GE(c, 0); DCHECK_LE(c, 255); - words_[c / 64] |= (1ULL << (c % 64)); + words_[c / 64] |= (uint64_t{1} << (c % 64)); } // Finds the next non-zero bit with index >= c. @@ -51,7 +51,6 @@ class Bitmap256 { // Finds the least significant non-zero bit in n. static int FindLSBSet(uint64_t n) { DCHECK_NE(n, 0); - #if defined(__GNUC__) return __builtin_ctzll(n); #elif defined(_MSC_VER) && defined(_M_X64) @@ -89,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const { // Check the word that contains the bit. Mask out any lower bits. int i = c / 64; - uint64_t word = words_[i] & (~0ULL << (c % 64)); + uint64_t word = words_[i] & (~uint64_t{0} << (c % 64)); if (word != 0) return (i * 64) + FindLSBSet(word); diff --git a/extern/re2/re2/bitstate.cc b/extern/re2/re2/bitstate.cc index 6f045b19dd..320d1eea15 100644 --- a/extern/re2/re2/bitstate.cc +++ b/extern/re2/re2/bitstate.cc @@ -7,7 +7,7 @@ // Prog::SearchBitState is a regular expression search with submatch // tracking for small regular expressions and texts. Similarly to // testing/backtrack.cc, it allocates a bitmap with (count of -// lists) * (length of prog) bits to make sure it never explores the +// lists) * (length of text) bits to make sure it never explores the // same (instruction list, character position) multiple times. This // limits the search to run in time linear in the length of the text. // @@ -24,7 +24,7 @@ #include #include "util/logging.h" -#include "util/pod_array.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -63,11 +63,14 @@ class BitState { int nsubmatch_; // # of submatches to fill in // Search state - static const int VisitedBits = 32; - PODArray visited_; // bitmap: (list ID, char*) pairs visited + static constexpr int kVisitedBits = 64; + PODArray visited_; // bitmap: (list ID, char*) pairs visited PODArray cap_; // capture registers PODArray job_; // stack of text positions to explore int njob_; // stack size + + BitState(const BitState&) = delete; + BitState& operator=(const BitState&) = delete; }; BitState::BitState(Prog* prog) @@ -86,10 +89,10 @@ BitState::BitState(Prog* prog) // we don't repeat the visit. bool BitState::ShouldVisit(int id, const char* p) { int n = prog_->list_heads()[id] * static_cast(text_.size()+1) + - static_cast(p-text_.begin()); - if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) + static_cast(p-text_.data()); + if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1)))) return false; - visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); + visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1)); return true; } @@ -134,7 +137,7 @@ void BitState::Push(int id, const char* p) { // Return whether it succeeded. bool BitState::TrySearch(int id0, const char* p0) { bool matched = false; - const char* end = text_.end(); + const char* end = text_.data() + text_.size(); njob_ = 0; // Push() no longer checks ShouldVisit(), // so we must perform the check ourselves. @@ -251,7 +254,7 @@ bool BitState::TrySearch(int id0, const char* p0) { matched = true; cap_[1] = p; if (submatch_[0].data() == NULL || - (longest_ && p > submatch_[0].end())) { + (longest_ && p > submatch_[0].data() + submatch_[0].size())) { for (int i = 0; i < nsubmatch_; i++) submatch_[i] = StringPiece(cap_[2 * i], @@ -288,7 +291,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Search parameters. text_ = text; context_ = context; - if (context_.begin() == NULL) + if (context_.data() == NULL) context_ = text; if (prog_->anchor_start() && context_.begin() != text.begin()) return false; @@ -304,8 +307,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Allocate scratch space. int nvisited = prog_->list_count() * static_cast(text.size()+1); - nvisited = (nvisited + VisitedBits-1) / VisitedBits; - visited_ = PODArray(nvisited); + nvisited = (nvisited + kVisitedBits-1) / kVisitedBits; + visited_ = PODArray(nvisited); memset(visited_.data(), 0, nvisited*sizeof visited_[0]); int ncap = 2*nsubmatch; @@ -319,8 +322,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // Anchored search must start at text.begin(). if (anchored_) { - cap_[0] = text.begin(); - return TrySearch(prog_->start(), text.begin()); + cap_[0] = text.data(); + return TrySearch(prog_->start(), text.data()); } // Unanchored search, starting from each possible text position. @@ -329,18 +332,22 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // This looks like it's quadratic in the size of the text, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. - for (const char* p = text.begin(); p <= text.end(); p++) { - // Try to use memchr to find the first byte quickly. - int fb = prog_->first_byte(); - if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) { - p = reinterpret_cast(memchr(p, fb, text.end() - p)); + const char* etext = text.data() + text.size(); + for (const char* p = text.data(); p <= etext; p++) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + if (p < etext && prog_->can_prefix_accel()) { + p = reinterpret_cast(prog_->PrefixAccel(p, etext - p)); if (p == NULL) - p = text.end(); + p = etext; } cap_[0] = p; if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. return true; + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + if (p == NULL) + break; } return false; } diff --git a/extern/re2/re2/compile.cc b/extern/re2/re2/compile.cc index 7457b228ac..7a9de07281 100644 --- a/extern/re2/re2/compile.cc +++ b/extern/re2/re2/compile.cc @@ -14,8 +14,8 @@ #include #include "util/logging.h" -#include "util/pod_array.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -30,91 +30,57 @@ namespace re2 { // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. // // Because the out and out1 fields in Inst are no longer pointers, -// we can't use pointers directly here either. Instead, p refers -// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). -// p == 0 represents the NULL list. This is okay because instruction #0 +// we can't use pointers directly here either. Instead, head refers +// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). +// head == 0 represents the NULL list. This is okay because instruction #0 // is always the fail instruction, which never appears on a list. - struct PatchList { - uint32_t p; - // Returns patch list containing just p. - static PatchList Mk(uint32_t p); + static PatchList Mk(uint32_t p) { + return {p, p}; + } - // Patches all the entries on l to have value v. + // Patches all the entries on l to have value p. // Caller must not ever use patch list again. - static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v); - - // Deref returns the next pointer pointed at by p. - static PatchList Deref(Prog::Inst *inst0, PatchList l); - - // Appends two patch lists and returns result. - static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); -}; - -static PatchList nullPatchList = { 0 }; - -// Returns patch list containing just p. -PatchList PatchList::Mk(uint32_t p) { - PatchList l; - l.p = p; - return l; -} - -// Returns the next pointer pointed at by l. -PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - l.p = ip->out1(); - else - l.p = ip->out(); - return l; -} - -// Patches all the entries on l to have value v. -void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) { - while (l.p != 0) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) { - l.p = ip->out1(); - ip->out1_ = val; - } else { - l.p = ip->out(); - ip->set_out(val); + static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { + while (l.head != 0) { + Prog::Inst* ip = &inst0[l.head>>1]; + if (l.head&1) { + l.head = ip->out1(); + ip->out1_ = p; + } else { + l.head = ip->out(); + ip->set_out(p); + } } } -} -// Appends two patch lists and returns result. -PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { - if (l1.p == 0) - return l2; - if (l2.p == 0) - return l1; - - PatchList l = l1; - for (;;) { - PatchList next = PatchList::Deref(inst0, l); - if (next.p == 0) - break; - l = next; + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.head == 0) + return l2; + if (l2.head == 0) + return l1; + Prog::Inst* ip = &inst0[l1.tail>>1]; + if (l1.tail&1) + ip->out1_ = l2.head; + else + ip->set_out(l2.head); + return {l1.head, l2.tail}; } - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - ip->out1_ = l2.p; - else - ip->set_out(l2.p); + uint32_t head; + uint32_t tail; // for constant-time append +}; - return l1; -} +static const PatchList kNullPatchList = {0, 0}; // Compiled program fragment. struct Frag { uint32_t begin; PatchList end; - Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector + Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} }; @@ -212,8 +178,8 @@ class Compiler : public Regexp::Walker { int AddSuffixRecursive(int root, int id); // Finds the trie node for the given suffix. Returns a Frag in order to - // distinguish between pointing at the root node directly (end.p == 0) - // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). + // distinguish between pointing at the root node directly (end.head == 0) + // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). Frag FindByteRange(int root, int id); // Compares two ByteRanges and returns true iff they are equal. @@ -225,8 +191,8 @@ class Compiler : public Regexp::Walker { // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); - Prog* Finish(); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); + Prog* Finish(Regexp* re); // Returns .* where dot = any byte Frag DotStar(); @@ -298,7 +264,7 @@ int Compiler::AllocInst(int n) { // Returns an unmatchable fragment. Frag Compiler::NoMatch() { - return Frag(0, nullPatchList); + return Frag(0, kNullPatchList); } // Is a an unmatchable fragment? @@ -314,7 +280,7 @@ Frag Compiler::Cat(Frag a, Frag b) { // Elide no-op. Prog::Inst* begin = &inst_[a.begin]; if (begin->opcode() == kInstNop && - a.end.p == (a.begin << 1) && + a.end.head == (a.begin << 1) && begin->out() == 0) { // in case refs to a somewhere PatchList::Patch(inst_.data(), a.end, b.begin); @@ -419,7 +385,7 @@ Frag Compiler::Match(int32_t match_id) { if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); - return Frag(id, nullPatchList); + return Frag(id, kNullPatchList); } // Returns a fragment matching a particular empty-width op (like ^ or $) @@ -467,7 +433,7 @@ static int MaxRune(int len) { void Compiler::BeginRange() { rune_cache_.clear(); rune_range_.begin = 0; - rune_range_.end = nullPatchList; + rune_range_.end = kNullPatchList; } int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, @@ -548,9 +514,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { } int br; - if (f.end.p == 0) + if (f.end.head == 0) br = root; - else if (f.end.p&1) + else if (f.end.head&1) br = inst_[f.begin].out1(); else br = inst_[f.begin].out(); @@ -566,9 +532,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { // Ensure that the parent points to the clone, not to the original. // Note that this could leave the head unreachable except via the cache. br = byterange; - if (f.end.p == 0) + if (f.end.head == 0) root = br; - else if (f.end.p&1) + else if (f.end.head&1) inst_[f.begin].out1_ = br; else inst_[f.begin].set_out(br); @@ -601,7 +567,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) { Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) - return Frag(root, nullPatchList); + return Frag(root, kNullPatchList); else return NoMatch(); } @@ -662,48 +628,43 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { static_cast(hi), foldcase, 0)); } -// Table describing how to make a UTF-8 matching machine -// for the rune range 80-10FFFF (Runeself-Runemax). -// This range happens frequently enough (for example /./ and /[^a-z]/) -// and the rune_cache_ map is slow enough that this is worth -// special handling. Makes compilation of a small expression -// with a dot in it about 10% faster. -// The * in the comments below mark whole sequences. -static struct ByteRangeProg { - int next; - int lo; - int hi; -} prog_80_10ffff[] = { - // Two-byte - { -1, 0x80, 0xBF, }, // 0: 80-BF - { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* - - // Three-byte - { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF - { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* - { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF - { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* - - // Four-byte - { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF - { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* - { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF - { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* - { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF - { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* -}; - void Compiler::Add_80_10ffff() { - int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning - for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) { - const ByteRangeProg& p = prog_80_10ffff[i]; - int next = 0; - if (p.next >= 0) - next = inst[p.next]; - inst[i] = UncachedRuneByteSuffix(static_cast(p.lo), - static_cast(p.hi), false, next); - if ((p.lo & 0xC0) != 0x80) - AddSuffix(inst[i]); + // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough + // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by + // permitting overlong encodings in E0 and F0 sequences and code points + // over 10FFFF in F4 sequences, the size of the bytecode and the number + // of equivalence classes are reduced significantly. + int id; + if (reversed_) { + // Prefix factoring matters, but we don't have to handle it here + // because the rune range trie logic takes care of that already. + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); + AddSuffix(id); + } else { + // Suffix factoring matters - and we do have to handle it here. + int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); + id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); + AddSuffix(id); + + int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); + id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); + AddSuffix(id); + + int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); + id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); + AddSuffix(id); } } @@ -711,9 +672,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { if (lo > hi) return; - // Pick off 80-10FFFF as a common special case - // that can bypass the slow rune_cache_. - if (lo == 0x80 && hi == 0x10ffff && !reversed_) { + // Pick off 80-10FFFF as a common special case. + if (lo == 0x80 && hi == 0x10ffff) { Add_80_10ffff(); return; } @@ -1095,8 +1055,6 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { - prog_->set_flags(flags); - if (flags & Regexp::Latin1) encoding_ = kEncodingLatin1; max_mem_ = max_mem; @@ -1117,14 +1075,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, // on the program.) if (m >= 1<<24) m = 1<<24; - // Inst imposes its own limit (currently bigger than 2^24 but be safe). if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_ninst_ = static_cast(m); } - anchor_ = anchor; } @@ -1178,10 +1133,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. - return c.Finish(); + return c.Finish(re); } -Prog* Compiler::Finish() { +Prog* Compiler::Finish(Regexp* re) { if (failed_) return NULL; @@ -1198,6 +1153,17 @@ Prog* Compiler::Finish() { prog_->Flatten(); prog_->ComputeByteMap(); + if (!prog_->reversed()) { + std::string prefix; + bool prefix_foldcase; + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) && + !prefix_foldcase) { + prog_->prefix_size_ = prefix.size(); + prog_->prefix_front_ = prefix.front(); + prog_->prefix_back_ = prefix.back(); + } + } + // Record remaining memory for DFA. if (max_mem_ <= 0) { prog_->set_dfa_mem(1<<20); @@ -1254,7 +1220,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - Prog* prog = c.Finish(); + Prog* prog = c.Finish(re); if (prog == NULL) return NULL; diff --git a/extern/re2/re2/dfa.cc b/extern/re2/re2/dfa.cc index 40880f9e2b..f292ff10b1 100644 --- a/extern/re2/re2/dfa.cc +++ b/extern/re2/re2/dfa.cc @@ -39,10 +39,11 @@ #include "util/logging.h" #include "util/mix.h" #include "util/mutex.h" -#include "util/pod_array.h" -#include "util/sparse_set.h" #include "util/strutil.h" +#include "re2/pod_array.h" #include "re2/prog.h" +#include "re2/re2.h" +#include "re2/sparse_set.h" #include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for DFA::State::next_. @@ -52,17 +53,6 @@ namespace re2 { -#if !defined(__linux__) /* only Linux seems to have memrchr */ -static void* memrchr(const void* s, int c, size_t n) { - const unsigned char* p = (const unsigned char*)s; - for (p += n; n > 0; n--) - if (*--p == c) - return (void*)p; - - return NULL; -} -#endif - // Controls whether the DFA should bail out early if the NFA would be faster. static bool dfa_should_bail_when_slow = true; @@ -177,11 +167,8 @@ class DFA { typedef std::unordered_set StateSet; private: - // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.) - enum { - kFbUnknown = -1, // No analysis has been performed. - kFbNone = -2, // The first-byte trick cannot be used. - }; + // Make it easier to swap in a scalable reader-writer mutex. + using CacheMutex = Mutex; enum { // Indices into start_ for unanchored searches. @@ -249,25 +236,26 @@ class DFA { struct SearchParams { SearchParams(const StringPiece& text, const StringPiece& context, RWLocker* cache_lock) - : text(text), context(context), + : text(text), + context(context), anchored(false), + can_prefix_accel(false), want_earliest_match(false), run_forward(false), start(NULL), - first_byte(kFbUnknown), cache_lock(cache_lock), failed(false), ep(NULL), - matches(NULL) { } + matches(NULL) {} StringPiece text; StringPiece context; bool anchored; + bool can_prefix_accel; bool want_earliest_match; bool run_forward; State* start; - int first_byte; - RWLocker *cache_lock; + RWLocker* cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match SparseSet* matches; @@ -278,15 +266,13 @@ class DFA { }; // Before each search, the parameters to Search are analyzed by - // AnalyzeSearch to determine the state in which to start and the - // "first_byte" for that state, if any. + // AnalyzeSearch to determine the state in which to start. struct StartInfo { - StartInfo() : start(NULL), first_byte(kFbUnknown) {} - State* start; - std::atomic first_byte; + StartInfo() : start(NULL) {} + std::atomic start; }; - // Fills in params->start and params->first_byte using + // Fills in params->start and params->can_prefix_accel using // the other search parameters. Returns true on success, // false on failure. // cache_mutex_.r <= L < mutex_ @@ -297,10 +283,10 @@ class DFA { // The generic search loop, inlined to create specialized versions. // cache_mutex_.r <= L < mutex_ // Might unlock and relock cache_mutex_ via params->cache_lock. - inline bool InlinedSearchLoop(SearchParams* params, - bool have_first_byte, - bool want_earliest_match, - bool run_forward); + template + inline bool InlinedSearchLoop(SearchParams* params); // The specialized versions of InlinedSearchLoop. The three letters // at the ends of the name denote the true/false values used as the @@ -322,13 +308,6 @@ class DFA { // Might unlock and relock cache_mutex_ via params->cache_lock. bool FastSearchLoop(SearchParams* params); - // For debugging, a slow search loop that calls InlinedSearchLoop - // directly -- because the booleans passed are not constants, the - // loop is not specialized like the SearchFFF etc. versions, so it - // runs much more slowly. Useful only for debugging. - // cache_mutex_.r <= L < mutex_ - // Might unlock and relock cache_mutex_ via params->cache_lock. - bool SlowSearchLoop(SearchParams* params); // Looks up bytes in bytemap_ but handles case c == kByteEndText too. int ByteMap(int c) { @@ -355,11 +334,14 @@ class DFA { // while holding cache_mutex_ for writing, to avoid interrupting other // readers. Any State* pointers are only valid while cache_mutex_ // is held. - Mutex cache_mutex_; + CacheMutex cache_mutex_; int64_t mem_budget_; // Total memory budget for all States. int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. StartInfo start_[kMaxStart]; + + DFA(const DFA&) = delete; + DFA& operator=(const DFA&) = delete; }; // Shorthand for casting to uint8_t*. @@ -442,7 +424,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) q1_(NULL), mem_budget_(max_mem) { if (ExtraDebug) - fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); + fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str()); int nmark = 0; if (kind_ == Prog::kLongestMatch) nmark = prog_->size(); @@ -613,7 +595,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: // those are the only operators with any effect in // RunWorkqOnEmptyString or RunWorkqOnByte. - int* inst = new int[q->size()]; + PODArray inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch @@ -643,7 +625,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { (it == q->begin() && ip->greedy(prog_))) && (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; @@ -690,7 +671,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // the execution loop can stop early. This is only okay // if the state is *not* a matching state. if (n == 0 && flag == 0) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> DeadState\n"); return DeadState; @@ -700,7 +680,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // unordered state sets separated by Marks. Sort each set // to canonicalize, to reduce the number of distinct sets stored. if (kind_ == Prog::kLongestMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; while (ip < ep) { int* markp = ip; @@ -717,7 +697,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // we have an unordered set of states (i.e. we don't have Marks) // and sorting will reduce the number of distinct sets stored. if (kind_ == Prog::kManyMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; std::sort(ip, ep); } @@ -736,8 +716,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; - State* state = CachedState(inst, n, flag); - delete[] inst; + State* state = CachedState(inst.data(), n, flag); return state; } @@ -971,8 +950,21 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, break; case kInstByteRange: // can follow if c is in range - if (ip->Matches(c)) - AddToQueue(newq, ip->out(), flag); + if (!ip->Matches(c)) + break; + AddToQueue(newq, ip->out(), flag); + if (ip->hint() != 0) { + // We have a hint, but we must cancel out the + // increment that will occur after the break. + i += ip->hint() - 1; + } else { + // We have no hint, so we must find the end + // of the current list and then skip to it. + Prog::Inst* ip0 = ip; + while (!ip->last()) + ++ip; + i += ip - ip0; + } break; case kInstMatch: @@ -989,8 +981,8 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, } if (ExtraDebug) - fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), - c, flag, DumpWorkq(newq).c_str(), *ismatch); + fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", + DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); } // Processes input byte c in state, returning new state. @@ -1117,7 +1109,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { class DFA::RWLocker { public: - explicit RWLocker(Mutex* mu); + explicit RWLocker(CacheMutex* mu); ~RWLocker(); // If the lock is only held for reading right now, @@ -1127,19 +1119,19 @@ class DFA::RWLocker { void LockForWriting(); private: - Mutex* mu_; + CacheMutex* mu_; bool writing_; RWLocker(const RWLocker&) = delete; RWLocker& operator=(const RWLocker&) = delete; }; -DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) { +DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } -// This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations -// does not support lock upgrade. +// This function is marked as NO_THREAD_SAFETY_ANALYSIS because +// the annotations don't support lock upgrade. void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { if (!writing_) { mu_->ReaderUnlock(); @@ -1171,11 +1163,14 @@ void DFA::ResetCache(RWLocker* cache_lock) { // Re-acquire the cache_mutex_ for writing (exclusive use). cache_lock->LockForWriting(); + hooks::GetDFAStateCacheResetHook()({ + state_budget_, + state_cache_.size(), + }); + // Clear the cache, reset the memory budget. - for (int i = 0; i < kMaxStart; i++) { - start_[i].start = NULL; - start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed); - } + for (int i = 0; i < kMaxStart; i++) + start_[i].start.store(NULL, std::memory_order_relaxed); ClearCache(); mem_budget_ = state_budget_; } @@ -1290,8 +1285,7 @@ DFA::State* DFA::StateSaver::Restore() { // situation, the DFA can do better than executing the simple loop. // Instead, it can call memchr to search very quickly for the byte c. // Whether the start state has this property is determined during a -// pre-compilation pass, and if so, the byte b is passed to the search -// loop as the "first_byte" argument, along with a boolean "have_first_byte". +// pre-compilation pass and the "can_prefix_accel" argument is set. // // Fourth, the desired behavior is to search for the leftmost-best match // (approximately, the same one that Perl would find), which is not @@ -1323,15 +1317,16 @@ DFA::State* DFA::StateSaver::Restore() { // The bools are equal to the same-named variables in params, but // making them function arguments lets the inliner specialize // this function to each combination (see two paragraphs above). -inline bool DFA::InlinedSearchLoop(SearchParams* params, - bool have_first_byte, - bool want_earliest_match, - bool run_forward) { +template +inline bool DFA::InlinedSearchLoop(SearchParams* params) { State* start = params->start; - const uint8_t* bp = BytePtr(params->text.begin()); // start of text - const uint8_t* p = bp; // text scanning point - const uint8_t* ep = BytePtr(params->text.end()); // end of text - const uint8_t* resetp = NULL; // p at last cache reset + const uint8_t* bp = BytePtr(params->text.data()); // start of text + const uint8_t* p = bp; // text scanning point + const uint8_t* ep = BytePtr(params->text.data() + + params->text.size()); // end of text + const uint8_t* resetp = NULL; // p at last cache reset if (!run_forward) { using std::swap; swap(p, ep); @@ -1366,25 +1361,16 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, while (p != ep) { if (ExtraDebug) - fprintf(stderr, "@%td: %s\n", - p - bp, DumpState(s).c_str()); + fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - if (have_first_byte && s == start) { - // In start state, only way out is to find first_byte, - // so use optimized assembly in memchr to skip ahead. - // If first_byte isn't found, we can skip to the end - // of the string. - if (run_forward) { - if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { - p = ep; - break; - } - } else { - if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) { - p = ep; - break; - } - p++; + if (can_prefix_accel && s == start) { + // In start state, only way out is to find the prefix, + // so we use prefix accel (e.g. memchr) to skip ahead. + // If not found, we can skip to the end of the string. + p = BytePtr(prog_->PrefixAccel(p, ep - p)); + if (p == NULL) { + p = ep; + break; } } @@ -1475,8 +1461,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, else lastmatch = p + 1; if (ExtraDebug) - fprintf(stderr, "match @%td! [%s]\n", - lastmatch - bp, DumpState(s).c_str()); + fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str()); if (params->matches != NULL && kind_ == Prog::kManyMatch) { for (int i = s->ninst_ - 1; i >= 0; i--) { int id = s->inst_[i]; @@ -1560,36 +1545,28 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Inline specializations of the general loop. bool DFA::SearchFFF(SearchParams* params) { - return InlinedSearchLoop(params, 0, 0, 0); + return InlinedSearchLoop(params); } bool DFA::SearchFFT(SearchParams* params) { - return InlinedSearchLoop(params, 0, 0, 1); + return InlinedSearchLoop(params); } bool DFA::SearchFTF(SearchParams* params) { - return InlinedSearchLoop(params, 0, 1, 0); + return InlinedSearchLoop(params); } bool DFA::SearchFTT(SearchParams* params) { - return InlinedSearchLoop(params, 0, 1, 1); + return InlinedSearchLoop(params); } bool DFA::SearchTFF(SearchParams* params) { - return InlinedSearchLoop(params, 1, 0, 0); + return InlinedSearchLoop(params); } bool DFA::SearchTFT(SearchParams* params) { - return InlinedSearchLoop(params, 1, 0, 1); + return InlinedSearchLoop(params); } bool DFA::SearchTTF(SearchParams* params) { - return InlinedSearchLoop(params, 1, 1, 0); + return InlinedSearchLoop(params); } bool DFA::SearchTTT(SearchParams* params) { - return InlinedSearchLoop(params, 1, 1, 1); -} - -// For debugging, calls the general code directly. -bool DFA::SlowSearchLoop(SearchParams* params) { - return InlinedSearchLoop(params, - params->first_byte >= 0, - params->want_earliest_match, - params->run_forward); + return InlinedSearchLoop(params); } // For performance, calls the appropriate specialized version @@ -1608,8 +1585,7 @@ bool DFA::FastSearchLoop(SearchParams* params) { &DFA::SearchTTT, }; - bool have_first_byte = params->first_byte >= 0; - int index = 4 * have_first_byte + + int index = 4 * params->can_prefix_accel + 2 * params->want_earliest_match + 1 * params->run_forward; return (this->*Searches[index])(params); @@ -1701,13 +1677,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } - if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n", - params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), info->first_byte.load()); + params->start = info->start.load(std::memory_order_acquire); - params->start = info->start; - params->first_byte = info->first_byte.load(std::memory_order_acquire); + // Even if we could prefix accel, we cannot do so when anchored and, + // less obviously, we cannot do so when we are going to need flags. + // This trick works only when there is a single byte that leads to a + // different state! + if (prog_->can_prefix_accel() && + !params->anchored && + params->start > SpecialStateMax && + params->start->flag_ >> kFlagNeedShift == 0) + params->can_prefix_accel = true; + + if (ExtraDebug) + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", + params->anchored, params->run_forward, flags, + DumpState(params->start).c_str(), params->can_prefix_accel); return true; } @@ -1716,47 +1701,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. - int fb = info->first_byte.load(std::memory_order_acquire); - if (fb != kFbUnknown) + State* start = info->start.load(std::memory_order_acquire); + if (start != NULL) return true; MutexLock l(&mutex_); - fb = info->first_byte.load(std::memory_order_relaxed); - if (fb != kFbUnknown) + start = info->start.load(std::memory_order_relaxed); + if (start != NULL) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, NULL, flags); - if (info->start == NULL) + start = WorkqToCachedState(q0_, NULL, flags); + if (start == NULL) return false; - if (info->start == DeadState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); - return true; - } - - if (info->start == FullMatchState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored - return true; - } - - // Even if we have a first_byte, we cannot use it when anchored and, - // less obviously, we cannot use it when we are going to need flags. - // This trick works only when there is a single byte that leads to a - // different state! - int first_byte = prog_->first_byte(); - if (first_byte == -1 || - params->anchored || - info->start->flag_ >> kFlagNeedShift != 0) - first_byte = kFbNone; - // Synchronize with "quick check" above. - info->first_byte.store(first_byte, std::memory_order_release); + info->start.store(start, std::memory_order_release); return true; } @@ -1779,8 +1742,7 @@ bool DFA::Search(const StringPiece& text, if (ExtraDebug) { fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", - std::string(text).c_str(), anchored, want_earliest_match, - run_forward, kind_); + std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_); } RWLocker l(&cache_mutex_); @@ -1798,9 +1760,9 @@ bool DFA::Search(const StringPiece& text, return false; if (params.start == FullMatchState) { if (run_forward == want_earliest_match) - *epp = text.begin(); + *epp = text.data(); else - *epp = text.end(); + *epp = text.data() + text.size(); return true; } if (ExtraDebug) @@ -1863,15 +1825,15 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, *failed = false; StringPiece context = const_context; - if (context.begin() == NULL) + if (context.data() == NULL) context = text; - bool carat = anchor_start(); + bool caret = anchor_start(); bool dollar = anchor_end(); if (reversed_) { using std::swap; - swap(carat, dollar); + swap(caret, dollar); } - if (carat && context.begin() != text.begin()) + if (caret && context.begin() != text.begin()) return false; if (dollar && context.end() != text.end()) return false; @@ -1906,11 +1868,15 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, bool matched = dfa->Search(text, context, anchored, want_earliest_match, !reversed_, failed, &ep, matches); - if (*failed) + if (*failed) { + hooks::GetDFASearchFailureHook()({ + // Nothing yet... + }); return false; + } if (!matched) return false; - if (endmatch && ep != (reversed_ ? text.begin() : text.end())) + if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size())) return false; // If caller cares, record the boundary of the match. @@ -1918,10 +1884,11 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // as the beginning. if (match0) { if (reversed_) - *match0 = StringPiece(ep, static_cast(text.end() - ep)); + *match0 = + StringPiece(ep, static_cast(text.data() + text.size() - ep)); else *match0 = - StringPiece(text.begin(), static_cast(ep - text.begin())); + StringPiece(text.data(), static_cast(ep - text.data())); } return true; } diff --git a/extern/re2/re2/filtered_re2.cc b/extern/re2/re2/filtered_re2.cc index e5d8de5ce6..5df97456e2 100644 --- a/extern/re2/re2/filtered_re2.cc +++ b/extern/re2/re2/filtered_re2.cc @@ -6,6 +6,7 @@ #include #include +#include #include "util/util.h" #include "util/logging.h" @@ -27,7 +28,22 @@ FilteredRE2::FilteredRE2(int min_atom_len) FilteredRE2::~FilteredRE2() { for (size_t i = 0; i < re2_vec_.size(); i++) delete re2_vec_[i]; - delete prefilter_tree_; +} + +FilteredRE2::FilteredRE2(FilteredRE2&& other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); +} + +FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { + this->~FilteredRE2(); + (void) new (this) FilteredRE2(std::move(other)); + return *this; } RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, @@ -38,7 +54,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << re << " due to error " << re->error(); + << pattern << " due to error " << re->error(); } delete re; } else { diff --git a/extern/re2/re2/filtered_re2.h b/extern/re2/re2/filtered_re2.h index 4118accc87..dd618c70e8 100644 --- a/extern/re2/re2/filtered_re2.h +++ b/extern/re2/re2/filtered_re2.h @@ -10,17 +10,18 @@ // number of regexps that need to be actually searched. // // By design, it does not include a string matching engine. This is to -// allow the user of the class to use their favorite string match +// allow the user of the class to use their favorite string matching // engine. The overall flow is: Add all the regexps using Add, then -// Compile the FilteredRE2. The compile returns strings that need to -// be matched. Note that all returned strings are lowercase. For -// applying regexps to a search text, the caller does the string -// matching using the strings returned. When doing the string match, -// note that the caller has to do that on lower cased version of the -// search text. Then call FirstMatch or AllMatches with a vector of -// indices of strings that were found in the text to get the actual -// regexp matches. +// Compile the FilteredRE2. Compile returns strings that need to be +// matched. Note that the returned strings are lowercased and distinct. +// For applying regexps to a search text, the caller does the string +// matching using the returned strings. When doing the string match, +// note that the caller has to do that in a case-insensitive way or +// on a lowercased version of the search text. Then call FirstMatch +// or AllMatches with a vector of indices of strings that were found +// in the text to get the actual regexp matches. +#include #include #include @@ -36,18 +37,25 @@ class FilteredRE2 { explicit FilteredRE2(int min_atom_len); ~FilteredRE2(); + // Not copyable. + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; + // Movable. + FilteredRE2(FilteredRE2&& other); + FilteredRE2& operator=(FilteredRE2&& other); + // Uses RE2 constructor to create a RE2 object (re). Returns // re->error_code(). If error_code is other than NoError, then re is // deleted and not added to re2_vec_. RE2::ErrorCode Add(const StringPiece& pattern, const RE2::Options& options, - int *id); + int* id); // Prepares the regexps added by Add for filtering. Returns a set // of strings that the caller should check for in candidate texts. - // The returned strings are lowercased. When doing string matching, - // the search text should be lowercased first to find matching - // strings from the set of strings returned by Compile. Call after + // The returned strings are lowercased and distinct. When doing + // string matching, it should be performed in a case-insensitive + // way or the search text should be lowercased first. Call after // all Add calls are done. void Compile(std::vector* strings_to_match); @@ -98,10 +106,7 @@ class FilteredRE2 { bool compiled_; // An AND-OR tree of string atoms used for filtering regexps. - PrefilterTree* prefilter_tree_; - - FilteredRE2(const FilteredRE2&) = delete; - FilteredRE2& operator=(const FilteredRE2&) = delete; + std::unique_ptr prefilter_tree_; }; } // namespace re2 diff --git a/extern/re2/re2/fuzzing/compiler-rt/LICENSE b/extern/re2/re2/fuzzing/compiler-rt/LICENSE new file mode 100644 index 0000000000..f9dc50615d --- /dev/null +++ b/extern/re2/re2/fuzzing/compiler-rt/LICENSE @@ -0,0 +1,219 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. + diff --git a/extern/re2/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/extern/re2/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h new file mode 100644 index 0000000000..3e069eba69 --- /dev/null +++ b/extern/re2/re2/fuzzing/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -0,0 +1,305 @@ +//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// A single header library providing an utility class to break up an array of +// bytes. Whenever run on the same input, provides the same output, as long as +// its methods are called in the same order, with the same arguments. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ +#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// In addition to the comments below, the API is also briefly documented at +// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider +class FuzzedDataProvider { + public: + // |data| is an array of length |size| that the FuzzedDataProvider wraps to + // provide more granular access. |data| must outlive the FuzzedDataProvider. + FuzzedDataProvider(const uint8_t *data, size_t size) + : data_ptr_(data), remaining_bytes_(size) {} + ~FuzzedDataProvider() = default; + + // Returns a std::vector containing |num_bytes| of input data. If fewer than + // |num_bytes| of data remain, returns a shorter std::vector containing all + // of the data that's left. Can be used with any byte sized type, such as + // char, unsigned char, uint8_t, etc. + template std::vector ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes(num_bytes, num_bytes); + } + + // Similar to |ConsumeBytes|, but also appends the terminator value at the end + // of the resulting vector. Useful, when a mutable null-terminated C-string is + // needed, for example. But that is a rare case. Better avoid it, if possible, + // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + template + std::vector ConsumeBytesWithTerminator(size_t num_bytes, + T terminator = 0) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; + } + + // Returns a std::string containing |num_bytes| of input data. Using this and + // |.c_str()| on the resulting string is the best way to get an immutable + // null-terminated C string. If fewer than |num_bytes| of data remain, returns + // a shorter std::string containing all of the data that's left. + std::string ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), + num_bytes); + Advance(num_bytes); + return result; + } + + // Returns a number in the range [min, max] by consuming bytes from the + // input data. The value might not be uniformly distributed in the given + // range. If there's no input data left, always returns |min|. |min| must + // be less than or equal to |max|. + template T ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; + } + + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); + } + + // Returns a std::string of length from 0 to |max_length|. When it runs out of + // input data, returns what remains of the input. Designed to be more stable + // with respect to a fuzzer inserting characters than just picking a random + // length and then consuming that many bytes with |ConsumeBytes|. + std::string ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; + } + result += next; + } + + result.shrink_to_fit(); + return result; + } + + // Returns a std::vector containing all remaining bytes of the input data. + template std::vector ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); + } + + // Returns a std::string containing all remaining bytes of the input data. + // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string + // object. + std::string ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); + } + + // Returns a number in the range [Type's min, Type's max]. The value might + // not be uniformly distributed in the given range. If there's no input data + // left, always returns |min|. + template T ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); + } + + // Reads one byte and returns a bool, or false when no data remains. + bool ConsumeBool() { return 1 & ConsumeIntegral(); } + + // Returns a copy of the value selected from the given fixed-size |array|. + template + T PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange(0, size - 1)]; + } + + template + T PickValueInArray(std::initializer_list list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); + } + + // Returns an enum value. The enum must start at 0 and be contiguous. It must + // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: + // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; + template T ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast(ConsumeIntegralInRange( + 0, static_cast(T::kMaxValue))); + } + + // Returns a floating point number in the range [0.0, 1.0]. If there's no + // input data left, always returns 0. + template T ConsumeProbability() { + static_assert(std::is_floating_point::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast(ConsumeIntegral()); + result /= static_cast(std::numeric_limits::max()); + return result; + } + + // Returns a floating point value in the range [Type's lowest, Type's max] by + // consuming bytes from the input data. If there's no input data left, always + // returns approximately 0. + template T ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), + std::numeric_limits::max()); + } + + // Returns a floating point value in the given range by consuming bytes from + // the input data. If there's no input data left, returns |min|. Note that + // |min| must be less than or equal to |max|. + template T ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; + } + } else { + range = max - min; + } + + return result + range * ConsumeProbability(); + } + + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } + + private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; + + void Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; + } + + template + std::vector ConsumeBytes(size_t size, size_t num_bytes_to_consume) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(size); + if (size == 0) { + if (num_bytes_to_consume != 0) + abort(); + return result; + } + + std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); + Advance(num_bytes_to_consume); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; + } + + template TS ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits::is_modulo) + return static_cast(value); + + // Avoid using implementation-defined unsigned to signer conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits::max()) { + return static_cast(value); + } else { + constexpr auto TS_min = std::numeric_limits::min(); + return TS_min + static_cast(value - TS_min); + } + } + + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; + +#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/extern/re2/re2/fuzzing/re2_fuzzer.cc b/extern/re2/re2/fuzzing/re2_fuzzer.cc index 061c418e18..8306f887f5 100644 --- a/extern/re2/re2/fuzzing/re2_fuzzer.cc +++ b/extern/re2/re2/fuzzing/re2_fuzzer.cc @@ -2,12 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include #include #include -#include #include #include #include +#include #include "re2/prefilter.h" #include "re2/re2.h" @@ -17,7 +18,38 @@ using re2::StringPiece; // NOT static, NOT signed. uint8_t dummy = 0; -void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { +void TestOneInput(StringPiece pattern, const RE2::Options& options, + StringPiece text) { + // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. + // Otherwise, we will waste time on inputs that have long runs of various + // character classes. The fuzzer has shown itself to be easily capable of + // generating such patterns that fall within the other limits, but result + // in timeouts nonetheless. The marginal cost is high - even more so when + // counted repetition is involved - whereas the marginal benefit is zero. + // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. + int char_class = 0; + int backslash_p = 0; // very expensive, so handle specially + for (size_t i = 0; i < pattern.size(); i++) { + if (pattern[i] == '.') + char_class++; + if (pattern[i] != '\\') + continue; + i++; + if (i >= pattern.size()) + break; + if (pattern[i] == 'p' || pattern[i] == 'P' || + pattern[i] == 'd' || pattern[i] == 'D' || + pattern[i] == 's' || pattern[i] == 'S' || + pattern[i] == 'w' || pattern[i] == 'W') + char_class++; + if (pattern[i] == 'p' || pattern[i] == 'P') + backslash_p++; + } + if (char_class > 9) + return; + if (backslash_p > 1) + return; + RE2 re(pattern, options); if (!re.ok()) return; @@ -55,7 +87,7 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Don't waste time fuzzing high-fanout programs. // They can cause bug reports due to fuzzer timeouts. - std::map histogram; + std::vector histogram; int fanout = re.ProgramFanout(&histogram); if (fanout > 9) return; @@ -102,72 +134,38 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) { // Entry point for libFuzzer. extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { - if (size == 0 || size > 999) + // An input larger than 4 KiB probably isn't interesting. (This limit + // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.) + if (size == 0 || size > 4096) return 0; - // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. - // Otherwise, we will waste time on inputs that have long runs of various - // character classes. The fuzzer has shown itself to be easily capable of - // generating such patterns that fall within the other limits, but result - // in timeouts nonetheless. The marginal cost is high - even more so when - // counted repetition is involved - whereas the marginal benefit is zero. - // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain. - int char_class = 0; - int backslash_p = 0; // very expensive, so handle specially - for (size_t i = 0; i < size; i++) { - if (data[i] == '.') - char_class++; - if (data[i] != '\\') - continue; - i++; - if (i >= size) - break; - if (data[i] == 'p' || data[i] == 'P' || - data[i] == 'd' || data[i] == 'D' || - data[i] == 's' || data[i] == 'S' || - data[i] == 'w' || data[i] == 'W') - char_class++; - if (data[i] == 'p' || data[i] == 'P') - backslash_p++; - } - if (char_class > 9) - return 0; - if (backslash_p > 1) - return 0; - - // The one-at-a-time hash by Bob Jenkins. - uint32_t hash = 0; - for (size_t i = 0; i < size; i++) { - hash += data[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + FuzzedDataProvider fdp(data, size); + // The convention here is that fdp.ConsumeBool() returning false sets + // the default value whereas returning true sets the alternate value: + // most options default to false and so can be set directly; encoding + // defaults to UTF-8; case_sensitive defaults to true. We do NOT want + // to log errors. max_mem is 64 MiB because we can afford to use more + // RAM in exchange for (hopefully) faster fuzzing. RE2::Options options; + options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1 + : RE2::Options::EncodingUTF8); + options.set_posix_syntax(fdp.ConsumeBool()); + options.set_longest_match(fdp.ConsumeBool()); options.set_log_errors(false); options.set_max_mem(64 << 20); - options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1 - : RE2::Options::EncodingUTF8); - options.set_posix_syntax(hash & 2); - options.set_longest_match(hash & 4); - options.set_literal(hash & 8); - options.set_never_nl(hash & 16); - options.set_dot_nl(hash & 32); - options.set_never_capture(hash & 64); - options.set_case_sensitive(hash & 128); - options.set_perl_classes(hash & 256); - options.set_word_boundary(hash & 512); - options.set_one_line(hash & 1024); + options.set_literal(fdp.ConsumeBool()); + options.set_never_nl(fdp.ConsumeBool()); + options.set_dot_nl(fdp.ConsumeBool()); + options.set_never_capture(fdp.ConsumeBool()); + options.set_case_sensitive(!fdp.ConsumeBool()); + options.set_perl_classes(fdp.ConsumeBool()); + options.set_word_boundary(fdp.ConsumeBool()); + options.set_one_line(fdp.ConsumeBool()); - const char* ptr = reinterpret_cast(data); - int len = static_cast(size); - - StringPiece pattern(ptr, len); - StringPiece text(ptr, len); - Test(pattern, options, text); + std::string pattern = fdp.ConsumeRandomLengthString(999); + std::string text = fdp.ConsumeRandomLengthString(999); + TestOneInput(pattern, options, text); return 0; } diff --git a/extern/re2/re2/make_perl_groups.pl b/extern/re2/re2/make_perl_groups.pl old mode 100644 new mode 100755 index d9fcdafaaf..ed0d509dc3 --- a/extern/re2/re2/make_perl_groups.pl +++ b/extern/re2/re2/make_perl_groups.pl @@ -76,7 +76,7 @@ sub PrintClass($$@) { } else { $negname =~ y/a-z/A-Z/; } - return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; + return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }"; } my $cnum = 0; diff --git a/extern/re2/re2/make_unicode_casefold.py b/extern/re2/re2/make_unicode_casefold.py old mode 100644 new mode 100755 diff --git a/extern/re2/re2/make_unicode_groups.py b/extern/re2/re2/make_unicode_groups.py old mode 100644 new mode 100755 diff --git a/extern/re2/re2/mimics_pcre.cc b/extern/re2/re2/mimics_pcre.cc index ad197bef55..b1d6a51228 100644 --- a/extern/re2/re2/mimics_pcre.cc +++ b/extern/re2/re2/mimics_pcre.cc @@ -38,14 +38,21 @@ static bool CanBeEmptyString(Regexp *re); class PCREWalker : public Regexp::Walker { public: PCREWalker() {} - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, - int nchild_args); - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. - LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + LOG(DFATAL) << "PCREWalker::ShortVisit called"; +#endif return a; } + + private: + PCREWalker(const PCREWalker&) = delete; + PCREWalker& operator=(const PCREWalker&) = delete; }; // Called after visiting each of re's children and accumulating @@ -114,13 +121,16 @@ bool Regexp::MimicsPCRE() { class EmptyStringWalker : public Regexp::Walker { public: - EmptyStringWalker() { } - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); + EmptyStringWalker() {} - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; +#endif return a; } diff --git a/extern/re2/re2/nfa.cc b/extern/re2/re2/nfa.cc index 7bb4fafae6..e858451c59 100644 --- a/extern/re2/re2/nfa.cc +++ b/extern/re2/re2/nfa.cc @@ -27,17 +27,18 @@ #include #include #include +#include #include #include #include +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" -#include "util/logging.h" -#include "util/pod_array.h" -#include "util/sparse_array.h" -#include "util/sparse_set.h" -#include "util/strutil.h" +#include "re2/sparse_array.h" +#include "re2/sparse_set.h" namespace re2 { @@ -107,18 +108,21 @@ class NFA { // Returns text version of capture information, for debugging. std::string FormatCapture(const char** capture); - inline void CopyCapture(const char** dst, const char** src); + void CopyCapture(const char** dst, const char** src) { + memmove(dst, src, ncapture_*sizeof src[0]); + } Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track bool longest_; // whether searching for longest match bool endmatch_; // whether match must end at text.end() - const char* btext_; // beginning of text being matched (for FormatSubmatch) - const char* etext_; // end of text being matched (for endmatch_) + const char* btext_; // beginning of text (for FormatSubmatch) + const char* etext_; // end of text (for endmatch_) Threadq q0_, q1_; // pre-allocated for Search. PODArray stack_; // pre-allocated for AddToThreadq - Thread* free_threads_; // free list + std::deque arena_; // thread arena + Thread* freelist_; // thread freelist const char** match_; // best match so far bool matched_; // any match so far? @@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) { prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + 1; // + 1 for start inst stack_ = PODArray(nstack); - free_threads_ = NULL; + freelist_ = NULL; match_ = NULL; matched_ = false; } NFA::~NFA() { delete[] match_; - Thread* next; - for (Thread* t = free_threads_; t; t = next) { - next = t->next; - delete[] t->capture; - delete t; - } + for (const Thread& t : arena_) + delete[] t.capture; } NFA::Thread* NFA::AllocThread() { - Thread* t = free_threads_; - if (t == NULL) { - t = new Thread; + Thread* t = freelist_; + if (t != NULL) { + freelist_ = t->next; t->ref = 1; - t->capture = new const char*[ncapture_]; + // We don't need to touch t->capture because + // the caller will immediately overwrite it. return t; } - free_threads_ = t->next; + arena_.emplace_back(); + t = &arena_.back(); t->ref = 1; + t->capture = new const char*[ncapture_]; return t; } @@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) { } void NFA::Decref(Thread* t) { - if (t == NULL) - return; + DCHECK(t != NULL); t->ref--; if (t->ref > 0) return; DCHECK_EQ(t->ref, 0); - t->next = free_threads_; - free_threads_ = t; -} - -void NFA::CopyCapture(const char** dst, const char** src) { - for (int i = 0; i < ncapture_; i+=2) { - dst[i] = src[i]; - dst[i+1] = src[i+1]; - } + t->next = freelist_; + freelist_ = t; } // Follows all empty arrows from id0 and enqueues all the states reached. @@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, matched_ = true; Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); if (ip->greedy(prog_)) return ip->out1(); @@ -382,10 +379,15 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, break; case kInstMatch: { - // Avoid invoking undefined behavior when p happens - // to be null - and p-1 would be meaningless anyway. - if (p == NULL) + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by storing p instead of p-1. (What would the latter even mean?!) + // This complements the special case in NFA::Search(). + if (p == NULL) { + CopyCapture(match_, t->capture); + match_[1] = p; + matched_ = true; break; + } if (endmatch_ && p-1 != etext_) break; @@ -411,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, // worse than the one we just found: don't run the // rest of the current Threadq. Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); return 0; } @@ -431,12 +435,12 @@ std::string NFA::FormatCapture(const char** capture) { if (capture[i] == NULL) s += "(?,?)"; else if (capture[i+1] == NULL) - s += StringPrintf("(%d,?)", - (int)(capture[i] - btext_)); + s += StringPrintf("(%td,?)", + capture[i] - btext_); else - s += StringPrintf("(%d,%d)", - (int)(capture[i] - btext_), - (int)(capture[i+1] - btext_)); + s += StringPrintf("(%td,%td)", + capture[i] - btext_, + capture[i+1] - btext_); } return s; } @@ -448,7 +452,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; StringPiece context = const_context; - if (context.begin() == NULL) + if (context.data() == NULL) context = text; // Sanity check: make sure that text lies within context. @@ -465,7 +469,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (prog_->anchor_end()) { longest = true; endmatch_ = true; - etext_ = text.end(); } if (nsubmatch < 0) { @@ -485,32 +488,33 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } match_ = new const char*[ncapture_]; + memset(match_, 0, ncapture_*sizeof match_[0]); matched_ = false; // For debugging prints. - btext_ = context.begin(); + btext_ = context.data(); + // For convenience. + etext_ = text.data() + text.size(); if (ExtraDebug) fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", - std::string(text).c_str(), std::string(context).c_str(), anchored, - longest); + std::string(text).c_str(), std::string(context).c_str(), anchored, longest); // Set up search. Threadq* runq = &q0_; Threadq* nextq = &q1_; runq->clear(); nextq->clear(); - memset(&match_[0], 0, ncapture_*sizeof match_[0]); // Loop over the text, stepping the machine. - for (const char* p = text.begin();; p++) { + for (const char* p = text.data();; p++) { if (ExtraDebug) { int c = 0; - if (p == context.begin()) + if (p == btext_) c = '^'; - else if (p > text.end()) + else if (p > etext_) c = '$'; - else if (p < text.end()) + else if (p < etext_) c = p[0] & 0xFF; fprintf(stderr, "%c:", c); @@ -524,14 +528,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } // This is a no-op the first time around the loop because runq is empty. - int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p); + int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); DCHECK_EQ(runq->size(), 0); using std::swap; swap(nextq, runq); nextq->clear(); if (id != 0) { // We're done: full match ahead. - p = text.end(); + p = etext_; for (;;) { Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { @@ -559,30 +563,28 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, break; } - if (p > text.end()) + if (p > etext_) break; // Start a new thread if there have not been any matches. // (No point in starting a new thread if there have been // matches, since it would be to the right of the match // we already found.) - if (!matched_ && (!anchored || p == text.begin())) { - // If there's a required first byte for an unanchored search - // and we're not in the middle of any possible matches, - // use memchr to search for the byte quickly. - int fb = prog_->first_byte(); + if (!matched_ && (!anchored || p == text.data())) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + // The search must be unanchored and there must be zero + // possible matches already. if (!anchored && runq->size() == 0 && - fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) { - p = reinterpret_cast(memchr(p, fb, text.end() - p)); - if (p == NULL) { - p = text.end(); - } + p < etext_ && prog_->can_prefix_accel()) { + p = reinterpret_cast(prog_->PrefixAccel(p, etext_ - p)); + if (p == NULL) + p = etext_; } Thread* t = AllocThread(); CopyCapture(t->capture, match_); t->capture[0] = p; - AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p, + AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, t); Decref(t); } @@ -593,10 +595,24 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, fprintf(stderr, "dead\n"); break; } + + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + // This complements the special case in NFA::Step(). + if (p == NULL) { + (void) Step(runq, nextq, -1, context, p); + DCHECK_EQ(runq->size(), 0); + using std::swap; + swap(nextq, runq); + nextq->clear(); + break; + } } - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) - Decref(i->value()); + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } if (matched_) { for (int i = 0; i < nsubmatch; i++) @@ -605,73 +621,13 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, static_cast(match_[2 * i + 1] - match_[2 * i])); if (ExtraDebug) fprintf(stderr, "match (%td,%td)\n", - match_[0] - btext_, match_[1] - btext_); + match_[0] - btext_, + match_[1] - btext_); return true; } return false; } -// Computes whether all successful matches have a common first byte, -// and if so, returns that byte. If not, returns -1. -int Prog::ComputeFirstByte() { - int b = -1; - SparseSet q(size()); - q.insert(start()); - for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { - int id = *it; - Prog::Inst* ip = inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; - break; - - case kInstMatch: - // The empty string matches: no first byte. - return -1; - - case kInstByteRange: - if (!ip->last()) - q.insert(id+1); - - // Must match only a single byte - if (ip->lo() != ip->hi()) - return -1; - if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') - return -1; - // If we haven't seen any bytes yet, record it; - // otherwise must match the one we saw before. - if (b == -1) - b = ip->lo(); - else if (b != ip->lo()) - return -1; - break; - - case kInstNop: - case kInstCapture: - case kInstEmptyWidth: - if (!ip->last()) - q.insert(id+1); - - // Continue on. - // Ignore ip->empty() flags for kInstEmptyWidth - // in order to be as conservative as possible - // (assume all possible empty-width flags are true). - if (ip->out()) - q.insert(ip->out()); - break; - - case kInstAltMatch: - DCHECK(!ip->last()); - q.insert(id+1); - break; - - case kInstFail: - break; - } - } - return b; -} - bool Prog::SearchNFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, diff --git a/extern/re2/re2/onepass.cc b/extern/re2/re2/onepass.cc index d615893274..66a62d94b0 100644 --- a/extern/re2/re2/onepass.cc +++ b/extern/re2/re2/onepass.cc @@ -59,11 +59,11 @@ #include "util/util.h" #include "util/logging.h" -#include "util/pod_array.h" -#include "util/sparse_set.h" #include "util/strutil.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/prog.h" +#include "re2/sparse_set.h" #include "re2/stringpiece.h" // Silence "zero-sized array in struct/union" warning for OneState::action. @@ -235,7 +235,7 @@ bool Prog::SearchOnePass(const StringPiece& text, matchcap[i] = NULL; StringPiece context = const_context; - if (context.begin() == NULL) + if (context.data() == NULL) context = text; if (anchor_start() && context.begin() != text.begin()) return false; @@ -249,8 +249,8 @@ bool Prog::SearchOnePass(const StringPiece& text, // start() is always mapped to the zeroth OneState. OneState* state = IndexToNode(nodes, statesize, 0); uint8_t* bytemap = bytemap_; - const char* bp = text.begin(); - const char* ep = text.end(); + const char* bp = text.data(); + const char* ep = text.data() + text.size(); const char* p; bool matched = false; matchcap[0] = bp; @@ -550,7 +550,7 @@ bool Prog::IsOnePass() { if (!AddQ(&workq, ip->out())) { if (ExtraDebug) LOG(ERROR) << StringPrintf( - "Not OnePass: multiple paths %d -> %d\n", *it, ip->out()); + "Not OnePass: multiple paths %d -> %d", *it, ip->out()); goto fail; } id = ip->out(); @@ -561,7 +561,7 @@ bool Prog::IsOnePass() { // (3) is violated if (ExtraDebug) LOG(ERROR) << StringPrintf( - "Not OnePass: multiple matches from %d\n", *it); + "Not OnePass: multiple matches from %d", *it); goto fail; } matched = true; diff --git a/extern/re2/re2/parse.cc b/extern/re2/re2/parse.cc index 93b922a146..3bba6137f4 100644 --- a/extern/re2/re2/parse.cc +++ b/extern/re2/re2/parse.cc @@ -27,9 +27,9 @@ #include "util/util.h" #include "util/logging.h" -#include "util/pod_array.h" #include "util/strutil.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/stringpiece.h" #include "re2/unicode_casefold.h" @@ -93,7 +93,7 @@ class Regexp::ParseState { bool PushSimpleOp(RegexpOp op); // Pushes a ^ onto the stack. - bool PushCarat(); + bool PushCaret(); // Pushes a \b (word == true) or \B (word == false) onto the stack. bool PushWordBoundary(bool word); @@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } // Pushes a ^ onto the stack. -bool Regexp::ParseState::PushCarat() { +bool Regexp::ParseState::PushCaret() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); } @@ -556,9 +556,10 @@ int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, } int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; +#endif return 0; } @@ -684,7 +685,7 @@ bool Regexp::ParseState::DoRightParen() { if ((r1 = stacktop_) == NULL || (r2 = r1->down_) == NULL || r2->op() != kLeftParen) { - status_->set_code(kRegexpMissingParen); + status_->set_code(kRegexpUnexpectedParen); status_->set_error_arg(whole_regexp_); return false; } @@ -1323,14 +1324,14 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { // Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. static bool ParseInteger(StringPiece* s, int* np) { - if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) + if (s->empty() || !isdigit((*s)[0] & 0xFF)) return false; // Disallow leading zeros. if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) return false; int n = 0; int c; - while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { + while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) { // Avoid overflow. if (n >= 100000000) return false; @@ -1352,16 +1353,16 @@ static bool ParseInteger(StringPiece* s, int* np) { // s must NOT be edited unless MaybeParseRepetition returns true. static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { StringPiece s = *sp; - if (s.size() == 0 || s[0] != '{') + if (s.empty() || s[0] != '{') return false; s.remove_prefix(1); // '{' if (!ParseInteger(&s, lo)) return false; - if (s.size() == 0) + if (s.empty()) return false; if (s[0] == ',') { s.remove_prefix(1); // ',' - if (s.size() == 0) + if (s.empty()) return false; if (s[0] == '}') { // {2,} means at least 2 @@ -1375,7 +1376,7 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { // {2} means exactly two *hi = *lo; } - if (s.size() == 0 || s[0] != '}') + if (s.empty() || s[0] != '}') return false; s.remove_prefix(1); // '}' *sp = s; @@ -1416,7 +1417,7 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { StringPiece t = s; Rune r; - while (t.size() > 0) { + while (!t.empty()) { if (StringPieceToRune(&r, &t, status) < 0) return false; } @@ -1447,14 +1448,14 @@ static int UnHex(int c) { // Sets *rp to the named character. static bool ParseEscape(StringPiece* s, Rune* rp, RegexpStatus* status, int rune_max) { - const char* begin = s->begin(); - if (s->size() < 1 || (*s)[0] != '\\') { + const char* begin = s->data(); + if (s->empty() || (*s)[0] != '\\') { // Should not happen - caller always checks. status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); return false; } - if (s->size() < 2) { + if (s->size() == 1) { status->set_code(kRegexpTrailingBackslash); status->set_error_arg(StringPiece()); return false; @@ -1485,16 +1486,16 @@ static bool ParseEscape(StringPiece* s, Rune* rp, case '6': case '7': // Single non-zero octal digit is a backreference; not supported. - if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') + if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7') goto BadEscape; FALLTHROUGH_INTENDED; case '0': // consume up to three octal digits; already have one. code = c - '0'; - if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { + if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') { code = code * 8 + c - '0'; s->remove_prefix(1); // digit - if (s->size() > 0) { + if (!s->empty()) { c = (*s)[0]; if ('0' <= c && c <= '7') { code = code * 8 + c - '0'; @@ -1509,7 +1510,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Hexadecimal escapes case 'x': - if (s->size() == 0) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c, s, status) < 0) return false; @@ -1529,7 +1530,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, code = code * 16 + UnHex(c); if (code > rune_max) goto BadEscape; - if (s->size() == 0) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c, s, status) < 0) return false; @@ -1540,7 +1541,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, return true; } // Easy case: two hex digits. - if (s->size() == 0) + if (s->empty()) goto BadEscape; if (StringPieceToRune(&c1, s, status) < 0) return false; @@ -1590,7 +1591,7 @@ BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); status->set_error_arg( - StringPiece(begin, static_cast(s->begin() - begin))); + StringPiece(begin, static_cast(s->data() - begin))); return false; } @@ -1710,7 +1711,7 @@ const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_fl return NULL; // Could use StringPieceToRune, but there aren't // any non-ASCII Perl group names. - StringPiece name(s->begin(), 2); + StringPiece name(s->data(), 2); const UGroup *g = LookupPerlGroup(name); if (g == NULL) return NULL; @@ -1750,8 +1751,8 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, return kParseError; if (c != '{') { // Name is the bit of string we just skipped over for c. - const char* p = seq.begin() + 2; - name = StringPiece(p, static_cast(s->begin() - p)); + const char* p = seq.data() + 2; + name = StringPiece(p, static_cast(s->data() - p)); } else { // Name is in braces. Look for closing } size_t end = s->find('}', 0); @@ -1762,16 +1763,16 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, status->set_error_arg(seq); return kParseError; } - name = StringPiece(s->begin(), end); // without '}' + name = StringPiece(s->data(), end); // without '}' s->remove_prefix(end + 1); // with '}' if (!IsValidUTF8(name, status)) return kParseError; } // Chop seq where s now begins. - seq = StringPiece(seq.begin(), static_cast(s->begin() - seq.begin())); + seq = StringPiece(seq.data(), static_cast(s->data() - seq.data())); - if (name.size() > 0 && name[0] == '^') { + if (!name.empty() && name[0] == '^') { sign = -sign; name.remove_prefix(1); // '^' } @@ -1801,14 +1802,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Convert the UnicodeSet to a URange32 and UGroup that we can add. int nr = uset.getRangeCount(); - URange32* r = new URange32[nr]; + PODArray r(nr); for (int i = 0; i < nr; i++) { r[i].lo = uset.getRangeStart(i); r[i].hi = uset.getRangeEnd(i); } - UGroup g = {"", +1, 0, 0, r, nr}; + UGroup g = {"", +1, 0, 0, r.data(), nr}; AddUGroup(cc, &g, sign, parse_flags); - delete[] r; #endif return kParseOk; @@ -1858,7 +1858,7 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, const StringPiece& whole_class, RegexpStatus* status) { - if (s->size() == 0) { + if (s->empty()) { status->set_code(kRegexpMissingBracket); status->set_error_arg(whole_class); return false; @@ -1866,7 +1866,7 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, // Allow regular escape sequences even though // many need not be escaped in this context. - if (s->size() >= 1 && (*s)[0] == '\\') + if ((*s)[0] == '\\') return ParseEscape(s, rp, status, rune_max_); // Otherwise take the next rune. @@ -1908,7 +1908,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, Regexp** out_re, RegexpStatus* status) { StringPiece whole_class = *s; - if (s->size() == 0 || (*s)[0] != '[') { + if (s->empty() || (*s)[0] != '[') { // Caller checked this. status->set_code(kRegexpInternalError); status->set_error_arg(StringPiece()); @@ -1918,7 +1918,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); re->ccb_ = new CharClassBuilder; s->remove_prefix(1); // '[' - if (s->size() > 0 && (*s)[0] == '^') { + if (!s->empty() && (*s)[0] == '^') { s->remove_prefix(1); // '^' negated = true; if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { @@ -1928,7 +1928,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, } } bool first = true; // ] is okay as first char in class - while (s->size() > 0 && ((*s)[0] != ']' || first)) { + while (!s->empty() && ((*s)[0] != ']' || first)) { // - is only okay unescaped as first or last in class. // Except that Perl allows - anywhere. if ((*s)[0] == '-' && !first && !(flags_&PerlX) && @@ -1996,7 +1996,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, // in the flags. re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); } - if (s->size() == 0) { + if (s->empty()) { status->set_code(kRegexpMissingBracket); status->set_error_arg(whole_class); re->Decref(); @@ -2016,7 +2016,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, // Python rejects names starting with digits. // We don't enforce either of those. static bool IsValidCaptureName(const StringPiece& name) { - if (name.size() == 0) + if (name.empty()) return false; for (size_t i = 0; i < name.size(); i++) { int c = name[i]; @@ -2074,8 +2074,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { } // t is "P...", t[end] == '>' - StringPiece capture(t.begin()-2, end+3); // "(?P" - StringPiece name(t.begin()+2, end-2); // "name" + StringPiece capture(t.data()-2, end+3); // "(?P" + StringPiece name(t.data()+2, end-2); // "name" if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -2089,7 +2089,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix(static_cast(capture.end() - s->begin())); + s->remove_prefix( + static_cast(capture.data() + capture.size() - s->data())); return true; } @@ -2098,7 +2099,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { int nflags = flags_; Rune c; for (bool done = false; !done; ) { - if (t.size() == 0) + if (t.empty()) goto BadPerlOp; if (StringPieceToRune(&c, &t, status_) < 0) return false; @@ -2173,7 +2174,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); status_->set_error_arg( - StringPiece(s->begin(), static_cast(t.begin() - s->begin()))); + StringPiece(s->data(), static_cast(t.data() - s->data()))); return false; } @@ -2216,7 +2217,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (global_flags & Literal) { // Special parse loop for literal string. - while (t.size() > 0) { + while (!t.empty()) { Rune r; if (StringPieceToRune(&r, &t, status) < 0) return NULL; @@ -2227,7 +2228,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } StringPiece lastunary = StringPiece(); - while (t.size() > 0) { + while (!t.empty()) { StringPiece isunary = StringPiece(); switch (t[0]) { default: { @@ -2270,7 +2271,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; case '^': // Beginning of line. - if (!ps.PushCarat()) + if (!ps.PushCaret()) return NULL; t.remove_prefix(1); // '^' break; @@ -2311,18 +2312,18 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, bool nongreedy = false; t.remove_prefix(1); // '*' or '+' or '?' if (ps.flags() & PerlX) { - if (t.size() > 0 && t[0] == '?') { + if (!t.empty() && t[0] == '?') { nongreedy = true; t.remove_prefix(1); // '?' } - if (lastunary.size() > 0) { + if (!lastunary.empty()) { // In Perl it is not allowed to stack repetition operators: // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( - lastunary.begin(), - static_cast(t.begin() - lastunary.begin()))); + lastunary.data(), + static_cast(t.data() - lastunary.data()))); return NULL; } } @@ -2346,16 +2347,16 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, } bool nongreedy = false; if (ps.flags() & PerlX) { - if (t.size() > 0 && t[0] == '?') { + if (!t.empty() && t[0] == '?') { nongreedy = true; t.remove_prefix(1); // '?' } - if (lastunary.size() > 0) { + if (!lastunary.empty()) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); status->set_error_arg(StringPiece( - lastunary.begin(), - static_cast(t.begin() - lastunary.begin()))); + lastunary.data(), + static_cast(t.data() - lastunary.data()))); return NULL; } } @@ -2404,7 +2405,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (t[1] == 'Q') { // \Q ... \E: the ... is always literals t.remove_prefix(2); // '\\', 'Q' - while (t.size() > 0) { + while (!t.empty()) { if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { t.remove_prefix(2); // '\\', 'E' break; diff --git a/extern/re2/re2/perl_groups.cc b/extern/re2/re2/perl_groups.cc index 422b3882d4..4687444581 100644 --- a/extern/re2/re2/perl_groups.cc +++ b/extern/re2/re2/perl_groups.cc @@ -20,12 +20,12 @@ static const URange16 code3[] = { /* \w */ { 0x61, 0x7a }, }; const UGroup perl_groups[] = { - { "\\d", +1, code1, 1 }, - { "\\D", -1, code1, 1 }, - { "\\s", +1, code2, 3 }, - { "\\S", -1, code2, 3 }, - { "\\w", +1, code3, 4 }, - { "\\W", -1, code3, 4 }, + { "\\d", +1, code1, 1, 0, 0 }, + { "\\D", -1, code1, 1, 0, 0 }, + { "\\s", +1, code2, 3, 0, 0 }, + { "\\S", -1, code2, 3, 0, 0 }, + { "\\w", +1, code3, 4, 0, 0 }, + { "\\W", -1, code3, 4, 0, 0 }, }; const int num_perl_groups = 6; static const URange16 code4[] = { /* [:alnum:] */ @@ -85,34 +85,34 @@ static const URange16 code17[] = { /* [:xdigit:] */ { 0x61, 0x66 }, }; const UGroup posix_groups[] = { - { "[:alnum:]", +1, code4, 3 }, - { "[:^alnum:]", -1, code4, 3 }, - { "[:alpha:]", +1, code5, 2 }, - { "[:^alpha:]", -1, code5, 2 }, - { "[:ascii:]", +1, code6, 1 }, - { "[:^ascii:]", -1, code6, 1 }, - { "[:blank:]", +1, code7, 2 }, - { "[:^blank:]", -1, code7, 2 }, - { "[:cntrl:]", +1, code8, 2 }, - { "[:^cntrl:]", -1, code8, 2 }, - { "[:digit:]", +1, code9, 1 }, - { "[:^digit:]", -1, code9, 1 }, - { "[:graph:]", +1, code10, 1 }, - { "[:^graph:]", -1, code10, 1 }, - { "[:lower:]", +1, code11, 1 }, - { "[:^lower:]", -1, code11, 1 }, - { "[:print:]", +1, code12, 1 }, - { "[:^print:]", -1, code12, 1 }, - { "[:punct:]", +1, code13, 4 }, - { "[:^punct:]", -1, code13, 4 }, - { "[:space:]", +1, code14, 2 }, - { "[:^space:]", -1, code14, 2 }, - { "[:upper:]", +1, code15, 1 }, - { "[:^upper:]", -1, code15, 1 }, - { "[:word:]", +1, code16, 4 }, - { "[:^word:]", -1, code16, 4 }, - { "[:xdigit:]", +1, code17, 3 }, - { "[:^xdigit:]", -1, code17, 3 }, + { "[:alnum:]", +1, code4, 3, 0, 0 }, + { "[:^alnum:]", -1, code4, 3, 0, 0 }, + { "[:alpha:]", +1, code5, 2, 0, 0 }, + { "[:^alpha:]", -1, code5, 2, 0, 0 }, + { "[:ascii:]", +1, code6, 1, 0, 0 }, + { "[:^ascii:]", -1, code6, 1, 0, 0 }, + { "[:blank:]", +1, code7, 2, 0, 0 }, + { "[:^blank:]", -1, code7, 2, 0, 0 }, + { "[:cntrl:]", +1, code8, 2, 0, 0 }, + { "[:^cntrl:]", -1, code8, 2, 0, 0 }, + { "[:digit:]", +1, code9, 1, 0, 0 }, + { "[:^digit:]", -1, code9, 1, 0, 0 }, + { "[:graph:]", +1, code10, 1, 0, 0 }, + { "[:^graph:]", -1, code10, 1, 0, 0 }, + { "[:lower:]", +1, code11, 1, 0, 0 }, + { "[:^lower:]", -1, code11, 1, 0, 0 }, + { "[:print:]", +1, code12, 1, 0, 0 }, + { "[:^print:]", -1, code12, 1, 0, 0 }, + { "[:punct:]", +1, code13, 4, 0, 0 }, + { "[:^punct:]", -1, code13, 4, 0, 0 }, + { "[:space:]", +1, code14, 2, 0, 0 }, + { "[:^space:]", -1, code14, 2, 0, 0 }, + { "[:upper:]", +1, code15, 1, 0, 0 }, + { "[:^upper:]", -1, code15, 1, 0, 0 }, + { "[:word:]", +1, code16, 4, 0, 0 }, + { "[:^word:]", -1, code16, 4, 0, 0 }, + { "[:xdigit:]", +1, code17, 3, 0, 0 }, + { "[:^xdigit:]", -1, code17, 3, 0, 0 }, }; const int num_posix_groups = 28; diff --git a/extern/re2/util/pod_array.h b/extern/re2/re2/pod_array.h similarity index 84% rename from extern/re2/util/pod_array.h rename to extern/re2/re2/pod_array.h index eaf492d0ed..f234e976f4 100644 --- a/extern/re2/util/pod_array.h +++ b/extern/re2/re2/pod_array.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef UTIL_POD_ARRAY_H_ -#define UTIL_POD_ARRAY_H_ +#ifndef RE2_POD_ARRAY_H_ +#define RE2_POD_ARRAY_H_ #include #include @@ -13,7 +13,7 @@ namespace re2 { template class PODArray { public: - static_assert(std::is_pod::value, + static_assert(std::is_trivial::value && std::is_standard_layout::value, "T must be POD"); PODArray() @@ -52,4 +52,4 @@ class PODArray { } // namespace re2 -#endif // UTIL_POD_ARRAY_H_ +#endif // RE2_POD_ARRAY_H_ diff --git a/extern/re2/re2/prefilter.cc b/extern/re2/re2/prefilter.cc index f61d54b8f8..a47b3120fb 100644 --- a/extern/re2/re2/prefilter.cc +++ b/extern/re2/re2/prefilter.cc @@ -648,14 +648,15 @@ Prefilter* Prefilter::FromRegexp(Regexp* re) { return NULL; Regexp* simple = re->Simplify(); - Prefilter::Info *info = BuildInfo(simple); + if (simple == NULL) + return NULL; + Prefilter::Info* info = BuildInfo(simple); simple->Decref(); if (info == NULL) return NULL; Prefilter* m = info->TakeMatch(); - delete info; return m; } diff --git a/extern/re2/re2/prefilter_tree.cc b/extern/re2/re2/prefilter_tree.cc index 187e2ec552..fdf4e083c9 100644 --- a/extern/re2/re2/prefilter_tree.cc +++ b/extern/re2/re2/prefilter_tree.cc @@ -107,7 +107,7 @@ void PrefilterTree::Compile(std::vector* atom_vec) { Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { std::string node_string = NodeString(node); - std::map::iterator iter = nodes->find(node_string); + NodeMap::iterator iter = nodes->find(node_string); if (iter == nodes->end()) return NULL; return (*iter).second; @@ -377,7 +377,7 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { LOG(ERROR) << it->first; } LOG(ERROR) << "Map:"; - for (std::map::const_iterator iter = nodes->begin(); + for (NodeMap::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() << " Str: " << (*iter).first; diff --git a/extern/re2/re2/prefilter_tree.h b/extern/re2/re2/prefilter_tree.h index b2e2d749b3..5d73074d97 100644 --- a/extern/re2/re2/prefilter_tree.h +++ b/extern/re2/re2/prefilter_tree.h @@ -21,8 +21,8 @@ #include #include "util/util.h" -#include "util/sparse_array.h" #include "re2/prefilter.h" +#include "re2/sparse_array.h" namespace re2 { diff --git a/extern/re2/re2/prog.cc b/extern/re2/re2/prog.cc index 5155943cb1..ac9c085240 100644 --- a/extern/re2/re2/prog.cc +++ b/extern/re2/re2/prog.cc @@ -7,6 +7,12 @@ #include "re2/prog.h" +#if defined(__AVX2__) +#include +#ifdef _MSC_VER +#include +#endif +#endif #include #include #include @@ -109,8 +115,9 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), - first_byte_(-1), - flags_(0), + prefix_size_(0), + prefix_front_(-1), + prefix_back_(-1), list_count_(0), dfa_mem_(0), dfa_first_(NULL), @@ -185,14 +192,31 @@ std::string Prog::DumpByteMap() { return map; } -int Prog::first_byte() { - std::call_once(first_byte_once_, [](Prog* prog) { - prog->first_byte_ = prog->ComputeFirstByte(); - }, this); - return first_byte_; -} +// Is ip a guaranteed match at end of text, perhaps after some capturing? +static bool IsMatch(Prog* prog, Prog::Inst* ip) { + for (;;) { + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); + return false; -static bool IsMatch(Prog*, Prog::Inst*); + case kInstAlt: + case kInstAltMatch: + case kInstByteRange: + case kInstFail: + case kInstEmptyWidth: + return false; + + case kInstCapture: + case kInstNop: + ip = prog->inst(ip->out()); + break; + + case kInstMatch: + return true; + } + } +} // Peep-hole optimizer. void Prog::Optimize() { @@ -258,54 +282,28 @@ void Prog::Optimize() { } } -// Is ip a guaranteed match at end of text, perhaps after some capturing? -static bool IsMatch(Prog* prog, Prog::Inst* ip) { - for (;;) { - switch (ip->opcode()) { - default: - LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); - return false; - - case kInstAlt: - case kInstAltMatch: - case kInstByteRange: - case kInstFail: - case kInstEmptyWidth: - return false; - - case kInstCapture: - case kInstNop: - ip = prog->inst(ip->out()); - break; - - case kInstMatch: - return true; - } - } -} - uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { int flags = 0; // ^ and \A - if (p == text.begin()) + if (p == text.data()) flags |= kEmptyBeginText | kEmptyBeginLine; else if (p[-1] == '\n') flags |= kEmptyBeginLine; // $ and \z - if (p == text.end()) + if (p == text.data() + text.size()) flags |= kEmptyEndText | kEmptyEndLine; - else if (p < text.end() && p[0] == '\n') + else if (p < text.data() + text.size() && p[0] == '\n') flags |= kEmptyEndLine; // \b and \B - if (p == text.begin() && p == text.end()) { + if (p == text.data() && p == text.data() + text.size()) { // no word boundary here - } else if (p == text.begin()) { + } else if (p == text.data()) { if (IsWordChar(p[0])) flags |= kEmptyWordBoundary; - } else if (p == text.end()) { + } else if (p == text.data() + text.size()) { if (IsWordChar(p[-1])) flags |= kEmptyWordBoundary; } else { @@ -918,4 +916,73 @@ void Prog::ComputeHints(std::vector* flat, int begin, int end) { } } +#if defined(__AVX2__) +// Finds the least significant non-zero bit in n. +static int FindLSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return __builtin_ctz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanForward(&c, n); + return static_cast(c); +#else + int c = 31; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif +} +#endif + +const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 2); + if (size < prefix_size_) + return NULL; + // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. + // This also means that probing for prefix_back_ doesn't go out of bounds. + size -= prefix_size_-1; + +#if defined(__AVX2__) + // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. + if (size >= sizeof(__m256i)) { + const __m256i* fp = reinterpret_cast( + reinterpret_cast(data)); + const __m256i* bp = reinterpret_cast( + reinterpret_cast(data) + prefix_size_-1); + const __m256i* endfp = fp + size/sizeof(__m256i); + const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); + const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); + while (fp != endfp) { + const __m256i f_loadu = _mm256_loadu_si256(fp++); + const __m256i b_loadu = _mm256_loadu_si256(bp++); + const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); + const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); + const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); + if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. + const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); + const int fb_movemask = _mm256_movemask_epi8(fb_and); + const int fb_ctz = FindLSBSet(fb_movemask); + return reinterpret_cast(fp-1) + fb_ctz; + } + } + data = fp; + size = size%sizeof(__m256i); + } +#endif + + const char* p0 = reinterpret_cast(data); + for (const char* p = p0;; p++) { + DCHECK_GE(size, static_cast(p-p0)); + p = reinterpret_cast(memchr(p, prefix_front_, size - (p-p0))); + if (p == NULL || p[prefix_size_-1] == prefix_back_) + return p; + } +} + } // namespace re2 diff --git a/extern/re2/re2/prog.h b/extern/re2/re2/prog.h index bacc411797..e9ce682d99 100644 --- a/extern/re2/re2/prog.h +++ b/extern/re2/re2/prog.h @@ -18,10 +18,10 @@ #include "util/util.h" #include "util/logging.h" -#include "util/pod_array.h" -#include "util/sparse_array.h" -#include "util/sparse_set.h" +#include "re2/pod_array.h" #include "re2/re2.h" +#include "re2/sparse_array.h" +#include "re2/sparse_set.h" namespace re2 { @@ -198,8 +198,8 @@ class Prog { Inst *inst(int id) { return &inst_[id]; } int start() { return start_; } - int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } + int start_unanchored() { return start_unanchored_; } void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } bool reversed() { return reversed_; } @@ -207,19 +207,27 @@ class Prog { int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } - void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } int64_t dfa_mem() { return dfa_mem_; } - int flags() { return flags_; } - void set_flags(int flags) { flags_ = flags; } + void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } + bool can_prefix_accel() { return prefix_size_ != 0; } - // Lazily computed. - int first_byte(); + // Accelerates to the first likely occurrence of the prefix. + // Returns a pointer to the first byte or NULL if not found. + const void* PrefixAccel(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 1); + return prefix_size_ == 1 ? memchr(data, prefix_front_, size) + : PrefixAccel_FrontAndBack(data, size); + } + + // An implementation of prefix accel that looks for prefix_front_ and + // prefix_back_ to return fewer false positives than memchr(3) alone. + const void* PrefixAccel_FrontAndBack(const void* data, size_t size); // Returns string representation of program for debugging. std::string Dump(); @@ -297,10 +305,6 @@ class Prog { // Compute bytemap. void ComputeByteMap(); - // Computes whether all matches must begin with the same first - // byte, and if so, returns that byte. If not, returns -1. - int ComputeFirstByte(); - // Run peep-hole optimizer on program. void Optimize(); @@ -402,8 +406,9 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ - int first_byte_; // required first byte for match, or -1 if none - int flags_; // regexp parse flags + size_t prefix_size_; // size of prefix (0 if no prefix) + int prefix_front_; // first byte of prefix (-1 if no prefix) + int prefix_back_; // last byte of prefix (-1 if no prefix) int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode @@ -419,7 +424,6 @@ class Prog { uint8_t bytemap_[256]; // map from input bytes to byte classes - std::once_flag first_byte_once_; std::once_flag dfa_first_once_; std::once_flag dfa_longest_once_; diff --git a/extern/re2/re2/re2.cc b/extern/re2/re2/re2.cc index a4b499258e..85ba1f4ecd 100644 --- a/extern/re2/re2/re2.cc +++ b/extern/re2/re2/re2.cc @@ -12,10 +12,14 @@ #include #include #include +#ifdef _MSC_VER +#include +#endif #include #include #include #include +#include #include #include #include @@ -24,11 +28,11 @@ #include "util/util.h" #include "util/logging.h" -#include "util/sparse_array.h" #include "util/strutil.h" #include "util/utf.h" #include "re2/prog.h" #include "re2/regexp.h" +#include "re2/sparse_array.h" namespace re2 { @@ -79,6 +83,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorMissingBracket; case re2::kRegexpMissingParen: return RE2::ErrorMissingParen; + case re2::kRegexpUnexpectedParen: + return RE2::ErrorUnexpectedParen; case re2::kRegexpTrailingBackslash: return RE2::ErrorTrailingBackslash; case re2::kRegexpRepeatArgument: @@ -172,15 +178,20 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { empty_group_names = new std::map; }); - pattern_ = std::string(pattern); + pattern_.assign(pattern.data(), pattern.size()); options_.Copy(options); entire_regexp_ = NULL; + error_ = empty_string; + error_code_ = NoError; + error_arg_.clear(); + prefix_.clear(); + prefix_foldcase_ = false; suffix_regexp_ = NULL; prog_ = NULL; num_captures_ = -1; + is_one_pass_ = false; + rprog_ = NULL; - error_ = empty_string; - error_code_ = NoError; named_groups_ = NULL; group_names_ = NULL; @@ -239,9 +250,11 @@ re2::Prog* RE2::ReverseProg() const { if (re->rprog_ == NULL) { if (re->options_.log_errors()) LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; - re->error_ = - new std::string("pattern too large - reverse compile failed"); - re->error_code_ = RE2::ErrorPatternTooLarge; + // We no longer touch error_ and error_code_ because failing to compile + // the reverse Prog is not a showstopper: falling back to NFA execution + // is fine. More importantly, an RE2 object is supposed to be logically + // immutable: whatever ok() would have returned after Init() completed, + // it should continue to return that no matter what ReverseProg() does. } }, this); return rprog_; @@ -277,28 +290,54 @@ int RE2::ReverseProgramSize() const { return prog->size(); } -static int Fanout(Prog* prog, std::map* histogram) { - SparseArray fanout(prog->size()); - prog->Fanout(&fanout); - histogram->clear(); - for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { - // TODO(junyer): Optimise this? - int bucket = 0; - while (1 << bucket < i->value()) { - bucket++; +// Finds the most significant non-zero bit in n. +static int FindMSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return 31 ^ __builtin_clz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanReverse(&c, n); + return static_cast(c); +#else + int c = 0; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n >> shift; + if (word != 0) { + n = word; + c += shift; } - (*histogram)[bucket]++; } - return histogram->rbegin()->first; + return c; +#endif } -int RE2::ProgramFanout(std::map* histogram) const { +static int Fanout(Prog* prog, std::vector* histogram) { + SparseArray fanout(prog->size()); + prog->Fanout(&fanout); + int data[32] = {}; + int size = 0; + for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { + if (i->value() == 0) + continue; + uint32_t value = i->value(); + int bucket = FindMSBSet(value); + bucket += value & (value-1) ? 1 : 0; + ++data[bucket]; + size = std::max(size, bucket+1); + } + if (histogram != NULL) + histogram->assign(data, data+size); + return size-1; +} + +int RE2::ProgramFanout(std::vector* histogram) const { if (prog_ == NULL) return -1; return Fanout(prog_, histogram); } -int RE2::ReverseProgramFanout(std::map* histogram) const { +int RE2::ReverseProgramFanout(std::vector* histogram) const { if (prog_ == NULL) return -1; Prog* prog = ReverseProg(); @@ -368,6 +407,8 @@ bool RE2::Replace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) @@ -377,8 +418,8 @@ bool RE2::Replace(std::string* str, if (!re.Rewrite(&s, rewrite, vec, nvec)) return false; - assert(vec[0].begin() >= str->data()); - assert(vec[0].end() <= str->data()+str->size()); + assert(vec[0].data() >= str->data()); + assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); str->replace(vec[0].data() - str->data(), vec[0].size(), s); return true; } @@ -388,6 +429,8 @@ int RE2::GlobalReplace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; @@ -406,9 +449,9 @@ int RE2::GlobalReplace(std::string* str, if (!re.Match(*str, static_cast(p - str->data()), str->size(), UNANCHORED, vec, nvec)) break; - if (p < vec[0].begin()) - out.append(p, vec[0].begin() - p); - if (vec[0].begin() == lastend && vec[0].size() == 0) { + if (p < vec[0].data()) + out.append(p, vec[0].data() - p); + if (vec[0].data() == lastend && vec[0].empty()) { // Disallow empty match at end of last match: skip ahead. // // fullrune() takes int, not ptrdiff_t. However, it just looks @@ -439,7 +482,7 @@ int RE2::GlobalReplace(std::string* str, continue; } re.Rewrite(&out, rewrite, vec, nvec); - p = vec[0].end(); + p = vec[0].data() + vec[0].size(); lastend = p; count++; } @@ -460,9 +503,10 @@ bool RE2::Extract(const StringPiece& text, std::string* out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast(arraysize(vec))) return false; - if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -610,6 +654,8 @@ bool RE2::Match(const StringPiece& text, // If the regexp is anchored explicitly, must not be in middle of text. if (prog_->anchor_start() && startpos != 0) return false; + if (prog_->anchor_end() && endpos != text.size()) + return false; // If the regexp is anchored explicitly, update re_anchor // so that we can potentially fall into a faster case below. @@ -643,7 +689,6 @@ bool RE2::Match(const StringPiece& text, Prog::MatchKind kind = Prog::kFirstMatch; if (options_.longest_match()) kind = Prog::kLongestMatch; - bool skipped_test = false; bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); @@ -655,38 +700,82 @@ bool RE2::Match(const StringPiece& text, bool can_bit_state = prog_->CanBitState(); size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count(); +#ifdef RE2_HAVE_THREAD_LOCAL + hooks::context = this; +#endif bool dfa_failed = false; + bool skipped_test = false; switch (re_anchor) { default: + LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; + return false; + case UNANCHORED: { + if (prog_->anchor_end()) { + // This is a very special case: we don't need the forward DFA because + // we already know where the match must end! Instead, the reverse DFA + // can say whether there is a match and (optionally) where it starts. + Prog* prog = ReverseProg(); + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } + if (!prog->SearchDFA(subtext, text, Prog::kAnchored, + Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { + if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); + // Fall back to NFA below. + skipped_test = true; + break; + } + return false; + } + if (matchp == NULL) // Matched. Don't care where. + return true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, matchp, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; } return false; } - if (matchp == NULL) // Matched. Don't care where + if (matchp == NULL) // Matched. Don't care where. return true; - // SearchDFA set match[0].end() but didn't know where the - // match started. Run the regexp backward from match[0].end() + // SearchDFA set match.end() but didn't know where the + // match started. Run the regexp backward from match.end() // to find the longest possible match -- that's where it started. Prog* prog = ReverseProg(); - if (prog == NULL) - return false; + if (prog == NULL) { + // Fall back to NFA below. + skipped_test = true; + break; + } if (!prog->SearchDFA(match, text, Prog::kAnchored, Prog::kLongestMatch, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", " - << "bytemap range " << prog->bytemap_range() << ", " - << "list count " << prog->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog->size() << ", " + << "list count " << prog->list_count() << ", " + << "bytemap range " << prog->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -724,9 +813,11 @@ bool RE2::Match(const StringPiece& text, &match, &dfa_failed, NULL)) { if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "pattern length " << pattern_.size() << ", " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); // Fall back to NFA below. skipped_test = true; break; @@ -928,13 +1019,13 @@ bool RE2::Rewrite(std::string* out, int n = (c - '0'); if (n >= veclen) { if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; } return false; } StringPiece snip = vec[n]; - if (snip.size() > 0) + if (!snip.empty()) out->append(snip.data(), snip.size()); } else if (c == '\\') { out->push_back('\\'); @@ -949,41 +1040,49 @@ bool RE2::Rewrite(std::string* out, /***** Parsers for various types *****/ -bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { +namespace re2_internal { + +template <> +bool Parse(const char* str, size_t n, void* dest) { // We fail if somebody asked us to store into a non-NULL void* pointer return (dest == NULL); } -bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, std::string* dest) { if (dest == NULL) return true; - reinterpret_cast(dest)->assign(str, n); + dest->assign(str, n); return true; } -bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, StringPiece* dest) { if (dest == NULL) return true; - *(reinterpret_cast(dest)) = StringPiece(str, n); + *dest = StringPiece(str, n); return true; } -bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } -bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, signed char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } -bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) { +template <> +bool Parse(const char* str, size_t n, unsigned char* dest) { if (n != 1) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = str[0]; + *dest = str[0]; return true; } @@ -1047,10 +1146,40 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, return buf; } -bool RE2::Arg::parse_long_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, float* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + float r = strtof(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, double* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); + char* end; + errno = 0; + double r = strtod(str, &end); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *dest = r; + return true; +} + +template <> +bool Parse(const char* str, size_t n, long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1060,14 +1189,12 @@ bool RE2::Arg::parse_long_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_ulong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1083,62 +1210,52 @@ bool RE2::Arg::parse_ulong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_short_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, short* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (short)r; + *dest = (short)r; return true; } -bool RE2::Arg::parse_ushort_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned short)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (unsigned short)r; + *dest = (unsigned short)r; return true; } -bool RE2::Arg::parse_int_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, int* dest, int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (int)r; + *dest = (int)r; return true; } -bool RE2::Arg::parse_uint_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((unsigned int)r != r) return false; // Out of range + if (!Parse(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = (unsigned int)r; + *dest = (unsigned int)r; return true; } -bool RE2::Arg::parse_longlong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1148,14 +1265,12 @@ bool RE2::Arg::parse_longlong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -bool RE2::Arg::parse_ulonglong_radix(const char* str, - size_t n, - void* dest, - int radix) { +template <> +bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, sizeof buf, str, &n, false); @@ -1170,67 +1285,47 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *dest = r; return true; } -static bool parse_double_float(const char* str, size_t n, bool isfloat, - void* dest) { - if (n == 0) return false; - static const int kMaxLength = 200; - char buf[kMaxLength+1]; - str = TerminateNumber(buf, sizeof buf, str, &n, true); - char* end; - errno = 0; - double r; - if (isfloat) { - r = strtof(str, &end); - } else { - r = strtod(str, &end); - } - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - if (isfloat) { - *(reinterpret_cast(dest)) = (float)r; - } else { - *(reinterpret_cast(dest)) = r; - } - return true; -} +} // namespace re2_internal -bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, false, dest); -} +namespace hooks { -bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) { - return parse_double_float(str, n, true, dest); -} +#ifdef RE2_HAVE_THREAD_LOCAL +thread_local const RE2* context = NULL; +#endif -#define DEFINE_INTEGER_PARSER(name) \ - bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \ - void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ - } +template +union Hook { + void Store(T* cb) { cb_.store(cb, std::memory_order_release); } + T* Load() const { return cb_.load(std::memory_order_acquire); } -DEFINE_INTEGER_PARSER(short); -DEFINE_INTEGER_PARSER(ushort); -DEFINE_INTEGER_PARSER(int); -DEFINE_INTEGER_PARSER(uint); -DEFINE_INTEGER_PARSER(long); -DEFINE_INTEGER_PARSER(ulong); -DEFINE_INTEGER_PARSER(longlong); -DEFINE_INTEGER_PARSER(ulonglong); +#if !defined(__clang__) && defined(_MSC_VER) + // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, + // this is a gross hack to make std::atomic constant-initialized on MSVC. + static_assert(ATOMIC_POINTER_LOCK_FREE == 2, + "std::atomic must be always lock-free"); + T* cb_for_constinit_; +#endif -#undef DEFINE_INTEGER_PARSER + std::atomic cb_; +}; + +template +static void DoNothing(const T&) {} + +#define DEFINE_HOOK(type, name) \ + static Hook name##_hook = {{&DoNothing}}; \ + void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ + type##Callback* Get##type##Hook() { return name##_hook.Load(); } + +DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) +DEFINE_HOOK(DFASearchFailure, dfa_search_failure) + +#undef DEFINE_HOOK + +} // namespace hooks } // namespace re2 diff --git a/extern/re2/re2/re2.h b/extern/re2/re2/re2.h index c39589d6d6..09c1fbeb23 100644 --- a/extern/re2/re2/re2.h +++ b/extern/re2/re2/re2.h @@ -30,6 +30,19 @@ // "(?i)hello" -- (?i) turns on case-insensitive matching // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible // +// The double backslashes are needed when writing C++ string literals. +// However, they should NOT be used when writing C++11 raw string literals: +// +// R"(hello (\w+) world)" -- \w matches a "word" character +// R"(version (\d+))" -- \d matches a digit +// R"(hello\s+world)" -- \s matches any whitespace character +// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary +// R"((?i)hello)" -- (?i) turns on case-insensitive matching +// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible +// +// When using UTF-8 encoding, case-insensitive matching will perform +// simple case folding, not full case folding. +// // ----------------------------------------------------------------------- // MATCHING INTERFACE: // @@ -195,6 +208,12 @@ #include #include #include +#include +#include + +#if defined(__APPLE__) +#include +#endif #include "re2/stringpiece.h" @@ -229,6 +248,7 @@ class RE2 { ErrorBadCharRange, // bad character class range ErrorMissingBracket, // missing closing ] ErrorMissingParen, // missing closing ) + ErrorUnexpectedParen, // unexpected closing ) ErrorTrailingBackslash, // trailing \ at end of regexp ErrorRepeatArgument, // repeat argument missing, e.g. "*" ErrorRepeatSize, // bad repetition argument @@ -287,11 +307,11 @@ class RE2 { int ProgramSize() const; int ReverseProgramSize() const; - // EXPERIMENTAL! SUBJECT TO CHANGE! - // Outputs the program fanout as a histogram bucketed by powers of 2. + // If histogram is not null, outputs the program fanout + // as a histogram bucketed by powers of 2. // Returns the number of the largest non-empty bucket. - int ProgramFanout(std::map* histogram) const; - int ReverseProgramFanout(std::map* histogram) const; + int ProgramFanout(std::vector* histogram) const; + int ReverseProgramFanout(std::vector* histogram) const; // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need @@ -349,12 +369,12 @@ class RE2 { // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "re" exactly - // b. The number of matched sub-patterns is >= number of supplied pointers + // a. "text" matches "re" fully - from the beginning to the end of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. // c. The "i"th argument has a suitable type for holding the // string captured as the "i"th sub-pattern. If you pass in // NULL for the "i"th argument, or pass fewer arguments than - // number of sub-patterns, "i"th captured sub-pattern is + // number of sub-patterns, the "i"th captured sub-pattern is // ignored. // // CAVEAT: An optional sub-pattern that does not exist in the @@ -368,8 +388,17 @@ class RE2 { return Apply(FullMatchN, text, re, Arg(std::forward(a))...); } - // Exactly like FullMatch(), except that "re" is allowed to match - // a substring of "text". + // Like FullMatch(), except that "re" is allowed to match a substring + // of "text". + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "re" partially - for some substring of "text". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); @@ -378,7 +407,16 @@ class RE2 { // Like FullMatch() and PartialMatch(), except that "re" has to match // a prefix of the text, and "input" is advanced past the matched // text. Note: "input" is modified iff this routine returns true - // and "re" matched a non-empty substring of "text". + // and "re" matched a non-empty substring of "input". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some prefix of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool Consume(StringPiece* input, const RE2& re, A&&... a) { return Apply(ConsumeN, input, re, Arg(std::forward(a))...); @@ -388,6 +426,15 @@ class RE2 { // the text. That is, "re" need not start its match at the beginning // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds // the next word in "s" and stores it in "word". + // + // Returns true iff all of the following conditions are satisfied: + // a. "input" matches "re" partially - for some substring of "input". + // b. The number of matched sub-patterns is >= number of supplied pointers. + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, the "i"th captured sub-pattern is + // ignored. template static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); @@ -443,7 +490,7 @@ class RE2 { // Escapes all potentially meaningful regexp characters in // 'unquoted'. The returned string, used as a regular expression, - // will exactly match the original string. For example, + // will match exactly the original string. For example, // 1.5-2.0? // may become: // 1\.5\-2\.0\? @@ -626,17 +673,6 @@ class RE2 { Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } - // Legacy interface to encoding. - // TODO(rsc): Remove once clients have been converted. - bool utf8() const { return encoding_ == EncodingUTF8; } - void set_utf8(bool b) { - if (b) { - encoding_ = EncodingUTF8; - } else { - encoding_ = EncodingLatin1; - } - } - bool posix_syntax() const { return posix_syntax_; } void set_posix_syntax(bool b) { posix_syntax_ = b; } @@ -699,32 +735,12 @@ class RE2 { const Options& options() const { return options_; } // Argument converters; see below. - static inline Arg CRadix(short* x); - static inline Arg CRadix(unsigned short* x); - static inline Arg CRadix(int* x); - static inline Arg CRadix(unsigned int* x); - static inline Arg CRadix(long* x); - static inline Arg CRadix(unsigned long* x); - static inline Arg CRadix(long long* x); - static inline Arg CRadix(unsigned long long* x); - - static inline Arg Hex(short* x); - static inline Arg Hex(unsigned short* x); - static inline Arg Hex(int* x); - static inline Arg Hex(unsigned int* x); - static inline Arg Hex(long* x); - static inline Arg Hex(unsigned long* x); - static inline Arg Hex(long long* x); - static inline Arg Hex(unsigned long long* x); - - static inline Arg Octal(short* x); - static inline Arg Octal(unsigned short* x); - static inline Arg Octal(int* x); - static inline Arg Octal(unsigned int* x); - static inline Arg Octal(long* x); - static inline Arg Octal(unsigned long* x); - static inline Arg Octal(long long* x); - static inline Arg Octal(unsigned long long* x); + template + static Arg CRadix(T* ptr); + template + static Arg Hex(T* ptr); + template + static Arg Octal(T* ptr); private: void Init(const StringPiece& pattern, const Options& options); @@ -737,29 +753,26 @@ class RE2 { re2::Prog* ReverseProg() const; - std::string pattern_; // string regular expression - Options options_; // option flags - std::string prefix_; // required prefix (before regexp_) - bool prefix_foldcase_; // prefix is ASCII case-insensitive - re2::Regexp* entire_regexp_; // parsed regular expression - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // Number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? - - mutable re2::Prog* rprog_; // reverse program for regexp - mutable const std::string* error_; // Error indicator - // (or points to empty string) - mutable ErrorCode error_code_; // Error code - mutable std::string error_arg_; // Fragment of regexp showing error + std::string pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + const std::string* error_; // error indicator (or points to empty string) + ErrorCode error_code_; // error code + std::string error_arg_; // fragment of regexp showing error + std::string prefix_; // required prefix (before suffix_regexp_) + bool prefix_foldcase_; // prefix_ is ASCII case-insensitive + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + re2::Prog* prog_; // compiled program for regexp + int num_captures_; // number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + // Reverse Prog for DFA execution only + mutable re2::Prog* rprog_; // Map from capture names to indices mutable const std::map* named_groups_; - // Map from capture indices to names mutable const std::map* group_names_; - // Onces for lazy computations. mutable std::once_flag rprog_once_; mutable std::once_flag named_groups_once_; mutable std::once_flag group_names_once_; @@ -770,137 +783,134 @@ class RE2 { /***** Implementation details *****/ -// Hex/Octal/Binary? +namespace re2_internal { -// Special class for parsing into objects that define a ParseFrom() method -template -class _RE2_MatchObject { - public: - static inline bool Parse(const char* str, size_t n, void* dest) { - if (dest == NULL) return true; - T* object = reinterpret_cast(dest); - return object->ParseFrom(str, n); - } -}; +// Types for which the 3-ary Parse() function template has specializations. +template struct Parse3ary : public std::false_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; +template <> struct Parse3ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest); + +// Types for which the 4-ary Parse() function template has specializations. +template struct Parse4ary : public std::false_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; +template <> struct Parse4ary : public std::true_type {}; + +template +bool Parse(const char* str, size_t n, T* dest, int radix); + +} // namespace re2_internal class RE2::Arg { - public: - // Empty constructor so we can declare arrays of RE2::Arg - Arg(); + private: + template + using CanParse3ary = typename std::enable_if< + re2_internal::Parse3ary::value, + int>::type; - // Constructor specially designed for NULL arguments - Arg(void*); - Arg(std::nullptr_t); + template + using CanParse4ary = typename std::enable_if< + re2_internal::Parse4ary::value, + int>::type; + +#if !defined(_MSC_VER) + template + using CanParseFrom = typename std::enable_if< + std::is_member_function_pointer< + decltype(static_cast( + &T::ParseFrom))>::value, + int>::type; +#endif + + public: + Arg() : Arg(nullptr) {} + Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {} + + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary) {} + + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary) {} + +#if !defined(_MSC_VER) + template = 0> + Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom) {} +#endif typedef bool (*Parser)(const char* str, size_t n, void* dest); -// Type-specific parsers -#define MAKE_PARSER(type, name) \ - Arg(type* p) : arg_(p), parser_(name) {} \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} + template + Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {} - MAKE_PARSER(char, parse_char) - MAKE_PARSER(signed char, parse_schar) - MAKE_PARSER(unsigned char, parse_uchar) - MAKE_PARSER(float, parse_float) - MAKE_PARSER(double, parse_double) - MAKE_PARSER(std::string, parse_string) - MAKE_PARSER(StringPiece, parse_stringpiece) - - MAKE_PARSER(short, parse_short) - MAKE_PARSER(unsigned short, parse_ushort) - MAKE_PARSER(int, parse_int) - MAKE_PARSER(unsigned int, parse_uint) - MAKE_PARSER(long, parse_long) - MAKE_PARSER(unsigned long, parse_ulong) - MAKE_PARSER(long long, parse_longlong) - MAKE_PARSER(unsigned long long, parse_ulonglong) - -#undef MAKE_PARSER - - // Generic constructor templates - template Arg(T* p) - : arg_(p), parser_(_RE2_MatchObject::Parse) { } - template Arg(T* p, Parser parser) - : arg_(p), parser_(parser) { } - - // Parse the data - bool Parse(const char* str, size_t n) const; - - private: - void* arg_; - Parser parser_; - - static bool parse_null (const char* str, size_t n, void* dest); - static bool parse_char (const char* str, size_t n, void* dest); - static bool parse_schar (const char* str, size_t n, void* dest); - static bool parse_uchar (const char* str, size_t n, void* dest); - static bool parse_float (const char* str, size_t n, void* dest); - static bool parse_double (const char* str, size_t n, void* dest); - static bool parse_string (const char* str, size_t n, void* dest); - static bool parse_stringpiece (const char* str, size_t n, void* dest); - -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_##name(const char* str, size_t n, void* dest); \ - static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ - int radix); \ - \ - public: \ - static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ - static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ - static bool parse_##name##_cradix(const char* str, size_t n, void* dest); - - DECLARE_INTEGER_PARSER(short) - DECLARE_INTEGER_PARSER(ushort) - DECLARE_INTEGER_PARSER(int) - DECLARE_INTEGER_PARSER(uint) - DECLARE_INTEGER_PARSER(long) - DECLARE_INTEGER_PARSER(ulong) - DECLARE_INTEGER_PARSER(longlong) - DECLARE_INTEGER_PARSER(ulonglong) - -#undef DECLARE_INTEGER_PARSER - -}; - -inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } -inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } -inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { } - -inline bool RE2::Arg::Parse(const char* str, size_t n) const { - return (*parser_)(str, n, arg_); -} - -// This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline RE2::Arg RE2::Hex(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \ - } \ - inline RE2::Arg RE2::Octal(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \ - } \ - inline RE2::Arg RE2::CRadix(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \ + bool Parse(const char* str, size_t n) const { + return (*parser_)(str, n, arg_); } -MAKE_INTEGER_PARSER(short, short) -MAKE_INTEGER_PARSER(unsigned short, ushort) -MAKE_INTEGER_PARSER(int, int) -MAKE_INTEGER_PARSER(unsigned int, uint) -MAKE_INTEGER_PARSER(long, long) -MAKE_INTEGER_PARSER(unsigned long, ulong) -MAKE_INTEGER_PARSER(long long, longlong) -MAKE_INTEGER_PARSER(unsigned long long, ulonglong) + private: + static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) { + return true; + } -#undef MAKE_INTEGER_PARSER + template + static bool DoParse3ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest)); + } + + template + static bool DoParse4ary(const char* str, size_t n, void* dest) { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 10); + } + +#if !defined(_MSC_VER) + template + static bool DoParseFrom(const char* str, size_t n, void* dest) { + if (dest == NULL) return true; + return reinterpret_cast(dest)->ParseFrom(str, n); + } +#endif + + void* arg_; + Parser parser_; +}; + +template +inline RE2::Arg RE2::CRadix(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 0); + }); +} + +template +inline RE2::Arg RE2::Hex(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 16); + }); +} + +template +inline RE2::Arg RE2::Octal(T* ptr) { + return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool { + return re2_internal::Parse(str, n, reinterpret_cast(dest), 8); + }); +} #ifndef SWIG - // Silence warnings about missing initializers for members of LazyRE2. -// Note that we test for Clang first because it defines __GNUC__ as well. -#if defined(__clang__) -#elif defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -949,7 +959,52 @@ class LazyRE2 { void operator=(const LazyRE2&); // disallowed }; -#endif // SWIG +#endif + +namespace hooks { + +// Most platforms support thread_local. Older versions of iOS don't support +// thread_local, but for the sake of brevity, we lump together all versions +// of Apple platforms that aren't macOS. If an iOS application really needs +// the context pointee someday, we can get more specific then... +#define RE2_HAVE_THREAD_LOCAL +#if defined(__APPLE__) && !TARGET_OS_OSX +#undef RE2_HAVE_THREAD_LOCAL +#endif + +// A hook must not make any assumptions regarding the lifetime of the context +// pointee beyond the current invocation of the hook. Pointers and references +// obtained via the context pointee should be considered invalidated when the +// hook returns. Hence, any data about the context pointee (e.g. its pattern) +// would have to be copied in order for it to be kept for an indefinite time. +// +// A hook must not use RE2 for matching. Control flow reentering RE2::Match() +// could result in infinite mutual recursion. To discourage that possibility, +// RE2 will not maintain the context pointer correctly when used in that way. +#ifdef RE2_HAVE_THREAD_LOCAL +extern thread_local const RE2* context; +#endif + +struct DFAStateCacheReset { + int64_t state_budget; + size_t state_cache_size; +}; + +struct DFASearchFailure { + // Nothing yet... +}; + +#define DECLARE_HOOK(type) \ + using type##Callback = void(const type&); \ + void Set##type##Hook(type##Callback* cb); \ + type##Callback* Get##type##Hook(); + +DECLARE_HOOK(DFAStateCacheReset) +DECLARE_HOOK(DFASearchFailure) + +#undef DECLARE_HOOK + +} // namespace hooks } // namespace re2 diff --git a/extern/re2/re2/regexp.cc b/extern/re2/re2/regexp.cc index 7995ffceb3..1a384181b8 100644 --- a/extern/re2/re2/regexp.cc +++ b/extern/re2/re2/regexp.cc @@ -20,6 +20,7 @@ #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, return new Regexp(kRegexpEmptyMatch, flags); } - Regexp** subcopy = NULL; + PODArray subcopy; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. - subcopy = new Regexp*[nsub]; - memmove(subcopy, sub, nsub * sizeof sub[0]); - sub = subcopy; + subcopy = PODArray(nsub); + memmove(subcopy.data(), sub, nsub * sizeof sub[0]); + sub = subcopy.data(); nsub = FactorAlternation(sub, nsub, flags); if (nsub == 1) { Regexp* re = sub[0]; - delete[] subcopy; return re; } } @@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags, false); - delete[] subcopy; return re; } @@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, Regexp** subs = re->sub(); for (int i = 0; i < nsub; i++) subs[i] = sub[i]; - - delete[] subcopy; return re; } @@ -501,6 +498,7 @@ static const char *kErrorStrings[] = { "invalid character class range", "missing ]", "missing )", + "unexpected )", "trailing \\", "no argument for repetition operator", "invalid repetition size", @@ -544,9 +542,12 @@ class NumCapturesWalker : public Regexp::Walker { ncapture_++; return ignored; } + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; +#endif return ignored; } @@ -575,7 +576,7 @@ class NamedCapturesWalker : public Regexp::Walker { return m; } - Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) @@ -591,8 +592,10 @@ class NamedCapturesWalker : public Regexp::Walker { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; +#endif return ignored; } @@ -621,7 +624,7 @@ class CaptureNamesWalker : public Regexp::Walker { return m; } - Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) @@ -633,8 +636,10 @@ class CaptureNamesWalker : public Regexp::Walker { } virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { - // Should never be called: we use Walk not WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; +#endif return ignored; } @@ -651,78 +656,89 @@ std::map* Regexp::CaptureNames() { return w.TakeMap(); } +void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, + std::string* bytes) { + if (latin1) { + bytes->resize(nrunes); + for (int i = 0; i < nrunes; i++) + (*bytes)[i] = static_cast(runes[i]); + } else { + bytes->resize(nrunes * UTFmax); // worst case + char* p = &(*bytes)[0]; + for (int i = 0; i < nrunes; i++) + p += runetochar(p, &runes[i]); + bytes->resize(p - &(*bytes)[0]); + bytes->shrink_to_fit(); + } +} + // Determines whether regexp matches must be anchored // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix) { + prefix->clear(); + *foldcase = false; + *suffix = NULL; + // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string // 3. the rest - prefix->clear(); - *foldcase = false; - *suffix = NULL; if (op_ != kRegexpConcat) return false; - - // Some number of anchors, then a literal or concatenation. int i = 0; - Regexp** sub = this->sub(); - while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) + while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) i++; if (i == 0 || i >= nsub_) return false; - - Regexp* re = sub[i]; - switch (re->op_) { - default: - return false; - - case kRegexpLiteralString: - // Convert to string in proper encoding. - if (re->parse_flags() & Latin1) { - prefix->resize(re->nrunes_); - for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = static_cast(re->runes_[j]); - } else { - // Convert to UTF-8 in place. - // Assume worst-case space and then trim. - prefix->resize(re->nrunes_ * UTFmax); - char *p = &(*prefix)[0]; - for (int j = 0; j < re->nrunes_; j++) { - Rune r = re->runes_[j]; - if (r < Runeself) - *p++ = static_cast(r); - else - p += runetochar(p, &r); - } - prefix->resize(p - &(*prefix)[0]); - } - break; - - case kRegexpLiteral: - if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, static_cast(re->rune_)); - } else { - char buf[UTFmax]; - prefix->append(buf, runetochar(buf, &re->rune_)); - } - break; - } - *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; + Regexp* re = sub()[i]; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; i++; - - // The rest. if (i < nsub_) { for (int j = i; j < nsub_; j++) - sub[j]->Incref(); - re = Concat(sub + i, nsub_ - i, parse_flags()); + sub()[j]->Incref(); + *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); } else { - re = new Regexp(kRegexpEmptyMatch, parse_flags()); + *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); } - *suffix = re; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; + return true; +} + +// Determines whether regexp matches must be unanchored +// with a fixed string prefix. If so, returns the prefix. +// The prefix might be ASCII case-insensitive. +bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { + prefix->clear(); + *foldcase = false; + + // No need for a walker: the regexp must either begin with or be + // a literal char or string. We "see through" capturing groups, + // but make no effort to glue multiple prefix fragments together. + Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; + while (re->op_ == kRegexpCapture) { + re = re->sub()[0]; + if (re->op_ == kRegexpConcat && re->nsub_ > 0) + re = re->sub()[0]; + } + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; return true; } @@ -903,7 +919,7 @@ void CharClassBuilder::Negate() { // The ranges are allocated in the same block as the header, // necessitating a special allocator and Delete method. -CharClass* CharClass::New(int maxranges) { +CharClass* CharClass::New(size_t maxranges) { CharClass* cc; uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; cc = reinterpret_cast(data); @@ -920,7 +936,7 @@ void CharClass::Delete() { } CharClass* CharClass::Negate() { - CharClass* cc = CharClass::New(nranges_+1); + CharClass* cc = CharClass::New(static_cast(nranges_+1)); cc->folds_ascii_ = folds_ascii_; cc->nrunes_ = Runemax + 1 - nrunes_; int n = 0; @@ -957,7 +973,7 @@ bool CharClass::Contains(Rune r) { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(static_cast(ranges_.size())); + CharClass* cc = CharClass::New(ranges_.size()); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; diff --git a/extern/re2/re2/regexp.h b/extern/re2/re2/regexp.h index a5d85c8128..61882b5798 100644 --- a/extern/re2/re2/regexp.h +++ b/extern/re2/re2/regexp.h @@ -86,6 +86,7 @@ // form accessible to clients, so that client code can analyze the // parsed regular expressions. +#include #include #include #include @@ -177,6 +178,7 @@ enum RegexpStatusCode { kRegexpBadCharRange, // bad character class range kRegexpMissingBracket, // missing closing ] kRegexpMissingParen, // missing closing ) + kRegexpUnexpectedParen, // unexpected closing ) kRegexpTrailingBackslash, // at end of regexp kRegexpRepeatArgument, // repeat argument missing, e.g. "*" kRegexpRepeatSize, // bad repetition argument @@ -258,7 +260,7 @@ class CharClass { private: CharClass(); // not implemented ~CharClass(); // not implemented - static CharClass* New(int maxranges); + static CharClass* New(size_t maxranges); friend class CharClassBuilder; @@ -440,6 +442,13 @@ class Regexp { bool RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix); + // Whether every match of this regexp must be unanchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix. + // Callers should expect *prefix and *foldcase to be "zeroed" + // regardless of the return value. + bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); diff --git a/extern/re2/re2/set.cc b/extern/re2/re2/set.cc index d4c34ad8f1..c3924bfd98 100644 --- a/extern/re2/re2/set.cc +++ b/extern/re2/re2/set.cc @@ -7,30 +7,49 @@ #include #include #include +#include #include "util/util.h" #include "util/logging.h" -#include "util/pod_array.h" -#include "re2/stringpiece.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "re2/stringpiece.h" namespace re2 { -RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { - options_.Copy(options); +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) + : options_(options), + anchor_(anchor), + compiled_(false), + size_(0) { options_.set_never_capture(true); // might unblock some optimisations - anchor_ = anchor; - prog_ = NULL; - compiled_ = false; - size_ = 0; } RE2::Set::~Set() { for (size_t i = 0; i < elem_.size(); i++) elem_[i].second->Decref(); - delete prog_; +} + +RE2::Set::Set(Set&& other) + : options_(other.options_), + anchor_(other.anchor_), + elem_(std::move(other.elem_)), + compiled_(other.compiled_), + size_(other.size_), + prog_(std::move(other.prog_)) { + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); +} + +RE2::Set& RE2::Set::operator=(Set&& other) { + this->~Set(); + (void) new (this) Set(std::move(other)); + return *this; } int RE2::Set::Add(const StringPiece& pattern, std::string* error) { @@ -97,9 +116,9 @@ bool RE2::Set::Compile() { options_.ParseFlags()); re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); - prog_ = Prog::CompileSet(re, anchor_, options_.max_mem()); + prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); re->Decref(); - return prog_ != NULL; + return prog_ != nullptr; } bool RE2::Set::Match(const StringPiece& text, std::vector* v) const { @@ -124,9 +143,10 @@ bool RE2::Set::Match(const StringPiece& text, std::vector* v, NULL, &dfa_failed, matches.get()); if (dfa_failed) { if (options_.log_errors()) - LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " - << "bytemap range " << prog_->bytemap_range() << ", " - << "list count " << prog_->list_count(); + LOG(ERROR) << "DFA out of memory: " + << "program size " << prog_->size() << ", " + << "list count " << prog_->list_count() << ", " + << "bytemap range " << prog_->bytemap_range(); if (error_info != NULL) error_info->kind = kOutOfMemory; return false; diff --git a/extern/re2/re2/set.h b/extern/re2/re2/set.h index 59733fd94c..8d64f30ccd 100644 --- a/extern/re2/re2/set.h +++ b/extern/re2/re2/set.h @@ -5,6 +5,7 @@ #ifndef RE2_SET_H_ #define RE2_SET_H_ +#include #include #include #include @@ -36,6 +37,13 @@ class RE2::Set { Set(const RE2::Options& options, RE2::Anchor anchor); ~Set(); + // Not copyable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + // Movable. + Set(Set&& other); + Set& operator=(Set&& other); + // Adds pattern to the set using the options passed to the constructor. // Returns the index that will identify the regexp in the output of Match(), // or -1 if the regexp cannot be parsed. @@ -67,12 +75,9 @@ class RE2::Set { RE2::Options options_; RE2::Anchor anchor_; std::vector elem_; - re2::Prog* prog_; bool compiled_; int size_; - - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; + std::unique_ptr prog_; }; } // namespace re2 diff --git a/extern/re2/re2/simplify.cc b/extern/re2/re2/simplify.cc index 8939678619..663d5fcd45 100644 --- a/extern/re2/re2/simplify.cc +++ b/extern/re2/re2/simplify.cc @@ -10,8 +10,8 @@ #include "util/util.h" #include "util/logging.h" -#include "util/pod_array.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -28,8 +28,6 @@ bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, Regexp* sre = re->Simplify(); re->Decref(); if (sre == NULL) { - // Should not happen, since Simplify never fails. - LOG(ERROR) << "Simplify failed on " << src; if (status) { status->set_code(kRegexpInternalError); status->set_error_arg(src); @@ -180,10 +178,20 @@ Regexp* Regexp::Simplify() { CoalesceWalker cw; Regexp* cre = cw.Walk(this, NULL); if (cre == NULL) - return cre; + return NULL; + if (cw.stopped_early()) { + cre->Decref(); + return NULL; + } SimplifyWalker sw; Regexp* sre = sw.Walk(cre, NULL); cre->Decref(); + if (sre == NULL) + return NULL; + if (sw.stopped_early()) { + sre->Decref(); + return NULL; + } return sre; } @@ -212,9 +220,10 @@ Regexp* CoalesceWalker::Copy(Regexp* re) { } Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; +#endif return re->Incref(); } @@ -437,9 +446,10 @@ Regexp* SimplifyWalker::Copy(Regexp* re) { } Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { - // This should never be called, since we use Walk and not - // WalkExponential. + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; +#endif return re->Incref(); } diff --git a/extern/re2/util/sparse_array.h b/extern/re2/re2/sparse_array.h similarity index 99% rename from extern/re2/util/sparse_array.h rename to extern/re2/re2/sparse_array.h index c81c9f355f..09ffe086b7 100644 --- a/extern/re2/util/sparse_array.h +++ b/extern/re2/re2/sparse_array.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef UTIL_SPARSE_ARRAY_H_ -#define UTIL_SPARSE_ARRAY_H_ +#ifndef RE2_SPARSE_ARRAY_H_ +#define RE2_SPARSE_ARRAY_H_ // DESCRIPTION // @@ -102,7 +102,7 @@ #include #include -#include "util/pod_array.h" +#include "re2/pod_array.h" namespace re2 { @@ -389,4 +389,4 @@ template bool SparseArray::less(const IndexValue& a, } // namespace re2 -#endif // UTIL_SPARSE_ARRAY_H_ +#endif // RE2_SPARSE_ARRAY_H_ diff --git a/extern/re2/util/sparse_set.h b/extern/re2/re2/sparse_set.h similarity index 98% rename from extern/re2/util/sparse_set.h rename to extern/re2/re2/sparse_set.h index 0d5ad51149..06ed88d81b 100644 --- a/extern/re2/util/sparse_set.h +++ b/extern/re2/re2/sparse_set.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef UTIL_SPARSE_SET_H_ -#define UTIL_SPARSE_SET_H_ +#ifndef RE2_SPARSE_SET_H_ +#define RE2_SPARSE_SET_H_ // DESCRIPTION // @@ -61,7 +61,7 @@ #include #include -#include "util/pod_array.h" +#include "re2/pod_array.h" namespace re2 { @@ -261,4 +261,4 @@ typedef SparseSetT SparseSet; } // namespace re2 -#endif // UTIL_SPARSE_SET_H_ +#endif // RE2_SPARSE_SET_H_ diff --git a/extern/re2/re2/testing/backtrack.cc b/extern/re2/re2/testing/backtrack.cc index ae9fd82bc7..216d259802 100644 --- a/extern/re2/re2/testing/backtrack.cc +++ b/extern/re2/re2/testing/backtrack.cc @@ -29,6 +29,7 @@ #include "util/util.h" #include "util/logging.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -53,7 +54,6 @@ namespace re2 { class Backtracker { public: explicit Backtracker(Prog* prog); - ~Backtracker(); bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool longest, @@ -79,9 +79,11 @@ class Backtracker { int nsubmatch_; // # of submatches to fill in // Search state - const char* cap_[64]; // capture registers - uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked - size_t nvisited_; // # of words in bitmap + const char* cap_[64]; // capture registers + PODArray visited_; // bitmap: (Inst*, char*) pairs visited + + Backtracker(const Backtracker&) = delete; + Backtracker& operator=(const Backtracker&) = delete; }; Backtracker::Backtracker(Prog* prog) @@ -90,13 +92,7 @@ Backtracker::Backtracker(Prog* prog) longest_(false), endmatch_(false), submatch_(NULL), - nsubmatch_(0), - visited_(NULL), - nvisited_(0) { -} - -Backtracker::~Backtracker() { - delete[] visited_; + nsubmatch_(0) { } // Runs a backtracking search. @@ -105,7 +101,7 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, StringPiece* submatch, int nsubmatch) { text_ = text; context_ = context; - if (context_.begin() == NULL) + if (context_.data() == NULL) context_ = text; if (prog_->anchor_start() && text.begin() > context_.begin()) return false; @@ -130,24 +126,28 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, // Allocate new visited_ bitmap -- size is proportional // to text, so have to reallocate on each call to Search. - delete[] visited_; - nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; - visited_ = new uint32_t[nvisited_]; - memset(visited_, 0, nvisited_*sizeof visited_[0]); + int nvisited = prog_->size() * static_cast(text.size()+1); + nvisited = (nvisited + 31) / 32; + visited_ = PODArray(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); // Anchored search must start at text.begin(). if (anchored_) { - cap_[0] = text.begin(); - return Visit(prog_->start(), text.begin()); + cap_[0] = text.data(); + return Visit(prog_->start(), text.data()); } // Unanchored search, starting from each possible text position. // Notice that we have to try the empty string at the end of // the text, so the loop condition is p <= text.end(), not p < text.end(). - for (const char* p = text.begin(); p <= text.end(); p++) { + for (const char* p = text.data(); p <= text.data() + text.size(); p++) { cap_[0] = p; if (Visit(prog_->start(), p)) // Match must be leftmost; done. return true; + // Avoid invoking undefined behavior (arithmetic on a null pointer) + // by simply not continuing the loop. + if (p == NULL) + break; } return false; } @@ -158,9 +158,10 @@ bool Backtracker::Visit(int id, const char* p) { // Check bitmap. If we've already explored from here, // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. - CHECK(p <= text_.end()); - size_t n = id*(text_.size()+1) + (p - text_.begin()); - CHECK_LT(n/32, nvisited_); + CHECK(p <= text_.data() + text_.size()); + int n = id * static_cast(text_.size()+1) + + static_cast(p-text_.data()); + CHECK_LT(n/32, visited_.size()); if (visited_[n/32] & (1 << (n&31))) return false; visited_[n/32] |= 1 << (n&31); @@ -182,7 +183,7 @@ bool Backtracker::Try(int id, const char* p) { // Pick out byte at current position. If at end of string, // have to explore in hope of finishing a match. Use impossible byte -1. int c = -1; - if (p < text_.end()) + if (p < text_.data() + text_.size()) c = *p & 0xFF; Prog::Inst* ip = prog_->inst(id); @@ -224,11 +225,12 @@ bool Backtracker::Try(int id, const char* p) { case kInstMatch: // We found a match. If it's the best so far, record the // parameters in the caller's submatch_ array. - if (endmatch_ && p != context_.end()) + if (endmatch_ && p != context_.data() + context_.size()) return false; cap_[1] = p; - if (submatch_[0].data() == NULL || // First match so far ... - (longest_ && p > submatch_[0].end())) { // ... or better match + if (submatch_[0].data() == NULL || + (longest_ && p > submatch_[0].data() + submatch_[0].size())) { + // First match so far - or better match. for (int i = 0; i < nsubmatch_; i++) submatch_[i] = StringPiece( cap_[2 * i], static_cast(cap_[2 * i + 1] - cap_[2 * i])); diff --git a/extern/re2/re2/testing/charclass_test.cc b/extern/re2/re2/testing/charclass_test.cc index a2837a69a3..9c2a32f6a8 100644 --- a/extern/re2/re2/testing/charclass_test.cc +++ b/extern/re2/re2/testing/charclass_test.cc @@ -85,7 +85,7 @@ static CCTest tests[] = { { {-1} } }, }; -template +template static void Broke(const char *desc, const CCTest* t, CharClass* cc) { if (t == NULL) { printf("\t%s:", desc); @@ -136,7 +136,7 @@ void Delete(CharClassBuilder* cc) { delete cc; } -template +template bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { typename CharClass::iterator it = cc->begin(); int size = 0; diff --git a/extern/re2/re2/testing/compile_test.cc b/extern/re2/re2/testing/compile_test.cc index 6b77cf97b9..2096e2f079 100644 --- a/extern/re2/re2/testing/compile_test.cc +++ b/extern/re2/re2/testing/compile_test.cc @@ -147,10 +147,19 @@ static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags, Regexp* re = Regexp::Parse(pattern, flags, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(0); - EXPECT_TRUE(prog != NULL); - *bytemap = prog->DumpByteMap(); - delete prog; + { + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + *bytemap = prog->DumpByteMap(); + delete prog; + } + + { + Prog* prog = re->CompileToReverseProg(0); + EXPECT_TRUE(prog != NULL); + EXPECT_EQ(*bytemap, prog->DumpByteMap()); + delete prog; + } re->Decref(); } @@ -213,16 +222,11 @@ TEST(TestCompile, UTF8Ranges) { EXPECT_EQ("[00-09] -> 0\n" "[0a-0a] -> 1\n" "[0b-7f] -> 0\n" - "[80-8f] -> 2\n" - "[90-9f] -> 3\n" - "[a0-bf] -> 4\n" + "[80-bf] -> 2\n" "[c0-c1] -> 1\n" - "[c2-df] -> 5\n" - "[e0-e0] -> 6\n" - "[e1-ef] -> 7\n" - "[f0-f0] -> 8\n" - "[f1-f3] -> 9\n" - "[f4-f4] -> 10\n" + "[c2-df] -> 3\n" + "[e0-ef] -> 4\n" + "[f0-f4] -> 5\n" "[f5-ff] -> 1\n", bytemap); } @@ -232,7 +236,7 @@ TEST(TestCompile, InsufficientMemory) { "^(?P[^\\s]+)\\s+(?P[^\\s]+)\\s+(?P.+)$", Regexp::LikePerl, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(920); + Prog* prog = re->CompileToProg(850); // If the memory budget has been exhausted, compilation should fail // and return NULL instead of trying to do anything with NoMatch(). EXPECT_TRUE(prog == NULL); @@ -299,20 +303,22 @@ TEST(TestCompile, Bug26705922) { "8. byte [f0-f0] 0 -> 7\n", reverse); - Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse); - EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" - "4+ byte [c2-df] 0 -> 7\n" - "5+ byte [a0-bf] 1 -> 8\n" - "6. byte [80-bf] 0 -> 9\n" + Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse); + EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n" + "4+ byte [e0-ef] 0 -> 8\n" + "5. byte [f0-f4] 0 -> 9\n" + "6. byte [80-bf] 0 -> 7\n" "7. match! 0\n" - "8. byte [e0-e0] 0 -> 7\n" - "9+ byte [e1-ef] 0 -> 7\n" - "10+ byte [90-bf] 1 -> 13\n" - "11+ byte [80-bf] 1 -> 14\n" - "12. byte [80-8f] 0 -> 15\n" - "13. byte [f0-f0] 0 -> 7\n" - "14. byte [f1-f3] 0 -> 7\n" - "15. byte [f4-f4] 0 -> 7\n", + "8. byte [80-bf] 0 -> 6\n" + "9. byte [80-bf] 0 -> 8\n", + forward); + EXPECT_EQ("3. byte [80-bf] 0 -> 4\n" + "4+ byte [c2-df] 0 -> 6\n" + "5. byte [80-bf] 0 -> 7\n" + "6. match! 0\n" + "7+ byte [e0-ef] 0 -> 6\n" + "8. byte [80-bf] 0 -> 9\n" + "9. byte [f0-f4] 0 -> 6\n", reverse); } diff --git a/extern/re2/re2/testing/dfa_test.cc b/extern/re2/re2/testing/dfa_test.cc index fb3cc14547..9e15a41ed8 100644 --- a/extern/re2/re2/testing/dfa_test.cc +++ b/extern/re2/re2/testing/dfa_test.cc @@ -8,7 +8,9 @@ #include #include "util/test.h" +#include "util/flags.h" #include "util/logging.h" +#include "util/malloc_counter.h" #include "util/strutil.h" #include "re2/prog.h" #include "re2/re2.h" @@ -18,12 +20,26 @@ static const bool UsingMallocCounter = false; -DEFINE_int32(size, 8, "log2(number of DFA nodes)"); -DEFINE_int32(repeat, 2, "Repetition count."); -DEFINE_int32(threads, 4, "number of threads"); +DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)"); +DEFINE_FLAG(int, repeat, 2, "Repetition count."); +DEFINE_FLAG(int, threads, 4, "number of threads"); namespace re2 { +static int state_cache_resets = 0; +static int search_failures = 0; + +struct SetHooks { + SetHooks() { + hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) { + ++state_cache_resets; + }); + hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) { + ++search_failures; + }); + } +} set_hooks; + // Check that multithreaded access to DFA class works. // Helper function: builds entire DFA for prog. @@ -34,7 +50,7 @@ static void DoBuild(Prog* prog) { TEST(Multithreaded, BuildEntireDFA) { // Create regexp with 2^FLAGS_size states in DFA. std::string s = "a"; - for (int i = 0; i < FLAGS_size; i++) + for (int i = 0; i < GetFlag(FLAGS_size); i++) s += "[ab]"; s += "b"; Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL); @@ -52,14 +68,14 @@ TEST(Multithreaded, BuildEntireDFA) { } // Build the DFA simultaneously in a bunch of threads. - for (int i = 0; i < FLAGS_repeat; i++) { + for (int i = 0; i < GetFlag(FLAGS_repeat); i++) { Prog* prog = re->CompileToProg(0); ASSERT_TRUE(prog != NULL); std::vector threads; - for (int j = 0; j < FLAGS_threads; j++) + for (int j = 0; j < GetFlag(FLAGS_threads); j++) threads.emplace_back(DoBuild, prog); - for (int j = 0; j < FLAGS_threads; j++) + for (int j = 0; j < GetFlag(FLAGS_threads); j++) threads[j].join(); // One more compile, to make sure everything is okay. @@ -106,44 +122,6 @@ TEST(SingleThreaded, BuildEntireDFA) { re->Decref(); } -// Generates and returns a string over binary alphabet {0,1} that contains -// all possible binary sequences of length n as subsequences. The obvious -// brute force method would generate a string of length n * 2^n, but this -// generates a string of length n + 2^n - 1 called a De Bruijn cycle. -// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. -// Such a string is useful for testing a DFA. If you have a DFA -// where distinct last n bytes implies distinct states, then running on a -// DeBruijn string causes the DFA to need to create a new state at every -// position in the input, never reusing any states until it gets to the -// end of the string. This is the worst possible case for DFA execution. -static std::string DeBruijnString(int n) { - CHECK_LT(n, static_cast(8*sizeof(int))); - CHECK_GT(n, 0); - - std::vector did(size_t{1}<CompileToProg(1< threads; - for (int j = 0; j < FLAGS_threads; j++) + for (int j = 0; j < GetFlag(FLAGS_threads); j++) threads.emplace_back(DoSearch, prog, match, no_match); - for (int j = 0; j < FLAGS_threads; j++) + for (int j = 0; j < GetFlag(FLAGS_threads); j++) threads[j].join(); delete prog; @@ -276,6 +260,8 @@ TEST(Multithreaded, SearchDFA) { // Reset to original behaviour. Prog::TEST_dfa_should_bail_when_slow(true); + ASSERT_GT(state_cache_resets, 0); + ASSERT_EQ(search_failures, 0); } struct ReverseTest { diff --git a/extern/re2/re2/testing/dump.cc b/extern/re2/re2/testing/dump.cc index 1df8ddde6e..cad0910a0a 100644 --- a/extern/re2/re2/testing/dump.cc +++ b/extern/re2/re2/testing/dump.cc @@ -25,9 +25,6 @@ #include "re2/stringpiece.h" #include "re2/regexp.h" -// Cause a link error if this file is used outside of testing. -DECLARE_string(test_tmpdir); - namespace re2 { static const char* kOpcodeNames[] = { @@ -154,14 +151,11 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { } std::string Regexp::Dump() { + // Make sure that we are being called from a unit test. + // Should cause a link error if used outside of testing. + CHECK(!::testing::TempDir().empty()); + std::string s; - - // Make sure being called from a unit test. - if (FLAGS_test_tmpdir.empty()) { - LOG(ERROR) << "Cannot use except for testing."; - return s; - } - DumpRegexpAppending(this, &s); return s; } diff --git a/extern/re2/re2/testing/exhaustive1_test.cc b/extern/re2/re2/testing/exhaustive1_test.cc index 9ead27e646..eef2dae398 100644 --- a/extern/re2/re2/testing/exhaustive1_test.cc +++ b/extern/re2/re2/testing/exhaustive1_test.cc @@ -10,8 +10,6 @@ #include "util/test.h" #include "re2/testing/exhaustive_tester.h" -DECLARE_string(regexp_engines); - namespace re2 { // Test simple repetition operators @@ -34,11 +32,8 @@ TEST(Repetition, Capturing) { "%s* %s+ %s? %s*? %s+? %s??"); ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops, 7, Explode("ab"), "(?:%s)", ""); - - // This would be a great test, but it runs forever when PCRE is enabled. - if (FLAGS_regexp_engines.find("PCRE") == std::string::npos) - ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops, - 50, Explode("a"), "(?:%s)", ""); + ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops, + 50, Explode("a"), "(?:%s)", ""); } } // namespace re2 diff --git a/extern/re2/re2/testing/exhaustive2_test.cc b/extern/re2/re2/testing/exhaustive2_test.cc index ce4235b0e5..ae89ecec27 100644 --- a/extern/re2/re2/testing/exhaustive2_test.cc +++ b/extern/re2/re2/testing/exhaustive2_test.cc @@ -10,7 +10,6 @@ #include #include "util/test.h" -#include "re2/re2.h" #include "re2/testing/exhaustive_tester.h" namespace re2 { diff --git a/extern/re2/re2/testing/exhaustive_tester.cc b/extern/re2/re2/testing/exhaustive_tester.cc index 47950ba711..bdac381a89 100644 --- a/extern/re2/re2/testing/exhaustive_tester.cc +++ b/extern/re2/re2/testing/exhaustive_tester.cc @@ -14,6 +14,7 @@ #include #include "util/test.h" +#include "util/flags.h" #include "util/logging.h" #include "util/strutil.h" #include "re2/testing/exhaustive_tester.h" @@ -24,11 +25,11 @@ #define LOGGING 0 #endif -DEFINE_bool(show_regexps, false, "show regexps during testing"); +DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing"); -DEFINE_int32(max_bad_regexp_inputs, 1, - "Stop testing a regular expression after finding this many " - "strings that break it."); +DEFINE_FLAG(int, max_bad_regexp_inputs, 1, + "Stop testing a regular expression after finding this many " + "strings that break it."); namespace re2 { @@ -62,11 +63,12 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc for (int i = 0; i < n; i++) { if (i > 0) printf(" "); - if (m[i].begin() == NULL) + if (m[i].data() == NULL) printf("-"); else printf("%td-%td", - m[i].begin() - input.begin(), m[i].end() - input.begin()); + m[i].begin() - input.begin(), + m[i].end() - input.begin()); } } @@ -76,10 +78,11 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { regexps_++; std::string regexp = const_regexp; - if (!topwrapper_.empty()) + if (!topwrapper_.empty()) { regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); + } - if (FLAGS_show_regexps) { + if (GetFlag(FLAGS_show_regexps)) { printf("\r%s", regexp.c_str()); fflush(stdout); } @@ -134,7 +137,7 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) { tests_++; if (!tester.TestInput(strgen_.Next())) { failures_++; - if (++bad_inputs >= FLAGS_max_bad_regexp_inputs) + if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs)) break; } } diff --git a/extern/re2/re2/testing/filtered_re2_test.cc b/extern/re2/re2/testing/filtered_re2_test.cc index deef2f87d6..c788fdadc4 100644 --- a/extern/re2/re2/testing/filtered_re2_test.cc +++ b/extern/re2/re2/testing/filtered_re2_test.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include "util/test.h" #include "util/logging.h" @@ -291,4 +292,49 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) { "EmptyStringInStringSetBug", &v)); } +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + } // namespace re2 diff --git a/extern/re2/re2/testing/null_walker.cc b/extern/re2/re2/testing/null_walker.cc index 77fa72389e..2bdea02789 100644 --- a/extern/re2/re2/testing/null_walker.cc +++ b/extern/re2/re2/testing/null_walker.cc @@ -13,13 +13,16 @@ namespace re2 { class NullWalker : public Regexp::Walker { public: - NullWalker() { } - bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, - bool* child_args, int nchild_args); + NullWalker() {} - bool ShortVisit(Regexp* re, bool a) { - // Should never be called: we use Walk not WalkExponential. + virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + virtual bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk(), not WalkExponential(). +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION LOG(DFATAL) << "NullWalker::ShortVisit called"; +#endif return a; } diff --git a/extern/re2/re2/testing/random_test.cc b/extern/re2/re2/testing/random_test.cc index c0b1fe5b35..44712eb988 100644 --- a/extern/re2/re2/testing/random_test.cc +++ b/extern/re2/re2/testing/random_test.cc @@ -9,12 +9,13 @@ #include #include "util/test.h" +#include "util/flags.h" #include "re2/testing/exhaustive_tester.h" -DEFINE_int32(regexpseed, 404, "Random regexp seed."); -DEFINE_int32(regexpcount, 100, "How many random regexps to generate."); -DEFINE_int32(stringseed, 200, "Random string seed."); -DEFINE_int32(stringcount, 100, "How many random strings to generate."); +DEFINE_FLAG(int, regexpseed, 404, "Random regexp seed."); +DEFINE_FLAG(int, regexpcount, 100, "How many random regexps to generate."); +DEFINE_FLAG(int, stringseed, 200, "Random string seed."); +DEFINE_FLAG(int, stringcount, 100, "How many random strings to generate."); namespace re2 { @@ -37,8 +38,10 @@ static void RandomTest(int maxatoms, int maxops, ExhaustiveTester t(maxatoms, maxops, alphabet, ops, maxstrlen, stralphabet, wrapper, ""); - t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount); - t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount); + t.RandomStrings(GetFlag(FLAGS_stringseed), + GetFlag(FLAGS_stringcount)); + t.GenerateRandom(GetFlag(FLAGS_regexpseed), + GetFlag(FLAGS_regexpcount)); printf("%d regexps, %d tests, %d failures [%d/%d str]\n", t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); EXPECT_EQ(0, t.failures()); @@ -96,4 +99,3 @@ TEST(Random, Complicated) { } } // namespace re2 - diff --git a/extern/re2/re2/testing/re2_arg_test.cc b/extern/re2/re2/testing/re2_arg_test.cc index 7a38de7c2f..f62e17cf47 100644 --- a/extern/re2/re2/testing/re2_arg_test.cc +++ b/extern/re2/re2/testing/re2_arg_test.cc @@ -11,6 +11,7 @@ #include #include "util/test.h" +#include "util/logging.h" #include "re2/re2.h" namespace re2 { @@ -132,4 +133,28 @@ TEST(RE2ArgTest, Uint64Test) { PARSE_FOR_TYPE(uint64_t, 5); } +TEST(RE2ArgTest, ParseFromTest) { +#if !defined(_MSC_VER) + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return true; + } + } obj1; + RE2::Arg arg1(&obj1); + EXPECT_TRUE(arg1.Parse("one", 3)); + + struct { + bool ParseFrom(const char* str, size_t n) { + LOG(INFO) << "str = " << str << ", n = " << n; + return false; + } + // Ensure that RE2::Arg works even with overloaded ParseFrom(). + void ParseFrom(const char* str) {} + } obj2; + RE2::Arg arg2(&obj2); + EXPECT_FALSE(arg2.Parse("two", 3)); +#endif +} + } // namespace re2 diff --git a/extern/re2/re2/testing/re2_test.cc b/extern/re2/re2/testing/re2_test.cc index 2f4b90cddd..41fccf68eb 100644 --- a/extern/re2/re2/testing/re2_test.cc +++ b/extern/re2/re2/testing/re2_test.cc @@ -12,6 +12,7 @@ #include #include #include +#include #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) #include #include /* for sysconf */ @@ -223,6 +224,15 @@ TEST(RE2, Extract) { ASSERT_EQ(s, "'foo'"); } +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + TEST(RE2, Consume) { RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace std::string word; @@ -473,28 +483,27 @@ TEST(ProgramFanout, BigProgram) { RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); - std::map histogram; + std::vector histogram; // 3 is the largest non-empty bucket and has 1 element. ASSERT_EQ(3, re1.ProgramFanout(&histogram)); ASSERT_EQ(1, histogram[3]); - // 7 is the largest non-empty bucket and has 10 elements. - ASSERT_EQ(7, re10.ProgramFanout(&histogram)); - ASSERT_EQ(10, histogram[7]); + // 6 is the largest non-empty bucket and has 10 elements. + ASSERT_EQ(6, re10.ProgramFanout(&histogram)); + ASSERT_EQ(10, histogram[6]); - // 10 is the largest non-empty bucket and has 100 elements. - ASSERT_EQ(10, re100.ProgramFanout(&histogram)); - ASSERT_EQ(100, histogram[10]); + // 9 is the largest non-empty bucket and has 100 elements. + ASSERT_EQ(9, re100.ProgramFanout(&histogram)); + ASSERT_EQ(100, histogram[9]); // 13 is the largest non-empty bucket and has 1000 elements. ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); ASSERT_EQ(1000, histogram[13]); - // 2 is the largest non-empty bucket and has 3 elements. - // This differs from the others due to how reverse `.' works. + // 2 is the largest non-empty bucket and has 1 element. ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); - ASSERT_EQ(3, histogram[2]); + ASSERT_EQ(1, histogram[2]); // 5 is the largest non-empty bucket and has 10 elements. ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); @@ -1232,11 +1241,10 @@ TEST(RE2, DeepRecursion) { // Suggested by Josh Hyman. Failed when SearchOnePass was // not implementing case-folding. TEST(CaseInsensitive, MatchAndConsume) { - std::string result; std::string text = "A fish named *Wanda*"; StringPiece sp(text); - - EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result)); + StringPiece result; + EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); } @@ -1269,38 +1277,43 @@ TEST(RE2, CL8622304) { EXPECT_EQ(val, "1,0x2F,030,4,5"); } - // Check that RE2 returns correct regexp pieces on error. // In particular, make sure it returns whole runes // and that it always reports invalid UTF-8. // Also check that Perl error flag piece is big enough. static struct ErrorTest { const char *regexp; - const char *error; + RE2::ErrorCode error_code; + const char *error_arg; } error_tests[] = { - { "ab\\αcd", "\\α" }, - { "ef\\x☺01", "\\x☺0" }, - { "gh\\x1☺01", "\\x1☺" }, - { "ij\\x1", "\\x1" }, - { "kl\\x", "\\x" }, - { "uv\\x{0000☺}", "\\x{0000☺" }, - { "wx\\p{ABC", "\\p{ABC" }, - { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X - { "aa(?sm☺i", "(?sm☺" }, - { "bb[abc", "[abc" }, + { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, + { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, + { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, + { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, + { "kl\\x", RE2::ErrorBadEscape, "\\x" }, + { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, + { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, + // used to return (?s but the error is X + { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, + { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, + { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, + { "abc(def", RE2::ErrorMissingParen, "abc(def" }, + { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, - { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 - { "op\377qr", "" }, - { "st\\x{00000\377", "" }, - { "zz\\p{\377}", "" }, - { "zz\\x{00\377}", "" }, - { "zz(?Pabc)", "" }, + // no argument string returned for invalid UTF-8 + { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, + { "op\377qr", RE2::ErrorBadUTF8, "" }, + { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, + { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, + { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, + { "zz(?Pabc)", RE2::ErrorBadUTF8, "" }, }; -TEST(RE2, ErrorArgs) { +TEST(RE2, ErrorCodeAndArg) { for (size_t i = 0; i < arraysize(error_tests); i++) { RE2 re(error_tests[i].regexp, RE2::Quiet); EXPECT_FALSE(re.ok()); - EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); + EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); + EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); } } diff --git a/extern/re2/re2/testing/regexp_benchmark.cc b/extern/re2/re2/testing/regexp_benchmark.cc index 968fb86a64..acf6e885b9 100644 --- a/extern/re2/re2/testing/regexp_benchmark.cc +++ b/extern/re2/re2/testing/regexp_benchmark.cc @@ -8,16 +8,21 @@ #include #include #include +#include +#include #include +#include "util/benchmark.h" #include "util/test.h" +#include "util/flags.h" #include "util/logging.h" +#include "util/malloc_counter.h" #include "util/strutil.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" +#include "util/mutex.h" #include "util/pcre.h" -#include "util/benchmark.h" namespace re2 { void Test(); @@ -56,19 +61,22 @@ void MemoryUsage() { CHECK(re); // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly, // because LOG(INFO) might do a big allocation before they get evaluated. - fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); mc.Reset(); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->IsOnePass()); CHECK(prog->CanBitState()); - fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); mc.Reset(); StringPiece sp[4]; CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); - fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); delete prog; re->Decref(); } @@ -77,18 +85,22 @@ void MemoryUsage() { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); PCRE re(regexp, PCRE::UTF8); - fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); PCRE::FullMatch(text, re); - fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); } { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); PCRE* re = new PCRE(regexp, PCRE::UTF8); - fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); PCRE::FullMatch(text, *re); - fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); delete re; } @@ -96,42 +108,43 @@ void MemoryUsage() { MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); RE2 re(regexp); - fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); RE2::FullMatch(text, re); - fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", + mc.HeapGrowth(), mc.PeakHeapGrowth()); } fprintf(stderr, "sizeof: PCRE=%zd RE2=%zd Prog=%zd Inst=%zd\n", sizeof(PCRE), sizeof(RE2), sizeof(Prog), sizeof(Prog::Inst)); } +int NumCPUs() { + return static_cast(std::thread::hardware_concurrency()); +} + // Regular expression implementation wrappers. // Defined at bottom of file, but they are repetitive // and not interesting. -typedef void SearchImpl(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match); +typedef void SearchImpl(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match); -SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, - SearchPCRE, SearchRE2, - SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, SearchCachedBitState, - SearchCachedPCRE, SearchCachedRE2; +SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, SearchPCRE, + SearchRE2, SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, + SearchCachedBitState, SearchCachedPCRE, SearchCachedRE2; -typedef void ParseImpl(int iters, const char* regexp, const StringPiece& text); +typedef void ParseImpl(benchmark::State& state, const char* regexp, + const StringPiece& text); -ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, - Parse1PCRE, Parse1RE2, - Parse1Backtrack, - Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, - Parse1CachedPCRE, Parse1CachedRE2, - Parse1CachedBacktrack; +ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, Parse1PCRE, Parse1RE2, + Parse1Backtrack, Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, + Parse1CachedPCRE, Parse1CachedRE2, Parse1CachedBacktrack; -ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, - Parse3PCRE, Parse3RE2, - Parse3Backtrack, - Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState, - Parse3CachedPCRE, Parse3CachedRE2, - Parse3CachedBacktrack; +ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, Parse3PCRE, Parse3RE2, + Parse3Backtrack, Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState, + Parse3CachedPCRE, Parse3CachedRE2, Parse3CachedBacktrack; ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2; @@ -141,29 +154,31 @@ ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; // Generate random text that won't contain the search string, // to test worst-case search behavior. -void MakeText(std::string* text, int nbytes) { - srand(1); - text->resize(nbytes); - for (int i = 0; i < nbytes; i++) { - // Generate a one-byte rune that isn't a control character (e.g. '\n'). - // Clipping to 0x20 introduces some bias, but we don't need uniformity. - int byte = rand() & 0x7F; - if (byte < 0x20) - byte = 0x20; - (*text)[i] = byte; - } +std::string RandomText(int64_t nbytes) { + static const std::string* const text = []() { + std::string* text = new std::string; + srand(1); + text->resize(16<<20); + for (int64_t i = 0; i < 16<<20; i++) { + // Generate a one-byte rune that isn't a control character (e.g. '\n'). + // Clipping to 0x20 introduces some bias, but we don't need uniformity. + int byte = rand() & 0x7F; + if (byte < 0x20) + byte = 0x20; + (*text)[i] = byte; + } + return text; + }(); + CHECK_LE(nbytes, 16<<20); + return text->substr(0, nbytes); } // Makes text of size nbytes, then calls run to search // the text for regexp iters times. -void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, nbytes); - BenchmarkMemoryUsage(); - StartBenchmarkTiming(); - search(iters, regexp, s, Prog::kUnanchored, false); - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +void Search(benchmark::State& state, const char* regexp, SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, regexp, s, Prog::kUnanchored, false); + state.SetBytesProcessed(state.iterations() * state.range(0)); } // These two are easy because they start with an A, @@ -189,10 +204,10 @@ void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) { #define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \ "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" -void Search_Easy0_CachedDFA(int i, int n) { Search(i, n, EASY0, SearchCachedDFA); } -void Search_Easy0_CachedNFA(int i, int n) { Search(i, n, EASY0, SearchCachedNFA); } -void Search_Easy0_CachedPCRE(int i, int n) { Search(i, n, EASY0, SearchCachedPCRE); } -void Search_Easy0_CachedRE2(int i, int n) { Search(i, n, EASY0, SearchCachedRE2); } +void Search_Easy0_CachedDFA(benchmark::State& state) { Search(state, EASY0, SearchCachedDFA); } +void Search_Easy0_CachedNFA(benchmark::State& state) { Search(state, EASY0, SearchCachedNFA); } +void Search_Easy0_CachedPCRE(benchmark::State& state) { Search(state, EASY0, SearchCachedPCRE); } +void Search_Easy0_CachedRE2(benchmark::State& state) { Search(state, EASY0, SearchCachedRE2); } BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -201,10 +216,10 @@ BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs() #endif BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_Easy1_CachedDFA(int i, int n) { Search(i, n, EASY1, SearchCachedDFA); } -void Search_Easy1_CachedNFA(int i, int n) { Search(i, n, EASY1, SearchCachedNFA); } -void Search_Easy1_CachedPCRE(int i, int n) { Search(i, n, EASY1, SearchCachedPCRE); } -void Search_Easy1_CachedRE2(int i, int n) { Search(i, n, EASY1, SearchCachedRE2); } +void Search_Easy1_CachedDFA(benchmark::State& state) { Search(state, EASY1, SearchCachedDFA); } +void Search_Easy1_CachedNFA(benchmark::State& state) { Search(state, EASY1, SearchCachedNFA); } +void Search_Easy1_CachedPCRE(benchmark::State& state) { Search(state, EASY1, SearchCachedPCRE); } +void Search_Easy1_CachedRE2(benchmark::State& state) { Search(state, EASY1, SearchCachedRE2); } BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -213,10 +228,10 @@ BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs() #endif BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_Medium_CachedDFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedDFA); } -void Search_Medium_CachedNFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedNFA); } -void Search_Medium_CachedPCRE(int i, int n) { Search(i, n, MEDIUM, SearchCachedPCRE); } -void Search_Medium_CachedRE2(int i, int n) { Search(i, n, MEDIUM, SearchCachedRE2); } +void Search_Medium_CachedDFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedDFA); } +void Search_Medium_CachedNFA(benchmark::State& state) { Search(state, MEDIUM, SearchCachedNFA); } +void Search_Medium_CachedPCRE(benchmark::State& state) { Search(state, MEDIUM, SearchCachedPCRE); } +void Search_Medium_CachedRE2(benchmark::State& state) { Search(state, MEDIUM, SearchCachedRE2); } BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -225,10 +240,10 @@ BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs #endif BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_Hard_CachedDFA(int i, int n) { Search(i, n, HARD, SearchCachedDFA); } -void Search_Hard_CachedNFA(int i, int n) { Search(i, n, HARD, SearchCachedNFA); } -void Search_Hard_CachedPCRE(int i, int n) { Search(i, n, HARD, SearchCachedPCRE); } -void Search_Hard_CachedRE2(int i, int n) { Search(i, n, HARD, SearchCachedRE2); } +void Search_Hard_CachedDFA(benchmark::State& state) { Search(state, HARD, SearchCachedDFA); } +void Search_Hard_CachedNFA(benchmark::State& state) { Search(state, HARD, SearchCachedNFA); } +void Search_Hard_CachedPCRE(benchmark::State& state) { Search(state, HARD, SearchCachedPCRE); } +void Search_Hard_CachedRE2(benchmark::State& state) { Search(state, HARD, SearchCachedRE2); } BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -237,10 +252,10 @@ BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs()); #endif BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_Fanout_CachedDFA(int i, int n) { Search(i, n, FANOUT, SearchCachedDFA); } -void Search_Fanout_CachedNFA(int i, int n) { Search(i, n, FANOUT, SearchCachedNFA); } -void Search_Fanout_CachedPCRE(int i, int n) { Search(i, n, FANOUT, SearchCachedPCRE); } -void Search_Fanout_CachedRE2(int i, int n) { Search(i, n, FANOUT, SearchCachedRE2); } +void Search_Fanout_CachedDFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedDFA); } +void Search_Fanout_CachedNFA(benchmark::State& state) { Search(state, FANOUT, SearchCachedNFA); } +void Search_Fanout_CachedPCRE(benchmark::State& state) { Search(state, FANOUT, SearchCachedPCRE); } +void Search_Fanout_CachedRE2(benchmark::State& state) { Search(state, FANOUT, SearchCachedRE2); } BENCHMARK_RANGE(Search_Fanout_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Fanout_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -249,10 +264,10 @@ BENCHMARK_RANGE(Search_Fanout_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs() #endif BENCHMARK_RANGE(Search_Fanout_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_Parens_CachedDFA(int i, int n) { Search(i, n, PARENS, SearchCachedDFA); } -void Search_Parens_CachedNFA(int i, int n) { Search(i, n, PARENS, SearchCachedNFA); } -void Search_Parens_CachedPCRE(int i, int n) { Search(i, n, PARENS, SearchCachedPCRE); } -void Search_Parens_CachedRE2(int i, int n) { Search(i, n, PARENS, SearchCachedRE2); } +void Search_Parens_CachedDFA(benchmark::State& state) { Search(state, PARENS, SearchCachedDFA); } +void Search_Parens_CachedNFA(benchmark::State& state) { Search(state, PARENS, SearchCachedNFA); } +void Search_Parens_CachedPCRE(benchmark::State& state) { Search(state, PARENS, SearchCachedPCRE); } +void Search_Parens_CachedRE2(benchmark::State& state) { Search(state, PARENS, SearchCachedRE2); } BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); @@ -261,24 +276,20 @@ BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs()); #endif BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void SearchBigFixed(int iters, int nbytes, SearchImpl* search) { - StopBenchmarkTiming(); +void SearchBigFixed(benchmark::State& state, SearchImpl* search) { std::string s; - s.append(nbytes/2, 'x'); + s.append(state.range(0)/2, 'x'); std::string regexp = "^" + s + ".*$"; - std::string t; - MakeText(&t, nbytes/2); + std::string t = RandomText(state.range(0)/2); s += t; - BenchmarkMemoryUsage(); - StartBenchmarkTiming(); - search(iters, regexp.c_str(), s, Prog::kUnanchored, true); - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); + search(state, regexp.c_str(), s, Prog::kUnanchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); } -void Search_BigFixed_CachedDFA(int i, int n) { SearchBigFixed(i, n, SearchCachedDFA); } -void Search_BigFixed_CachedNFA(int i, int n) { SearchBigFixed(i, n, SearchCachedNFA); } -void Search_BigFixed_CachedPCRE(int i, int n) { SearchBigFixed(i, n, SearchCachedPCRE); } -void Search_BigFixed_CachedRE2(int i, int n) { SearchBigFixed(i, n, SearchCachedRE2); } +void Search_BigFixed_CachedDFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedDFA); } +void Search_BigFixed_CachedNFA(benchmark::State& state) { SearchBigFixed(state, SearchCachedNFA); } +void Search_BigFixed_CachedPCRE(benchmark::State& state) { SearchBigFixed(state, SearchCachedPCRE); } +void Search_BigFixed_CachedRE2(benchmark::State& state) { SearchBigFixed(state, SearchCachedRE2); } BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs()); @@ -289,43 +300,37 @@ BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs // Benchmark: FindAndConsume -void FindAndConsume(int iters, int nbytes) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, nbytes); +void FindAndConsume(benchmark::State& state) { + std::string s = RandomText(state.range(0)); s.append("Hello World"); - StartBenchmarkTiming(); RE2 re("((Hello World))"); - for (int i = 0; i < iters; i++) { + for (auto _ : state) { StringPiece t = s; StringPiece u; CHECK(RE2::FindAndConsume(&t, re, &u)); CHECK_EQ(u, "Hello World"); } - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); + state.SetBytesProcessed(state.iterations() * state.range(0)); } BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); // Benchmark: successful anchored search. -void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, nbytes); - BenchmarkMemoryUsage(); - StartBenchmarkTiming(); - search(iters, regexp, s, Prog::kAnchored, true); - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +void SearchSuccess(benchmark::State& state, const char* regexp, + SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, regexp, s, Prog::kAnchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); } // Unambiguous search (RE2 can use OnePass). -void Search_Success_DFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchDFA); } -void Search_Success_NFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchNFA); } -void Search_Success_PCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchPCRE); } -void Search_Success_RE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchRE2); } -void Search_Success_OnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchOnePass); } +void Search_Success_DFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchDFA); } +void Search_Success_NFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchNFA); } +void Search_Success_PCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchPCRE); } +void Search_Success_RE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchRE2); } +void Search_Success_OnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchOnePass); } BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -335,11 +340,11 @@ BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); -void Search_Success_CachedDFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedDFA); } -void Search_Success_CachedNFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedNFA); } -void Search_Success_CachedPCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedPCRE); } -void Search_Success_CachedRE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedRE2); } -void Search_Success_CachedOnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedOnePass); } +void Search_Success_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedDFA); } +void Search_Success_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedNFA); } +void Search_Success_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedPCRE); } +void Search_Success_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedRE2); } +void Search_Success_CachedOnePass(benchmark::State& state) { SearchSuccess(state, ".*$", SearchCachedOnePass); } BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -352,11 +357,11 @@ BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs( // Ambiguous search (RE2 cannot use OnePass). // Used to be ".*.$", but that is coalesced to ".+$" these days. -void Search_Success1_DFA(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchDFA); } -void Search_Success1_NFA(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchNFA); } -void Search_Success1_PCRE(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchPCRE); } -void Search_Success1_RE2(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchRE2); } -void Search_Success1_BitState(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchBitState); } +void Search_Success1_DFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchDFA); } +void Search_Success1_NFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchNFA); } +void Search_Success1_PCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchPCRE); } +void Search_Success1_RE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchRE2); } +void Search_Success1_BitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchBitState); } BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success1_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -366,11 +371,11 @@ BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); -void Search_Success1_CachedDFA(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedDFA); } -void Search_Success1_CachedNFA(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedNFA); } -void Search_Success1_CachedPCRE(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedPCRE); } -void Search_Success1_CachedRE2(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedRE2); } -void Search_Success1_CachedBitState(int i, int n) { SearchSuccess(i, n, ".*\\C$", SearchCachedBitState); } +void Search_Success1_CachedDFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedDFA); } +void Search_Success1_CachedNFA(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedNFA); } +void Search_Success1_CachedPCRE(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedPCRE); } +void Search_Success1_CachedRE2(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedRE2); } +void Search_Success1_CachedBitState(benchmark::State& state) { SearchSuccess(state, ".*\\C$", SearchCachedBitState); } BENCHMARK_RANGE(Search_Success1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_Success1_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -383,22 +388,18 @@ BENCHMARK_RANGE(Search_Success1_CachedBitState, 8, 2<<20)->ThreadRange(1, NumCPU // Benchmark: AltMatch optimisation (just to verify that it works) // Note that OnePass doesn't implement it! -void SearchAltMatch(int iters, int nbytes, SearchImpl* search) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, nbytes); - BenchmarkMemoryUsage(); - StartBenchmarkTiming(); - search(iters, "\\C*", s, Prog::kAnchored, true); - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +void SearchAltMatch(benchmark::State& state, SearchImpl* search) { + std::string s = RandomText(state.range(0)); + search(state, "\\C*", s, Prog::kAnchored, true); + state.SetBytesProcessed(state.iterations() * state.range(0)); } -void Search_AltMatch_DFA(int i, int n) { SearchAltMatch(i, n, SearchDFA); } -void Search_AltMatch_NFA(int i, int n) { SearchAltMatch(i, n, SearchNFA); } -void Search_AltMatch_OnePass(int i, int n) { SearchAltMatch(i, n, SearchOnePass); } -void Search_AltMatch_BitState(int i, int n) { SearchAltMatch(i, n, SearchBitState); } -void Search_AltMatch_PCRE(int i, int n) { SearchAltMatch(i, n, SearchPCRE); } -void Search_AltMatch_RE2(int i, int n) { SearchAltMatch(i, n, SearchRE2); } +void Search_AltMatch_DFA(benchmark::State& state) { SearchAltMatch(state, SearchDFA); } +void Search_AltMatch_NFA(benchmark::State& state) { SearchAltMatch(state, SearchNFA); } +void Search_AltMatch_OnePass(benchmark::State& state) { SearchAltMatch(state, SearchOnePass); } +void Search_AltMatch_BitState(benchmark::State& state) { SearchAltMatch(state, SearchBitState); } +void Search_AltMatch_PCRE(benchmark::State& state) { SearchAltMatch(state, SearchPCRE); } +void Search_AltMatch_RE2(benchmark::State& state) { SearchAltMatch(state, SearchRE2); } BENCHMARK_RANGE(Search_AltMatch_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_AltMatch_NFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -409,12 +410,12 @@ BENCHMARK_RANGE(Search_AltMatch_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); #endif BENCHMARK_RANGE(Search_AltMatch_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); -void Search_AltMatch_CachedDFA(int i, int n) { SearchAltMatch(i, n, SearchCachedDFA); } -void Search_AltMatch_CachedNFA(int i, int n) { SearchAltMatch(i, n, SearchCachedNFA); } -void Search_AltMatch_CachedOnePass(int i, int n) { SearchAltMatch(i, n, SearchCachedOnePass); } -void Search_AltMatch_CachedBitState(int i, int n) { SearchAltMatch(i, n, SearchCachedBitState); } -void Search_AltMatch_CachedPCRE(int i, int n) { SearchAltMatch(i, n, SearchCachedPCRE); } -void Search_AltMatch_CachedRE2(int i, int n) { SearchAltMatch(i, n, SearchCachedRE2); } +void Search_AltMatch_CachedDFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedDFA); } +void Search_AltMatch_CachedNFA(benchmark::State& state) { SearchAltMatch(state, SearchCachedNFA); } +void Search_AltMatch_CachedOnePass(benchmark::State& state) { SearchAltMatch(state, SearchCachedOnePass); } +void Search_AltMatch_CachedBitState(benchmark::State& state) { SearchAltMatch(state, SearchCachedBitState); } +void Search_AltMatch_CachedPCRE(benchmark::State& state) { SearchAltMatch(state, SearchCachedPCRE); } +void Search_AltMatch_CachedRE2(benchmark::State& state) { SearchAltMatch(state, SearchCachedRE2); } BENCHMARK_RANGE(Search_AltMatch_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); BENCHMARK_RANGE(Search_AltMatch_CachedNFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); @@ -427,19 +428,18 @@ BENCHMARK_RANGE(Search_AltMatch_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCP // Benchmark: use regexp to find phone number. -void SearchDigits(int iters, SearchImpl* search) { +void SearchDigits(benchmark::State& state, SearchImpl* search) { StringPiece s("650-253-0001"); - BenchmarkMemoryUsage(); - search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); - SetBenchmarkItemsProcessed(iters); + search(state, "([0-9]+)-([0-9]+)-([0-9]+)", s, Prog::kAnchored, true); + state.SetItemsProcessed(state.iterations()); } -void Search_Digits_DFA(int i) { SearchDigits(i, SearchDFA); } -void Search_Digits_NFA(int i) { SearchDigits(i, SearchNFA); } -void Search_Digits_OnePass(int i) { SearchDigits(i, SearchOnePass); } -void Search_Digits_PCRE(int i) { SearchDigits(i, SearchPCRE); } -void Search_Digits_RE2(int i) { SearchDigits(i, SearchRE2); } -void Search_Digits_BitState(int i) { SearchDigits(i, SearchBitState); } +void Search_Digits_DFA(benchmark::State& state) { SearchDigits(state, SearchDFA); } +void Search_Digits_NFA(benchmark::State& state) { SearchDigits(state, SearchNFA); } +void Search_Digits_OnePass(benchmark::State& state) { SearchDigits(state, SearchOnePass); } +void Search_Digits_PCRE(benchmark::State& state) { SearchDigits(state, SearchPCRE); } +void Search_Digits_RE2(benchmark::State& state) { SearchDigits(state, SearchRE2); } +void Search_Digits_BitState(benchmark::State& state) { SearchDigits(state, SearchBitState); } BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs()); @@ -452,19 +452,19 @@ BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs()); // Benchmark: use regexp to parse digit fields in phone number. -void Parse3Digits(int iters, - void (*parse3)(int, const char*, const StringPiece&)) { - BenchmarkMemoryUsage(); - parse3(iters, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); - SetBenchmarkItemsProcessed(iters); +void Parse3Digits(benchmark::State& state, + void (*parse3)(benchmark::State&, const char*, + const StringPiece&)) { + parse3(state, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); } -void Parse_Digits_NFA(int i) { Parse3Digits(i, Parse3NFA); } -void Parse_Digits_OnePass(int i) { Parse3Digits(i, Parse3OnePass); } -void Parse_Digits_PCRE(int i) { Parse3Digits(i, Parse3PCRE); } -void Parse_Digits_RE2(int i) { Parse3Digits(i, Parse3RE2); } -void Parse_Digits_Backtrack(int i) { Parse3Digits(i, Parse3Backtrack); } -void Parse_Digits_BitState(int i) { Parse3Digits(i, Parse3BitState); } +void Parse_Digits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3NFA); } +void Parse_Digits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3OnePass); } +void Parse_Digits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3PCRE); } +void Parse_Digits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3RE2); } +void Parse_Digits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3Backtrack); } +void Parse_Digits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3BitState); } BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs()); @@ -475,12 +475,12 @@ BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs()); -void Parse_CachedDigits_NFA(int i) { Parse3Digits(i, Parse3CachedNFA); } -void Parse_CachedDigits_OnePass(int i) { Parse3Digits(i, Parse3CachedOnePass); } -void Parse_CachedDigits_PCRE(int i) { Parse3Digits(i, Parse3CachedPCRE); } -void Parse_CachedDigits_RE2(int i) { Parse3Digits(i, Parse3CachedRE2); } -void Parse_CachedDigits_Backtrack(int i) { Parse3Digits(i, Parse3CachedBacktrack); } -void Parse_CachedDigits_BitState(int i) { Parse3Digits(i, Parse3CachedBitState); } +void Parse_CachedDigits_NFA(benchmark::State& state) { Parse3Digits(state, Parse3CachedNFA); } +void Parse_CachedDigits_OnePass(benchmark::State& state) { Parse3Digits(state, Parse3CachedOnePass); } +void Parse_CachedDigits_PCRE(benchmark::State& state) { Parse3Digits(state, Parse3CachedPCRE); } +void Parse_CachedDigits_RE2(benchmark::State& state) { Parse3Digits(state, Parse3CachedRE2); } +void Parse_CachedDigits_Backtrack(benchmark::State& state) { Parse3Digits(state, Parse3CachedBacktrack); } +void Parse_CachedDigits_BitState(benchmark::State& state) { Parse3Digits(state, Parse3CachedBitState); } BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs()); @@ -491,19 +491,19 @@ BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs()); -void Parse3DigitDs(int iters, - void (*parse3)(int, const char*, const StringPiece&)) { - BenchmarkMemoryUsage(); - parse3(iters, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); - SetBenchmarkItemsProcessed(iters); +void Parse3DigitDs(benchmark::State& state, + void (*parse3)(benchmark::State&, const char*, + const StringPiece&)) { + parse3(state, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); } -void Parse_DigitDs_NFA(int i) { Parse3DigitDs(i, Parse3NFA); } -void Parse_DigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3OnePass); } -void Parse_DigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3PCRE); } -void Parse_DigitDs_RE2(int i) { Parse3DigitDs(i, Parse3RE2); } -void Parse_DigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } -void Parse_DigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } +void Parse_DigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3NFA); } +void Parse_DigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3OnePass); } +void Parse_DigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3PCRE); } +void Parse_DigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3RE2); } +void Parse_DigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); } +void Parse_DigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); } BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs()); @@ -514,12 +514,12 @@ BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs()); -void Parse_CachedDigitDs_NFA(int i) { Parse3DigitDs(i, Parse3CachedNFA); } -void Parse_CachedDigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3CachedOnePass); } -void Parse_CachedDigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3CachedPCRE); } -void Parse_CachedDigitDs_RE2(int i) { Parse3DigitDs(i, Parse3CachedRE2); } -void Parse_CachedDigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } -void Parse_CachedDigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } +void Parse_CachedDigitDs_NFA(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedNFA); } +void Parse_CachedDigitDs_OnePass(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedOnePass); } +void Parse_CachedDigitDs_PCRE(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedPCRE); } +void Parse_CachedDigitDs_RE2(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedRE2); } +void Parse_CachedDigitDs_Backtrack(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBacktrack); } +void Parse_CachedDigitDs_BitState(benchmark::State& state) { Parse3DigitDs(state, Parse3CachedBitState); } BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs()); @@ -532,18 +532,18 @@ BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs()); // Benchmark: splitting off leading number field. -void Parse1Split(int iters, - void (*parse1)(int, const char*, const StringPiece&)) { - BenchmarkMemoryUsage(); - parse1(iters, "[0-9]+-(.*)", "650-253-0001"); - SetBenchmarkItemsProcessed(iters); +void Parse1Split(benchmark::State& state, + void (*parse1)(benchmark::State&, const char*, + const StringPiece&)) { + parse1(state, "[0-9]+-(.*)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); } -void Parse_Split_NFA(int i) { Parse1Split(i, Parse1NFA); } -void Parse_Split_OnePass(int i) { Parse1Split(i, Parse1OnePass); } -void Parse_Split_PCRE(int i) { Parse1Split(i, Parse1PCRE); } -void Parse_Split_RE2(int i) { Parse1Split(i, Parse1RE2); } -void Parse_Split_BitState(int i) { Parse1Split(i, Parse1BitState); } +void Parse_Split_NFA(benchmark::State& state) { Parse1Split(state, Parse1NFA); } +void Parse_Split_OnePass(benchmark::State& state) { Parse1Split(state, Parse1OnePass); } +void Parse_Split_PCRE(benchmark::State& state) { Parse1Split(state, Parse1PCRE); } +void Parse_Split_RE2(benchmark::State& state) { Parse1Split(state, Parse1RE2); } +void Parse_Split_BitState(benchmark::State& state) { Parse1Split(state, Parse1BitState); } BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs()); @@ -553,11 +553,11 @@ BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs()); -void Parse_CachedSplit_NFA(int i) { Parse1Split(i, Parse1CachedNFA); } -void Parse_CachedSplit_OnePass(int i) { Parse1Split(i, Parse1CachedOnePass); } -void Parse_CachedSplit_PCRE(int i) { Parse1Split(i, Parse1CachedPCRE); } -void Parse_CachedSplit_RE2(int i) { Parse1Split(i, Parse1CachedRE2); } -void Parse_CachedSplit_BitState(int i) { Parse1Split(i, Parse1CachedBitState); } +void Parse_CachedSplit_NFA(benchmark::State& state) { Parse1Split(state, Parse1CachedNFA); } +void Parse_CachedSplit_OnePass(benchmark::State& state) { Parse1Split(state, Parse1CachedOnePass); } +void Parse_CachedSplit_PCRE(benchmark::State& state) { Parse1Split(state, Parse1CachedPCRE); } +void Parse_CachedSplit_RE2(benchmark::State& state) { Parse1Split(state, Parse1CachedRE2); } +void Parse_CachedSplit_BitState(benchmark::State& state) { Parse1Split(state, Parse1CachedBitState); } BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs()); @@ -569,17 +569,17 @@ BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs()); // Benchmark: splitting off leading number field but harder (ambiguous regexp). -void Parse1SplitHard(int iters, - void (*run)(int, const char*, const StringPiece&)) { - BenchmarkMemoryUsage(); - run(iters, "[0-9]+.(.*)", "650-253-0001"); - SetBenchmarkItemsProcessed(iters); +void Parse1SplitHard(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { + run(state, "[0-9]+.(.*)", "650-253-0001"); + state.SetItemsProcessed(state.iterations()); } -void Parse_SplitHard_NFA(int i) { Parse1SplitHard(i, Parse1NFA); } -void Parse_SplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1PCRE); } -void Parse_SplitHard_RE2(int i) { Parse1SplitHard(i, Parse1RE2); } -void Parse_SplitHard_BitState(int i) { Parse1SplitHard(i, Parse1BitState); } +void Parse_SplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1NFA); } +void Parse_SplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1PCRE); } +void Parse_SplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1RE2); } +void Parse_SplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1BitState); } #ifdef USEPCRE BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs()); @@ -588,11 +588,11 @@ BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs()); BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs()); -void Parse_CachedSplitHard_NFA(int i) { Parse1SplitHard(i, Parse1CachedNFA); } -void Parse_CachedSplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1CachedPCRE); } -void Parse_CachedSplitHard_RE2(int i) { Parse1SplitHard(i, Parse1CachedRE2); } -void Parse_CachedSplitHard_BitState(int i) { Parse1SplitHard(i, Parse1CachedBitState); } -void Parse_CachedSplitHard_Backtrack(int i) { Parse1SplitHard(i, Parse1CachedBacktrack); } +void Parse_CachedSplitHard_NFA(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedNFA); } +void Parse_CachedSplitHard_PCRE(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedPCRE); } +void Parse_CachedSplitHard_RE2(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedRE2); } +void Parse_CachedSplitHard_BitState(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBitState); } +void Parse_CachedSplitHard_Backtrack(benchmark::State& state) { Parse1SplitHard(state, Parse1CachedBacktrack); } #ifdef USEPCRE BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs()); @@ -604,18 +604,18 @@ BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs()); // Benchmark: Parse1SplitHard, big text, small match. -void Parse1SplitBig1(int iters, - void (*run)(int, const char*, const StringPiece&)) { +void Parse1SplitBig1(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { std::string s; s.append(100000, 'x'); s.append("650-253-0001"); - BenchmarkMemoryUsage(); - run(iters, "[0-9]+.(.*)", s); - SetBenchmarkItemsProcessed(iters); + run(state, "[0-9]+.(.*)", s); + state.SetItemsProcessed(state.iterations()); } -void Parse_CachedSplitBig1_PCRE(int i) { Parse1SplitBig1(i, SearchParse1CachedPCRE); } -void Parse_CachedSplitBig1_RE2(int i) { Parse1SplitBig1(i, SearchParse1CachedRE2); } +void Parse_CachedSplitBig1_PCRE(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig1_RE2(benchmark::State& state) { Parse1SplitBig1(state, SearchParse1CachedRE2); } #ifdef USEPCRE BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs()); @@ -624,18 +624,18 @@ BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs()); // Benchmark: Parse1SplitHard, big text, big match. -void Parse1SplitBig2(int iters, - void (*run)(int, const char*, const StringPiece&)) { +void Parse1SplitBig2(benchmark::State& state, + void (*run)(benchmark::State&, const char*, + const StringPiece&)) { std::string s; s.append("650-253-"); s.append(100000, '0'); - BenchmarkMemoryUsage(); - run(iters, "[0-9]+.(.*)", s); - SetBenchmarkItemsProcessed(iters); + run(state, "[0-9]+.(.*)", s); + state.SetItemsProcessed(state.iterations()); } -void Parse_CachedSplitBig2_PCRE(int i) { Parse1SplitBig2(i, SearchParse1CachedPCRE); } -void Parse_CachedSplitBig2_RE2(int i) { Parse1SplitBig2(i, SearchParse1CachedRE2); } +void Parse_CachedSplitBig2_PCRE(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig2_RE2(benchmark::State& state) { Parse1SplitBig2(state, SearchParse1CachedRE2); } #ifdef USEPCRE BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs()); @@ -645,16 +645,16 @@ BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs()); // Benchmark: measure time required to parse (but not execute) // a simple regular expression. -void ParseRegexp(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void ParseRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); re->Decref(); } } -void SimplifyRegexp(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void SimplifyRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Regexp* sre = re->Simplify(); @@ -664,17 +664,17 @@ void SimplifyRegexp(int iters, const std::string& regexp) { } } -void NullWalkRegexp(int iters, const std::string& regexp) { +void NullWalkRegexp(benchmark::State& state, const std::string& regexp) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); - for (int i = 0; i < iters; i++) { + for (auto _ : state) { re->NullWalk(); } re->Decref(); } -void SimplifyCompileRegexp(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void SimplifyCompileRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Regexp* sre = re->Simplify(); @@ -687,8 +687,8 @@ void SimplifyCompileRegexp(int iters, const std::string& regexp) { } } -void CompileRegexp(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void CompileRegexp(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -698,10 +698,10 @@ void CompileRegexp(int iters, const std::string& regexp) { } } -void CompileToProg(int iters, const std::string& regexp) { +void CompileToProg(benchmark::State& state, const std::string& regexp) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); - for (int i = 0; i < iters; i++) { + for (auto _ : state) { Prog* prog = re->CompileToProg(0); CHECK(prog); delete prog; @@ -709,53 +709,54 @@ void CompileToProg(int iters, const std::string& regexp) { re->Decref(); } -void CompileByteMap(int iters, const std::string& regexp) { +void CompileByteMap(benchmark::State& state, const std::string& regexp) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); - for (int i = 0; i < iters; i++) { + for (auto _ : state) { prog->ComputeByteMap(); } delete prog; re->Decref(); } -void CompilePCRE(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void CompilePCRE(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); } } -void CompileRE2(int iters, const std::string& regexp) { - for (int i = 0; i < iters; i++) { +void CompileRE2(benchmark::State& state, const std::string& regexp) { + for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); } } -void RunBuild(int iters, const std::string& regexp, - void (*run)(int, const std::string&)) { - run(iters, regexp); - SetBenchmarkItemsProcessed(iters); +void RunBuild(benchmark::State& state, const std::string& regexp, + void (*run)(benchmark::State&, const std::string&)) { + run(state, regexp); + state.SetItemsProcessed(state.iterations()); } } // namespace re2 -DEFINE_string(compile_regexp, "(.*)-(\\d+)-of-(\\d+)", "regexp for compile benchmarks"); +DEFINE_FLAG(std::string, compile_regexp, "(.*)-(\\d+)-of-(\\d+)", + "regexp for compile benchmarks"); namespace re2 { -void BM_PCRE_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompilePCRE); } -void BM_Regexp_Parse(int i) { RunBuild(i, FLAGS_compile_regexp, ParseRegexp); } -void BM_Regexp_Simplify(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyRegexp); } -void BM_CompileToProg(int i) { RunBuild(i, FLAGS_compile_regexp, CompileToProg); } -void BM_CompileByteMap(int i) { RunBuild(i, FLAGS_compile_regexp, CompileByteMap); } -void BM_Regexp_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRegexp); } -void BM_Regexp_SimplifyCompile(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyCompileRegexp); } -void BM_Regexp_NullWalk(int i) { RunBuild(i, FLAGS_compile_regexp, NullWalkRegexp); } -void BM_RE2_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRE2); } +void BM_PCRE_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompilePCRE); } +void BM_Regexp_Parse(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), ParseRegexp); } +void BM_Regexp_Simplify(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyRegexp); } +void BM_CompileToProg(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileToProg); } +void BM_CompileByteMap(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileByteMap); } +void BM_Regexp_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRegexp); } +void BM_Regexp_SimplifyCompile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), SimplifyCompileRegexp); } +void BM_Regexp_NullWalk(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), NullWalkRegexp); } +void BM_RE2_Compile(benchmark::State& state) { RunBuild(state, GetFlag(FLAGS_compile_regexp), CompileRE2); } #ifdef USEPCRE BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs()); @@ -771,22 +772,19 @@ BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); // Makes text of size nbytes, then calls run to search // the text for regexp iters times. -void SearchPhone(int iters, int nbytes, ParseImpl* search) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, nbytes); +void SearchPhone(benchmark::State& state, ParseImpl* search) { + std::string s = RandomText(state.range(0)); s.append("(650) 253-0001"); - BenchmarkMemoryUsage(); - StartBenchmarkTiming(); - search(iters, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); - SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); + search(state, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); + state.SetBytesProcessed(state.iterations() * state.range(0)); } -void SearchPhone_CachedPCRE(int i, int n) { - SearchPhone(i, n, SearchParse2CachedPCRE); +void SearchPhone_CachedPCRE(benchmark::State& state) { + SearchPhone(state, SearchParse2CachedPCRE); } -void SearchPhone_CachedRE2(int i, int n) { - SearchPhone(i, n, SearchParse2CachedRE2); + +void SearchPhone_CachedRE2(benchmark::State& state) { + SearchPhone(state, SearchParse2CachedRE2); } #ifdef USEPCRE @@ -796,40 +794,6 @@ BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); /* TODO(rsc): Make this work again. - -// Generates and returns a string over binary alphabet {0,1} that contains -// all possible binary sequences of length n as subsequences. The obvious -// brute force method would generate a string of length n * 2^n, but this -// generates a string of length n + 2^n - 1 called a De Bruijn cycle. -// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. -static std::string DeBruijnString(int n) { - CHECK_LT(n, 8*sizeof(int)); - CHECK_GT(n, 0); - - std::vector did(1<CompileToProg(0); @@ -898,9 +863,10 @@ void SearchDFA(int iters, const char* regexp, const StringPiece& text, } } -void SearchNFA(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - for (int i = 0; i < iters; i++) { +void SearchNFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -913,9 +879,10 @@ void SearchNFA(int iters, const char* regexp, const StringPiece& text, } } -void SearchOnePass(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - for (int i = 0; i < iters; i++) { +void SearchOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -928,9 +895,10 @@ void SearchOnePass(int iters, const char* regexp, const StringPiece& text, } } -void SearchBitState(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - for (int i = 0; i < iters; i++) { +void SearchBitState(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -943,9 +911,10 @@ void SearchBitState(int iters, const char* regexp, const StringPiece& text, } } -void SearchPCRE(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - for (int i = 0; i < iters; i++) { +void SearchPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); if (anchor == Prog::kAnchored) @@ -955,9 +924,10 @@ void SearchPCRE(int iters, const char* regexp, const StringPiece& text, } } -void SearchRE2(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - for (int i = 0; i < iters; i++) { +void SearchRE2(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); if (anchor == Prog::kAnchored) @@ -971,71 +941,99 @@ void SearchRE2(int iters, const char* regexp, const StringPiece& text, // regexp parsing and compiling once. This lets us measure // search time without the per-regexp overhead. -void SearchCachedDFA(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(1LL<<31); - CHECK(prog); - for (int i = 0; i < iters; i++) { +Prog* GetCachedProg(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + Prog* prog = cache[regexp]; + if (prog == NULL) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + prog = re->CompileToProg(int64_t{1}<<31); // mostly for the DFA + CHECK(prog); + cache[regexp] = prog; + re->Decref(); + } + return prog; +} + +PCRE* GetCachedPCRE(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + PCRE* re = cache[regexp]; + if (re == NULL) { + re = new PCRE(regexp, PCRE::UTF8); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +RE2* GetCachedRE2(const char* regexp) { + static auto& mutex = *new Mutex; + MutexLock lock(&mutex); + static auto& cache = *new std::unordered_map; + RE2* re = cache[regexp]; + if (re == NULL) { + re = new RE2(regexp); + CHECK_EQ(re->error(), ""); + cache[regexp] = re; + } + return re; +} + +void SearchCachedDFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + for (auto _ : state) { bool failed = false; CHECK_EQ(prog->SearchDFA(text, StringPiece(), anchor, Prog::kFirstMatch, NULL, &failed, NULL), expect_match); CHECK(!failed); } - delete prog; - re->Decref(); } -void SearchCachedNFA(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); - for (int i = 0; i < iters; i++) { +void SearchCachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); + for (auto _ : state) { CHECK_EQ(prog->SearchNFA(text, StringPiece(), anchor, Prog::kFirstMatch, NULL, 0), expect_match); } - delete prog; - re->Decref(); } -void SearchCachedOnePass(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void SearchCachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); - delete prog; - re->Decref(); + } } -void SearchCachedBitState(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void SearchCachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), expect_match); - delete prog; - re->Decref(); + } } -void SearchCachedPCRE(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { +void SearchCachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(PCRE::FullMatch(text, re), expect_match); else @@ -1043,11 +1041,11 @@ void SearchCachedPCRE(int iters, const char* regexp, const StringPiece& text, } } -void SearchCachedRE2(int iters, const char* regexp, const StringPiece& text, - Prog::Anchor anchor, bool expect_match) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { +void SearchCachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text, Prog::Anchor anchor, + bool expect_match) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { if (anchor == Prog::kAnchored) CHECK_EQ(RE2::FullMatch(text, re), expect_match); else @@ -1055,12 +1053,12 @@ void SearchCachedRE2(int iters, const char* regexp, const StringPiece& text, } } - // Runs implementation to full match regexp against text, // extracting three submatches. Expects match always. -void Parse3NFA(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3NFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1073,8 +1071,9 @@ void Parse3NFA(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3OnePass(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3OnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1087,8 +1086,9 @@ void Parse3OnePass(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3BitState(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3BitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1101,8 +1101,9 @@ void Parse3BitState(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3Backtrack(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3Backtrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1114,8 +1115,9 @@ void Parse3Backtrack(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3PCRE(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3PCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); StringPiece sp1, sp2, sp3; @@ -1123,8 +1125,9 @@ void Parse3PCRE(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3RE2(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse3RE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); StringPiece sp1, sp2, sp3; @@ -1132,82 +1135,69 @@ void Parse3RE2(int iters, const char* regexp, const StringPiece& text) { } } -void Parse3CachedNFA(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse3CachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 4)); } - delete prog; - re->Decref(); } -void Parse3CachedOnePass(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse3CachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[4]; // 4 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); - delete prog; - re->Decref(); + } } -void Parse3CachedBitState(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse3CachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[4]; // 4 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); - delete prog; - re->Decref(); + } } -void Parse3CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse3CachedBacktrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); StringPiece sp[4]; // 4 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); - delete prog; - re->Decref(); + } } -void Parse3CachedPCRE(int iters, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); +void Parse3CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1, sp2, sp3; - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); } } -void Parse3CachedRE2(int iters, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); +void Parse3CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); StringPiece sp1, sp2, sp3; - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); } } - // Runs implementation to full match regexp against text, // extracting three submatches. Expects match always. -void Parse1NFA(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse1NFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1220,8 +1210,9 @@ void Parse1NFA(int iters, const char* regexp, const StringPiece& text) { } } -void Parse1OnePass(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse1OnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1234,8 +1225,9 @@ void Parse1OnePass(int iters, const char* regexp, const StringPiece& text) { } } -void Parse1BitState(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse1BitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); @@ -1248,8 +1240,9 @@ void Parse1BitState(int iters, const char* regexp, const StringPiece& text) { } } -void Parse1PCRE(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse1PCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { PCRE re(regexp, PCRE::UTF8); CHECK_EQ(re.error(), ""); StringPiece sp1; @@ -1257,8 +1250,9 @@ void Parse1PCRE(int iters, const char* regexp, const StringPiece& text) { } } -void Parse1RE2(int iters, const char* regexp, const StringPiece& text) { - for (int i = 0; i < iters; i++) { +void Parse1RE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + for (auto _ : state) { RE2 re(regexp); CHECK_EQ(re.error(), ""); StringPiece sp1; @@ -1266,126 +1260,109 @@ void Parse1RE2(int iters, const char* regexp, const StringPiece& text) { } } -void Parse1CachedNFA(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse1CachedNFA(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(prog->SearchNFA(text, StringPiece(), Prog::kAnchored, Prog::kFullMatch, sp, 2)); } - delete prog; - re->Decref(); } -void Parse1CachedOnePass(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse1CachedOnePass(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->IsOnePass()); StringPiece sp[2]; // 2 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); - delete prog; - re->Decref(); + } } -void Parse1CachedBitState(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse1CachedBitState(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); CHECK(prog->CanBitState()); StringPiece sp[2]; // 2 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); - delete prog; - re->Decref(); + } } -void Parse1CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { - Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); - CHECK(re); - Prog* prog = re->CompileToProg(0); - CHECK(prog); +void Parse1CachedBacktrack(benchmark::State& state, const char* regexp, + const StringPiece& text) { + Prog* prog = GetCachedProg(regexp); StringPiece sp[2]; // 2 because sp[0] is whole match. - for (int i = 0; i < iters; i++) + for (auto _ : state) { CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); - delete prog; - re->Decref(); + } } -void Parse1CachedPCRE(int iters, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); +void Parse1CachedPCRE(benchmark::State& state, const char* regexp, + const StringPiece& text) { + PCRE& re = *GetCachedPCRE(regexp); StringPiece sp1; - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(PCRE::FullMatch(text, re, &sp1)); } } -void Parse1CachedRE2(int iters, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); +void Parse1CachedRE2(benchmark::State& state, const char* regexp, + const StringPiece& text) { + RE2& re = *GetCachedRE2(regexp); StringPiece sp1; - for (int i = 0; i < iters; i++) { + for (auto _ : state) { CHECK(RE2::FullMatch(text, re, &sp1)); } } -void SearchParse2CachedPCRE(int iters, const char* regexp, +void SearchParse2CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { StringPiece sp1, sp2; CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); } } -void SearchParse2CachedRE2(int iters, const char* regexp, +void SearchParse2CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { StringPiece sp1, sp2; CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); } } -void SearchParse1CachedPCRE(int iters, const char* regexp, +void SearchParse1CachedPCRE(benchmark::State& state, const char* regexp, const StringPiece& text) { - PCRE re(regexp, PCRE::UTF8); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { + PCRE& re = *GetCachedPCRE(regexp); + for (auto _ : state) { StringPiece sp1; CHECK(PCRE::PartialMatch(text, re, &sp1)); } } -void SearchParse1CachedRE2(int iters, const char* regexp, +void SearchParse1CachedRE2(benchmark::State& state, const char* regexp, const StringPiece& text) { - RE2 re(regexp); - CHECK_EQ(re.error(), ""); - for (int i = 0; i < iters; i++) { + RE2& re = *GetCachedRE2(regexp); + for (auto _ : state) { StringPiece sp1; CHECK(RE2::PartialMatch(text, re, &sp1)); } } -void EmptyPartialMatchPCRE(int n) { +void EmptyPartialMatchPCRE(benchmark::State& state) { PCRE re(""); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch("", re); } } -void EmptyPartialMatchRE2(int n) { +void EmptyPartialMatchRE2(benchmark::State& state) { RE2 re(""); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch("", re); } } @@ -1394,16 +1371,16 @@ BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs()); -void SimplePartialMatchPCRE(int n) { +void SimplePartialMatchPCRE(benchmark::State& state) { PCRE re("abcdefg"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch("abcdefg", re); } } -void SimplePartialMatchRE2(int n) { +void SimplePartialMatchRE2(benchmark::State& state) { RE2 re("abcdefg"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch("abcdefg", re); } } @@ -1416,18 +1393,18 @@ static std::string http_text = "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf" "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1"; -void HTTPPartialMatchPCRE(int n) { +void HTTPPartialMatchPCRE(benchmark::State& state) { StringPiece a; PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); } } -void HTTPPartialMatchRE2(int n) { +void HTTPPartialMatchRE2(benchmark::State& state) { StringPiece a; RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); } } @@ -1440,18 +1417,18 @@ BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); static std::string smallhttp_text = "GET /abc HTTP/1.1"; -void SmallHTTPPartialMatchPCRE(int n) { +void SmallHTTPPartialMatchPCRE(benchmark::State& state) { StringPiece a; PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch(smallhttp_text, re, &a); } } -void SmallHTTPPartialMatchRE2(int n) { +void SmallHTTPPartialMatchRE2(benchmark::State& state) { StringPiece a; RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch(smallhttp_text, re, &a); } } @@ -1461,18 +1438,18 @@ BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); -void DotMatchPCRE(int n) { +void DotMatchPCRE(benchmark::State& state) { StringPiece a; PCRE re("(?-s)^(.+)"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); } } -void DotMatchRE2(int n) { +void DotMatchRE2(benchmark::State& state) { StringPiece a; RE2 re("(?-s)^(.+)"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); } } @@ -1482,18 +1459,18 @@ BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs()); -void ASCIIMatchPCRE(int n) { +void ASCIIMatchPCRE(benchmark::State& state) { StringPiece a; PCRE re("(?-s)^([ -~]+)"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { PCRE::PartialMatch(http_text, re, &a); } } -void ASCIIMatchRE2(int n) { +void ASCIIMatchRE2(benchmark::State& state) { StringPiece a; RE2 re("(?-s)^([ -~]+)"); - for (int i = 0; i < n; i++) { + for (auto _ : state) { RE2::PartialMatch(http_text, re, &a); } } @@ -1503,40 +1480,34 @@ BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); #endif BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); -void FullMatchPCRE(int iter, int n, const char *regexp) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, n); +void FullMatchPCRE(benchmark::State& state, const char *regexp) { + std::string s = RandomText(state.range(0)); s += "ABCDEFGHIJ"; - BenchmarkMemoryUsage(); PCRE re(regexp); - StartBenchmarkTiming(); - for (int i = 0; i < iter; i++) + for (auto _ : state) { CHECK(PCRE::FullMatch(s, re)); - SetBenchmarkBytesProcessed(static_cast(iter)*n); + } + state.SetBytesProcessed(state.iterations() * state.range(0)); } -void FullMatchRE2(int iter, int n, const char *regexp) { - StopBenchmarkTiming(); - std::string s; - MakeText(&s, n); +void FullMatchRE2(benchmark::State& state, const char *regexp) { + std::string s = RandomText(state.range(0)); s += "ABCDEFGHIJ"; - BenchmarkMemoryUsage(); RE2 re(regexp, RE2::Latin1); - StartBenchmarkTiming(); - for (int i = 0; i < iter; i++) + for (auto _ : state) { CHECK(RE2::FullMatch(s, re)); - SetBenchmarkBytesProcessed(static_cast(iter)*n); + } + state.SetBytesProcessed(state.iterations() * state.range(0)); } -void FullMatch_DotStar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*"); } -void FullMatch_DotStar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*"); } +void FullMatch_DotStar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*"); } +void FullMatch_DotStar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*"); } -void FullMatch_DotStarDollar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*$"); } -void FullMatch_DotStarDollar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*$"); } +void FullMatch_DotStarDollar_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s).*$"); } +void FullMatch_DotStarDollar_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s).*$"); } -void FullMatch_DotStarCapture_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s)((.*)()()($))"); } -void FullMatch_DotStarCapture_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s)((.*)()()($))"); } +void FullMatch_DotStarCapture_CachedPCRE(benchmark::State& state) { FullMatchPCRE(state, "(?s)((.*)()()($))"); } +void FullMatch_DotStarCapture_CachedRE2(benchmark::State& state) { FullMatchRE2(state, "(?s)((.*)()()($))"); } #ifdef USEPCRE BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20); @@ -1553,29 +1524,27 @@ BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); #endif BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); -void PossibleMatchRangeCommon(int iter, const char* regexp) { - StopBenchmarkTiming(); +void PossibleMatchRangeCommon(benchmark::State& state, const char* regexp) { RE2 re(regexp); - StartBenchmarkTiming(); std::string min; std::string max; const int kMaxLen = 16; - for (int i = 0; i < iter; i++) { + for (auto _ : state) { CHECK(re.PossibleMatchRange(&min, &max, kMaxLen)); } } -void PossibleMatchRange_Trivial(int i) { - PossibleMatchRangeCommon(i, ".*"); +void PossibleMatchRange_Trivial(benchmark::State& state) { + PossibleMatchRangeCommon(state, ".*"); } -void PossibleMatchRange_Complex(int i) { - PossibleMatchRangeCommon(i, "^abc[def]?[gh]{1,2}.*"); +void PossibleMatchRange_Complex(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^abc[def]?[gh]{1,2}.*"); } -void PossibleMatchRange_Prefix(int i) { - PossibleMatchRangeCommon(i, "^some_random_prefix.*"); +void PossibleMatchRange_Prefix(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^some_random_prefix.*"); } -void PossibleMatchRange_NoProg(int i) { - PossibleMatchRangeCommon(i, "^some_random_string$"); +void PossibleMatchRange_NoProg(benchmark::State& state) { + PossibleMatchRangeCommon(state, "^some_random_string$"); } BENCHMARK(PossibleMatchRange_Trivial); diff --git a/extern/re2/re2/testing/regexp_generator.cc b/extern/re2/re2/testing/regexp_generator.cc index 1e4d3da990..3eeda25e3e 100644 --- a/extern/re2/re2/testing/regexp_generator.cc +++ b/extern/re2/re2/testing/regexp_generator.cc @@ -241,7 +241,7 @@ void RegexpGenerator::RunPostfix(const std::vector& post) { std::vector Explode(const StringPiece& s) { std::vector v; - for (const char *q = s.begin(); q < s.end(); ) { + for (const char *q = s.data(); q < s.data() + s.size(); ) { const char* p = q; Rune r; q += chartorune(&r, q); @@ -256,11 +256,11 @@ std::vector Explode(const StringPiece& s) { std::vector Split(const StringPiece& sep, const StringPiece& s) { std::vector v; - if (sep.size() == 0) + if (sep.empty()) return Explode(s); - const char *p = s.begin(); - for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) { + const char *p = s.data(); + for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) { if (StringPiece(q, sep.size()) == sep) { v.push_back(std::string(p, q - p)); p = q + sep.size(); @@ -268,8 +268,8 @@ std::vector Split(const StringPiece& sep, const StringPiece& s) { continue; } } - if (p < s.end()) - v.push_back(std::string(p, s.end() - p)); + if (p < s.data() + s.size()) + v.push_back(std::string(p, s.data() + s.size() - p)); return v; } diff --git a/extern/re2/re2/testing/required_prefix_test.cc b/extern/re2/re2/testing/required_prefix_test.cc index 54600456a8..023d24249e 100644 --- a/extern/re2/re2/testing/required_prefix_test.cc +++ b/extern/re2/re2/testing/required_prefix_test.cc @@ -6,6 +6,7 @@ #include "util/test.h" #include "util/logging.h" +#include "re2/prog.h" #include "re2/regexp.h" namespace re2 { @@ -19,15 +20,18 @@ struct PrefixTest { }; static PrefixTest tests[] = { - // If the regexp is missing a ^, there's no required prefix. - { "abc", false }, + // Empty cases. { "", false }, { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has no ^, there's no required prefix. + { "abc", false }, // If the regexp immediately goes into // something not a literal match, there's no required prefix. - { "^(abc)", false }, { "^a*", false }, + { "^(abc)", false }, // Otherwise, it should work. { "^abc$", true, "abc", false, "(?-m:$)" }, @@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) { bool f; Regexp* s; ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf") + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") << " " << re->Dump(); if (t.return_value) { ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(s->ToString(), std::string(t.suffix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); s->Decref(); } re->Decref(); @@ -69,4 +73,81 @@ TEST(RequiredPrefix, SimpleTests) { } } +static PrefixTest for_accel_tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has a ^, there's no required prefix. + { "^abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "a*", false }, + + // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through" + // capturing groups, but doesn't try to glue prefix fragments together. + { "(a?)def", false }, + { "(ab?)def", true, "a", false }, + { "(abc?)def", true, "ab", false }, + { "(()a)def", false }, + { "((a)b)def", true, "a", false }, + { "((ab)c)def", true, "ab", false }, + + // Otherwise, it should work. + { "abc$", true, "abc", false }, + { "abc", true, "abc", false }, + { "(?i)abc", true, "abc", true }, + { "abcd*", true, "abc", false }, + { "[Aa][Bb]cd*", true, "ab", true }, + { "ab[Cc]d*", true, "ab", false }, + { "☺abc", true, "☺abc", false }, +}; + +TEST(RequiredPrefixForAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + const PrefixTest& t = for_accel_tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + } + re->Decref(); + } + } +} + +TEST(PrefixAccel, BasicTest) { + Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + for (int i = 0; i < 100; i++) { + std::string text(i, 'a'); + const char* p = reinterpret_cast( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("abc"); + p = reinterpret_cast( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(i, p-text.data()); + } + delete prog; + re->Decref(); +} + } // namespace re2 diff --git a/extern/re2/re2/testing/set_test.cc b/extern/re2/re2/testing/set_test.cc index 61d1cf295f..5a760c4b5e 100644 --- a/extern/re2/re2/testing/set_test.cc +++ b/extern/re2/re2/testing/set_test.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "util/test.h" #include "util/logging.h" @@ -201,4 +202,29 @@ TEST(Set, Prefix) { ASSERT_EQ(v[0], 0); } +TEST(Set, MoveSemantics) { + RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); + ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); + + // The moved-to object should do what the moved-from object did. + RE2::Set s2 = std::move(s1); + ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); + + // The moved-from object should have been reset and be reusable. + ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + s1 = std::move(s2); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); +} + } // namespace re2 diff --git a/extern/re2/re2/testing/string_generator.cc b/extern/re2/re2/testing/string_generator.cc index 030cc457e1..44837fe90e 100644 --- a/extern/re2/re2/testing/string_generator.cc +++ b/extern/re2/re2/testing/string_generator.cc @@ -111,4 +111,31 @@ void StringGenerator::GenerateNULL() { hasnext_ = true; } +std::string DeBruijnString(int n) { + CHECK_GE(n, 1); + CHECK_LE(n, 29); + const size_t size = size_t{1} << static_cast(n); + const size_t mask = size - 1; + std::vector did(size, false); + std::string s; + s.reserve(static_cast(n) + size); + for (size_t i = 0; i < static_cast(n - 1); i++) + s += '0'; + size_t bits = 0; + for (size_t i = 0; i < size; i++) { + bits <<= 1; + bits &= mask; + if (!did[bits | 1]) { + bits |= 1; + s += '1'; + } else { + s += '0'; + } + CHECK(!did[bits]); + did[bits] = true; + } + CHECK_EQ(s.size(), static_cast(n - 1) + size); + return s; +} + } // namespace re2 diff --git a/extern/re2/re2/testing/string_generator.h b/extern/re2/re2/testing/string_generator.h index 6184176523..73fbb51451 100644 --- a/extern/re2/re2/testing/string_generator.h +++ b/extern/re2/re2/testing/string_generator.h @@ -58,6 +58,19 @@ class StringGenerator { StringGenerator& operator=(const StringGenerator&) = delete; }; +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n-1 + 2^n called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +// +// Such a string is useful for testing a DFA. If you have a DFA +// where distinct last n bytes implies distinct states, then running on a +// DeBruijn string causes the DFA to need to create a new state at every +// position in the input, never reusing any states until it gets to the +// end of the string. This is the worst possible case for DFA execution. +std::string DeBruijnString(int n); + } // namespace re2 #endif // RE2_TESTING_STRING_GENERATOR_H_ diff --git a/extern/re2/re2/testing/tester.cc b/extern/re2/re2/testing/tester.cc index d676d9a74f..d2ec4fb9ea 100644 --- a/extern/re2/re2/testing/tester.cc +++ b/extern/re2/re2/testing/tester.cc @@ -18,14 +18,15 @@ #include "re2/re2.h" #include "re2/regexp.h" -DEFINE_bool(dump_prog, false, "dump regexp program"); -DEFINE_bool(log_okay, false, "log successful runs"); -DEFINE_bool(dump_rprog, false, "dump reversed regexp program"); +DEFINE_FLAG(bool, dump_prog, false, "dump regexp program"); +DEFINE_FLAG(bool, log_okay, false, "log successful runs"); +DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); -DEFINE_int32(max_regexp_failures, 100, - "maximum number of regexp test failures (-1 = unlimited)"); +DEFINE_FLAG(int, max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); -DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test"); +DEFINE_FLAG(std::string, regexp_engines, "", + "pattern to select regexp engines to test"); namespace re2 { @@ -62,11 +63,11 @@ static uint32_t Engines() { if (did_parse) return cached_engines; - if (FLAGS_regexp_engines.empty()) { + if (GetFlag(FLAGS_regexp_engines).empty()) { cached_engines = ~0; } else { for (Engine i = static_cast(0); i < kEngineMax; i++) - if (FLAGS_regexp_engines.find(EngineName(i)) != std::string::npos) + if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) cached_engines |= 1<= 0x80) bytes. @@ -198,7 +214,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, error_ = true; return; } - if (FLAGS_dump_prog) { + if (GetFlag(FLAGS_dump_prog)) { LOG(INFO) << "Prog for " << " regexp " << CEscape(regexp_str_) @@ -216,7 +232,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, error_ = true; return; } - if (FLAGS_dump_rprog) + if (GetFlag(FLAGS_dump_rprog)) LOG(INFO) << rprog_->Dump(); } @@ -290,9 +306,6 @@ void TestInstance::RunSearch(Engine type, const StringPiece& orig_context, Prog::Anchor anchor, Result* result) { - // Result is not trivial, so we cannot freely clear it with memset(3), - // but zeroing objects like so is safe and expedient for our purposes. - memset(reinterpret_cast(result), 0, sizeof *result); if (regexp_ == NULL) { result->skipped = true; return; @@ -476,7 +489,7 @@ void TestInstance::RunSearch(Engine type, } if (!result->matched) - memset(result->submatch, 0, sizeof result->submatch); + result->ClearSubmatch(); } // Checks whether r is okay given that correct is the right answer. @@ -489,7 +502,7 @@ static bool ResultOkay(const Result& r, const Result& correct) { return false; if (r.have_submatch || r.have_submatch0) { for (int i = 0; i < kMaxSubmatch; i++) { - if (correct.submatch[i].begin() != r.submatch[i].begin() || + if (correct.submatch[i].data() != r.submatch[i].data() || correct.submatch[i].size() != r.submatch[i].size()) return false; if (!r.have_submatch) @@ -528,7 +541,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, Result r; RunSearch(i, text, context, anchor, &r); if (ResultOkay(r, correct)) { - if (FLAGS_log_okay) + if (GetFlag(FLAGS_log_okay)) LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); continue; } @@ -555,8 +568,8 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, } } for (int i = 0; i < 1+num_captures_; i++) { - if (r.submatch[i].begin() != correct.submatch[i].begin() || - r.submatch[i].end() != correct.submatch[i].end()) { + if (r.submatch[i].data() != correct.submatch[i].data() || + r.submatch[i].size() != correct.submatch[i].size()) { LOG(INFO) << StringPrintf(" $%d: should be %s is %s", i, @@ -571,7 +584,10 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, } if (!all_okay) { - if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0) + // This will be initialised once (after flags have been initialised) + // and that is desirable because we want to enforce a global limit. + static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures); + if (max_regexp_failures > 0 && --max_regexp_failures == 0) LOG(QFATAL) << "Too many regexp failures."; } @@ -640,7 +656,7 @@ static Prog::Anchor anchors[] = { bool Tester::TestInput(const StringPiece& text) { bool okay = TestInputInContext(text, text); - if (text.size() > 0) { + if (!text.empty()) { StringPiece sp; sp = text; sp.remove_prefix(1); diff --git a/extern/re2/re2/unicode.py b/extern/re2/re2/unicode.py index 56ca8119c6..e0f33ef661 100644 --- a/extern/re2/re2/unicode.py +++ b/extern/re2/re2/unicode.py @@ -13,7 +13,7 @@ import re from six.moves import urllib # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd" +_UNICODE_DIR = "https://www.unicode.org/Public/13.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/extern/re2/re2/unicode_casefold.cc b/extern/re2/re2/unicode_casefold.cc index 4ea2533eda..8424107814 100644 --- a/extern/re2/re2/unicode_casefold.cc +++ b/extern/re2/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1381 groups, 2792 pairs, 356 ranges +// 1384 groups, 2798 pairs, 358 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -349,6 +349,8 @@ const CaseFold unicode_casefold[] = { { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, + { 42951, 42954, OddEven }, + { 42997, 42998, OddEven }, { 43859, 43859, -928 }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, @@ -366,9 +368,9 @@ const CaseFold unicode_casefold[] = { { 125184, 125217, 34 }, { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 356; +const int num_unicode_casefold = 358; -// 1381 groups, 1411 pairs, 198 ranges +// 1384 groups, 1414 pairs, 200 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -560,6 +562,8 @@ const CaseFold unicode_tolower[] = { { 42948, 42948, -48 }, { 42949, 42949, -42307 }, { 42950, 42950, -35384 }, + { 42951, 42953, OddEvenSkip }, + { 42997, 42997, OddEven }, { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, @@ -569,7 +573,7 @@ const CaseFold unicode_tolower[] = { { 93760, 93791, 32 }, { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 198; +const int num_unicode_tolower = 200; diff --git a/extern/re2/re2/unicode_groups.cc b/extern/re2/re2/unicode_groups.cc index 63e611658c..7b7a3c6a56 100644 --- a/extern/re2/re2/unicode_groups.cc +++ b/extern/re2/re2/unicode_groups.cc @@ -125,7 +125,7 @@ static const URange16 L_range16[] = { { 2112, 2136 }, { 2144, 2154 }, { 2208, 2228 }, - { 2230, 2237 }, + { 2230, 2247 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -201,7 +201,7 @@ static const URange16 L_range16[] = { { 3294, 3294 }, { 3296, 3297 }, { 3313, 3314 }, - { 3333, 3340 }, + { 3332, 3340 }, { 3342, 3344 }, { 3346, 3386 }, { 3389, 3389 }, @@ -372,10 +372,10 @@ static const URange16 L_range16[] = { { 12540, 12543 }, { 12549, 12591 }, { 12593, 12686 }, - { 12704, 12730 }, + { 12704, 12735 }, { 12784, 12799 }, - { 13312, 19893 }, - { 19968, 40943 }, + { 13312, 19903 }, + { 19968, 40956 }, { 40960, 42124 }, { 42192, 42237 }, { 42240, 42508 }, @@ -387,8 +387,8 @@ static const URange16 L_range16[] = { { 42775, 42783 }, { 42786, 42888 }, { 42891, 42943 }, - { 42946, 42950 }, - { 42999, 43009 }, + { 42946, 42954 }, + { 42997, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -425,7 +425,7 @@ static const URange16 L_range16[] = { { 43808, 43814 }, { 43816, 43822 }, { 43824, 43866 }, - { 43868, 43879 }, + { 43868, 43881 }, { 43888, 44002 }, { 44032, 55203 }, { 55216, 55238 }, @@ -511,15 +511,19 @@ static const URange32 L_range32[] = { { 68736, 68786 }, { 68800, 68850 }, { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, { 69956, 69956 }, + { 69959, 69959 }, { 69968, 70002 }, { 70006, 70006 }, { 70019, 70066 }, @@ -545,7 +549,7 @@ static const URange32 L_range32[] = { { 70493, 70497 }, { 70656, 70708 }, { 70727, 70730 }, - { 70751, 70751 }, + { 70751, 70753 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -558,7 +562,13 @@ static const URange32 L_range32[] = { { 71424, 71450 }, { 71680, 71723 }, { 71840, 71903 }, - { 71935, 71935 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, { 72096, 72103 }, { 72106, 72144 }, { 72161, 72161 }, @@ -583,6 +593,7 @@ static const URange32 L_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77824, 78894 }, @@ -601,7 +612,8 @@ static const URange32 L_range32[] = { { 94176, 94177 }, { 94179, 94179 }, { 94208, 100343 }, - { 100352, 101106 }, + { 100352, 101589 }, + { 101632, 101640 }, { 110592, 110878 }, { 110928, 110930 }, { 110948, 110951 }, @@ -680,12 +692,13 @@ static const URange32 L_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173782 }, + { 131072, 173789 }, { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Ll_range16[] = { { 97, 122 }, @@ -1289,9 +1302,12 @@ static const URange16 Ll_range16[] = { { 42941, 42941 }, { 42943, 42943 }, { 42947, 42947 }, + { 42952, 42952 }, + { 42954, 42954 }, + { 42998, 42998 }, { 43002, 43002 }, { 43824, 43866 }, - { 43872, 43879 }, + { 43872, 43880 }, { 43888, 43967 }, { 64256, 64262 }, { 64275, 64279 }, @@ -1386,6 +1402,7 @@ static const URange16 Lm_range16[] = { { 43741, 43741 }, { 43763, 43764 }, { 43868, 43871 }, + { 43881, 43881 }, { 65392, 65392 }, { 65438, 65439 }, }; @@ -1422,7 +1439,7 @@ static const URange16 Lo_range16[] = { { 2112, 2136 }, { 2144, 2154 }, { 2208, 2228 }, - { 2230, 2237 }, + { 2230, 2247 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, @@ -1498,7 +1515,7 @@ static const URange16 Lo_range16[] = { { 3294, 3294 }, { 3296, 3297 }, { 3313, 3314 }, - { 3333, 3340 }, + { 3332, 3340 }, { 3342, 3344 }, { 3346, 3386 }, { 3389, 3389 }, @@ -1611,10 +1628,10 @@ static const URange16 Lo_range16[] = { { 12543, 12543 }, { 12549, 12591 }, { 12593, 12686 }, - { 12704, 12730 }, + { 12704, 12735 }, { 12784, 12799 }, - { 13312, 19893 }, - { 19968, 40943 }, + { 13312, 19903 }, + { 19968, 40956 }, { 40960, 40980 }, { 40982, 42124 }, { 42192, 42231 }, @@ -1740,15 +1757,19 @@ static const URange32 Lo_range32[] = { { 68480, 68497 }, { 68608, 68680 }, { 68864, 68899 }, + { 69248, 69289 }, + { 69296, 69297 }, { 69376, 69404 }, { 69415, 69415 }, { 69424, 69445 }, + { 69552, 69572 }, { 69600, 69622 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, { 69956, 69956 }, + { 69959, 69959 }, { 69968, 70002 }, { 70006, 70006 }, { 70019, 70066 }, @@ -1774,7 +1795,7 @@ static const URange32 Lo_range32[] = { { 70493, 70497 }, { 70656, 70708 }, { 70727, 70730 }, - { 70751, 70751 }, + { 70751, 70753 }, { 70784, 70831 }, { 70852, 70853 }, { 70855, 70855 }, @@ -1786,7 +1807,13 @@ static const URange32 Lo_range32[] = { { 71352, 71352 }, { 71424, 71450 }, { 71680, 71723 }, - { 71935, 71935 }, + { 71935, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71983 }, + { 71999, 71999 }, + { 72001, 72001 }, { 72096, 72103 }, { 72106, 72144 }, { 72161, 72161 }, @@ -1811,6 +1838,7 @@ static const URange32 Lo_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77824, 78894 }, @@ -1824,7 +1852,8 @@ static const URange32 Lo_range32[] = { { 93952, 94026 }, { 94032, 94032 }, { 94208, 100343 }, - { 100352, 101106 }, + { 100352, 101589 }, + { 101632, 101640 }, { 110592, 110878 }, { 110928, 110930 }, { 110948, 110951 }, @@ -1870,12 +1899,13 @@ static const URange32 Lo_range32[] = { { 126625, 126627 }, { 126629, 126633 }, { 126635, 126651 }, - { 131072, 173782 }, + { 131072, 173789 }, { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Lt_range16[] = { { 453, 453 }, @@ -2487,7 +2517,9 @@ static const URange16 Lu_range16[] = { { 42940, 42940 }, { 42942, 42942 }, { 42946, 42946 }, - { 42948, 42950 }, + { 42948, 42951 }, + { 42953, 42953 }, + { 42997, 42997 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { @@ -2588,7 +2620,7 @@ static const URange16 M_range16[] = { { 2878, 2884 }, { 2887, 2888 }, { 2891, 2893 }, - { 2902, 2903 }, + { 2901, 2903 }, { 2914, 2915 }, { 2946, 2946 }, { 3006, 3010 }, @@ -2615,7 +2647,7 @@ static const URange16 M_range16[] = { { 3402, 3405 }, { 3415, 3415 }, { 3426, 3427 }, - { 3458, 3459 }, + { 3457, 3459 }, { 3530, 3530 }, { 3535, 3540 }, { 3542, 3542 }, @@ -2662,7 +2694,7 @@ static const URange16 M_range16[] = { { 6741, 6750 }, { 6752, 6780 }, { 6783, 6783 }, - { 6832, 6846 }, + { 6832, 6848 }, { 6912, 6916 }, { 6964, 6980 }, { 7019, 7027 }, @@ -2691,6 +2723,7 @@ static const URange16 M_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43043, 43047 }, + { 43052, 43052 }, { 43136, 43137 }, { 43188, 43205 }, { 43232, 43249 }, @@ -2728,6 +2761,7 @@ static const URange32 M_range32[] = { { 68159, 68159 }, { 68325, 68326 }, { 68900, 68903 }, + { 69291, 69292 }, { 69446, 69456 }, { 69632, 69634 }, { 69688, 69702 }, @@ -2740,6 +2774,7 @@ static const URange32 M_range32[] = { { 70016, 70018 }, { 70067, 70080 }, { 70089, 70092 }, + { 70094, 70095 }, { 70188, 70199 }, { 70206, 70206 }, { 70367, 70378 }, @@ -2762,6 +2797,11 @@ static const URange32 M_range32[] = { { 71339, 71351 }, { 71453, 71467 }, { 71724, 71738 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71995, 71998 }, + { 72000, 72000 }, + { 72002, 72003 }, { 72145, 72151 }, { 72154, 72160 }, { 72164, 72164 }, @@ -2789,6 +2829,8 @@ static const URange32 M_range32[] = { { 94031, 94031 }, { 94033, 94087 }, { 94095, 94098 }, + { 94180, 94180 }, + { 94192, 94193 }, { 113821, 113822 }, { 119141, 119145 }, { 119149, 119154 }, @@ -2935,6 +2977,7 @@ static const URange32 Mc_range32[] = { { 70018, 70018 }, { 70067, 70069 }, { 70079, 70080 }, + { 70094, 70094 }, { 70188, 70190 }, { 70194, 70195 }, { 70197, 70197 }, @@ -2966,6 +3009,11 @@ static const URange32 Mc_range32[] = { { 71462, 71462 }, { 71724, 71726 }, { 71736, 71736 }, + { 71984, 71989 }, + { 71991, 71992 }, + { 71997, 71997 }, + { 72000, 72000 }, + { 72002, 72002 }, { 72145, 72147 }, { 72156, 72159 }, { 72164, 72164 }, @@ -2982,6 +3030,7 @@ static const URange32 Mc_range32[] = { { 73110, 73110 }, { 73461, 73462 }, { 94033, 94087 }, + { 94192, 94193 }, { 119141, 119142 }, { 119149, 119154 }, }; @@ -3051,7 +3100,7 @@ static const URange16 Mn_range16[] = { { 2879, 2879 }, { 2881, 2884 }, { 2893, 2893 }, - { 2902, 2902 }, + { 2901, 2902 }, { 2914, 2915 }, { 2946, 2946 }, { 3008, 3008 }, @@ -3074,6 +3123,7 @@ static const URange16 Mn_range16[] = { { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, + { 3457, 3457 }, { 3530, 3530 }, { 3538, 3540 }, { 3542, 3542 }, @@ -3131,6 +3181,7 @@ static const URange16 Mn_range16[] = { { 6771, 6780 }, { 6783, 6783 }, { 6832, 6845 }, + { 6847, 6848 }, { 6912, 6915 }, { 6964, 6964 }, { 6966, 6970 }, @@ -3171,6 +3222,7 @@ static const URange16 Mn_range16[] = { { 43014, 43014 }, { 43019, 43019 }, { 43045, 43046 }, + { 43052, 43052 }, { 43204, 43205 }, { 43232, 43249 }, { 43263, 43263 }, @@ -3212,6 +3264,7 @@ static const URange32 Mn_range32[] = { { 68159, 68159 }, { 68325, 68326 }, { 68900, 68903 }, + { 69291, 69292 }, { 69446, 69456 }, { 69633, 69633 }, { 69688, 69702 }, @@ -3225,6 +3278,7 @@ static const URange32 Mn_range32[] = { { 70016, 70017 }, { 70070, 70078 }, { 70089, 70092 }, + { 70095, 70095 }, { 70191, 70193 }, { 70196, 70196 }, { 70198, 70199 }, @@ -3260,6 +3314,9 @@ static const URange32 Mn_range32[] = { { 71463, 71467 }, { 71727, 71735 }, { 71737, 71738 }, + { 71995, 71996 }, + { 71998, 71998 }, + { 72003, 72003 }, { 72148, 72151 }, { 72154, 72155 }, { 72160, 72160 }, @@ -3291,6 +3348,7 @@ static const URange32 Mn_range32[] = { { 92976, 92982 }, { 94031, 94031 }, { 94095, 94098 }, + { 94180, 94180 }, { 113821, 113822 }, { 119143, 119145 }, { 119163, 119170 }, @@ -3413,6 +3471,7 @@ static const URange32 N_range32[] = { { 69216, 69246 }, { 69405, 69414 }, { 69457, 69460 }, + { 69573, 69579 }, { 69714, 69743 }, { 69872, 69881 }, { 69942, 69951 }, @@ -3425,6 +3484,7 @@ static const URange32 N_range32[] = { { 71360, 71369 }, { 71472, 71483 }, { 71904, 71922 }, + { 72016, 72025 }, { 72784, 72812 }, { 73040, 73049 }, { 73120, 73129 }, @@ -3447,6 +3507,7 @@ static const URange32 N_range32[] = { { 126209, 126253 }, { 126255, 126269 }, { 127232, 127244 }, + { 130032, 130041 }, }; static const URange16 Nd_range16[] = { { 48, 57 }, @@ -3501,6 +3562,7 @@ static const URange32 Nd_range32[] = { { 71360, 71369 }, { 71472, 71481 }, { 71904, 71913 }, + { 72016, 72025 }, { 72784, 72793 }, { 73040, 73049 }, { 73120, 73129 }, @@ -3510,6 +3572,7 @@ static const URange32 Nd_range32[] = { { 123200, 123209 }, { 123632, 123641 }, { 125264, 125273 }, + { 130032, 130041 }, }; static const URange16 Nl_range16[] = { { 5870, 5872 }, @@ -3583,6 +3646,7 @@ static const URange32 No_range32[] = { { 69216, 69246 }, { 69405, 69414 }, { 69457, 69460 }, + { 69573, 69579 }, { 69714, 69733 }, { 70113, 70132 }, { 71482, 71483 }, @@ -3692,6 +3756,7 @@ static const URange16 P_range16[] = { { 11632, 11632 }, { 11776, 11822 }, { 11824, 11855 }, + { 11858, 11858 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3747,6 +3812,7 @@ static const URange32 P_range32[] = { { 68336, 68342 }, { 68409, 68415 }, { 68505, 68508 }, + { 69293, 69293 }, { 69461, 69465 }, { 69703, 69709 }, { 69819, 69820 }, @@ -3760,7 +3826,7 @@ static const URange32 P_range32[] = { { 70200, 70205 }, { 70313, 70313 }, { 70731, 70735 }, - { 70747, 70747 }, + { 70746, 70747 }, { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, @@ -3768,6 +3834,7 @@ static const URange32 P_range32[] = { { 71264, 71276 }, { 71484, 71486 }, { 71739, 71739 }, + { 72004, 72006 }, { 72162, 72162 }, { 72255, 72262 }, { 72346, 72348 }, @@ -3814,6 +3881,9 @@ static const URange16 Pd_range16[] = { { 65123, 65123 }, { 65293, 65293 }, }; +static const URange32 Pd_range32[] = { + { 69293, 69293 }, +}; static const URange16 Pe_range16[] = { { 41, 41 }, { 93, 93 }, @@ -4002,6 +4072,7 @@ static const URange16 Po_range16[] = { { 11836, 11839 }, { 11841, 11841 }, { 11843, 11855 }, + { 11858, 11858 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -4069,7 +4140,7 @@ static const URange32 Po_range32[] = { { 70200, 70205 }, { 70313, 70313 }, { 70731, 70735 }, - { 70747, 70747 }, + { 70746, 70747 }, { 70749, 70749 }, { 70854, 70854 }, { 71105, 71127 }, @@ -4077,6 +4148,7 @@ static const URange32 Po_range32[] = { { 71264, 71276 }, { 71484, 71486 }, { 71739, 71739 }, + { 72004, 72006 }, { 72162, 72162 }, { 72255, 72262 }, { 72346, 72348 }, @@ -4274,8 +4346,9 @@ static const URange16 S_range16[] = { { 10716, 10747 }, { 10750, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4304,6 +4377,7 @@ static const URange16 S_range16[] = { { 43062, 43065 }, { 43639, 43641 }, { 43867, 43867 }, + { 43882, 43883 }, { 64297, 64297 }, { 64434, 64449 }, { 65020, 65021 }, @@ -4325,7 +4399,7 @@ static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65932, 65934 }, - { 65936, 65947 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, @@ -4372,16 +4446,15 @@ static const URange32 S_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127245, 127405 }, { 127462, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, - { 127744, 128725 }, + { 127744, 128727 }, { 128736, 128748 }, - { 128752, 128762 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, @@ -4390,18 +4463,20 @@ static const URange32 S_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, + { 129200, 129201 }, + { 129280, 129400 }, + { 129402, 129483 }, { 129485, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, + { 129648, 129652 }, { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129664, 129670 }, + { 129680, 129704 }, + { 129712, 129718 }, + { 129728, 129730 }, + { 129744, 129750 }, + { 129792, 129938 }, + { 129940, 129994 }, }; static const URange16 Sc_range16[] = { { 36, 36 }, @@ -4453,6 +4528,7 @@ static const URange16 Sk_range16[] = { { 42784, 42785 }, { 42889, 42890 }, { 43867, 43867 }, + { 43882, 43883 }, { 64434, 64449 }, { 65342, 65342 }, { 65344, 65344 }, @@ -4610,8 +4686,9 @@ static const URange16 So_range16[] = { { 11077, 11078 }, { 11085, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, + { 11159, 11263 }, { 11493, 11498 }, + { 11856, 11857 }, { 11904, 11929 }, { 11931, 12019 }, { 12032, 12245 }, @@ -4646,7 +4723,7 @@ static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, { 65932, 65934 }, - { 65936, 65947 }, + { 65936, 65948 }, { 65952, 65952 }, { 66000, 66044 }, { 67703, 67704 }, @@ -4681,17 +4758,16 @@ static const URange32 So_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127245, 127405 }, { 127462, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, { 127744, 127994 }, - { 128000, 128725 }, + { 128000, 128727 }, { 128736, 128748 }, - { 128752, 128762 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, @@ -4700,18 +4776,20 @@ static const URange32 So_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, + { 129200, 129201 }, + { 129280, 129400 }, + { 129402, 129483 }, { 129485, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, + { 129648, 129652 }, { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129664, 129670 }, + { 129680, 129704 }, + { 129712, 129718 }, + { 129728, 129730 }, + { 129744, 129750 }, + { 129792, 129938 }, + { 129940, 129994 }, }; static const URange16 Z_range16[] = { { 32, 32 }, @@ -4764,7 +4842,7 @@ static const URange16 Arabic_range16[] = { { 1758, 1791 }, { 1872, 1919 }, { 2208, 2228 }, - { 2230, 2237 }, + { 2230, 2247 }, { 2259, 2273 }, { 2275, 2303 }, { 64336, 64449 }, @@ -4814,8 +4892,7 @@ static const URange32 Arabic_range32[] = { }; static const URange16 Armenian_range16[] = { { 1329, 1366 }, - { 1369, 1416 }, - { 1418, 1418 }, + { 1369, 1418 }, { 1421, 1423 }, { 64275, 64279 }, }; @@ -4866,7 +4943,7 @@ static const URange32 Bhaiksuki_range32[] = { static const URange16 Bopomofo_range16[] = { { 746, 747 }, { 12549, 12591 }, - { 12704, 12730 }, + { 12704, 12735 }, }; static const URange32 Brahmi_range32[] = { { 69632, 69709 }, @@ -4896,7 +4973,7 @@ static const URange32 Caucasian_Albanian_range32[] = { }; static const URange32 Chakma_range32[] = { { 69888, 69940 }, - { 69942, 69958 }, + { 69942, 69959 }, }; static const URange16 Cham_range16[] = { { 43520, 43574 }, @@ -4909,6 +4986,9 @@ static const URange16 Cherokee_range16[] = { { 5112, 5117 }, { 43888, 43967 }, }; +static const URange32 Chorasmian_range32[] = { + { 69552, 69579 }, +}; static const URange16 Common_range16[] = { { 0, 64 }, { 91, 96 }, @@ -4924,7 +5004,6 @@ static const URange16 Common_range16[] = { { 894, 894 }, { 901, 901 }, { 903, 903 }, - { 1417, 1417 }, { 1541, 1541 }, { 1548, 1548 }, { 1563, 1563 }, @@ -4963,8 +5042,8 @@ static const URange16 Common_range16[] = { { 9312, 10239 }, { 10496, 11123 }, { 11126, 11157 }, - { 11160, 11263 }, - { 11776, 11855 }, + { 11159, 11263 }, + { 11776, 11858 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -4987,6 +5066,7 @@ static const URange16 Common_range16[] = { { 43310, 43310 }, { 43471, 43471 }, { 43867, 43867 }, + { 43882, 43883 }, { 64830, 64831 }, { 65040, 65049 }, { 65072, 65106 }, @@ -5006,7 +5086,7 @@ static const URange32 Common_range32[] = { { 65792, 65794 }, { 65799, 65843 }, { 65847, 65855 }, - { 65936, 65947 }, + { 65936, 65948 }, { 66000, 66044 }, { 66273, 66299 }, { 94178, 94179 }, @@ -5050,18 +5130,16 @@ static const URange32 Common_range32[] = { { 127153, 127167 }, { 127169, 127183 }, { 127185, 127221 }, - { 127232, 127244 }, - { 127248, 127340 }, - { 127344, 127404 }, + { 127232, 127405 }, { 127462, 127487 }, { 127489, 127490 }, { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, { 127584, 127589 }, - { 127744, 128725 }, + { 127744, 128727 }, { 128736, 128748 }, - { 128752, 128762 }, + { 128752, 128764 }, { 128768, 128883 }, { 128896, 128984 }, { 128992, 129003 }, @@ -5070,18 +5148,21 @@ static const URange32 Common_range32[] = { { 129104, 129113 }, { 129120, 129159 }, { 129168, 129197 }, - { 129280, 129291 }, - { 129293, 129393 }, - { 129395, 129398 }, - { 129402, 129442 }, - { 129445, 129450 }, - { 129454, 129482 }, + { 129200, 129201 }, + { 129280, 129400 }, + { 129402, 129483 }, { 129485, 129619 }, { 129632, 129645 }, - { 129648, 129651 }, + { 129648, 129652 }, { 129656, 129658 }, - { 129664, 129666 }, - { 129680, 129685 }, + { 129664, 129670 }, + { 129680, 129704 }, + { 129712, 129718 }, + { 129728, 129730 }, + { 129744, 129750 }, + { 129792, 129938 }, + { 129940, 129994 }, + { 130032, 130041 }, { 917505, 917505 }, { 917536, 917631 }, }; @@ -5123,6 +5204,16 @@ static const URange16 Devanagari_range16[] = { { 2406, 2431 }, { 43232, 43263 }, }; +static const URange32 Dives_Akuru_range32[] = { + { 71936, 71942 }, + { 71945, 71945 }, + { 71948, 71955 }, + { 71957, 71958 }, + { 71960, 71989 }, + { 71991, 71992 }, + { 71995, 72006 }, + { 72016, 72025 }, +}; static const URange32 Dogra_range32[] = { { 71680, 71739 }, }; @@ -5310,18 +5401,20 @@ static const URange16 Han_range16[] = { { 12295, 12295 }, { 12321, 12329 }, { 12344, 12347 }, - { 13312, 19893 }, - { 19968, 40943 }, + { 13312, 19903 }, + { 19968, 40956 }, { 63744, 64109 }, { 64112, 64217 }, }; static const URange32 Han_range32[] = { - { 131072, 173782 }, + { 94192, 94193 }, + { 131072, 173789 }, { 173824, 177972 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, + { 196608, 201546 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -5381,7 +5474,7 @@ static const URange16 Inherited_range16[] = { { 1611, 1621 }, { 1648, 1648 }, { 2385, 2388 }, - { 6832, 6846 }, + { 6832, 6848 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, @@ -5466,6 +5559,10 @@ static const URange32 Kharoshthi_range32[] = { { 68159, 68168 }, { 68176, 68184 }, }; +static const URange32 Khitan_Small_Script_range32[] = { + { 94180, 94180 }, + { 101120, 101589 }, +}; static const URange16 Khmer_range16[] = { { 6016, 6109 }, { 6112, 6121 }, @@ -5518,11 +5615,11 @@ static const URange16 Latin_range16[] = { { 11360, 11391 }, { 42786, 42887 }, { 42891, 42943 }, - { 42946, 42950 }, - { 42999, 43007 }, + { 42946, 42954 }, + { 42997, 43007 }, { 43824, 43866 }, { 43868, 43876 }, - { 43878, 43879 }, + { 43878, 43881 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, @@ -5556,6 +5653,9 @@ static const URange32 Linear_B_range32[] = { static const URange16 Lisu_range16[] = { { 42192, 42239 }, }; +static const URange32 Lisu_range32[] = { + { 73648, 73648 }, +}; static const URange32 Lycian_range32[] = { { 66176, 66204 }, }; @@ -5570,8 +5670,7 @@ static const URange32 Makasar_range32[] = { { 73440, 73464 }, }; static const URange16 Malayalam_range16[] = { - { 3328, 3331 }, - { 3333, 3340 }, + { 3328, 3340 }, { 3342, 3344 }, { 3346, 3396 }, { 3398, 3400 }, @@ -5674,9 +5773,8 @@ static const URange16 New_Tai_Lue_range16[] = { { 6622, 6623 }, }; static const URange32 Newa_range32[] = { - { 70656, 70745 }, - { 70747, 70747 }, - { 70749, 70751 }, + { 70656, 70747 }, + { 70749, 70753 }, }; static const URange16 Nko_range16[] = { { 1984, 2042 }, @@ -5737,7 +5835,7 @@ static const URange16 Oriya_range16[] = { { 2876, 2884 }, { 2887, 2888 }, { 2891, 2893 }, - { 2902, 2903 }, + { 2901, 2903 }, { 2908, 2909 }, { 2911, 2915 }, { 2918, 2935 }, @@ -5792,8 +5890,7 @@ static const URange16 Saurashtra_range16[] = { { 43214, 43225 }, }; static const URange32 Sharada_range32[] = { - { 70016, 70093 }, - { 70096, 70111 }, + { 70016, 70111 }, }; static const URange32 Shavian_range32[] = { { 66640, 66687 }, @@ -5808,7 +5905,7 @@ static const URange32 SignWriting_range32[] = { { 121505, 121519 }, }; static const URange16 Sinhala_range16[] = { - { 3458, 3459 }, + { 3457, 3459 }, { 3461, 3478 }, { 3482, 3505 }, { 3507, 3515 }, @@ -5839,7 +5936,7 @@ static const URange16 Sundanese_range16[] = { { 7360, 7367 }, }; static const URange16 Syloti_Nagri_range16[] = { - { 43008, 43051 }, + { 43008, 43052 }, }; static const URange16 Syriac_range16[] = { { 1792, 1805 }, @@ -5900,7 +5997,8 @@ static const URange32 Tamil_range32[] = { static const URange32 Tangut_range32[] = { { 94176, 94176 }, { 94208, 100343 }, - { 100352, 101106 }, + { 100352, 101119 }, + { 101632, 101640 }, }; static const URange16 Telugu_range16[] = { { 3072, 3084 }, @@ -5956,6 +6054,11 @@ static const URange32 Warang_Citi_range32[] = { { 71840, 71922 }, { 71935, 71935 }, }; +static const URange32 Yezidi_range32[] = { + { 69248, 69289 }, + { 69291, 69293 }, + { 69296, 69297 }, +}; static const URange16 Yi_range16[] = { { 40960, 42124 }, { 42128, 42182 }, @@ -5963,13 +6066,13 @@ static const URange16 Yi_range16[] = { static const URange32 Zanabazar_Square_range32[] = { { 72192, 72263 }, }; -// 3987 16-bit ranges, 1525 32-bit ranges +// 4001 16-bit ranges, 1602 32-bit ranges const UGroup unicode_groups[] = { { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, - { "Armenian", +1, Armenian_range16, 5, 0, 0 }, + { "Armenian", +1, Armenian_range16, 4, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, @@ -5991,8 +6094,9 @@ const UGroup unicode_groups[] = { { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, + { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 91, Common_range32, 81 }, + { "Common", +1, Common_range16, 91, Common_range32, 82 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, @@ -6000,6 +6104,7 @@ const UGroup unicode_groups[] = { { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, { "Dogra", +1, 0, 0, Dogra_range32, 1 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, @@ -6014,7 +6119,7 @@ const UGroup unicode_groups[] = { { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 6 }, + { "Han", +1, Han_range16, 11, Han_range32, 8 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, @@ -6031,33 +6136,34 @@ const UGroup unicode_groups[] = { { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 2 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, + { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 380, L_range32, 229 }, + { "L", +1, L_range16, 380, L_range32, 242 }, { "Lao", +1, Lao_range16, 11, 0, 0 }, { "Latin", +1, Latin_range16, 32, 0, 0 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, - { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 608, Ll_range32, 34 }, - { "Lm", +1, Lm_range16, 54, Lm_range32, 6 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 186 }, + { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, + { "Ll", +1, Ll_range16, 611, Ll_range32, 34 }, + { "Lm", +1, Lm_range16, 55, Lm_range32, 6 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 199 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 599, Lu_range32, 37 }, + { "Lu", +1, Lu_range16, 601, Lu_range32, 37 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 186, M_range32, 94 }, + { "M", +1, M_range16, 187, M_range32, 103 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Makasar", +1, 0, 0, Makasar_range32, 1 }, - { "Malayalam", +1, Malayalam_range16, 8, 0, 0 }, + { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, { "Marchen", +1, 0, 0, Marchen_range32, 3 }, { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, - { "Mc", +1, Mc_range16, 109, Mc_range32, 59 }, + { "Mc", +1, Mc_range16, 109, Mc_range32, 66 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, @@ -6065,21 +6171,21 @@ const UGroup unicode_groups[] = { { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 207, Mn_range32, 111 }, + { "Mn", +1, Mn_range16, 210, Mn_range32, 117 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 67, N_range32, 63 }, + { "N", +1, N_range16, 67, N_range32, 66 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 22 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 24 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, - { "Newa", +1, 0, 0, Newa_range32, 3 }, + { "Newa", +1, 0, 0, Newa_range32, 2 }, { "Nko", +1, Nko_range16, 2, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 29, No_range32, 41 }, + { "No", +1, No_range16, 29, No_range32, 42 }, { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, @@ -6095,34 +6201,34 @@ const UGroup unicode_groups[] = { { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 131, P_range32, 51 }, + { "P", +1, P_range16, 132, P_range32, 53 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 17, 0, 0 }, + { "Pd", +1, Pd_range16, 17, Pd_range32, 1 }, { "Pe", +1, Pe_range16, 72, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 128, Po_range32, 51 }, + { "Po", +1, Po_range16, 129, Po_range32, 52 }, { "Ps", +1, Ps_range16, 75, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 146, S_range32, 80 }, + { "S", +1, S_range16, 148, S_range32, 81 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, - { "Sharada", +1, 0, 0, Sharada_range32, 2 }, + { "Sharada", +1, 0, 0, Sharada_range32, 1 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, { "Siddham", +1, 0, 0, Siddham_range32, 2 }, { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, - { "Sk", +1, Sk_range16, 28, Sk_range32, 1 }, + { "Sk", +1, Sk_range16, 29, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 111, So_range32, 69 }, + { "So", +1, So_range16, 112, So_range32, 70 }, { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, @@ -6136,7 +6242,7 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, Tamil_range32, 2 }, - { "Tangut", +1, 0, 0, Tangut_range32, 3 }, + { "Tangut", +1, 0, 0, Tangut_range32, 4 }, { "Telugu", +1, Telugu_range16, 12, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, @@ -6147,6 +6253,7 @@ const UGroup unicode_groups[] = { { "Vai", +1, Vai_range16, 1, 0, 0 }, { "Wancho", +1, 0, 0, Wancho_range32, 2 }, { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, + { "Yezidi", +1, 0, 0, Yezidi_range32, 3 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, @@ -6154,7 +6261,7 @@ const UGroup unicode_groups[] = { { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 188; +const int num_unicode_groups = 192; } // namespace re2 diff --git a/extern/re2/re2/walker-inl.h b/extern/re2/re2/walker-inl.h index 032b8ac7db..8e0f94667c 100644 --- a/extern/re2/re2/walker-inl.h +++ b/extern/re2/re2/walker-inl.h @@ -89,7 +89,7 @@ template class Regexp::Walker { private: // Walk state for the entire traversal. - std::stack >* stack_; + std::stack> stack_; bool stopped_early_; int max_visits_; @@ -119,7 +119,7 @@ template T Regexp::Walker::Copy(T arg) { // State about a single level in the traversal. template struct WalkState { - WalkState(Regexp* re, T parent) + WalkState(Regexp* re, T parent) : re(re), n(-1), parent_arg(parent), @@ -134,24 +134,22 @@ template struct WalkState { }; template Regexp::Walker::Walker() { - stack_ = new std::stack >; stopped_early_ = false; } template Regexp::Walker::~Walker() { Reset(); - delete stack_; } // Clears the stack. Should never be necessary, since // Walk always enters and exits with an empty stack. // Logs DFATAL if stack is not already clear. template void Regexp::Walker::Reset() { - if (stack_ && stack_->size() > 0) { + if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; - while (stack_->size() > 0) { - delete stack_->top().child_args; - stack_->pop(); + while (!stack_.empty()) { + delete[] stack_.top().child_args; + stack_.pop(); } } } @@ -165,12 +163,12 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, return top_arg; } - stack_->push(WalkState(re, top_arg)); + stack_.push(WalkState(re, top_arg)); WalkState* s; for (;;) { T t; - s = &stack_->top(); + s = &stack_.top(); Regexp* re = s->re; switch (s->n) { case -1: { @@ -201,7 +199,7 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, s->child_args[s->n] = Copy(s->child_args[s->n - 1]); s->n++; } else { - stack_->push(WalkState(sub[s->n], s->pre_arg)); + stack_.push(WalkState(sub[s->n], s->pre_arg)); } continue; } @@ -214,12 +212,12 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, } } - // We've finished stack_->top(). + // We've finished stack_.top(). // Update next guy down. - stack_->pop(); - if (stack_->size() == 0) + stack_.pop(); + if (stack_.empty()) return t; - s = &stack_->top(); + s = &stack_.top(); if (s->child_args != NULL) s->child_args[s->n] = t; else diff --git a/extern/re2/re2_test.bzl b/extern/re2/re2_test.bzl deleted file mode 100644 index c0eb654196..0000000000 --- a/extern/re2/re2_test.bzl +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2009 The RE2 Authors. All Rights Reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -# Defines a Bazel macro that instantiates a native cc_test rule for an RE2 test. -def re2_test(name, deps=[], size="medium"): - native.cc_test( - name=name, - srcs=["re2/testing/%s.cc" % (name)], - deps=[":test"] + deps, - size=size, - ) diff --git a/extern/re2/runtests b/extern/re2/runtests old mode 100644 new mode 100755 diff --git a/extern/re2/testinstall.cc b/extern/re2/testinstall.cc index 47db4e68cc..19cc9003bf 100644 --- a/extern/re2/testinstall.cc +++ b/extern/re2/testinstall.cc @@ -2,23 +2,26 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include -#include #include +#include +#include -int main(void) { - re2::FilteredRE2 f; - int id; - f.Add("a.*b.*c", RE2::DefaultOptions, &id); - std::vector v; - f.Compile(&v); - std::vector ids; - f.FirstMatch("abbccc", ids); +int main() { + re2::FilteredRE2 f; + int id; + f.Add("a.*b.*c", RE2::DefaultOptions, &id); + std::vector v; + f.Compile(&v); + std::vector ids; + f.FirstMatch("abbccc", ids); - if(RE2::FullMatch("axbyc", "a.*b.*c")) { - printf("PASS\n"); - return 0; - } - printf("FAIL\n"); - return 2; + int n; + if (RE2::FullMatch("axbyc", "a.*b.*c") && + RE2::PartialMatch("foo123bar", "(\\d+)", &n) && n == 123) { + printf("PASS\n"); + return 0; + } + + printf("FAIL\n"); + return 2; } diff --git a/extern/re2/util/benchmark.cc b/extern/re2/util/benchmark.cc index 144f550171..e39c3349ab 100644 --- a/extern/re2/util/benchmark.cc +++ b/extern/re2/util/benchmark.cc @@ -7,155 +7,125 @@ #include #include #include -#include -#include "util/util.h" -#include "util/flags.h" #include "util/benchmark.h" +#include "util/flags.h" #include "re2/re2.h" -DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); - #ifdef _WIN32 #define snprintf _snprintf #endif -using testing::Benchmark; +using ::testing::Benchmark; static Benchmark* benchmarks[10000]; static int nbenchmarks; void Benchmark::Register() { - benchmarks[nbenchmarks] = this; - if(lo < 1) - lo = 1; - if(hi < lo) - hi = lo; - nbenchmarks++; + lo_ = std::max(1, lo_); + hi_ = std::max(lo_, hi_); + benchmarks[nbenchmarks++] = this; } static int64_t nsec() { - return std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()).count(); + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); } -static int64_t bytes; -static int64_t ns; static int64_t t0; +static int64_t ns; +static int64_t bytes; static int64_t items; -void SetBenchmarkBytesProcessed(int64_t x) { - bytes = x; +void StartBenchmarkTiming() { + if (t0 == 0) { + t0 = nsec(); + } } void StopBenchmarkTiming() { - if(t0 != 0) - ns += nsec() - t0; - t0 = 0; + if (t0 != 0) { + ns += nsec() - t0; + t0 = 0; + } } -void StartBenchmarkTiming() { - if(t0 == 0) - t0 = nsec(); -} +void SetBenchmarkBytesProcessed(int64_t b) { bytes = b; } -void SetBenchmarkItemsProcessed(int n) { - items = n; -} +void SetBenchmarkItemsProcessed(int64_t i) { items = i; } -void BenchmarkMemoryUsage() { - // TODO(rsc): Implement. -} - -int NumCPUs() { - return static_cast(std::thread::hardware_concurrency()); -} - -static void runN(Benchmark *b, int n, int siz) { - bytes = 0; - items = 0; - ns = 0; - t0 = nsec(); - if(b->fn) - b->fn(n); - else if(b->fnr) - b->fnr(n, siz); - else { - fprintf(stderr, "%s: missing function\n", b->name); - abort(); - } - if(t0 != 0) - ns += nsec() - t0; +static void RunFunc(Benchmark* b, int iters, int arg) { + t0 = nsec(); + ns = 0; + bytes = 0; + items = 0; + b->func()(iters, arg); + StopBenchmarkTiming(); } static int round(int n) { - int base = 1; - - while(base*10 < n) - base *= 10; - if(n < 2*base) - return 2*base; - if(n < 5*base) - return 5*base; - return 10*base; + int base = 1; + while (base * 10 < n) base *= 10; + if (n < 2 * base) return 2 * base; + if (n < 5 * base) return 5 * base; + return 10 * base; } -void RunBench(Benchmark* b, int nthread, int siz) { - int n, last; +static void RunBench(Benchmark* b, int arg) { + int iters, last; - // TODO(rsc): Threaded benchmarks. - if(nthread != 1) - return; - - // run once in case it's expensive - n = 1; - runN(b, n, siz); - while(ns < (int)1e9 && n < (int)1e9) { - last = n; - if(ns/n == 0) - n = (int)1e9; - else - n = (int)1e9 / static_cast(ns/n); - - n = std::max(last+1, std::min(n+n/2, 100*last)); - n = round(n); - runN(b, n, siz); - } - - char mb[100]; - char suf[100]; - mb[0] = '\0'; - suf[0] = '\0'; - if(ns > 0 && bytes > 0) - snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9)); - if(b->fnr || b->lo != b->hi) { - if(siz >= (1<<20)) - snprintf(suf, sizeof suf, "/%dM", siz/(1<<20)); - else if(siz >= (1<<10)) - snprintf(suf, sizeof suf, "/%dK", siz/(1<<10)); - else - snprintf(suf, sizeof suf, "/%d", siz); - } - printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb); - fflush(stdout); + // Run once just in case it's expensive. + iters = 1; + RunFunc(b, iters, arg); + while (ns < (int)1e9 && iters < (int)1e9) { + last = iters; + if (ns / iters == 0) { + iters = (int)1e9; + } else { + iters = (int)1e9 / static_cast(ns / iters); + } + iters = std::max(last + 1, std::min(iters + iters / 2, 100 * last)); + iters = round(iters); + RunFunc(b, iters, arg); + } + + char mb[100]; + char suf[100]; + mb[0] = '\0'; + suf[0] = '\0'; + if (ns > 0 && bytes > 0) + snprintf(mb, sizeof mb, "\t%7.2f MB/s", + ((double)bytes / 1e6) / ((double)ns / 1e9)); + if (b->has_arg()) { + if (arg >= (1 << 20)) { + snprintf(suf, sizeof suf, "/%dM", arg / (1 << 20)); + } else if (arg >= (1 << 10)) { + snprintf(suf, sizeof suf, "/%dK", arg / (1 << 10)); + } else { + snprintf(suf, sizeof suf, "/%d", arg); + } + } + printf("%s%s\t%8d\t%10lld ns/op%s\n", b->name(), suf, iters, + (long long)ns / iters, mb); + fflush(stdout); } -static int match(const char* name, int argc, const char** argv) { - if(argc == 1) - return 1; - for(int i = 1; i < argc; i++) - if(RE2::PartialMatch(name, argv[i])) - return 1; - return 0; +static bool WantBench(const char* name, int argc, const char** argv) { + if (argc == 1) return true; + for (int i = 1; i < argc; i++) { + if (RE2::PartialMatch(name, argv[i])) + return true; + } + return false; } int main(int argc, const char** argv) { - for(int i = 0; i < nbenchmarks; i++) { - Benchmark* b = benchmarks[i]; - if(match(b->name, argc, argv)) - for(int j = b->threadlo; j <= b->threadhi; j++) - for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1) - RunBench(b, j, k); - } + for (int i = 0; i < nbenchmarks; i++) { + Benchmark* b = benchmarks[i]; + if (!WantBench(b->name(), argc, argv)) + continue; + for (int arg = b->lo(); arg <= b->hi(); arg <<= 1) + RunBench(b, arg); + } } - diff --git a/extern/re2/util/benchmark.h b/extern/re2/util/benchmark.h index fba30b9cba..d97b49e17f 100644 --- a/extern/re2/util/benchmark.h +++ b/extern/re2/util/benchmark.h @@ -6,38 +6,151 @@ #define UTIL_BENCHMARK_H_ #include +#include + +#include "util/logging.h" +#include "util/util.h" + +// Globals for the old benchmark API. +void StartBenchmarkTiming(); +void StopBenchmarkTiming(); +void SetBenchmarkBytesProcessed(int64_t b); +void SetBenchmarkItemsProcessed(int64_t i); + +namespace benchmark { + +// The new benchmark API implemented as a layer over the old benchmark API. +// (Please refer to https://github.com/google/benchmark for documentation.) +class State { + private: + class Iterator { + public: + // Benchmark code looks like this: + // + // for (auto _ : state) { + // // ... + // } + // + // We try to avoid compiler warnings about such variables being unused. + struct ATTRIBUTE_UNUSED Value {}; + + explicit Iterator(int64_t iters) : iters_(iters) {} + + bool operator!=(const Iterator& that) const { + if (iters_ != that.iters_) { + return true; + } else { + // We are about to stop the loop, so stop timing. + StopBenchmarkTiming(); + return false; + } + } + + Value operator*() const { + return Value(); + } + + Iterator& operator++() { + --iters_; + return *this; + } + + private: + int64_t iters_; + }; + + public: + explicit State(int64_t iters) + : iters_(iters), arg_(0), has_arg_(false) {} + + State(int64_t iters, int64_t arg) + : iters_(iters), arg_(arg), has_arg_(true) {} + + Iterator begin() { + // We are about to start the loop, so start timing. + StartBenchmarkTiming(); + return Iterator(iters_); + } + + Iterator end() { + return Iterator(0); + } + + void SetBytesProcessed(int64_t b) { SetBenchmarkBytesProcessed(b); } + void SetItemsProcessed(int64_t i) { SetBenchmarkItemsProcessed(i); } + int64_t iterations() const { return iters_; } + // Pretend to support multiple arguments. + int64_t range(int pos) const { CHECK(has_arg_); return arg_; } + + private: + int64_t iters_; + int64_t arg_; + bool has_arg_; + + State(const State&) = delete; + State& operator=(const State&) = delete; +}; + +} // namespace benchmark namespace testing { -struct Benchmark { - const char* name; - void (*fn)(int); - void (*fnr)(int, int); - int lo; - int hi; - int threadlo; - int threadhi; +class Benchmark { + public: + Benchmark(const char* name, void (*func)(benchmark::State&)) + : name_(name), + func_([func](int iters, int arg) { + benchmark::State state(iters); + func(state); + }), + lo_(0), + hi_(0), + has_arg_(false) { + Register(); + } + + Benchmark(const char* name, void (*func)(benchmark::State&), int lo, int hi) + : name_(name), + func_([func](int iters, int arg) { + benchmark::State state(iters, arg); + func(state); + }), + lo_(lo), + hi_(hi), + has_arg_(true) { + Register(); + } + + // Pretend to support multiple threads. + Benchmark* ThreadRange(int lo, int hi) { return this; } + + const char* name() const { return name_; } + const std::function& func() const { return func_; } + int lo() const { return lo_; } + int hi() const { return hi_; } + bool has_arg() const { return has_arg_; } + + private: void Register(); - Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); } - Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); } - void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; } - Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; } + + const char* name_; + std::function func_; + int lo_; + int hi_; + bool has_arg_; + + Benchmark(const Benchmark&) = delete; + Benchmark& operator=(const Benchmark&) = delete; }; + } // namespace testing -void SetBenchmarkBytesProcessed(int64_t); -void StopBenchmarkTiming(); -void StartBenchmarkTiming(); -void BenchmarkMemoryUsage(); -void SetBenchmarkItemsProcessed(int); +#define BENCHMARK(f) \ + ::testing::Benchmark* _benchmark_##f = \ + (new ::testing::Benchmark(#f, f)) -int NumCPUs(); - -#define BENCHMARK(f) \ - ::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f)) - -#define BENCHMARK_RANGE(f, lo, hi) \ - ::testing::Benchmark* _benchmark_##f = \ - (new ::testing::Benchmark(#f, f, lo, hi)) +#define BENCHMARK_RANGE(f, lo, hi) \ + ::testing::Benchmark* _benchmark_##f = \ + (new ::testing::Benchmark(#f, f, lo, hi)) #endif // UTIL_BENCHMARK_H_ diff --git a/extern/re2/util/flags.h b/extern/re2/util/flags.h index e0f1f420bc..3386b729d4 100644 --- a/extern/re2/util/flags.h +++ b/extern/re2/util/flags.h @@ -10,20 +10,17 @@ // If you want to do that, see // https://gflags.github.io/gflags/ -#include - -#define DEFINE_flag(type, name, deflt, desc) \ +#define DEFINE_FLAG(type, name, deflt, desc) \ namespace re2 { type FLAGS_##name = deflt; } -#define DECLARE_flag(type, name) \ +#define DECLARE_FLAG(type, name) \ namespace re2 { extern type FLAGS_##name; } -#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc) -#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc) -#define DEFINE_string(name, deflt, desc) DEFINE_flag(std::string, name, deflt, desc) - -#define DECLARE_bool(name) DECLARE_flag(bool, name) -#define DECLARE_int32(name) DECLARE_flag(int32_t, name) -#define DECLARE_string(name) DECLARE_flag(std::string, name) +namespace re2 { +template +T GetFlag(const T& flag) { + return flag; +} +} // namespace re2 #endif // UTIL_FLAGS_H_ diff --git a/extern/re2/util/malloc_counter.h b/extern/re2/util/malloc_counter.h new file mode 100644 index 0000000000..81b564ff98 --- /dev/null +++ b/extern/re2/util/malloc_counter.h @@ -0,0 +1,19 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MALLOC_COUNTER_H_ +#define UTIL_MALLOC_COUNTER_H_ + +namespace testing { +class MallocCounter { + public: + MallocCounter(int x) {} + static const int THIS_THREAD_ONLY = 0; + long long HeapGrowth() { return 0; } + long long PeakHeapGrowth() { return 0; } + void Reset() {} +}; +} // namespace testing + +#endif // UTIL_MALLOC_COUNTER_H_ diff --git a/extern/re2/util/mutex.h b/extern/re2/util/mutex.h index 9c49158048..e1587d5168 100644 --- a/extern/re2/util/mutex.h +++ b/extern/re2/util/mutex.h @@ -10,7 +10,13 @@ * You should assume the locks are *not* re-entrant. */ -#if !defined(_WIN32) +#ifdef _WIN32 +// Requires Windows Vista or Windows Server 2008 at minimum. +#include +#if defined(WINVER) && WINVER >= 0x0600 +#define MUTEX_IS_WIN32_SRWLOCK +#endif +#else #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif @@ -20,7 +26,9 @@ #endif #endif -#if defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) +typedef SRWLOCK MutexType; +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #include #include typedef pthread_rwlock_t MutexType; @@ -56,7 +64,16 @@ class Mutex { Mutex& operator=(const Mutex&) = delete; }; -#if defined(MUTEX_IS_PTHREAD_RWLOCK) +#if defined(MUTEX_IS_WIN32_SRWLOCK) + +Mutex::Mutex() { InitializeSRWLock(&mutex_); } +Mutex::~Mutex() { } +void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); } +void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); } +void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); } +void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); } + +#elif defined(MUTEX_IS_PTHREAD_RWLOCK) #define SAFE_PTHREAD(fncall) \ do { \ diff --git a/extern/re2/util/pcre.cc b/extern/re2/util/pcre.cc index 5983c9f526..b68985144f 100644 --- a/extern/re2/util/pcre.cc +++ b/extern/re2/util/pcre.cc @@ -22,9 +22,7 @@ #include "util/strutil.h" // Silence warnings about the wacky formatting in the operator() functions. -// Note that we test for Clang first because it defines __GNUC__ as well. -#if defined(__clang__) -#elif defined(__GNUC__) && __GNUC__ >= 6 +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif @@ -35,9 +33,10 @@ // not exceed main thread stacks. Note that other threads // often have smaller stacks, and therefore tightening // regexp_stack_limit may frequently be necessary. -DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); -DEFINE_int32(regexp_match_limit, 1000000, - "default PCRE match limit (function calls)"); +DEFINE_FLAG(int, regexp_stack_limit, 256 << 10, + "default PCRE stack limit (bytes)"); +DEFINE_FLAG(int, regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); #ifndef USEPCRE @@ -523,12 +522,12 @@ int PCRE::TryMatch(const StringPiece& text, int match_limit = match_limit_; if (match_limit <= 0) { - match_limit = FLAGS_regexp_match_limit; + match_limit = GetFlag(FLAGS_regexp_match_limit); } int stack_limit = stack_limit_; if (stack_limit <= 0) { - stack_limit = FLAGS_regexp_stack_limit; + stack_limit = GetFlag(FLAGS_regexp_stack_limit); } pcre_extra extra = { 0 }; @@ -977,32 +976,7 @@ static bool parse_double_float(const char* str, size_t n, bool isfloat, } else { r = strtod(buf, &end); } - if (end != buf + n) { -#ifdef _WIN32 - // Microsoft's strtod() doesn't handle inf and nan, so we have to - // handle it explicitly. Speed is not important here because this - // code is only called in unit tests. - bool pos = true; - const char* i = buf; - if ('-' == *i) { - pos = false; - ++i; - } else if ('+' == *i) { - ++i; - } - if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) { - r = std::numeric_limits::infinity(); - if (!pos) - r = -r; - } else if (0 == _stricmp(i, "nan")) { - r = std::numeric_limits::quiet_NaN(); - } else { - return false; - } -#else - return false; // Leftover junk -#endif - } + if (end != buf + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; if (isfloat) { diff --git a/extern/re2/util/pcre.h b/extern/re2/util/pcre.h index 644dce68c2..896b0bdf89 100644 --- a/extern/re2/util/pcre.h +++ b/extern/re2/util/pcre.h @@ -555,7 +555,7 @@ class PCRE_Options { // Hex/Octal/Binary? // Special class for parsing into objects that define a ParseFrom() method -template +template class _PCRE_MatchObject { public: static inline bool Parse(const char* str, size_t n, void* dest) { @@ -600,9 +600,9 @@ class PCRE::Arg { #undef MAKE_PARSER // Generic constructor - template Arg(T*, Parser parser); + template Arg(T*, Parser parser); // Generic constructor template - template Arg(T* p) + template Arg(T* p) : arg_(p), parser_(_PCRE_MatchObject::Parse) { } diff --git a/extern/re2/util/test.cc b/extern/re2/util/test.cc index 29c8b41420..028616b359 100644 --- a/extern/re2/util/test.cc +++ b/extern/re2/util/test.cc @@ -3,10 +3,13 @@ // license that can be found in the LICENSE file. #include +#include #include "util/test.h" -DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); +namespace testing { +std::string TempDir() { return "/tmp/"; } +} // namespace testing struct Test { void (*fn)(void); diff --git a/extern/re2/util/test.h b/extern/re2/util/test.h index 5242e94a9c..54e6f8fbbb 100644 --- a/extern/re2/util/test.h +++ b/extern/re2/util/test.h @@ -6,9 +6,12 @@ #define UTIL_TEST_H_ #include "util/util.h" -#include "util/flags.h" #include "util/logging.h" +namespace testing { +std::string TempDir(); +} // namespace testing + #define TEST(x, y) \ void x##y(void); \ TestRegisterer r##x##y(x##y, # x "." # y); \ @@ -44,15 +47,4 @@ class TestRegisterer { #define EXPECT_GT CHECK_GT #define EXPECT_GE CHECK_GE -namespace testing { -class MallocCounter { - public: - MallocCounter(int x) {} - static const int THIS_THREAD_ONLY = 0; - long long HeapGrowth() { return 0; } - long long PeakHeapGrowth() { return 0; } - void Reset() {} -}; -} // namespace testing - #endif // UTIL_TEST_H_ diff --git a/extern/re2/util/util.h b/extern/re2/util/util.h index 8f3e0d0fe7..56e46c1a33 100644 --- a/extern/re2/util/util.h +++ b/extern/re2/util/util.h @@ -17,6 +17,14 @@ #endif #endif +#ifndef ATTRIBUTE_UNUSED +#if defined(__GNUC__) +#define ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define ATTRIBUTE_UNUSED +#endif +#endif + #ifndef FALLTHROUGH_INTENDED #if defined(__clang__) #define FALLTHROUGH_INTENDED [[clang::fallthrough]] diff --git a/src/common/SimilarToRegex.cpp b/src/common/SimilarToRegex.cpp index 3a62b245e3..0b6e533018 100644 --- a/src/common/SimilarToRegex.cpp +++ b/src/common/SimilarToRegex.cpp @@ -108,7 +108,7 @@ namespace options.set_log_errors(false); options.set_dot_nl(true); options.set_case_sensitive(!(flags & COMP_FLAG_CASE_INSENSITIVE)); - options.set_utf8(!(flags & COMP_FLAG_LATIN)); + options.set_encoding(flags & COMP_FLAG_LATIN ? RE2::Options::EncodingLatin1 : RE2::Options::EncodingUTF8); re2::StringPiece sp((const char*) re2PatternStr.c_str(), re2PatternStr.length()); regexp = FB_NEW_POOL(pool) RE2(sp, options); @@ -759,7 +759,7 @@ namespace options.set_log_errors(false); options.set_dot_nl(true); options.set_case_sensitive(!(flags & COMP_FLAG_CASE_INSENSITIVE)); - options.set_utf8(!(flags & COMP_FLAG_LATIN)); + options.set_encoding(flags & COMP_FLAG_LATIN ? RE2::Options::EncodingLatin1 : RE2::Options::EncodingUTF8); re2::StringPiece sp((const char*) finalRe2Pattern.c_str(), finalRe2Pattern.length()); regexp = FB_NEW_POOL(pool) RE2(sp, options);