8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-22 16:43:03 +01:00

Update re2 to version 2021-04-01.

This commit is contained in:
Adriano dos Santos Fernandes 2021-05-26 10:07:49 -03:00
parent d9d8cc36d4
commit 2e35cc66e7
103 changed files with 3877 additions and 2714 deletions

2
extern/re2/kokoro/bazel.sh → extern/re2/.github/bazel.sh vendored Normal file → Executable file
View File

@ -1,8 +1,6 @@
#!/bin/bash
set -eux
cd git/re2
bazel clean
bazel build --compilation_mode=dbg -- //:all
bazel test --compilation_mode=dbg --test_output=errors -- //:all \

12
extern/re2/.github/cmake.sh vendored Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
set -eux
cmake -D CMAKE_BUILD_TYPE=Debug
cmake --build . --config Debug --clean-first
ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random'
cmake -D CMAKE_BUILD_TYPE=Release
cmake --build . --config Release --clean-first
ctest -C Release --output-on-failure -E 'dfa|exhaustive|random'
exit 0

View File

@ -0,0 +1,17 @@
name: CI (Bazel)
on:
push:
branches: [master]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, ubuntu-latest, windows-latest]
env:
BAZELISK_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- run: .github/bazel.sh
shell: bash

View File

@ -0,0 +1,15 @@
name: CI (CMake)
on:
push:
branches: [master]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, ubuntu-latest, windows-latest]
steps:
- uses: actions/checkout@v2
- run: .github/cmake.sh
shell: bash

51
extern/re2/.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,51 @@
name: CI
on:
push:
branches: [master]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, ubuntu-latest]
env:
CC: clang
CXX: clang++
steps:
- uses: actions/checkout@v2
- run: make && make test
shell: bash
build-clang:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
tag: [9, 10, 11]
env:
CC: clang-${{ matrix.tag }}
CXX: clang++-${{ matrix.tag }}
steps:
- uses: actions/checkout@v2
- name: Install Clang ${{ matrix.tag }}
run: |
wget https://apt.llvm.org/llvm.sh
chmod +x ./llvm.sh
sudo ./llvm.sh ${{ matrix.tag }}
shell: bash
- run: make && make test
shell: bash
build-gcc:
runs-on: ubuntu-latest
container: gcc:${{ matrix.tag }}
strategy:
fail-fast: false
matrix:
tag: [4, 5, 6, 7, 8, 9, 10]
env:
CC: gcc
CXX: g++
steps:
- uses: actions/checkout@v2
- run: make && make test
shell: bash

View File

@ -3,4 +3,3 @@
core
obj/
benchlog.*
builds/

179
extern/re2/.travis.yml vendored
View File

@ -1,179 +0,0 @@
language: cpp
sudo: false
dist: trusty
script:
- make
- make test
matrix:
include:
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
env:
- MATRIX_EVAL="CC=gcc-4.8 CXX=g++-4.8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
env:
- MATRIX_EVAL="CC=gcc-4.9 CXX=g++-4.9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
env:
- MATRIX_EVAL="CC=gcc-5 CXX=g++-5"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-6
env:
- MATRIX_EVAL="CC=gcc-6 CXX=g++-6"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-7
env:
- MATRIX_EVAL="CC=gcc-7 CXX=g++-7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-8
env:
- MATRIX_EVAL="CC=gcc-8 CXX=g++-8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-9
env:
- MATRIX_EVAL="CC=gcc-9 CXX=g++-9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.5
packages:
- clang-3.5
env:
- MATRIX_EVAL="CC=clang-3.5 CXX=clang++-3.5"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.6
packages:
- clang-3.6
env:
- MATRIX_EVAL="CC=clang-3.6 CXX=clang++-3.6"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.7
packages:
- clang-3.7
env:
- MATRIX_EVAL="CC=clang-3.7 CXX=clang++-3.7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.8
packages:
- clang-3.8
env:
- MATRIX_EVAL="CC=clang-3.8 CXX=clang++-3.8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.9
packages:
- clang-3.9
env:
- MATRIX_EVAL="CC=clang-3.9 CXX=clang++-3.9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-trusty-4.0
packages:
- clang-4.0
env:
- MATRIX_EVAL="CC=clang-4.0 CXX=clang++-4.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-trusty-5.0
packages:
- clang-5.0
env:
- MATRIX_EVAL="CC=clang-5.0 CXX=clang++-5.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-6.0 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-6.0
env:
- MATRIX_EVAL="CC=clang-6.0 CXX=clang++-6.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-7 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-7
env:
- MATRIX_EVAL="CC=clang-7 CXX=clang++-7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-8 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-8
env:
- MATRIX_EVAL="CC=clang-8 CXX=clang++-8"
before_install:
- eval "${MATRIX_EVAL}"

151
extern/re2/BUILD vendored
View File

@ -9,19 +9,21 @@ licenses(["notice"])
exports_files(["LICENSE"])
config_setting(
name = "darwin",
name = "macos",
values = {"cpu": "darwin"},
)
config_setting(
name = "wasm",
values = {"cpu": "wasm32"},
)
config_setting(
name = "windows",
values = {"cpu": "x64_windows"},
)
config_setting(
name = "windows_msvc",
values = {"cpu": "x64_windows_msvc"},
)
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
cc_library(
name = "re2",
@ -36,6 +38,7 @@ cc_library(
"re2/onepass.cc",
"re2/parse.cc",
"re2/perl_groups.cc",
"re2/pod_array.h",
"re2/prefilter.cc",
"re2/prefilter.h",
"re2/prefilter_tree.cc",
@ -47,6 +50,8 @@ cc_library(
"re2/regexp.h",
"re2/set.cc",
"re2/simplify.cc",
"re2/sparse_array.h",
"re2/sparse_set.h",
"re2/stringpiece.cc",
"re2/tostring.cc",
"re2/unicode_casefold.cc",
@ -54,14 +59,10 @@ cc_library(
"re2/unicode_groups.cc",
"re2/unicode_groups.h",
"re2/walker-inl.h",
"util/flags.h",
"util/logging.h",
"util/mix.h",
"util/mutex.h",
"util/pod_array.h",
"util/rune.cc",
"util/sparse_array.h",
"util/sparse_set.h",
"util/strutil.cc",
"util/strutil.h",
"util/utf.h",
@ -74,17 +75,17 @@ cc_library(
"re2/stringpiece.h",
],
copts = select({
":wasm": [],
":windows": [],
":windows_msvc": [],
"//conditions:default": ["-pthread"],
}),
linkopts = select({
# Darwin doesn't need `-pthread' when linking and it appears that
# macOS doesn't need `-pthread' when linking and it appears that
# older versions of Clang will warn about the unused command line
# argument, so just don't pass it.
":darwin": [],
":macos": [],
":wasm": [],
":windows": [],
":windows_msvc": [],
"//conditions:default": ["-pthread"],
}),
visibility = ["//visibility:public"],
@ -109,6 +110,8 @@ cc_library(
"re2/testing/string_generator.h",
"re2/testing/tester.h",
"util/benchmark.h",
"util/flags.h",
"util/malloc_counter.h",
"util/pcre.h",
"util/test.h",
],
@ -122,106 +125,144 @@ cc_library(
deps = [":testing"],
)
load(":re2_test.bzl", "re2_test")
re2_test(
"charclass_test",
cc_test(
name = "charclass_test",
size = "small",
srcs = ["re2/testing/charclass_test.cc"],
deps = [":test"],
)
re2_test(
"compile_test",
cc_test(
name = "compile_test",
size = "small",
srcs = ["re2/testing/compile_test.cc"],
deps = [":test"],
)
re2_test(
"filtered_re2_test",
cc_test(
name = "filtered_re2_test",
size = "small",
srcs = ["re2/testing/filtered_re2_test.cc"],
deps = [":test"],
)
re2_test(
"mimics_pcre_test",
cc_test(
name = "mimics_pcre_test",
size = "small",
srcs = ["re2/testing/mimics_pcre_test.cc"],
deps = [":test"],
)
re2_test(
"parse_test",
cc_test(
name = "parse_test",
size = "small",
srcs = ["re2/testing/parse_test.cc"],
deps = [":test"],
)
re2_test(
"possible_match_test",
cc_test(
name = "possible_match_test",
size = "small",
srcs = ["re2/testing/possible_match_test.cc"],
deps = [":test"],
)
re2_test(
"re2_arg_test",
cc_test(
name = "re2_arg_test",
size = "small",
srcs = ["re2/testing/re2_arg_test.cc"],
deps = [":test"],
)
re2_test(
"re2_test",
cc_test(
name = "re2_test",
size = "small",
srcs = ["re2/testing/re2_test.cc"],
deps = [":test"],
)
re2_test(
"regexp_test",
cc_test(
name = "regexp_test",
size = "small",
srcs = ["re2/testing/regexp_test.cc"],
deps = [":test"],
)
re2_test(
"required_prefix_test",
cc_test(
name = "required_prefix_test",
size = "small",
srcs = ["re2/testing/required_prefix_test.cc"],
deps = [":test"],
)
re2_test(
"search_test",
cc_test(
name = "search_test",
size = "small",
srcs = ["re2/testing/search_test.cc"],
deps = [":test"],
)
re2_test(
"set_test",
cc_test(
name = "set_test",
size = "small",
srcs = ["re2/testing/set_test.cc"],
deps = [":test"],
)
re2_test(
"simplify_test",
cc_test(
name = "simplify_test",
size = "small",
srcs = ["re2/testing/simplify_test.cc"],
deps = [":test"],
)
re2_test(
"string_generator_test",
cc_test(
name = "string_generator_test",
size = "small",
srcs = ["re2/testing/string_generator_test.cc"],
deps = [":test"],
)
re2_test(
"dfa_test",
cc_test(
name = "dfa_test",
size = "large",
srcs = ["re2/testing/dfa_test.cc"],
deps = [":test"],
)
re2_test(
"exhaustive1_test",
cc_test(
name = "exhaustive1_test",
size = "large",
srcs = ["re2/testing/exhaustive1_test.cc"],
deps = [":test"],
)
re2_test(
"exhaustive2_test",
cc_test(
name = "exhaustive2_test",
size = "large",
srcs = ["re2/testing/exhaustive2_test.cc"],
deps = [":test"],
)
re2_test(
"exhaustive3_test",
cc_test(
name = "exhaustive3_test",
size = "large",
srcs = ["re2/testing/exhaustive3_test.cc"],
deps = [":test"],
)
re2_test(
"exhaustive_test",
cc_test(
name = "exhaustive_test",
size = "large",
srcs = ["re2/testing/exhaustive_test.cc"],
deps = [":test"],
)
re2_test(
"random_test",
cc_test(
name = "random_test",
size = "large",
srcs = ["re2/testing/random_test.cc"],
deps = [":test"],
)
cc_library(

View File

@ -2,8 +2,8 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Old enough to support Ubuntu Trusty.
cmake_minimum_required(VERSION 2.8.12)
# Old enough to support Ubuntu Xenial.
cmake_minimum_required(VERSION 3.5.1)
if(POLICY CMP0048)
cmake_policy(SET CMP0048 NEW)
@ -11,6 +11,12 @@ endif()
project(RE2 CXX)
include(CTest)
include(GNUInstallDirs)
if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
endif()
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(USEPCRE "use PCRE in tests and benchmarks" OFF)
@ -19,6 +25,10 @@ option(USEPCRE "use PCRE in tests and benchmarks" OFF)
# so we provide an option similar to BUILD_TESTING, but just for RE2.
option(RE2_BUILD_TESTING "enable testing for RE2" ON)
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
set(SONAME 9)
set(EXTRA_TARGET_LINK_LIBRARIES)
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
@ -27,7 +37,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
endif()
if(BUILD_SHARED_LIBS)
# See http://www.kitware.com/blog/home/post/939 for details.
cmake_minimum_required(VERSION 3.4)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX,
@ -36,13 +45,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
# Without a byte order mark (BOM), Visual Studio assumes that the source
# file is encoded using the current user code page, so we specify UTF-8.
add_compile_options(/utf-8)
# allow multi-processor compilation
add_compile_options(/MP)
elseif(CYGWIN OR MINGW)
# See https://stackoverflow.com/questions/38139631 for details.
add_compile_options(-std=gnu++11)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_compile_options(-std=c++11)
endif()
if(WIN32)
@ -58,8 +60,6 @@ if(USEPCRE)
list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
set(RE2_SOURCES
re2/bitstate.cc
re2/compile.cc
@ -86,6 +86,8 @@ set(RE2_SOURCES
)
add_library(re2 ${RE2_SOURCES})
target_include_directories(re2 PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0)
add_library(re2::re2 ALIAS re2)
if(RE2_BUILD_TESTING)
@ -101,6 +103,7 @@ if(RE2_BUILD_TESTING)
)
add_library(testing STATIC ${TESTING_SOURCES})
target_link_libraries(testing PUBLIC re2)
set(TEST_TARGETS
charclass_test
@ -132,13 +135,13 @@ if(RE2_BUILD_TESTING)
foreach(target ${TEST_TARGETS})
add_executable(${target} re2/testing/${target}.cc util/test.cc)
target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES})
target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES})
add_test(NAME ${target} COMMAND ${target})
endforeach(target)
foreach(target ${BENCHMARK_TARGETS})
add_executable(${target} re2/testing/${target}.cc util/benchmark.cc)
target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES})
target_link_libraries(${target} testing ${EXTRA_TARGET_LINK_LIBRARIES})
endforeach(target)
endif()
@ -149,6 +152,12 @@ set(RE2_HEADERS
re2/stringpiece.h
)
install(FILES ${RE2_HEADERS} DESTINATION include/re2)
install(TARGETS re2 EXPORT re2Config ARCHIVE DESTINATION lib LIBRARY DESTINATION lib RUNTIME DESTINATION bin INCLUDES DESTINATION include)
install(EXPORT re2Config DESTINATION lib/cmake/re2 NAMESPACE re2::)
install(FILES ${RE2_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2)
install(TARGETS re2 EXPORT re2Config
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(EXPORT re2Config
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2 NAMESPACE re2::)

88
extern/re2/Makefile vendored
View File

@ -44,7 +44,7 @@ endif
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
SONAME=0
SONAME=9
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
@ -55,7 +55,7 @@ ifeq ($(shell uname),Darwin)
SOEXT=dylib
SOEXTVER=$(SONAME).$(SOEXT)
SOEXTVER00=$(SONAME).0.0.$(SOEXT)
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS)
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin $(RE2_LDFLAGS) $(LDFLAGS)
else ifeq ($(shell uname),SunOS)
SOEXT=so
SOEXTVER=$(SOEXT).$(SONAME)
@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS)
endif
.PHONY: all
all: obj/libre2.a obj/so/libre2.$(SOEXT)
INSTALL_HFILES=\
@ -80,24 +81,25 @@ HFILES=\
util/benchmark.h\
util/flags.h\
util/logging.h\
util/malloc_counter.h\
util/mix.h\
util/mutex.h\
util/pcre.h\
util/pod_array.h\
util/sparse_array.h\
util/sparse_set.h\
util/strutil.h\
util/test.h\
util/utf.h\
util/util.h\
re2/bitmap256.h\
re2/filtered_re2.h\
re2/pod_array.h\
re2/prefilter.h\
re2/prefilter_tree.h\
re2/prog.h\
re2/re2.h\
re2/regexp.h\
re2/set.h\
re2/sparse_array.h\
re2/sparse_set.h\
re2/stringpiece.h\
re2/testing/exhaustive_tester.h\
re2/testing/regexp_generator.h\
@ -175,117 +177,156 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
.PRECIOUS: obj/%.o
obj/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
.PRECIOUS: obj/dbg/%.o
obj/dbg/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc
.PRECIOUS: obj/so/%.o
obj/so/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
.PRECIOUS: obj/libre2.a
obj/libre2.a: $(OFILES)
@mkdir -p obj
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
.PRECIOUS: obj/dbg/libre2.a
obj/dbg/libre2.a: $(DOFILES)
@mkdir -p obj/dbg
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
obj/so/libre2.$(SOEXT): $(SOFILES)
.PRECIOUS: obj/so/libre2.$(SOEXT)
obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin
@mkdir -p obj/so
$(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
ln -sf libre2.$(SOEXTVER) $@
.PRECIOUS: obj/dbg/test/%
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
@mkdir -p obj/dbg/test
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
.PRECIOUS: obj/test/%
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
# Test the shared lib, falling back to the static lib for private symbols
.PRECIOUS: obj/so/test/%
obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/so/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
# Filter out dump.o because testing::TempDir() isn't available for it.
obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(filter-out obj/re2/testing/dump.o, $(TESTOFILES)) obj/util/benchmark.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
# re2_fuzzer is a target for fuzzers like libFuzzer and AFL. This fake fuzzing
# is simply a way to check that the target builds and then to run it against a
# fixed set of inputs. To perform real fuzzing, refer to the documentation for
# libFuzzer (llvm.org/docs/LibFuzzer.html) and AFL (lcamtuf.coredump.cx/afl/).
obj/test/re2_fuzzer: CXXFLAGS:=-I./re2/fuzzing/compiler-rt/include $(CXXFLAGS)
obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
ifdef REBUILD_TABLES
.PRECIOUS: re2/perl_groups.cc
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
.PRECIOUS: re2/unicode_%.cc
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
endif
.PHONY: distclean
distclean: clean
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
.PHONY: clean
clean:
rm -rf obj
rm -f re2/*.pyc
.PHONY: testofiles
testofiles: $(TESTOFILES)
.PHONY: test
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
.PHONY: debug-test
debug-test: $(DTESTS)
@./runtests $(DTESTS)
.PHONY: static-test
static-test: $(TESTS)
@./runtests $(TESTS)
.PHONY: shared-test
shared-test: $(STESTS)
@./runtests -shared-library-path obj/so $(STESTS)
.PHONY: debug-bigtest
debug-bigtest: $(DTESTS) $(DBIGTESTS)
@./runtests $(DTESTS) $(DBIGTESTS)
.PHONY: static-bigtest
static-bigtest: $(TESTS) $(BIGTESTS)
@./runtests $(TESTS) $(BIGTESTS)
.PHONY: shared-bigtest
shared-bigtest: $(STESTS) $(SBIGTESTS)
@./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS)
.PHONY: benchmark
benchmark: obj/test/regexp_benchmark
.PHONY: fuzz
fuzz: obj/test/re2_fuzzer
install: obj/libre2.a obj/so/libre2.$(SOEXT)
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
.PHONY: install
install: static-install shared-install
.PHONY: static
static: obj/libre2.a
.PHONY: static-install
static-install: obj/libre2.a common-install
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
.PHONY: shared
shared: obj/so/libre2.$(SOEXT)
.PHONY: shared-install
shared-install: obj/so/libre2.$(SOEXT) common-install
$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
$(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@prefix@#${prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@exec_prefix@#${exec_prefix}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@includedir@#${includedir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@libdir@#${libdir}#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
.PHONY: common-install
common-install:
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
$(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
$(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
.PHONY: testinstall
testinstall: static-testinstall shared-testinstall
@echo
@echo Install tests passed.
@echo
.PHONY: static-testinstall
static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS)
static-testinstall:
@ -300,6 +341,7 @@ else
obj/testinstall
endif
.PHONY: shared-testinstall
shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS)
shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS)
shared-testinstall:
@ -312,19 +354,14 @@ else
LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall
endif
.PHONY: benchlog
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
(uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
# Keep gmake from deleting intermediate files it creates.
# This makes repeated builds faster and preserves debug info on OS X.
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
obj/dbg/libre2.a obj/so/libre2.a \
obj/test/% obj/so/test/% obj/dbg/test/%
.PHONY: log
log:
$(MAKE) clean
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
@ -340,6 +377,3 @@ log:
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
echo '#' $$(date) >>re2-search.txt
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
x: x.cc obj/libre2.a
g++ -I. -o x x.cc obj/libre2.a

5
extern/re2/README vendored
View File

@ -27,12 +27,15 @@ under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
The Python wrapper is at https://github.com/google/re2/tree/abseil/python
and on PyPI (https://pypi.org/project/google-re2/).
A C wrapper is at https://github.com/marcomaggi/cre2/.
An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm).
An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com).
An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org).
A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org).
A Python wrapper is at https://github.com/facebook/pyre2/ and on PyPI (pypi.org).
An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org).
A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org).
A WebAssembly wrapper is at https://github.com/google/re2-wasm/ and on NPM (npmjs.com).

View File

@ -3,4 +3,13 @@
# license that can be found in the LICENSE file.
# Bazel (http://bazel.io/) WORKSPACE file for RE2.
workspace(name = "com_googlesource_code_re2")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
http_archive(
name = "rules_cc",
strip_prefix = "rules_cc-master",
urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"],
)

0
extern/re2/benchlog/benchplot.py vendored Normal file → Executable file
View File

0
extern/re2/benchlog/mktable vendored Normal file → Executable file
View File

3
extern/re2/doc/mksyntaxgo vendored Normal file → Executable file
View File

@ -15,7 +15,7 @@ sam -d $out <<'!'
,s/\n\n\n+/\n\n/g
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
1,2c
// Copyright 2012 The Go Authors. All rights reserved.
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
@ -33,6 +33,7 @@ Parts of the syntax can be disabled by passing alternate flags to Parse.
.
$a
Unicode character classes are those in unicode.Categories and unicode.Scripts.
*/
package syntax
.

0
extern/re2/doc/mksyntaxhtml vendored Normal file → Executable file
View File

0
extern/re2/doc/mksyntaxwiki vendored Normal file → Executable file
View File

View File

@ -47,6 +47,10 @@
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2>Implementation restriction: The counting forms <code>x{n,m}</code>, <code>x{n,}</code>, and <code>x{n}</code></td></tr>
<tr><td colspan=2>reject forms that create a minimum or maximum repetition count above 1000.</td></tr>
<tr><td colspan=2>Unlimited repetitions are not subject to this restriction.</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
@ -56,10 +60,10 @@
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
@ -80,8 +84,8 @@
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr>
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
<tr><td><code>\B</code></td><td>not a word boundary</td></tr>
<tr><td><code>\b</code></td><td>at ASCII word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
<tr><td><code>\B</code></td><td>not at ASCII word boundary</td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
@ -166,7 +170,7 @@
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Perl character classes:</b></td></tr>
<tr><td colspan=2><b>Perl character classes (all ASCII-only):</b></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
@ -237,105 +241,162 @@
<tr><td><code>Zs</code></td><td>space separator</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
<tr><td><code>Arabic</code></td><td>Arabic</td></tr>
<tr><td><code>Armenian</code></td><td>Armenian</td></tr>
<tr><td><code>Balinese</code></td><td>Balinese</td></tr>
<tr><td><code>Bamum</code></td><td>Bamum</td></tr>
<tr><td><code>Batak</code></td><td>Batak</td></tr>
<tr><td><code>Bengali</code></td><td>Bengali</td></tr>
<tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr>
<tr><td><code>Brahmi</code></td><td>Brahmi</td></tr>
<tr><td><code>Braille</code></td><td>Braille</td></tr>
<tr><td><code>Buginese</code></td><td>Buginese</td></tr>
<tr><td><code>Buhid</code></td><td>Buhid</td></tr>
<tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr>
<tr><td><code>Carian</code></td><td>Carian</td></tr>
<tr><td><code>Chakma</code></td><td>Chakma</td></tr>
<tr><td><code>Cham</code></td><td>Cham</td></tr>
<tr><td><code>Cherokee</code></td><td>Cherokee</td></tr>
<tr><td><code>Common</code></td><td>characters not specific to one script</td></tr>
<tr><td><code>Coptic</code></td><td>Coptic</td></tr>
<tr><td><code>Cuneiform</code></td><td>Cuneiform</td></tr>
<tr><td><code>Cypriot</code></td><td>Cypriot</td></tr>
<tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr>
<tr><td><code>Deseret</code></td><td>Deseret</td></tr>
<tr><td><code>Devanagari</code></td><td>Devanagari</td></tr>
<tr><td><code>Egyptian_Hieroglyphs</code></td><td>Egyptian Hieroglyphs</td></tr>
<tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr>
<tr><td><code>Georgian</code></td><td>Georgian</td></tr>
<tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr>
<tr><td><code>Gothic</code></td><td>Gothic</td></tr>
<tr><td><code>Greek</code></td><td>Greek</td></tr>
<tr><td><code>Gujarati</code></td><td>Gujarati</td></tr>
<tr><td><code>Gurmukhi</code></td><td>Gurmukhi</td></tr>
<tr><td><code>Han</code></td><td>Han</td></tr>
<tr><td><code>Hangul</code></td><td>Hangul</td></tr>
<tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr>
<tr><td><code>Hebrew</code></td><td>Hebrew</td></tr>
<tr><td><code>Hiragana</code></td><td>Hiragana</td></tr>
<tr><td><code>Imperial_Aramaic</code></td><td>Imperial Aramaic</td></tr>
<tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr>
<tr><td><code>Inscriptional_Pahlavi</code></td><td>Inscriptional Pahlavi</td></tr>
<tr><td><code>Inscriptional_Parthian</code></td><td>Inscriptional Parthian</td></tr>
<tr><td><code>Javanese</code></td><td>Javanese</td></tr>
<tr><td><code>Kaithi</code></td><td>Kaithi</td></tr>
<tr><td><code>Kannada</code></td><td>Kannada</td></tr>
<tr><td><code>Katakana</code></td><td>Katakana</td></tr>
<tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr>
<tr><td><code>Kharoshthi</code></td><td>Kharoshthi</td></tr>
<tr><td><code>Khmer</code></td><td>Khmer</td></tr>
<tr><td><code>Lao</code></td><td>Lao</td></tr>
<tr><td><code>Latin</code></td><td>Latin</td></tr>
<tr><td><code>Lepcha</code></td><td>Lepcha</td></tr>
<tr><td><code>Limbu</code></td><td>Limbu</td></tr>
<tr><td><code>Linear_B</code></td><td>Linear B</td></tr>
<tr><td><code>Lycian</code></td><td>Lycian</td></tr>
<tr><td><code>Lydian</code></td><td>Lydian</td></tr>
<tr><td><code>Malayalam</code></td><td>Malayalam</td></tr>
<tr><td><code>Mandaic</code></td><td>Mandaic</td></tr>
<tr><td><code>Meetei_Mayek</code></td><td>Meetei Mayek</td></tr>
<tr><td><code>Meroitic_Cursive</code></td><td>Meroitic Cursive</td></tr>
<tr><td><code>Meroitic_Hieroglyphs</code></td><td>Meroitic Hieroglyphs</td></tr>
<tr><td><code>Miao</code></td><td>Miao</td></tr>
<tr><td><code>Mongolian</code></td><td>Mongolian</td></tr>
<tr><td><code>Myanmar</code></td><td>Myanmar</td></tr>
<tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr>
<tr><td><code>Nko</code></td><td>Nko</td></tr>
<tr><td><code>Ogham</code></td><td>Ogham</td></tr>
<tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr>
<tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr>
<tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr>
<tr><td><code>Old_South_Arabian</code></td><td>Old South Arabian</td></tr>
<tr><td><code>Old_Turkic</code></td><td>Old Turkic</td></tr>
<tr><td><code>Oriya</code></td><td>Oriya</td></tr>
<tr><td><code>Osmanya</code></td><td>Osmanya</td></tr>
<tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr>
<tr><td><code>Phoenician</code></td><td>Phoenician</td></tr>
<tr><td><code>Rejang</code></td><td>Rejang</td></tr>
<tr><td><code>Runic</code></td><td>Runic</td></tr>
<tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr>
<tr><td><code>Sharada</code></td><td>Sharada</td></tr>
<tr><td><code>Shavian</code></td><td>Shavian</td></tr>
<tr><td><code>Sinhala</code></td><td>Sinhala</td></tr>
<tr><td><code>Sora_Sompeng</code></td><td>Sora Sompeng</td></tr>
<tr><td><code>Sundanese</code></td><td>Sundanese</td></tr>
<tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr>
<tr><td><code>Syriac</code></td><td>Syriac</td></tr>
<tr><td><code>Tagalog</code></td><td>Tagalog</td></tr>
<tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr>
<tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr>
<tr><td><code>Tai_Tham</code></td><td>Tai Tham</td></tr>
<tr><td><code>Tai_Viet</code></td><td>Tai Viet</td></tr>
<tr><td><code>Takri</code></td><td>Takri</td></tr>
<tr><td><code>Tamil</code></td><td>Tamil</td></tr>
<tr><td><code>Telugu</code></td><td>Telugu</td></tr>
<tr><td><code>Thaana</code></td><td>Thaana</td></tr>
<tr><td><code>Thai</code></td><td>Thai</td></tr>
<tr><td><code>Tibetan</code></td><td>Tibetan</td></tr>
<tr><td><code>Tifinagh</code></td><td>Tifinagh</td></tr>
<tr><td><code>Ugaritic</code></td><td>Ugaritic</td></tr>
<tr><td><code>Vai</code></td><td>Vai</td></tr>
<tr><td><code>Yi</code></td><td>Yi</td></tr>
<tr><td colspan=2>Adlam</td></tr>
<tr><td colspan=2>Ahom</td></tr>
<tr><td colspan=2>Anatolian_Hieroglyphs</td></tr>
<tr><td colspan=2>Arabic</td></tr>
<tr><td colspan=2>Armenian</td></tr>
<tr><td colspan=2>Avestan</td></tr>
<tr><td colspan=2>Balinese</td></tr>
<tr><td colspan=2>Bamum</td></tr>
<tr><td colspan=2>Bassa_Vah</td></tr>
<tr><td colspan=2>Batak</td></tr>
<tr><td colspan=2>Bengali</td></tr>
<tr><td colspan=2>Bhaiksuki</td></tr>
<tr><td colspan=2>Bopomofo</td></tr>
<tr><td colspan=2>Brahmi</td></tr>
<tr><td colspan=2>Braille</td></tr>
<tr><td colspan=2>Buginese</td></tr>
<tr><td colspan=2>Buhid</td></tr>
<tr><td colspan=2>Canadian_Aboriginal</td></tr>
<tr><td colspan=2>Carian</td></tr>
<tr><td colspan=2>Caucasian_Albanian</td></tr>
<tr><td colspan=2>Chakma</td></tr>
<tr><td colspan=2>Cham</td></tr>
<tr><td colspan=2>Cherokee</td></tr>
<tr><td colspan=2>Chorasmian</td></tr>
<tr><td colspan=2>Common</td></tr>
<tr><td colspan=2>Coptic</td></tr>
<tr><td colspan=2>Cuneiform</td></tr>
<tr><td colspan=2>Cypriot</td></tr>
<tr><td colspan=2>Cyrillic</td></tr>
<tr><td colspan=2>Deseret</td></tr>
<tr><td colspan=2>Devanagari</td></tr>
<tr><td colspan=2>Dives_Akuru</td></tr>
<tr><td colspan=2>Dogra</td></tr>
<tr><td colspan=2>Duployan</td></tr>
<tr><td colspan=2>Egyptian_Hieroglyphs</td></tr>
<tr><td colspan=2>Elbasan</td></tr>
<tr><td colspan=2>Elymaic</td></tr>
<tr><td colspan=2>Ethiopic</td></tr>
<tr><td colspan=2>Georgian</td></tr>
<tr><td colspan=2>Glagolitic</td></tr>
<tr><td colspan=2>Gothic</td></tr>
<tr><td colspan=2>Grantha</td></tr>
<tr><td colspan=2>Greek</td></tr>
<tr><td colspan=2>Gujarati</td></tr>
<tr><td colspan=2>Gunjala_Gondi</td></tr>
<tr><td colspan=2>Gurmukhi</td></tr>
<tr><td colspan=2>Han</td></tr>
<tr><td colspan=2>Hangul</td></tr>
<tr><td colspan=2>Hanifi_Rohingya</td></tr>
<tr><td colspan=2>Hanunoo</td></tr>
<tr><td colspan=2>Hatran</td></tr>
<tr><td colspan=2>Hebrew</td></tr>
<tr><td colspan=2>Hiragana</td></tr>
<tr><td colspan=2>Imperial_Aramaic</td></tr>
<tr><td colspan=2>Inherited</td></tr>
<tr><td colspan=2>Inscriptional_Pahlavi</td></tr>
<tr><td colspan=2>Inscriptional_Parthian</td></tr>
<tr><td colspan=2>Javanese</td></tr>
<tr><td colspan=2>Kaithi</td></tr>
<tr><td colspan=2>Kannada</td></tr>
<tr><td colspan=2>Katakana</td></tr>
<tr><td colspan=2>Kayah_Li</td></tr>
<tr><td colspan=2>Kharoshthi</td></tr>
<tr><td colspan=2>Khitan_Small_Script</td></tr>
<tr><td colspan=2>Khmer</td></tr>
<tr><td colspan=2>Khojki</td></tr>
<tr><td colspan=2>Khudawadi</td></tr>
<tr><td colspan=2>Lao</td></tr>
<tr><td colspan=2>Latin</td></tr>
<tr><td colspan=2>Lepcha</td></tr>
<tr><td colspan=2>Limbu</td></tr>
<tr><td colspan=2>Linear_A</td></tr>
<tr><td colspan=2>Linear_B</td></tr>
<tr><td colspan=2>Lisu</td></tr>
<tr><td colspan=2>Lycian</td></tr>
<tr><td colspan=2>Lydian</td></tr>
<tr><td colspan=2>Mahajani</td></tr>
<tr><td colspan=2>Makasar</td></tr>
<tr><td colspan=2>Malayalam</td></tr>
<tr><td colspan=2>Mandaic</td></tr>
<tr><td colspan=2>Manichaean</td></tr>
<tr><td colspan=2>Marchen</td></tr>
<tr><td colspan=2>Masaram_Gondi</td></tr>
<tr><td colspan=2>Medefaidrin</td></tr>
<tr><td colspan=2>Meetei_Mayek</td></tr>
<tr><td colspan=2>Mende_Kikakui</td></tr>
<tr><td colspan=2>Meroitic_Cursive</td></tr>
<tr><td colspan=2>Meroitic_Hieroglyphs</td></tr>
<tr><td colspan=2>Miao</td></tr>
<tr><td colspan=2>Modi</td></tr>
<tr><td colspan=2>Mongolian</td></tr>
<tr><td colspan=2>Mro</td></tr>
<tr><td colspan=2>Multani</td></tr>
<tr><td colspan=2>Myanmar</td></tr>
<tr><td colspan=2>Nabataean</td></tr>
<tr><td colspan=2>Nandinagari</td></tr>
<tr><td colspan=2>New_Tai_Lue</td></tr>
<tr><td colspan=2>Newa</td></tr>
<tr><td colspan=2>Nko</td></tr>
<tr><td colspan=2>Nushu</td></tr>
<tr><td colspan=2>Nyiakeng_Puachue_Hmong</td></tr>
<tr><td colspan=2>Ogham</td></tr>
<tr><td colspan=2>Ol_Chiki</td></tr>
<tr><td colspan=2>Old_Hungarian</td></tr>
<tr><td colspan=2>Old_Italic</td></tr>
<tr><td colspan=2>Old_North_Arabian</td></tr>
<tr><td colspan=2>Old_Permic</td></tr>
<tr><td colspan=2>Old_Persian</td></tr>
<tr><td colspan=2>Old_Sogdian</td></tr>
<tr><td colspan=2>Old_South_Arabian</td></tr>
<tr><td colspan=2>Old_Turkic</td></tr>
<tr><td colspan=2>Oriya</td></tr>
<tr><td colspan=2>Osage</td></tr>
<tr><td colspan=2>Osmanya</td></tr>
<tr><td colspan=2>Pahawh_Hmong</td></tr>
<tr><td colspan=2>Palmyrene</td></tr>
<tr><td colspan=2>Pau_Cin_Hau</td></tr>
<tr><td colspan=2>Phags_Pa</td></tr>
<tr><td colspan=2>Phoenician</td></tr>
<tr><td colspan=2>Psalter_Pahlavi</td></tr>
<tr><td colspan=2>Rejang</td></tr>
<tr><td colspan=2>Runic</td></tr>
<tr><td colspan=2>Samaritan</td></tr>
<tr><td colspan=2>Saurashtra</td></tr>
<tr><td colspan=2>Sharada</td></tr>
<tr><td colspan=2>Shavian</td></tr>
<tr><td colspan=2>Siddham</td></tr>
<tr><td colspan=2>SignWriting</td></tr>
<tr><td colspan=2>Sinhala</td></tr>
<tr><td colspan=2>Sogdian</td></tr>
<tr><td colspan=2>Sora_Sompeng</td></tr>
<tr><td colspan=2>Soyombo</td></tr>
<tr><td colspan=2>Sundanese</td></tr>
<tr><td colspan=2>Syloti_Nagri</td></tr>
<tr><td colspan=2>Syriac</td></tr>
<tr><td colspan=2>Tagalog</td></tr>
<tr><td colspan=2>Tagbanwa</td></tr>
<tr><td colspan=2>Tai_Le</td></tr>
<tr><td colspan=2>Tai_Tham</td></tr>
<tr><td colspan=2>Tai_Viet</td></tr>
<tr><td colspan=2>Takri</td></tr>
<tr><td colspan=2>Tamil</td></tr>
<tr><td colspan=2>Tangut</td></tr>
<tr><td colspan=2>Telugu</td></tr>
<tr><td colspan=2>Thaana</td></tr>
<tr><td colspan=2>Thai</td></tr>
<tr><td colspan=2>Tibetan</td></tr>
<tr><td colspan=2>Tifinagh</td></tr>
<tr><td colspan=2>Tirhuta</td></tr>
<tr><td colspan=2>Ugaritic</td></tr>
<tr><td colspan=2>Vai</td></tr>
<tr><td colspan=2>Wancho</td></tr>
<tr><td colspan=2>Warang_Citi</td></tr>
<tr><td colspan=2>Yezidi</td></tr>
<tr><td colspan=2>Yi</td></tr>
<tr><td colspan=2>Zanabazar_Square</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>

View File

@ -253,6 +253,7 @@ Caucasian_Albanian
Chakma
Cham
Cherokee
Chorasmian
Common
Coptic
Cuneiform
@ -260,6 +261,7 @@ Cypriot
Cyrillic
Deseret
Devanagari
Dives_Akuru
Dogra
Duployan
Egyptian_Hieroglyphs
@ -291,6 +293,7 @@ Kannada
Katakana
Kayah_Li
Kharoshthi
Khitan_Small_Script
Khmer
Khojki
Khudawadi
@ -380,6 +383,7 @@ Ugaritic
Vai
Wancho
Warang_Citi
Yezidi
Yi
Zanabazar_Square

View File

@ -1,25 +0,0 @@
#!/bin/bash
set -eux
cd git/re2
case "${KOKORO_JOB_NAME}" in
*/windows-*)
CMAKE_G_A_FLAGS=('-G' 'Visual Studio 14 2015' '-A' 'x64')
;;
*)
CMAKE_G_A_FLAGS=()
# Work around a bug in older versions of bash. :/
set +u
;;
esac
cmake -D CMAKE_BUILD_TYPE=Debug "${CMAKE_G_A_FLAGS[@]}" .
cmake --build . --config Debug --clean-first
ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random'
cmake -D CMAKE_BUILD_TYPE=Release "${CMAKE_G_A_FLAGS[@]}" .
cmake --build . --config Release --clean-first
ctest -C Release --output-on-failure -E 'dfa|exhaustive|random'
exit 0

View File

@ -1 +0,0 @@
build_file: "re2/kokoro/macos-bazel.sh"

View File

@ -1,4 +0,0 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/bazel.sh
exit $?

View File

@ -1 +0,0 @@
build_file: "re2/kokoro/macos-cmake.sh"

View File

@ -1,4 +0,0 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/cmake.sh
exit $?

View File

@ -1 +0,0 @@
build_file: "re2/kokoro/ubuntu-bazel.sh"

View File

@ -1,4 +0,0 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/bazel.sh
exit $?

View File

@ -1,2 +0,0 @@
bash git/re2/kokoro/bazel.sh
EXIT /B %ERRORLEVEL%

View File

@ -1 +0,0 @@
build_file: "re2/kokoro/windows-bazel.bat"

View File

@ -1,2 +0,0 @@
bash git/re2/kokoro/cmake.sh
EXIT /B %ERRORLEVEL%

View File

@ -1 +0,0 @@
build_file: "re2/kokoro/windows-cmake.bat"

0
extern/re2/lib/git/commit-msg.hook vendored Normal file → Executable file
View File

View File

@ -11,6 +11,9 @@
# re2::FilteredRE2*
_ZN3re211FilteredRE2*;
_ZNK3re211FilteredRE2*;
# re2::re2_internal*
_ZN3re212re2_internal*;
_ZNK3re212re2_internal*;
local:
*;
};

View File

@ -10,3 +10,6 @@ __ZN3re2ls*
# re2::FilteredRE2*
__ZN3re211FilteredRE2*
__ZNK3re211FilteredRE2*
# re2::re2_internal*
__ZN3re212re2_internal*
__ZNK3re212re2_internal*

2
extern/re2/re2.pc vendored
View File

@ -1,5 +1,3 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
includedir=@includedir@
libdir=@libdir@

View File

@ -32,7 +32,7 @@ class Bitmap256 {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0;
}
// Sets the bit with index c.
@ -40,7 +40,7 @@ class Bitmap256 {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
words_[c / 64] |= (1ULL << (c % 64));
words_[c / 64] |= (uint64_t{1} << (c % 64));
}
// Finds the next non-zero bit with index >= c.
@ -51,7 +51,6 @@ class Bitmap256 {
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
@ -89,7 +88,7 @@ int Bitmap256::FindNextSetBit(int c) const {
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
uint64_t word = words_[i] & (~0ULL << (c % 64));
uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);

View File

@ -7,7 +7,7 @@
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Similarly to
// testing/backtrack.cc, it allocates a bitmap with (count of
// lists) * (length of prog) bits to make sure it never explores the
// lists) * (length of text) bits to make sure it never explores the
// same (instruction list, character position) multiple times. This
// limits the search to run in time linear in the length of the text.
//
@ -24,7 +24,7 @@
#include <utility>
#include "util/logging.h"
#include "util/pod_array.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@ -63,11 +63,14 @@ class BitState {
int nsubmatch_; // # of submatches to fill in
// Search state
static const int VisitedBits = 32;
PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited
static constexpr int kVisitedBits = 64;
PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
PODArray<const char*> cap_; // capture registers
PODArray<Job> job_; // stack of text positions to explore
int njob_; // stack size
BitState(const BitState&) = delete;
BitState& operator=(const BitState&) = delete;
};
BitState::BitState(Prog* prog)
@ -86,10 +89,10 @@ BitState::BitState(Prog* prog)
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
static_cast<int>(p-text_.data());
if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
return true;
}
@ -134,7 +137,7 @@ void BitState::Push(int id, const char* p) {
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
const char* end = text_.data() + text_.size();
njob_ = 0;
// Push() no longer checks ShouldVisit(),
// so we must perform the check ourselves.
@ -251,7 +254,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] =
StringPiece(cap_[2 * i],
@ -288,7 +291,7 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
if (context_.data() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
@ -304,8 +307,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
nvisited = (nvisited + VisitedBits-1) / VisitedBits;
visited_ = PODArray<uint32_t>(nvisited);
nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
visited_ = PODArray<uint64_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
int ncap = 2*nsubmatch;
@ -319,8 +322,8 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
cap_[0] = text.data();
return TrySearch(prog_->start(), text.data());
}
// Unanchored search, starting from each possible text position.
@ -329,18 +332,22 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
// Try to use memchr to find the first byte quickly.
int fb = prog_->first_byte();
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
const char* etext = text.data() + text.size();
for (const char* p = text.data(); p <= etext; p++) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
if (p < etext && prog_->can_prefix_accel()) {
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
if (p == NULL)
p = text.end();
p = etext;
}
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
if (p == NULL)
break;
}
return false;
}

View File

@ -14,8 +14,8 @@
#include <utility>
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
@ -30,91 +30,57 @@ namespace re2 {
// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
//
// Because the out and out1 fields in Inst are no longer pointers,
// we can't use pointers directly here either. Instead, p refers
// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
// p == 0 represents the NULL list. This is okay because instruction #0
// we can't use pointers directly here either. Instead, head refers
// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
// head == 0 represents the NULL list. This is okay because instruction #0
// is always the fail instruction, which never appears on a list.
struct PatchList {
uint32_t p;
// Returns patch list containing just p.
static PatchList Mk(uint32_t p);
static PatchList Mk(uint32_t p) {
return {p, p};
}
// Patches all the entries on l to have value v.
// Patches all the entries on l to have value p.
// Caller must not ever use patch list again.
static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
// Deref returns the next pointer pointed at by p.
static PatchList Deref(Prog::Inst *inst0, PatchList l);
// Appends two patch lists and returns result.
static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
};
static PatchList nullPatchList = { 0 };
// Returns patch list containing just p.
PatchList PatchList::Mk(uint32_t p) {
PatchList l;
l.p = p;
return l;
}
// Returns the next pointer pointed at by l.
PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
Prog::Inst* ip = &inst0[l.p>>1];
if (l.p&1)
l.p = ip->out1();
else
l.p = ip->out();
return l;
}
// Patches all the entries on l to have value v.
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
while (l.p != 0) {
Prog::Inst* ip = &inst0[l.p>>1];
if (l.p&1) {
l.p = ip->out1();
ip->out1_ = val;
} else {
l.p = ip->out();
ip->set_out(val);
static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
while (l.head != 0) {
Prog::Inst* ip = &inst0[l.head>>1];
if (l.head&1) {
l.head = ip->out1();
ip->out1_ = p;
} else {
l.head = ip->out();
ip->set_out(p);
}
}
}
}
// Appends two patch lists and returns result.
PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
if (l1.p == 0)
return l2;
if (l2.p == 0)
return l1;
PatchList l = l1;
for (;;) {
PatchList next = PatchList::Deref(inst0, l);
if (next.p == 0)
break;
l = next;
// Appends two patch lists and returns result.
static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
if (l1.head == 0)
return l2;
if (l2.head == 0)
return l1;
Prog::Inst* ip = &inst0[l1.tail>>1];
if (l1.tail&1)
ip->out1_ = l2.head;
else
ip->set_out(l2.head);
return {l1.head, l2.tail};
}
Prog::Inst* ip = &inst0[l.p>>1];
if (l.p&1)
ip->out1_ = l2.p;
else
ip->set_out(l2.p);
uint32_t head;
uint32_t tail; // for constant-time append
};
return l1;
}
static const PatchList kNullPatchList = {0, 0};
// Compiled program fragment.
struct Frag {
uint32_t begin;
PatchList end;
Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector
Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
};
@ -212,8 +178,8 @@ class Compiler : public Regexp::Walker<Frag> {
int AddSuffixRecursive(int root, int id);
// Finds the trie node for the given suffix. Returns a Frag in order to
// distinguish between pointing at the root node directly (end.p == 0)
// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
// distinguish between pointing at the root node directly (end.head == 0)
// and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
Frag FindByteRange(int root, int id);
// Compares two ByteRanges and returns true iff they are equal.
@ -225,8 +191,8 @@ class Compiler : public Regexp::Walker<Frag> {
// Single rune.
Frag Literal(Rune r, bool foldcase);
void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
Prog* Finish();
void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
Prog* Finish(Regexp* re);
// Returns .* where dot = any byte
Frag DotStar();
@ -298,7 +264,7 @@ int Compiler::AllocInst(int n) {
// Returns an unmatchable fragment.
Frag Compiler::NoMatch() {
return Frag(0, nullPatchList);
return Frag(0, kNullPatchList);
}
// Is a an unmatchable fragment?
@ -314,7 +280,7 @@ Frag Compiler::Cat(Frag a, Frag b) {
// Elide no-op.
Prog::Inst* begin = &inst_[a.begin];
if (begin->opcode() == kInstNop &&
a.end.p == (a.begin << 1) &&
a.end.head == (a.begin << 1) &&
begin->out() == 0) {
// in case refs to a somewhere
PatchList::Patch(inst_.data(), a.end, b.begin);
@ -419,7 +385,7 @@ Frag Compiler::Match(int32_t match_id) {
if (id < 0)
return NoMatch();
inst_[id].InitMatch(match_id);
return Frag(id, nullPatchList);
return Frag(id, kNullPatchList);
}
// Returns a fragment matching a particular empty-width op (like ^ or $)
@ -467,7 +433,7 @@ static int MaxRune(int len) {
void Compiler::BeginRange() {
rune_cache_.clear();
rune_range_.begin = 0;
rune_range_.end = nullPatchList;
rune_range_.end = kNullPatchList;
}
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
@ -548,9 +514,9 @@ int Compiler::AddSuffixRecursive(int root, int id) {
}
int br;
if (f.end.p == 0)
if (f.end.head == 0)
br = root;
else if (f.end.p&1)
else if (f.end.head&1)
br = inst_[f.begin].out1();
else
br = inst_[f.begin].out();
@ -566,9 +532,9 @@ int Compiler::AddSuffixRecursive(int root, int id) {
// Ensure that the parent points to the clone, not to the original.
// Note that this could leave the head unreachable except via the cache.
br = byterange;
if (f.end.p == 0)
if (f.end.head == 0)
root = br;
else if (f.end.p&1)
else if (f.end.head&1)
inst_[f.begin].out1_ = br;
else
inst_[f.begin].set_out(br);
@ -601,7 +567,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) {
Frag Compiler::FindByteRange(int root, int id) {
if (inst_[root].opcode() == kInstByteRange) {
if (ByteRangeEqual(root, id))
return Frag(root, nullPatchList);
return Frag(root, kNullPatchList);
else
return NoMatch();
}
@ -662,48 +628,43 @@ void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
static_cast<uint8_t>(hi), foldcase, 0));
}
// Table describing how to make a UTF-8 matching machine
// for the rune range 80-10FFFF (Runeself-Runemax).
// This range happens frequently enough (for example /./ and /[^a-z]/)
// and the rune_cache_ map is slow enough that this is worth
// special handling. Makes compilation of a small expression
// with a dot in it about 10% faster.
// The * in the comments below mark whole sequences.
static struct ByteRangeProg {
int next;
int lo;
int hi;
} prog_80_10ffff[] = {
// Two-byte
{ -1, 0x80, 0xBF, }, // 0: 80-BF
{ 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
// Three-byte
{ 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
{ 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
{ 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
{ 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
// Four-byte
{ 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
{ 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
{ 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
{ 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
{ 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
{ 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
};
void Compiler::Add_80_10ffff() {
int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
const ByteRangeProg& p = prog_80_10ffff[i];
int next = 0;
if (p.next >= 0)
next = inst[p.next];
inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
static_cast<uint8_t>(p.hi), false, next);
if ((p.lo & 0xC0) != 0x80)
AddSuffix(inst[i]);
// The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
// (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
// permitting overlong encodings in E0 and F0 sequences and code points
// over 10FFFF in F4 sequences, the size of the bytecode and the number
// of equivalence classes are reduced significantly.
int id;
if (reversed_) {
// Prefix factoring matters, but we don't have to handle it here
// because the rune range trie logic takes care of that already.
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
AddSuffix(id);
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
AddSuffix(id);
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
AddSuffix(id);
} else {
// Suffix factoring matters - and we do have to handle it here.
int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
AddSuffix(id);
int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
AddSuffix(id);
int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
AddSuffix(id);
}
}
@ -711,9 +672,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
if (lo > hi)
return;
// Pick off 80-10FFFF as a common special case
// that can bypass the slow rune_cache_.
if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
// Pick off 80-10FFFF as a common special case.
if (lo == 0x80 && hi == 0x10ffff) {
Add_80_10ffff();
return;
}
@ -1095,8 +1055,6 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
RE2::Anchor anchor) {
prog_->set_flags(flags);
if (flags & Regexp::Latin1)
encoding_ = kEncodingLatin1;
max_mem_ = max_mem;
@ -1117,14 +1075,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
// on the program.)
if (m >= 1<<24)
m = 1<<24;
// Inst imposes its own limit (currently bigger than 2^24 but be safe).
if (m > Prog::Inst::kMaxInst)
m = Prog::Inst::kMaxInst;
max_ninst_ = static_cast<int>(m);
}
anchor_ = anchor;
}
@ -1178,10 +1133,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
c.prog_->set_start_unanchored(all.begin);
// Hand ownership of prog_ to caller.
return c.Finish();
return c.Finish(re);
}
Prog* Compiler::Finish() {
Prog* Compiler::Finish(Regexp* re) {
if (failed_)
return NULL;
@ -1198,6 +1153,17 @@ Prog* Compiler::Finish() {
prog_->Flatten();
prog_->ComputeByteMap();
if (!prog_->reversed()) {
std::string prefix;
bool prefix_foldcase;
if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) &&
!prefix_foldcase) {
prog_->prefix_size_ = prefix.size();
prog_->prefix_front_ = prefix.front();
prog_->prefix_back_ = prefix.back();
}
}
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
prog_->set_dfa_mem(1<<20);
@ -1254,7 +1220,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
c.prog_->set_start(all.begin);
c.prog_->set_start_unanchored(all.begin);
Prog* prog = c.Finish();
Prog* prog = c.Finish(re);
if (prog == NULL)
return NULL;

281
extern/re2/re2/dfa.cc vendored
View File

@ -39,10 +39,11 @@
#include "util/logging.h"
#include "util/mix.h"
#include "util/mutex.h"
#include "util/pod_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/sparse_set.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for DFA::State::next_.
@ -52,17 +53,6 @@
namespace re2 {
#if !defined(__linux__) /* only Linux seems to have memrchr */
static void* memrchr(const void* s, int c, size_t n) {
const unsigned char* p = (const unsigned char*)s;
for (p += n; n > 0; n--)
if (*--p == c)
return (void*)p;
return NULL;
}
#endif
// Controls whether the DFA should bail out early if the NFA would be faster.
static bool dfa_should_bail_when_slow = true;
@ -177,11 +167,8 @@ class DFA {
typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
private:
// Special "first_byte" values for a state. (Values >= 0 denote actual bytes.)
enum {
kFbUnknown = -1, // No analysis has been performed.
kFbNone = -2, // The first-byte trick cannot be used.
};
// Make it easier to swap in a scalable reader-writer mutex.
using CacheMutex = Mutex;
enum {
// Indices into start_ for unanchored searches.
@ -249,25 +236,26 @@ class DFA {
struct SearchParams {
SearchParams(const StringPiece& text, const StringPiece& context,
RWLocker* cache_lock)
: text(text), context(context),
: text(text),
context(context),
anchored(false),
can_prefix_accel(false),
want_earliest_match(false),
run_forward(false),
start(NULL),
first_byte(kFbUnknown),
cache_lock(cache_lock),
failed(false),
ep(NULL),
matches(NULL) { }
matches(NULL) {}
StringPiece text;
StringPiece context;
bool anchored;
bool can_prefix_accel;
bool want_earliest_match;
bool run_forward;
State* start;
int first_byte;
RWLocker *cache_lock;
RWLocker* cache_lock;
bool failed; // "out" parameter: whether search gave up
const char* ep; // "out" parameter: end pointer for match
SparseSet* matches;
@ -278,15 +266,13 @@ class DFA {
};
// Before each search, the parameters to Search are analyzed by
// AnalyzeSearch to determine the state in which to start and the
// "first_byte" for that state, if any.
// AnalyzeSearch to determine the state in which to start.
struct StartInfo {
StartInfo() : start(NULL), first_byte(kFbUnknown) {}
State* start;
std::atomic<int> first_byte;
StartInfo() : start(NULL) {}
std::atomic<State*> start;
};
// Fills in params->start and params->first_byte using
// Fills in params->start and params->can_prefix_accel using
// the other search parameters. Returns true on success,
// false on failure.
// cache_mutex_.r <= L < mutex_
@ -297,10 +283,10 @@ class DFA {
// The generic search loop, inlined to create specialized versions.
// cache_mutex_.r <= L < mutex_
// Might unlock and relock cache_mutex_ via params->cache_lock.
inline bool InlinedSearchLoop(SearchParams* params,
bool have_first_byte,
bool want_earliest_match,
bool run_forward);
template <bool can_prefix_accel,
bool want_earliest_match,
bool run_forward>
inline bool InlinedSearchLoop(SearchParams* params);
// The specialized versions of InlinedSearchLoop. The three letters
// at the ends of the name denote the true/false values used as the
@ -322,13 +308,6 @@ class DFA {
// Might unlock and relock cache_mutex_ via params->cache_lock.
bool FastSearchLoop(SearchParams* params);
// For debugging, a slow search loop that calls InlinedSearchLoop
// directly -- because the booleans passed are not constants, the
// loop is not specialized like the SearchFFF etc. versions, so it
// runs much more slowly. Useful only for debugging.
// cache_mutex_.r <= L < mutex_
// Might unlock and relock cache_mutex_ via params->cache_lock.
bool SlowSearchLoop(SearchParams* params);
// Looks up bytes in bytemap_ but handles case c == kByteEndText too.
int ByteMap(int c) {
@ -355,11 +334,14 @@ class DFA {
// while holding cache_mutex_ for writing, to avoid interrupting other
// readers. Any State* pointers are only valid while cache_mutex_
// is held.
Mutex cache_mutex_;
CacheMutex cache_mutex_;
int64_t mem_budget_; // Total memory budget for all States.
int64_t state_budget_; // Amount of memory remaining for new States.
StateSet state_cache_; // All States computed so far.
StartInfo start_[kMaxStart];
DFA(const DFA&) = delete;
DFA& operator=(const DFA&) = delete;
};
// Shorthand for casting to uint8_t*.
@ -442,7 +424,7 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
q1_(NULL),
mem_budget_(max_mem) {
if (ExtraDebug)
fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
fprintf(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored().c_str());
int nmark = 0;
if (kind_ == Prog::kLongestMatch)
nmark = prog_->size();
@ -613,7 +595,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
// those are the only operators with any effect in
// RunWorkqOnEmptyString or RunWorkqOnByte.
int* inst = new int[q->size()];
PODArray<int> inst(q->size());
int n = 0;
uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions
bool sawmatch = false; // whether queue contains guaranteed kInstMatch
@ -643,7 +625,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
(it == q->begin() && ip->greedy(prog_))) &&
(kind_ != Prog::kLongestMatch || !sawmark) &&
(flag & kFlagMatch)) {
delete[] inst;
if (ExtraDebug)
fprintf(stderr, " -> FullMatchState\n");
return FullMatchState;
@ -690,7 +671,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// the execution loop can stop early. This is only okay
// if the state is *not* a matching state.
if (n == 0 && flag == 0) {
delete[] inst;
if (ExtraDebug)
fprintf(stderr, " -> DeadState\n");
return DeadState;
@ -700,7 +680,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// unordered state sets separated by Marks. Sort each set
// to canonicalize, to reduce the number of distinct sets stored.
if (kind_ == Prog::kLongestMatch) {
int* ip = inst;
int* ip = inst.data();
int* ep = ip + n;
while (ip < ep) {
int* markp = ip;
@ -717,7 +697,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// we have an unordered set of states (i.e. we don't have Marks)
// and sorting will reduce the number of distinct sets stored.
if (kind_ == Prog::kManyMatch) {
int* ip = inst;
int* ip = inst.data();
int* ep = ip + n;
std::sort(ip, ep);
}
@ -736,8 +716,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
// Save the needed empty-width flags in the top bits for use later.
flag |= needflags << kFlagNeedShift;
State* state = CachedState(inst, n, flag);
delete[] inst;
State* state = CachedState(inst.data(), n, flag);
return state;
}
@ -971,8 +950,21 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
break;
case kInstByteRange: // can follow if c is in range
if (ip->Matches(c))
AddToQueue(newq, ip->out(), flag);
if (!ip->Matches(c))
break;
AddToQueue(newq, ip->out(), flag);
if (ip->hint() != 0) {
// We have a hint, but we must cancel out the
// increment that will occur after the break.
i += ip->hint() - 1;
} else {
// We have no hint, so we must find the end
// of the current list and then skip to it.
Prog::Inst* ip0 = ip;
while (!ip->last())
++ip;
i += ip - ip0;
}
break;
case kInstMatch:
@ -989,8 +981,8 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
}
if (ExtraDebug)
fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(),
c, flag, DumpWorkq(newq).c_str(), *ismatch);
fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n",
DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch);
}
// Processes input byte c in state, returning new state.
@ -1117,7 +1109,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) {
class DFA::RWLocker {
public:
explicit RWLocker(Mutex* mu);
explicit RWLocker(CacheMutex* mu);
~RWLocker();
// If the lock is only held for reading right now,
@ -1127,19 +1119,19 @@ class DFA::RWLocker {
void LockForWriting();
private:
Mutex* mu_;
CacheMutex* mu_;
bool writing_;
RWLocker(const RWLocker&) = delete;
RWLocker& operator=(const RWLocker&) = delete;
};
DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) {
DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
mu_->ReaderLock();
}
// This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations
// does not support lock upgrade.
// This function is marked as NO_THREAD_SAFETY_ANALYSIS because
// the annotations don't support lock upgrade.
void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
if (!writing_) {
mu_->ReaderUnlock();
@ -1171,11 +1163,14 @@ void DFA::ResetCache(RWLocker* cache_lock) {
// Re-acquire the cache_mutex_ for writing (exclusive use).
cache_lock->LockForWriting();
hooks::GetDFAStateCacheResetHook()({
state_budget_,
state_cache_.size(),
});
// Clear the cache, reset the memory budget.
for (int i = 0; i < kMaxStart; i++) {
start_[i].start = NULL;
start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed);
}
for (int i = 0; i < kMaxStart; i++)
start_[i].start.store(NULL, std::memory_order_relaxed);
ClearCache();
mem_budget_ = state_budget_;
}
@ -1290,8 +1285,7 @@ DFA::State* DFA::StateSaver::Restore() {
// situation, the DFA can do better than executing the simple loop.
// Instead, it can call memchr to search very quickly for the byte c.
// Whether the start state has this property is determined during a
// pre-compilation pass, and if so, the byte b is passed to the search
// loop as the "first_byte" argument, along with a boolean "have_first_byte".
// pre-compilation pass and the "can_prefix_accel" argument is set.
//
// Fourth, the desired behavior is to search for the leftmost-best match
// (approximately, the same one that Perl would find), which is not
@ -1323,15 +1317,16 @@ DFA::State* DFA::StateSaver::Restore() {
// The bools are equal to the same-named variables in params, but
// making them function arguments lets the inliner specialize
// this function to each combination (see two paragraphs above).
inline bool DFA::InlinedSearchLoop(SearchParams* params,
bool have_first_byte,
bool want_earliest_match,
bool run_forward) {
template <bool can_prefix_accel,
bool want_earliest_match,
bool run_forward>
inline bool DFA::InlinedSearchLoop(SearchParams* params) {
State* start = params->start;
const uint8_t* bp = BytePtr(params->text.begin()); // start of text
const uint8_t* p = bp; // text scanning point
const uint8_t* ep = BytePtr(params->text.end()); // end of text
const uint8_t* resetp = NULL; // p at last cache reset
const uint8_t* bp = BytePtr(params->text.data()); // start of text
const uint8_t* p = bp; // text scanning point
const uint8_t* ep = BytePtr(params->text.data() +
params->text.size()); // end of text
const uint8_t* resetp = NULL; // p at last cache reset
if (!run_forward) {
using std::swap;
swap(p, ep);
@ -1366,25 +1361,16 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
while (p != ep) {
if (ExtraDebug)
fprintf(stderr, "@%td: %s\n",
p - bp, DumpState(s).c_str());
fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str());
if (have_first_byte && s == start) {
// In start state, only way out is to find first_byte,
// so use optimized assembly in memchr to skip ahead.
// If first_byte isn't found, we can skip to the end
// of the string.
if (run_forward) {
if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
p = ep;
break;
}
} else {
if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) {
p = ep;
break;
}
p++;
if (can_prefix_accel && s == start) {
// In start state, only way out is to find the prefix,
// so we use prefix accel (e.g. memchr) to skip ahead.
// If not found, we can skip to the end of the string.
p = BytePtr(prog_->PrefixAccel(p, ep - p));
if (p == NULL) {
p = ep;
break;
}
}
@ -1475,8 +1461,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
else
lastmatch = p + 1;
if (ExtraDebug)
fprintf(stderr, "match @%td! [%s]\n",
lastmatch - bp, DumpState(s).c_str());
fprintf(stderr, "match @%td! [%s]\n", lastmatch - bp, DumpState(s).c_str());
if (params->matches != NULL && kind_ == Prog::kManyMatch) {
for (int i = s->ninst_ - 1; i >= 0; i--) {
int id = s->inst_[i];
@ -1560,36 +1545,28 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params,
// Inline specializations of the general loop.
bool DFA::SearchFFF(SearchParams* params) {
return InlinedSearchLoop(params, 0, 0, 0);
return InlinedSearchLoop<false, false, false>(params);
}
bool DFA::SearchFFT(SearchParams* params) {
return InlinedSearchLoop(params, 0, 0, 1);
return InlinedSearchLoop<false, false, true>(params);
}
bool DFA::SearchFTF(SearchParams* params) {
return InlinedSearchLoop(params, 0, 1, 0);
return InlinedSearchLoop<false, true, false>(params);
}
bool DFA::SearchFTT(SearchParams* params) {
return InlinedSearchLoop(params, 0, 1, 1);
return InlinedSearchLoop<false, true, true>(params);
}
bool DFA::SearchTFF(SearchParams* params) {
return InlinedSearchLoop(params, 1, 0, 0);
return InlinedSearchLoop<true, false, false>(params);
}
bool DFA::SearchTFT(SearchParams* params) {
return InlinedSearchLoop(params, 1, 0, 1);
return InlinedSearchLoop<true, false, true>(params);
}
bool DFA::SearchTTF(SearchParams* params) {
return InlinedSearchLoop(params, 1, 1, 0);
return InlinedSearchLoop<true, true, false>(params);
}
bool DFA::SearchTTT(SearchParams* params) {
return InlinedSearchLoop(params, 1, 1, 1);
}
// For debugging, calls the general code directly.
bool DFA::SlowSearchLoop(SearchParams* params) {
return InlinedSearchLoop(params,
params->first_byte >= 0,
params->want_earliest_match,
params->run_forward);
return InlinedSearchLoop<true, true, true>(params);
}
// For performance, calls the appropriate specialized version
@ -1608,8 +1585,7 @@ bool DFA::FastSearchLoop(SearchParams* params) {
&DFA::SearchTTT,
};
bool have_first_byte = params->first_byte >= 0;
int index = 4 * have_first_byte +
int index = 4 * params->can_prefix_accel +
2 * params->want_earliest_match +
1 * params->run_forward;
return (this->*Searches[index])(params);
@ -1701,13 +1677,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
}
}
if (ExtraDebug)
fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n",
params->anchored, params->run_forward, flags,
DumpState(info->start).c_str(), info->first_byte.load());
params->start = info->start.load(std::memory_order_acquire);
params->start = info->start;
params->first_byte = info->first_byte.load(std::memory_order_acquire);
// Even if we could prefix accel, we cannot do so when anchored and,
// less obviously, we cannot do so when we are going to need flags.
// This trick works only when there is a single byte that leads to a
// different state!
if (prog_->can_prefix_accel() &&
!params->anchored &&
params->start > SpecialStateMax &&
params->start->flag_ >> kFlagNeedShift == 0)
params->can_prefix_accel = true;
if (ExtraDebug)
fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
params->anchored, params->run_forward, flags,
DumpState(params->start).c_str(), params->can_prefix_accel);
return true;
}
@ -1716,47 +1701,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) {
bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
uint32_t flags) {
// Quick check.
int fb = info->first_byte.load(std::memory_order_acquire);
if (fb != kFbUnknown)
State* start = info->start.load(std::memory_order_acquire);
if (start != NULL)
return true;
MutexLock l(&mutex_);
fb = info->first_byte.load(std::memory_order_relaxed);
if (fb != kFbUnknown)
start = info->start.load(std::memory_order_relaxed);
if (start != NULL)
return true;
q0_->clear();
AddToQueue(q0_,
params->anchored ? prog_->start() : prog_->start_unanchored(),
flags);
info->start = WorkqToCachedState(q0_, NULL, flags);
if (info->start == NULL)
start = WorkqToCachedState(q0_, NULL, flags);
if (start == NULL)
return false;
if (info->start == DeadState) {
// Synchronize with "quick check" above.
info->first_byte.store(kFbNone, std::memory_order_release);
return true;
}
if (info->start == FullMatchState) {
// Synchronize with "quick check" above.
info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored
return true;
}
// Even if we have a first_byte, we cannot use it when anchored and,
// less obviously, we cannot use it when we are going to need flags.
// This trick works only when there is a single byte that leads to a
// different state!
int first_byte = prog_->first_byte();
if (first_byte == -1 ||
params->anchored ||
info->start->flag_ >> kFlagNeedShift != 0)
first_byte = kFbNone;
// Synchronize with "quick check" above.
info->first_byte.store(first_byte, std::memory_order_release);
info->start.store(start, std::memory_order_release);
return true;
}
@ -1779,8 +1742,7 @@ bool DFA::Search(const StringPiece& text,
if (ExtraDebug) {
fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
std::string(text).c_str(), anchored, want_earliest_match,
run_forward, kind_);
std::string(text).c_str(), anchored, want_earliest_match, run_forward, kind_);
}
RWLocker l(&cache_mutex_);
@ -1798,9 +1760,9 @@ bool DFA::Search(const StringPiece& text,
return false;
if (params.start == FullMatchState) {
if (run_forward == want_earliest_match)
*epp = text.begin();
*epp = text.data();
else
*epp = text.end();
*epp = text.data() + text.size();
return true;
}
if (ExtraDebug)
@ -1863,15 +1825,15 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
*failed = false;
StringPiece context = const_context;
if (context.begin() == NULL)
if (context.data() == NULL)
context = text;
bool carat = anchor_start();
bool caret = anchor_start();
bool dollar = anchor_end();
if (reversed_) {
using std::swap;
swap(carat, dollar);
swap(caret, dollar);
}
if (carat && context.begin() != text.begin())
if (caret && context.begin() != text.begin())
return false;
if (dollar && context.end() != text.end())
return false;
@ -1906,11 +1868,15 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
bool matched = dfa->Search(text, context, anchored,
want_earliest_match, !reversed_,
failed, &ep, matches);
if (*failed)
if (*failed) {
hooks::GetDFASearchFailureHook()({
// Nothing yet...
});
return false;
}
if (!matched)
return false;
if (endmatch && ep != (reversed_ ? text.begin() : text.end()))
if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
return false;
// If caller cares, record the boundary of the match.
@ -1918,10 +1884,11 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
// as the beginning.
if (match0) {
if (reversed_)
*match0 = StringPiece(ep, static_cast<size_t>(text.end() - ep));
*match0 =
StringPiece(ep, static_cast<size_t>(text.data() + text.size() - ep));
else
*match0 =
StringPiece(text.begin(), static_cast<size_t>(ep - text.begin()));
StringPiece(text.data(), static_cast<size_t>(ep - text.data()));
}
return true;
}

View File

@ -6,6 +6,7 @@
#include <stddef.h>
#include <string>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
@ -27,7 +28,22 @@ FilteredRE2::FilteredRE2(int min_atom_len)
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
FilteredRE2::FilteredRE2(FilteredRE2&& other)
: re2_vec_(std::move(other.re2_vec_)),
compiled_(other.compiled_),
prefilter_tree_(std::move(other.prefilter_tree_)) {
other.re2_vec_.clear();
other.re2_vec_.shrink_to_fit();
other.compiled_ = false;
other.prefilter_tree_.reset(new PrefilterTree());
}
FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
this->~FilteredRE2();
(void) new (this) FilteredRE2(std::move(other));
return *this;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
@ -38,7 +54,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
<< pattern << " due to error " << re->error();
}
delete re;
} else {

View File

@ -10,17 +10,18 @@
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// allow the user of the class to use their favorite string matching
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
// Compile the FilteredRE2. Compile returns strings that need to be
// matched. Note that the returned strings are lowercased and distinct.
// For applying regexps to a search text, the caller does the string
// matching using the returned strings. When doing the string match,
// note that the caller has to do that in a case-insensitive way or
// on a lowercased version of the search text. Then call FirstMatch
// or AllMatches with a vector of indices of strings that were found
// in the text to get the actual regexp matches.
#include <memory>
#include <string>
#include <vector>
@ -36,18 +37,25 @@ class FilteredRE2 {
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Not copyable.
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
// Movable.
FilteredRE2(FilteredRE2&& other);
FilteredRE2& operator=(FilteredRE2&& other);
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
int* id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// The returned strings are lowercased and distinct. When doing
// string matching, it should be performed in a case-insensitive
// way or the search text should be lowercased first. Call after
// all Add calls are done.
void Compile(std::vector<std::string>* strings_to_match);
@ -98,10 +106,7 @@ class FilteredRE2 {
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
std::unique_ptr<PrefilterTree> prefilter_tree_;
};
} // namespace re2

View File

@ -0,0 +1,219 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--- LLVM Exceptions to the Apache 2.0 License ----
As an exception, if, as a result of your compiling your source code, portions
of this Software are embedded into an Object form of such source code, you
may redistribute such embedded portions in such Object form without complying
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
In addition, if you combine or link compiled forms of this Software with
software that is licensed under the GPLv2 ("Combined Software") and if a
court of competent jurisdiction determines that the patent provision (Section
3), the indemnity provision (Section 9) or other Section of the License
conflicts with the conditions of the GPLv2, you may retroactively and
prospectively choose to deem waived or otherwise exclude such Section(s) of
the License, but only in their entirety and only with respect to the Combined
Software.

View File

@ -0,0 +1,305 @@
//===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// A single header library providing an utility class to break up an array of
// bytes. Whenever run on the same input, provides the same output, as long as
// its methods are called in the same order, with the same arguments.
//===----------------------------------------------------------------------===//
#ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
#define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
#include <algorithm>
#include <climits>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <initializer_list>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
// In addition to the comments below, the API is also briefly documented at
// https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
class FuzzedDataProvider {
public:
// |data| is an array of length |size| that the FuzzedDataProvider wraps to
// provide more granular access. |data| must outlive the FuzzedDataProvider.
FuzzedDataProvider(const uint8_t *data, size_t size)
: data_ptr_(data), remaining_bytes_(size) {}
~FuzzedDataProvider() = default;
// Returns a std::vector containing |num_bytes| of input data. If fewer than
// |num_bytes| of data remain, returns a shorter std::vector containing all
// of the data that's left. Can be used with any byte sized type, such as
// char, unsigned char, uint8_t, etc.
template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes) {
num_bytes = std::min(num_bytes, remaining_bytes_);
return ConsumeBytes<T>(num_bytes, num_bytes);
}
// Similar to |ConsumeBytes|, but also appends the terminator value at the end
// of the resulting vector. Useful, when a mutable null-terminated C-string is
// needed, for example. But that is a rare case. Better avoid it, if possible,
// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
template <typename T>
std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes,
T terminator = 0) {
num_bytes = std::min(num_bytes, remaining_bytes_);
std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
result.back() = terminator;
return result;
}
// Returns a std::string containing |num_bytes| of input data. Using this and
// |.c_str()| on the resulting string is the best way to get an immutable
// null-terminated C string. If fewer than |num_bytes| of data remain, returns
// a shorter std::string containing all of the data that's left.
std::string ConsumeBytesAsString(size_t num_bytes) {
static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
"ConsumeBytesAsString cannot convert the data to a string.");
num_bytes = std::min(num_bytes, remaining_bytes_);
std::string result(
reinterpret_cast<const std::string::value_type *>(data_ptr_),
num_bytes);
Advance(num_bytes);
return result;
}
// Returns a number in the range [min, max] by consuming bytes from the
// input data. The value might not be uniformly distributed in the given
// range. If there's no input data left, always returns |min|. |min| must
// be less than or equal to |max|.
template <typename T> T ConsumeIntegralInRange(T min, T max) {
static_assert(std::is_integral<T>::value, "An integral type is required.");
static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
if (min > max)
abort();
// Use the biggest type possible to hold the range and the result.
uint64_t range = static_cast<uint64_t>(max) - min;
uint64_t result = 0;
size_t offset = 0;
while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
remaining_bytes_ != 0) {
// Pull bytes off the end of the seed data. Experimentally, this seems to
// allow the fuzzer to more easily explore the input space. This makes
// sense, since it works by modifying inputs that caused new code to run,
// and this data is often used to encode length of data read by
// |ConsumeBytes|. Separating out read lengths makes it easier modify the
// contents of the data that is actually read.
--remaining_bytes_;
result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
offset += CHAR_BIT;
}
// Avoid division by 0, in case |range + 1| results in overflow.
if (range != std::numeric_limits<decltype(range)>::max())
result = result % (range + 1);
return static_cast<T>(min + result);
}
// Returns a std::string of length from 0 to |max_length|. When it runs out of
// input data, returns what remains of the input. Designed to be more stable
// with respect to a fuzzer inserting characters than just picking a random
// length and then consuming that many bytes with |ConsumeBytes|.
std::string ConsumeRandomLengthString(size_t max_length) {
// Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
// followed by anything else to the end of the string. As a result of this
// logic, a fuzzer can insert characters into the string, and the string
// will be lengthened to include those new characters, resulting in a more
// stable fuzzer than picking the length of a string independently from
// picking its contents.
std::string result;
// Reserve the anticipated capaticity to prevent several reallocations.
result.reserve(std::min(max_length, remaining_bytes_));
for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
Advance(1);
if (next == '\\' && remaining_bytes_ != 0) {
next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
Advance(1);
if (next != '\\')
break;
}
result += next;
}
result.shrink_to_fit();
return result;
}
// Returns a std::vector containing all remaining bytes of the input data.
template <typename T> std::vector<T> ConsumeRemainingBytes() {
return ConsumeBytes<T>(remaining_bytes_);
}
// Returns a std::string containing all remaining bytes of the input data.
// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
// object.
std::string ConsumeRemainingBytesAsString() {
return ConsumeBytesAsString(remaining_bytes_);
}
// Returns a number in the range [Type's min, Type's max]. The value might
// not be uniformly distributed in the given range. If there's no input data
// left, always returns |min|.
template <typename T> T ConsumeIntegral() {
return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
}
// Reads one byte and returns a bool, or false when no data remains.
bool ConsumeBool() { return 1 & ConsumeIntegral<uint8_t>(); }
// Returns a copy of the value selected from the given fixed-size |array|.
template <typename T, size_t size>
T PickValueInArray(const T (&array)[size]) {
static_assert(size > 0, "The array must be non empty.");
return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
}
template <typename T>
T PickValueInArray(std::initializer_list<const T> list) {
// TODO(Dor1s): switch to static_assert once C++14 is allowed.
if (!list.size())
abort();
return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
}
// Returns an enum value. The enum must start at 0 and be contiguous. It must
// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
template <typename T> T ConsumeEnum() {
static_assert(std::is_enum<T>::value, "|T| must be an enum type.");
return static_cast<T>(ConsumeIntegralInRange<uint32_t>(
0, static_cast<uint32_t>(T::kMaxValue)));
}
// Returns a floating point number in the range [0.0, 1.0]. If there's no
// input data left, always returns 0.
template <typename T> T ConsumeProbability() {
static_assert(std::is_floating_point<T>::value,
"A floating point type is required.");
// Use different integral types for different floating point types in order
// to provide better density of the resulting values.
using IntegralType =
typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
uint64_t>::type;
T result = static_cast<T>(ConsumeIntegral<IntegralType>());
result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
return result;
}
// Returns a floating point value in the range [Type's lowest, Type's max] by
// consuming bytes from the input data. If there's no input data left, always
// returns approximately 0.
template <typename T> T ConsumeFloatingPoint() {
return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(),
std::numeric_limits<T>::max());
}
// Returns a floating point value in the given range by consuming bytes from
// the input data. If there's no input data left, returns |min|. Note that
// |min| must be less than or equal to |max|.
template <typename T> T ConsumeFloatingPointInRange(T min, T max) {
if (min > max)
abort();
T range = .0;
T result = min;
constexpr T zero(.0);
if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
// The diff |max - min| would overflow the given floating point type. Use
// the half of the diff as the range and consume a bool to decide whether
// the result is in the first of the second part of the diff.
range = (max / 2.0) - (min / 2.0);
if (ConsumeBool()) {
result += range;
}
} else {
range = max - min;
}
return result + range * ConsumeProbability<T>();
}
// Reports the remaining bytes available for fuzzed input.
size_t remaining_bytes() { return remaining_bytes_; }
private:
FuzzedDataProvider(const FuzzedDataProvider &) = delete;
FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
void Advance(size_t num_bytes) {
if (num_bytes > remaining_bytes_)
abort();
data_ptr_ += num_bytes;
remaining_bytes_ -= num_bytes;
}
template <typename T>
std::vector<T> ConsumeBytes(size_t size, size_t num_bytes_to_consume) {
static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
// The point of using the size-based constructor below is to increase the
// odds of having a vector object with capacity being equal to the length.
// That part is always implementation specific, but at least both libc++ and
// libstdc++ allocate the requested number of bytes in that constructor,
// which seems to be a natural choice for other implementations as well.
// To increase the odds even more, we also call |shrink_to_fit| below.
std::vector<T> result(size);
if (size == 0) {
if (num_bytes_to_consume != 0)
abort();
return result;
}
std::memcpy(result.data(), data_ptr_, num_bytes_to_consume);
Advance(num_bytes_to_consume);
// Even though |shrink_to_fit| is also implementation specific, we expect it
// to provide an additional assurance in case vector's constructor allocated
// a buffer which is larger than the actual amount of data we put inside it.
result.shrink_to_fit();
return result;
}
template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value) {
static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
static_assert(!std::numeric_limits<TU>::is_signed,
"Source type must be unsigned.");
// TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
if (std::numeric_limits<TS>::is_modulo)
return static_cast<TS>(value);
// Avoid using implementation-defined unsigned to signer conversions.
// To learn more, see https://stackoverflow.com/questions/13150449.
if (value <= std::numeric_limits<TS>::max()) {
return static_cast<TS>(value);
} else {
constexpr auto TS_min = std::numeric_limits<TS>::min();
return TS_min + static_cast<char>(value - TS_min);
}
}
const uint8_t *data_ptr_;
size_t remaining_bytes_;
};
#endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_

View File

@ -2,12 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <fuzzer/FuzzedDataProvider.h>
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <memory>
#include <queue>
#include <string>
#include <vector>
#include "re2/prefilter.h"
#include "re2/re2.h"
@ -17,7 +18,38 @@ using re2::StringPiece;
// NOT static, NOT signed.
uint8_t dummy = 0;
void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
void TestOneInput(StringPiece pattern, const RE2::Options& options,
StringPiece text) {
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
// Otherwise, we will waste time on inputs that have long runs of various
// character classes. The fuzzer has shown itself to be easily capable of
// generating such patterns that fall within the other limits, but result
// in timeouts nonetheless. The marginal cost is high - even more so when
// counted repetition is involved - whereas the marginal benefit is zero.
// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
int char_class = 0;
int backslash_p = 0; // very expensive, so handle specially
for (size_t i = 0; i < pattern.size(); i++) {
if (pattern[i] == '.')
char_class++;
if (pattern[i] != '\\')
continue;
i++;
if (i >= pattern.size())
break;
if (pattern[i] == 'p' || pattern[i] == 'P' ||
pattern[i] == 'd' || pattern[i] == 'D' ||
pattern[i] == 's' || pattern[i] == 'S' ||
pattern[i] == 'w' || pattern[i] == 'W')
char_class++;
if (pattern[i] == 'p' || pattern[i] == 'P')
backslash_p++;
}
if (char_class > 9)
return;
if (backslash_p > 1)
return;
RE2 re(pattern, options);
if (!re.ok())
return;
@ -55,7 +87,7 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
// Don't waste time fuzzing high-fanout programs.
// They can cause bug reports due to fuzzer timeouts.
std::map<int, int> histogram;
std::vector<int> histogram;
int fanout = re.ProgramFanout(&histogram);
if (fanout > 9)
return;
@ -102,72 +134,38 @@ void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size == 0 || size > 999)
// An input larger than 4 KiB probably isn't interesting. (This limit
// allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
if (size == 0 || size > 4096)
return 0;
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
// Otherwise, we will waste time on inputs that have long runs of various
// character classes. The fuzzer has shown itself to be easily capable of
// generating such patterns that fall within the other limits, but result
// in timeouts nonetheless. The marginal cost is high - even more so when
// counted repetition is involved - whereas the marginal benefit is zero.
// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
int char_class = 0;
int backslash_p = 0; // very expensive, so handle specially
for (size_t i = 0; i < size; i++) {
if (data[i] == '.')
char_class++;
if (data[i] != '\\')
continue;
i++;
if (i >= size)
break;
if (data[i] == 'p' || data[i] == 'P' ||
data[i] == 'd' || data[i] == 'D' ||
data[i] == 's' || data[i] == 'S' ||
data[i] == 'w' || data[i] == 'W')
char_class++;
if (data[i] == 'p' || data[i] == 'P')
backslash_p++;
}
if (char_class > 9)
return 0;
if (backslash_p > 1)
return 0;
// The one-at-a-time hash by Bob Jenkins.
uint32_t hash = 0;
for (size_t i = 0; i < size; i++) {
hash += data[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
FuzzedDataProvider fdp(data, size);
// The convention here is that fdp.ConsumeBool() returning false sets
// the default value whereas returning true sets the alternate value:
// most options default to false and so can be set directly; encoding
// defaults to UTF-8; case_sensitive defaults to true. We do NOT want
// to log errors. max_mem is 64 MiB because we can afford to use more
// RAM in exchange for (hopefully) faster fuzzing.
RE2::Options options;
options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
: RE2::Options::EncodingUTF8);
options.set_posix_syntax(fdp.ConsumeBool());
options.set_longest_match(fdp.ConsumeBool());
options.set_log_errors(false);
options.set_max_mem(64 << 20);
options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
: RE2::Options::EncodingUTF8);
options.set_posix_syntax(hash & 2);
options.set_longest_match(hash & 4);
options.set_literal(hash & 8);
options.set_never_nl(hash & 16);
options.set_dot_nl(hash & 32);
options.set_never_capture(hash & 64);
options.set_case_sensitive(hash & 128);
options.set_perl_classes(hash & 256);
options.set_word_boundary(hash & 512);
options.set_one_line(hash & 1024);
options.set_literal(fdp.ConsumeBool());
options.set_never_nl(fdp.ConsumeBool());
options.set_dot_nl(fdp.ConsumeBool());
options.set_never_capture(fdp.ConsumeBool());
options.set_case_sensitive(!fdp.ConsumeBool());
options.set_perl_classes(fdp.ConsumeBool());
options.set_word_boundary(fdp.ConsumeBool());
options.set_one_line(fdp.ConsumeBool());
const char* ptr = reinterpret_cast<const char*>(data);
int len = static_cast<int>(size);
StringPiece pattern(ptr, len);
StringPiece text(ptr, len);
Test(pattern, options, text);
std::string pattern = fdp.ConsumeRandomLengthString(999);
std::string text = fdp.ConsumeRandomLengthString(999);
TestOneInput(pattern, options, text);
return 0;
}

2
extern/re2/re2/make_perl_groups.pl vendored Normal file → Executable file
View File

@ -76,7 +76,7 @@ sub PrintClass($$@) {
} else {
$negname =~ y/a-z/A-Z/;
}
return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }";
}
my $cnum = 0;

0
extern/re2/re2/make_unicode_casefold.py vendored Normal file → Executable file
View File

0
extern/re2/re2/make_unicode_groups.py vendored Normal file → Executable file
View File

View File

@ -38,14 +38,21 @@ static bool CanBeEmptyString(Regexp *re);
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "PCREWalker::ShortVisit called";
#endif
return a;
}
private:
PCREWalker(const PCREWalker&) = delete;
PCREWalker& operator=(const PCREWalker&) = delete;
};
// Called after visiting each of re's children and accumulating
@ -114,13 +121,16 @@ bool Regexp::MimicsPCRE() {
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
EmptyStringWalker() {}
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
#endif
return a;
}

220
extern/re2/re2/nfa.cc vendored
View File

@ -27,17 +27,18 @@
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <deque>
#include <string>
#include <utility>
#include <vector>
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
namespace re2 {
@ -107,18 +108,21 @@ class NFA {
// Returns text version of capture information, for debugging.
std::string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
void CopyCapture(const char** dst, const char** src) {
memmove(dst, src, ncapture_*sizeof src[0]);
}
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
const char* btext_; // beginning of text (for FormatSubmatch)
const char* etext_; // end of text (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
PODArray<AddState> stack_; // pre-allocated for AddToThreadq
Thread* free_threads_; // free list
std::deque<Thread> arena_; // thread arena
Thread* freelist_; // thread freelist
const char** match_; // best match so far
bool matched_; // any match so far?
@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) {
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
stack_ = PODArray<AddState>(nstack);
free_threads_ = NULL;
freelist_ = NULL;
match_ = NULL;
matched_ = false;
}
NFA::~NFA() {
delete[] match_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
for (const Thread& t : arena_)
delete[] t.capture;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
Thread* t = freelist_;
if (t != NULL) {
freelist_ = t->next;
t->ref = 1;
t->capture = new const char*[ncapture_];
// We don't need to touch t->capture because
// the caller will immediately overwrite it.
return t;
}
free_threads_ = t->next;
arena_.emplace_back();
t = &arena_.back();
t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) {
}
void NFA::Decref(Thread* t) {
if (t == NULL)
return;
DCHECK(t != NULL);
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
t->next = free_threads_;
free_threads_ = t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
t->next = freelist_;
freelist_ = t;
}
// Follows all empty arrows from id0 and enqueues all the states reached.
@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
matched_ = true;
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->value());
for (++i; i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
@ -382,10 +379,15 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
break;
case kInstMatch: {
// Avoid invoking undefined behavior when p happens
// to be null - and p-1 would be meaningless anyway.
if (p == NULL)
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by storing p instead of p-1. (What would the latter even mean?!)
// This complements the special case in NFA::Search().
if (p == NULL) {
CopyCapture(match_, t->capture);
match_[1] = p;
matched_ = true;
break;
}
if (endmatch_ && p-1 != etext_)
break;
@ -411,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
// worse than the one we just found: don't run the
// rest of the current Threadq.
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->value());
for (++i; i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
runq->clear();
return 0;
}
@ -431,12 +435,12 @@ std::string NFA::FormatCapture(const char** capture) {
if (capture[i] == NULL)
s += "(?,?)";
else if (capture[i+1] == NULL)
s += StringPrintf("(%d,?)",
(int)(capture[i] - btext_));
s += StringPrintf("(%td,?)",
capture[i] - btext_);
else
s += StringPrintf("(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
s += StringPrintf("(%td,%td)",
capture[i] - btext_,
capture[i+1] - btext_);
}
return s;
}
@ -448,7 +452,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
if (context.data() == NULL)
context = text;
// Sanity check: make sure that text lies within context.
@ -465,7 +469,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
@ -485,32 +488,33 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
}
match_ = new const char*[ncapture_];
memset(match_, 0, ncapture_*sizeof match_[0]);
matched_ = false;
// For debugging prints.
btext_ = context.begin();
btext_ = context.data();
// For convenience.
etext_ = text.data() + text.size();
if (ExtraDebug)
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
std::string(text).c_str(), std::string(context).c_str(), anchored,
longest);
std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
for (const char* p = text.data();; p++) {
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
if (p == btext_)
c = '^';
else if (p > text.end())
else if (p > etext_)
c = '$';
else if (p < text.end())
else if (p < etext_)
c = p[0] & 0xFF;
fprintf(stderr, "%c:", c);
@ -524,14 +528,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
}
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
p = etext_;
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
@ -559,30 +563,28 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
break;
}
if (p > text.end())
if (p > etext_)
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
int fb = prog_->first_byte();
if (!matched_ && (!anchored || p == text.data())) {
// Try to use prefix accel (e.g. memchr) to skip ahead.
// The search must be unanchored and there must be zero
// possible matches already.
if (!anchored && runq->size() == 0 &&
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
}
p < etext_ && prog_->can_prefix_accel()) {
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
if (p == NULL)
p = etext_;
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
t);
Decref(t);
}
@ -593,10 +595,24 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
fprintf(stderr, "dead\n");
break;
}
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
// This complements the special case in NFA::Step().
if (p == NULL) {
(void) Step(runq, nextq, -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
break;
}
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
Decref(i->value());
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
if (i->value() != NULL)
Decref(i->value());
}
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
@ -605,73 +621,13 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
if (ExtraDebug)
fprintf(stderr, "match (%td,%td)\n",
match_[0] - btext_, match_[1] - btext_);
match_[0] - btext_,
match_[1] - btext_);
return true;
}
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int Prog::ComputeFirstByte() {
int b = -1;
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
if (!ip->last())
q.insert(id+1);
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAltMatch:
DCHECK(!ip->last());
q.insert(id+1);
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,

View File

@ -59,11 +59,11 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/sparse_set.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
@ -235,7 +235,7 @@ bool Prog::SearchOnePass(const StringPiece& text,
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
if (context.data() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
@ -249,8 +249,8 @@ bool Prog::SearchOnePass(const StringPiece& text,
// start() is always mapped to the zeroth OneState.
OneState* state = IndexToNode(nodes, statesize, 0);
uint8_t* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* bp = text.data();
const char* ep = text.data() + text.size();
const char* p;
bool matched = false;
matchcap[0] = bp;
@ -550,7 +550,7 @@ bool Prog::IsOnePass() {
if (!AddQ(&workq, ip->out())) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
"Not OnePass: multiple paths %d -> %d", *it, ip->out());
goto fail;
}
id = ip->out();
@ -561,7 +561,7 @@ bool Prog::IsOnePass() {
// (3) is violated
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple matches from %d\n", *it);
"Not OnePass: multiple matches from %d", *it);
goto fail;
}
matched = true;

View File

@ -27,9 +27,9 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
#include "re2/unicode_casefold.h"
@ -93,7 +93,7 @@ class Regexp::ParseState {
bool PushSimpleOp(RegexpOp op);
// Pushes a ^ onto the stack.
bool PushCarat();
bool PushCaret();
// Pushes a \b (word == true) or \B (word == false) onto the stack.
bool PushWordBoundary(bool word);
@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) {
}
// Pushes a ^ onto the stack.
bool Regexp::ParseState::PushCarat() {
bool Regexp::ParseState::PushCaret() {
if (flags_ & OneLine) {
return PushSimpleOp(kRegexpBeginText);
}
@ -556,9 +556,10 @@ int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
}
int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
#endif
return 0;
}
@ -684,7 +685,7 @@ bool Regexp::ParseState::DoRightParen() {
if ((r1 = stacktop_) == NULL ||
(r2 = r1->down_) == NULL ||
r2->op() != kLeftParen) {
status_->set_code(kRegexpMissingParen);
status_->set_code(kRegexpUnexpectedParen);
status_->set_error_arg(whole_regexp_);
return false;
}
@ -1323,14 +1324,14 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
// Parses a decimal integer, storing it in *np.
// Sets *s to span the remainder of the string.
static bool ParseInteger(StringPiece* s, int* np) {
if (s->size() == 0 || !isdigit((*s)[0] & 0xFF))
if (s->empty() || !isdigit((*s)[0] & 0xFF))
return false;
// Disallow leading zeros.
if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
return false;
int n = 0;
int c;
while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) {
while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
// Avoid overflow.
if (n >= 100000000)
return false;
@ -1352,16 +1353,16 @@ static bool ParseInteger(StringPiece* s, int* np) {
// s must NOT be edited unless MaybeParseRepetition returns true.
static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
StringPiece s = *sp;
if (s.size() == 0 || s[0] != '{')
if (s.empty() || s[0] != '{')
return false;
s.remove_prefix(1); // '{'
if (!ParseInteger(&s, lo))
return false;
if (s.size() == 0)
if (s.empty())
return false;
if (s[0] == ',') {
s.remove_prefix(1); // ','
if (s.size() == 0)
if (s.empty())
return false;
if (s[0] == '}') {
// {2,} means at least 2
@ -1375,7 +1376,7 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {
// {2} means exactly two
*hi = *lo;
}
if (s.size() == 0 || s[0] != '}')
if (s.empty() || s[0] != '}')
return false;
s.remove_prefix(1); // '}'
*sp = s;
@ -1416,7 +1417,7 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) {
static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
StringPiece t = s;
Rune r;
while (t.size() > 0) {
while (!t.empty()) {
if (StringPieceToRune(&r, &t, status) < 0)
return false;
}
@ -1447,14 +1448,14 @@ static int UnHex(int c) {
// Sets *rp to the named character.
static bool ParseEscape(StringPiece* s, Rune* rp,
RegexpStatus* status, int rune_max) {
const char* begin = s->begin();
if (s->size() < 1 || (*s)[0] != '\\') {
const char* begin = s->data();
if (s->empty() || (*s)[0] != '\\') {
// Should not happen - caller always checks.
status->set_code(kRegexpInternalError);
status->set_error_arg(StringPiece());
return false;
}
if (s->size() < 2) {
if (s->size() == 1) {
status->set_code(kRegexpTrailingBackslash);
status->set_error_arg(StringPiece());
return false;
@ -1485,16 +1486,16 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
case '6':
case '7':
// Single non-zero octal digit is a backreference; not supported.
if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7')
if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
goto BadEscape;
FALLTHROUGH_INTENDED;
case '0':
// consume up to three octal digits; already have one.
code = c - '0';
if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {
if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
code = code * 8 + c - '0';
s->remove_prefix(1); // digit
if (s->size() > 0) {
if (!s->empty()) {
c = (*s)[0];
if ('0' <= c && c <= '7') {
code = code * 8 + c - '0';
@ -1509,7 +1510,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
// Hexadecimal escapes
case 'x':
if (s->size() == 0)
if (s->empty())
goto BadEscape;
if (StringPieceToRune(&c, s, status) < 0)
return false;
@ -1529,7 +1530,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
code = code * 16 + UnHex(c);
if (code > rune_max)
goto BadEscape;
if (s->size() == 0)
if (s->empty())
goto BadEscape;
if (StringPieceToRune(&c, s, status) < 0)
return false;
@ -1540,7 +1541,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp,
return true;
}
// Easy case: two hex digits.
if (s->size() == 0)
if (s->empty())
goto BadEscape;
if (StringPieceToRune(&c1, s, status) < 0)
return false;
@ -1590,7 +1591,7 @@ BadEscape:
// Unrecognized escape sequence.
status->set_code(kRegexpBadEscape);
status->set_error_arg(
StringPiece(begin, static_cast<size_t>(s->begin() - begin)));
StringPiece(begin, static_cast<size_t>(s->data() - begin)));
return false;
}
@ -1710,7 +1711,7 @@ const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_fl
return NULL;
// Could use StringPieceToRune, but there aren't
// any non-ASCII Perl group names.
StringPiece name(s->begin(), 2);
StringPiece name(s->data(), 2);
const UGroup *g = LookupPerlGroup(name);
if (g == NULL)
return NULL;
@ -1750,8 +1751,8 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
return kParseError;
if (c != '{') {
// Name is the bit of string we just skipped over for c.
const char* p = seq.begin() + 2;
name = StringPiece(p, static_cast<size_t>(s->begin() - p));
const char* p = seq.data() + 2;
name = StringPiece(p, static_cast<size_t>(s->data() - p));
} else {
// Name is in braces. Look for closing }
size_t end = s->find('}', 0);
@ -1762,16 +1763,16 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
status->set_error_arg(seq);
return kParseError;
}
name = StringPiece(s->begin(), end); // without '}'
name = StringPiece(s->data(), end); // without '}'
s->remove_prefix(end + 1); // with '}'
if (!IsValidUTF8(name, status))
return kParseError;
}
// Chop seq where s now begins.
seq = StringPiece(seq.begin(), static_cast<size_t>(s->begin() - seq.begin()));
seq = StringPiece(seq.data(), static_cast<size_t>(s->data() - seq.data()));
if (name.size() > 0 && name[0] == '^') {
if (!name.empty() && name[0] == '^') {
sign = -sign;
name.remove_prefix(1); // '^'
}
@ -1801,14 +1802,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,
// Convert the UnicodeSet to a URange32 and UGroup that we can add.
int nr = uset.getRangeCount();
URange32* r = new URange32[nr];
PODArray<URange32> r(nr);
for (int i = 0; i < nr; i++) {
r[i].lo = uset.getRangeStart(i);
r[i].hi = uset.getRangeEnd(i);
}
UGroup g = {"", +1, 0, 0, r, nr};
UGroup g = {"", +1, 0, 0, r.data(), nr};
AddUGroup(cc, &g, sign, parse_flags);
delete[] r;
#endif
return kParseOk;
@ -1858,7 +1858,7 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,
bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
const StringPiece& whole_class,
RegexpStatus* status) {
if (s->size() == 0) {
if (s->empty()) {
status->set_code(kRegexpMissingBracket);
status->set_error_arg(whole_class);
return false;
@ -1866,7 +1866,7 @@ bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,
// Allow regular escape sequences even though
// many need not be escaped in this context.
if (s->size() >= 1 && (*s)[0] == '\\')
if ((*s)[0] == '\\')
return ParseEscape(s, rp, status, rune_max_);
// Otherwise take the next rune.
@ -1908,7 +1908,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
Regexp** out_re,
RegexpStatus* status) {
StringPiece whole_class = *s;
if (s->size() == 0 || (*s)[0] != '[') {
if (s->empty() || (*s)[0] != '[') {
// Caller checked this.
status->set_code(kRegexpInternalError);
status->set_error_arg(StringPiece());
@ -1918,7 +1918,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
re->ccb_ = new CharClassBuilder;
s->remove_prefix(1); // '['
if (s->size() > 0 && (*s)[0] == '^') {
if (!s->empty() && (*s)[0] == '^') {
s->remove_prefix(1); // '^'
negated = true;
if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
@ -1928,7 +1928,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
}
}
bool first = true; // ] is okay as first char in class
while (s->size() > 0 && ((*s)[0] != ']' || first)) {
while (!s->empty() && ((*s)[0] != ']' || first)) {
// - is only okay unescaped as first or last in class.
// Except that Perl allows - anywhere.
if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
@ -1996,7 +1996,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
// in the flags.
re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
}
if (s->size() == 0) {
if (s->empty()) {
status->set_code(kRegexpMissingBracket);
status->set_error_arg(whole_class);
re->Decref();
@ -2016,7 +2016,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s,
// Python rejects names starting with digits.
// We don't enforce either of those.
static bool IsValidCaptureName(const StringPiece& name) {
if (name.size() == 0)
if (name.empty())
return false;
for (size_t i = 0; i < name.size(); i++) {
int c = name[i];
@ -2074,8 +2074,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
}
// t is "P<name>...", t[end] == '>'
StringPiece capture(t.begin()-2, end+3); // "(?P<name>"
StringPiece name(t.begin()+2, end-2); // "name"
StringPiece capture(t.data()-2, end+3); // "(?P<name>"
StringPiece name(t.data()+2, end-2); // "name"
if (!IsValidUTF8(name, status_))
return false;
if (!IsValidCaptureName(name)) {
@ -2089,7 +2089,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
return false;
}
s->remove_prefix(static_cast<size_t>(capture.end() - s->begin()));
s->remove_prefix(
static_cast<size_t>(capture.data() + capture.size() - s->data()));
return true;
}
@ -2098,7 +2099,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
int nflags = flags_;
Rune c;
for (bool done = false; !done; ) {
if (t.size() == 0)
if (t.empty())
goto BadPerlOp;
if (StringPieceToRune(&c, &t, status_) < 0)
return false;
@ -2173,7 +2174,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {
BadPerlOp:
status_->set_code(kRegexpBadPerlOp);
status_->set_error_arg(
StringPiece(s->begin(), static_cast<size_t>(t.begin() - s->begin())));
StringPiece(s->data(), static_cast<size_t>(t.data() - s->data())));
return false;
}
@ -2216,7 +2217,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
if (global_flags & Literal) {
// Special parse loop for literal string.
while (t.size() > 0) {
while (!t.empty()) {
Rune r;
if (StringPieceToRune(&r, &t, status) < 0)
return NULL;
@ -2227,7 +2228,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
}
StringPiece lastunary = StringPiece();
while (t.size() > 0) {
while (!t.empty()) {
StringPiece isunary = StringPiece();
switch (t[0]) {
default: {
@ -2270,7 +2271,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
break;
case '^': // Beginning of line.
if (!ps.PushCarat())
if (!ps.PushCaret())
return NULL;
t.remove_prefix(1); // '^'
break;
@ -2311,18 +2312,18 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
bool nongreedy = false;
t.remove_prefix(1); // '*' or '+' or '?'
if (ps.flags() & PerlX) {
if (t.size() > 0 && t[0] == '?') {
if (!t.empty() && t[0] == '?') {
nongreedy = true;
t.remove_prefix(1); // '?'
}
if (lastunary.size() > 0) {
if (!lastunary.empty()) {
// In Perl it is not allowed to stack repetition operators:
// a** is a syntax error, not a double-star.
// (and a++ means something else entirely, which we don't support!)
status->set_code(kRegexpRepeatOp);
status->set_error_arg(StringPiece(
lastunary.begin(),
static_cast<size_t>(t.begin() - lastunary.begin())));
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
return NULL;
}
}
@ -2346,16 +2347,16 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
}
bool nongreedy = false;
if (ps.flags() & PerlX) {
if (t.size() > 0 && t[0] == '?') {
if (!t.empty() && t[0] == '?') {
nongreedy = true;
t.remove_prefix(1); // '?'
}
if (lastunary.size() > 0) {
if (!lastunary.empty()) {
// Not allowed to stack repetition operators.
status->set_code(kRegexpRepeatOp);
status->set_error_arg(StringPiece(
lastunary.begin(),
static_cast<size_t>(t.begin() - lastunary.begin())));
lastunary.data(),
static_cast<size_t>(t.data() - lastunary.data())));
return NULL;
}
}
@ -2404,7 +2405,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,
if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
t.remove_prefix(2); // '\\', 'Q'
while (t.size() > 0) {
while (!t.empty()) {
if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
t.remove_prefix(2); // '\\', 'E'
break;

View File

@ -20,12 +20,12 @@ static const URange16 code3[] = { /* \w */
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
{ "\\d", +1, code1, 1, 0, 0 },
{ "\\D", -1, code1, 1, 0, 0 },
{ "\\s", +1, code2, 3, 0, 0 },
{ "\\S", -1, code2, 3, 0, 0 },
{ "\\w", +1, code3, 4, 0, 0 },
{ "\\W", -1, code3, 4, 0, 0 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
@ -85,34 +85,34 @@ static const URange16 code17[] = { /* [:xdigit:] */
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
{ "[:alnum:]", +1, code4, 3, 0, 0 },
{ "[:^alnum:]", -1, code4, 3, 0, 0 },
{ "[:alpha:]", +1, code5, 2, 0, 0 },
{ "[:^alpha:]", -1, code5, 2, 0, 0 },
{ "[:ascii:]", +1, code6, 1, 0, 0 },
{ "[:^ascii:]", -1, code6, 1, 0, 0 },
{ "[:blank:]", +1, code7, 2, 0, 0 },
{ "[:^blank:]", -1, code7, 2, 0, 0 },
{ "[:cntrl:]", +1, code8, 2, 0, 0 },
{ "[:^cntrl:]", -1, code8, 2, 0, 0 },
{ "[:digit:]", +1, code9, 1, 0, 0 },
{ "[:^digit:]", -1, code9, 1, 0, 0 },
{ "[:graph:]", +1, code10, 1, 0, 0 },
{ "[:^graph:]", -1, code10, 1, 0, 0 },
{ "[:lower:]", +1, code11, 1, 0, 0 },
{ "[:^lower:]", -1, code11, 1, 0, 0 },
{ "[:print:]", +1, code12, 1, 0, 0 },
{ "[:^print:]", -1, code12, 1, 0, 0 },
{ "[:punct:]", +1, code13, 4, 0, 0 },
{ "[:^punct:]", -1, code13, 4, 0, 0 },
{ "[:space:]", +1, code14, 2, 0, 0 },
{ "[:^space:]", -1, code14, 2, 0, 0 },
{ "[:upper:]", +1, code15, 1, 0, 0 },
{ "[:^upper:]", -1, code15, 1, 0, 0 },
{ "[:word:]", +1, code16, 4, 0, 0 },
{ "[:^word:]", -1, code16, 4, 0, 0 },
{ "[:xdigit:]", +1, code17, 3, 0, 0 },
{ "[:^xdigit:]", -1, code17, 3, 0, 0 },
};
const int num_posix_groups = 28;

View File

@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_POD_ARRAY_H_
#define UTIL_POD_ARRAY_H_
#ifndef RE2_POD_ARRAY_H_
#define RE2_POD_ARRAY_H_
#include <memory>
#include <type_traits>
@ -13,7 +13,7 @@ namespace re2 {
template <typename T>
class PODArray {
public:
static_assert(std::is_pod<T>::value,
static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
"T must be POD");
PODArray()
@ -52,4 +52,4 @@ class PODArray {
} // namespace re2
#endif // UTIL_POD_ARRAY_H_
#endif // RE2_POD_ARRAY_H_

View File

@ -648,14 +648,15 @@ Prefilter* Prefilter::FromRegexp(Regexp* re) {
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
if (simple == NULL)
return NULL;
Prefilter::Info* info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}

View File

@ -107,7 +107,7 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
std::string node_string = NodeString(node);
std::map<std::string, Prefilter*>::iterator iter = nodes->find(node_string);
NodeMap::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
@ -377,7 +377,7 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << it->first;
}
LOG(ERROR) << "Map:";
for (std::map<std::string, Prefilter*>::const_iterator iter = nodes->begin();
for (NodeMap::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;

View File

@ -21,8 +21,8 @@
#include <vector>
#include "util/util.h"
#include "util/sparse_array.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
namespace re2 {

149
extern/re2/re2/prog.cc vendored
View File

@ -7,6 +7,12 @@
#include "re2/prog.h"
#if defined(__AVX2__)
#include <immintrin.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#endif
#include <stdint.h>
#include <string.h>
#include <algorithm>
@ -109,8 +115,9 @@ Prog::Prog()
start_unanchored_(0),
size_(0),
bytemap_range_(0),
first_byte_(-1),
flags_(0),
prefix_size_(0),
prefix_front_(-1),
prefix_back_(-1),
list_count_(0),
dfa_mem_(0),
dfa_first_(NULL),
@ -185,14 +192,31 @@ std::string Prog::DumpByteMap() {
return map;
}
int Prog::first_byte() {
std::call_once(first_byte_once_, [](Prog* prog) {
prog->first_byte_ = prog->ComputeFirstByte();
}, this);
return first_byte_;
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
static bool IsMatch(Prog*, Prog::Inst*);
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
// Peep-hole optimizer.
void Prog::Optimize() {
@ -258,54 +282,28 @@ void Prog::Optimize() {
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
if (p == text.data())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
if (p == text.data() + text.size())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
else if (p < text.data() + text.size() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
if (p == text.data() && p == text.data() + text.size()) {
// no word boundary here
} else if (p == text.begin()) {
} else if (p == text.data()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
} else if (p == text.data() + text.size()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
@ -918,4 +916,73 @@ void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
}
}
#if defined(__AVX2__)
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint32_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctz(n);
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
unsigned long c;
_BitScanForward(&c, n);
return static_cast<int>(c);
#else
int c = 31;
for (int shift = 1 << 4; shift != 0; shift >>= 1) {
uint32_t word = n << shift;
if (word != 0) {
n = word;
c -= shift;
}
}
return c;
#endif
}
#endif
const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
DCHECK_GE(prefix_size_, 2);
if (size < prefix_size_)
return NULL;
// Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
// This also means that probing for prefix_back_ doesn't go out of bounds.
size -= prefix_size_-1;
#if defined(__AVX2__)
// Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
if (size >= sizeof(__m256i)) {
const __m256i* fp = reinterpret_cast<const __m256i*>(
reinterpret_cast<const char*>(data));
const __m256i* bp = reinterpret_cast<const __m256i*>(
reinterpret_cast<const char*>(data) + prefix_size_-1);
const __m256i* endfp = fp + size/sizeof(__m256i);
const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
while (fp != endfp) {
const __m256i f_loadu = _mm256_loadu_si256(fp++);
const __m256i b_loadu = _mm256_loadu_si256(bp++);
const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero.
const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
const int fb_movemask = _mm256_movemask_epi8(fb_and);
const int fb_ctz = FindLSBSet(fb_movemask);
return reinterpret_cast<const char*>(fp-1) + fb_ctz;
}
}
data = fp;
size = size%sizeof(__m256i);
}
#endif
const char* p0 = reinterpret_cast<const char*>(data);
for (const char* p = p0;; p++) {
DCHECK_GE(size, static_cast<size_t>(p-p0));
p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
if (p == NULL || p[prefix_size_-1] == prefix_back_)
return p;
}
}
} // namespace re2

36
extern/re2/re2/prog.h vendored
View File

@ -18,10 +18,10 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "re2/pod_array.h"
#include "re2/re2.h"
#include "re2/sparse_array.h"
#include "re2/sparse_set.h"
namespace re2 {
@ -198,8 +198,8 @@ class Prog {
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
int start_unanchored() { return start_unanchored_; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
bool reversed() { return reversed_; }
@ -207,19 +207,27 @@ class Prog {
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
bool can_prefix_accel() { return prefix_size_ != 0; }
// Lazily computed.
int first_byte();
// Accelerates to the first likely occurrence of the prefix.
// Returns a pointer to the first byte or NULL if not found.
const void* PrefixAccel(const void* data, size_t size) {
DCHECK_GE(prefix_size_, 1);
return prefix_size_ == 1 ? memchr(data, prefix_front_, size)
: PrefixAccel_FrontAndBack(data, size);
}
// An implementation of prefix accel that looks for prefix_front_ and
// prefix_back_ to return fewer false positives than memchr(3) alone.
const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
// Returns string representation of program for debugging.
std::string Dump();
@ -297,10 +305,6 @@ class Prog {
// Compute bytemap.
void ComputeByteMap();
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
@ -402,8 +406,9 @@ class Prog {
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
size_t prefix_size_; // size of prefix (0 if no prefix)
int prefix_front_; // first byte of prefix (-1 if no prefix)
int prefix_back_; // last byte of prefix (-1 if no prefix)
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
@ -419,7 +424,6 @@ class Prog {
uint8_t bytemap_[256]; // map from input bytes to byte classes
std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;

409
extern/re2/re2/re2.cc vendored
View File

@ -12,10 +12,14 @@
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <atomic>
#include <iterator>
#include <mutex>
#include <string>
@ -24,11 +28,11 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/sparse_array.h"
namespace re2 {
@ -79,6 +83,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
return RE2::ErrorMissingBracket;
case re2::kRegexpMissingParen:
return RE2::ErrorMissingParen;
case re2::kRegexpUnexpectedParen:
return RE2::ErrorUnexpectedParen;
case re2::kRegexpTrailingBackslash:
return RE2::ErrorTrailingBackslash;
case re2::kRegexpRepeatArgument:
@ -172,15 +178,20 @@ void RE2::Init(const StringPiece& pattern, const Options& options) {
empty_group_names = new std::map<int, std::string>;
});
pattern_ = std::string(pattern);
pattern_.assign(pattern.data(), pattern.size());
options_.Copy(options);
entire_regexp_ = NULL;
error_ = empty_string;
error_code_ = NoError;
error_arg_.clear();
prefix_.clear();
prefix_foldcase_ = false;
suffix_regexp_ = NULL;
prog_ = NULL;
num_captures_ = -1;
is_one_pass_ = false;
rprog_ = NULL;
error_ = empty_string;
error_code_ = NoError;
named_groups_ = NULL;
group_names_ = NULL;
@ -239,9 +250,11 @@ re2::Prog* RE2::ReverseProg() const {
if (re->rprog_ == NULL) {
if (re->options_.log_errors())
LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
re->error_ =
new std::string("pattern too large - reverse compile failed");
re->error_code_ = RE2::ErrorPatternTooLarge;
// We no longer touch error_ and error_code_ because failing to compile
// the reverse Prog is not a showstopper: falling back to NFA execution
// is fine. More importantly, an RE2 object is supposed to be logically
// immutable: whatever ok() would have returned after Init() completed,
// it should continue to return that no matter what ReverseProg() does.
}
}, this);
return rprog_;
@ -277,28 +290,54 @@ int RE2::ReverseProgramSize() const {
return prog->size();
}
static int Fanout(Prog* prog, std::map<int, int>* histogram) {
SparseArray<int> fanout(prog->size());
prog->Fanout(&fanout);
histogram->clear();
for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
// TODO(junyer): Optimise this?
int bucket = 0;
while (1 << bucket < i->value()) {
bucket++;
// Finds the most significant non-zero bit in n.
static int FindMSBSet(uint32_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return 31 ^ __builtin_clz(n);
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
unsigned long c;
_BitScanReverse(&c, n);
return static_cast<int>(c);
#else
int c = 0;
for (int shift = 1 << 4; shift != 0; shift >>= 1) {
uint32_t word = n >> shift;
if (word != 0) {
n = word;
c += shift;
}
(*histogram)[bucket]++;
}
return histogram->rbegin()->first;
return c;
#endif
}
int RE2::ProgramFanout(std::map<int, int>* histogram) const {
static int Fanout(Prog* prog, std::vector<int>* histogram) {
SparseArray<int> fanout(prog->size());
prog->Fanout(&fanout);
int data[32] = {};
int size = 0;
for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
if (i->value() == 0)
continue;
uint32_t value = i->value();
int bucket = FindMSBSet(value);
bucket += value & (value-1) ? 1 : 0;
++data[bucket];
size = std::max(size, bucket+1);
}
if (histogram != NULL)
histogram->assign(data, data+size);
return size-1;
}
int RE2::ProgramFanout(std::vector<int>* histogram) const {
if (prog_ == NULL)
return -1;
return Fanout(prog_, histogram);
}
int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
if (prog_ == NULL)
return -1;
Prog* prog = ReverseProg();
@ -368,6 +407,8 @@ bool RE2::Replace(std::string* str,
const StringPiece& rewrite) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
@ -377,8 +418,8 @@ bool RE2::Replace(std::string* str,
if (!re.Rewrite(&s, rewrite, vec, nvec))
return false;
assert(vec[0].begin() >= str->data());
assert(vec[0].end() <= str->data()+str->size());
assert(vec[0].data() >= str->data());
assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
str->replace(vec[0].data() - str->data(), vec[0].size(), s);
return true;
}
@ -388,6 +429,8 @@ int RE2::GlobalReplace(std::string* str,
const StringPiece& rewrite) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
@ -406,9 +449,9 @@ int RE2::GlobalReplace(std::string* str,
if (!re.Match(*str, static_cast<size_t>(p - str->data()),
str->size(), UNANCHORED, vec, nvec))
break;
if (p < vec[0].begin())
out.append(p, vec[0].begin() - p);
if (vec[0].begin() == lastend && vec[0].size() == 0) {
if (p < vec[0].data())
out.append(p, vec[0].data() - p);
if (vec[0].data() == lastend && vec[0].empty()) {
// Disallow empty match at end of last match: skip ahead.
//
// fullrune() takes int, not ptrdiff_t. However, it just looks
@ -439,7 +482,7 @@ int RE2::GlobalReplace(std::string* str,
continue;
}
re.Rewrite(&out, rewrite, vec, nvec);
p = vec[0].end();
p = vec[0].data() + vec[0].size();
lastend = p;
count++;
}
@ -460,9 +503,10 @@ bool RE2::Extract(const StringPiece& text,
std::string* out) {
StringPiece vec[kVecSize];
int nvec = 1 + MaxSubmatch(rewrite);
if (nvec > 1 + re.NumberOfCapturingGroups())
return false;
if (nvec > static_cast<int>(arraysize(vec)))
return false;
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
return false;
@ -610,6 +654,8 @@ bool RE2::Match(const StringPiece& text,
// If the regexp is anchored explicitly, must not be in middle of text.
if (prog_->anchor_start() && startpos != 0)
return false;
if (prog_->anchor_end() && endpos != text.size())
return false;
// If the regexp is anchored explicitly, update re_anchor
// so that we can potentially fall into a faster case below.
@ -643,7 +689,6 @@ bool RE2::Match(const StringPiece& text,
Prog::MatchKind kind = Prog::kFirstMatch;
if (options_.longest_match())
kind = Prog::kLongestMatch;
bool skipped_test = false;
bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
@ -655,38 +700,82 @@ bool RE2::Match(const StringPiece& text,
bool can_bit_state = prog_->CanBitState();
size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
#ifdef RE2_HAVE_THREAD_LOCAL
hooks::context = this;
#endif
bool dfa_failed = false;
bool skipped_test = false;
switch (re_anchor) {
default:
LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
return false;
case UNANCHORED: {
if (prog_->anchor_end()) {
// This is a very special case: we don't need the forward DFA because
// we already know where the match must end! Instead, the reverse DFA
// can say whether there is a match and (optionally) where it starts.
Prog* prog = ReverseProg();
if (prog == NULL) {
// Fall back to NFA below.
skipped_test = true;
break;
}
if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog->size() << ", "
<< "list count " << prog->list_count() << ", "
<< "bytemap range " << prog->bytemap_range();
// Fall back to NFA below.
skipped_test = true;
break;
}
return false;
}
if (matchp == NULL) // Matched. Don't care where.
return true;
break;
}
if (!prog_->SearchDFA(subtext, text, anchor, kind,
matchp, &dfa_failed, NULL)) {
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
// Fall back to NFA below.
skipped_test = true;
break;
}
return false;
}
if (matchp == NULL) // Matched. Don't care where
if (matchp == NULL) // Matched. Don't care where.
return true;
// SearchDFA set match[0].end() but didn't know where the
// match started. Run the regexp backward from match[0].end()
// SearchDFA set match.end() but didn't know where the
// match started. Run the regexp backward from match.end()
// to find the longest possible match -- that's where it started.
Prog* prog = ReverseProg();
if (prog == NULL)
return false;
if (prog == NULL) {
// Fall back to NFA below.
skipped_test = true;
break;
}
if (!prog->SearchDFA(match, text, Prog::kAnchored,
Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
<< "bytemap range " << prog->bytemap_range() << ", "
<< "list count " << prog->list_count();
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog->size() << ", "
<< "list count " << prog->list_count() << ", "
<< "bytemap range " << prog->bytemap_range();
// Fall back to NFA below.
skipped_test = true;
break;
@ -724,9 +813,11 @@ bool RE2::Match(const StringPiece& text,
&match, &dfa_failed, NULL)) {
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
LOG(ERROR) << "DFA out of memory: "
<< "pattern length " << pattern_.size() << ", "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
// Fall back to NFA below.
skipped_test = true;
break;
@ -928,13 +1019,13 @@ bool RE2::Rewrite(std::string* out,
int n = (c - '0');
if (n >= veclen) {
if (options_.log_errors()) {
LOG(ERROR) << "requested group " << n
<< " in regexp " << rewrite.data();
LOG(ERROR) << "invalid substitution \\" << n
<< " from " << veclen << " groups";
}
return false;
}
StringPiece snip = vec[n];
if (snip.size() > 0)
if (!snip.empty())
out->append(snip.data(), snip.size());
} else if (c == '\\') {
out->push_back('\\');
@ -949,41 +1040,49 @@ bool RE2::Rewrite(std::string* out,
/***** Parsers for various types *****/
bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
namespace re2_internal {
template <>
bool Parse(const char* str, size_t n, void* dest) {
// We fail if somebody asked us to store into a non-NULL void* pointer
return (dest == NULL);
}
bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
template <>
bool Parse(const char* str, size_t n, std::string* dest) {
if (dest == NULL) return true;
reinterpret_cast<std::string*>(dest)->assign(str, n);
dest->assign(str, n);
return true;
}
bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
template <>
bool Parse(const char* str, size_t n, StringPiece* dest) {
if (dest == NULL) return true;
*(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
*dest = StringPiece(str, n);
return true;
}
bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
template <>
bool Parse(const char* str, size_t n, char* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<char*>(dest)) = str[0];
*dest = str[0];
return true;
}
bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
template <>
bool Parse(const char* str, size_t n, signed char* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<signed char*>(dest)) = str[0];
*dest = str[0];
return true;
}
bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
template <>
bool Parse(const char* str, size_t n, unsigned char* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
*dest = str[0];
return true;
}
@ -1047,10 +1146,40 @@ static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
return buf;
}
bool RE2::Arg::parse_long_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, float* dest) {
if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, true);
char* end;
errno = 0;
float r = strtof(str, &end);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*dest = r;
return true;
}
template <>
bool Parse(const char* str, size_t n, double* dest) {
if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, true);
char* end;
errno = 0;
double r = strtod(str, &end);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*dest = r;
return true;
}
template <>
bool Parse(const char* str, size_t n, long* dest, int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
@ -1060,14 +1189,12 @@ bool RE2::Arg::parse_long_radix(const char* str,
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<long*>(dest)) = r;
*dest = r;
return true;
}
bool RE2::Arg::parse_ulong_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, unsigned long* dest, int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
@ -1083,62 +1210,52 @@ bool RE2::Arg::parse_ulong_radix(const char* str,
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned long*>(dest)) = r;
*dest = r;
return true;
}
bool RE2::Arg::parse_short_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, short* dest, int radix) {
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if ((short)r != r) return false; // Out of range
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<short*>(dest)) = (short)r;
*dest = (short)r;
return true;
}
bool RE2::Arg::parse_ushort_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, unsigned short* dest, int radix) {
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if ((unsigned short)r != r) return false; // Out of range
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((unsigned short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
*dest = (unsigned short)r;
return true;
}
bool RE2::Arg::parse_int_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, int* dest, int radix) {
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if ((int)r != r) return false; // Out of range
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((int)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<int*>(dest)) = (int)r;
*dest = (int)r;
return true;
}
bool RE2::Arg::parse_uint_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, unsigned int* dest, int radix) {
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if ((unsigned int)r != r) return false; // Out of range
if (!Parse(str, n, &r, radix)) return false; // Could not parse
if ((unsigned int)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
*dest = (unsigned int)r;
return true;
}
bool RE2::Arg::parse_longlong_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, long long* dest, int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
@ -1148,14 +1265,12 @@ bool RE2::Arg::parse_longlong_radix(const char* str,
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<long long*>(dest)) = r;
*dest = r;
return true;
}
bool RE2::Arg::parse_ulonglong_radix(const char* str,
size_t n,
void* dest,
int radix) {
template <>
bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) {
if (n == 0) return false;
char buf[kMaxNumberLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, false);
@ -1170,67 +1285,47 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str,
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned long long*>(dest)) = r;
*dest = r;
return true;
}
static bool parse_double_float(const char* str, size_t n, bool isfloat,
void* dest) {
if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength+1];
str = TerminateNumber(buf, sizeof buf, str, &n, true);
char* end;
errno = 0;
double r;
if (isfloat) {
r = strtof(str, &end);
} else {
r = strtod(str, &end);
}
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
if (isfloat) {
*(reinterpret_cast<float*>(dest)) = (float)r;
} else {
*(reinterpret_cast<double*>(dest)) = r;
}
return true;
}
} // namespace re2_internal
bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
return parse_double_float(str, n, false, dest);
}
namespace hooks {
bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
return parse_double_float(str, n, true, dest);
}
#ifdef RE2_HAVE_THREAD_LOCAL
thread_local const RE2* context = NULL;
#endif
#define DEFINE_INTEGER_PARSER(name) \
bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
return parse_##name##_radix(str, n, dest, 10); \
} \
bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
return parse_##name##_radix(str, n, dest, 16); \
} \
bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
return parse_##name##_radix(str, n, dest, 8); \
} \
bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
void* dest) { \
return parse_##name##_radix(str, n, dest, 0); \
}
template <typename T>
union Hook {
void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
T* Load() const { return cb_.load(std::memory_order_acquire); }
DEFINE_INTEGER_PARSER(short);
DEFINE_INTEGER_PARSER(ushort);
DEFINE_INTEGER_PARSER(int);
DEFINE_INTEGER_PARSER(uint);
DEFINE_INTEGER_PARSER(long);
DEFINE_INTEGER_PARSER(ulong);
DEFINE_INTEGER_PARSER(longlong);
DEFINE_INTEGER_PARSER(ulonglong);
#if !defined(__clang__) && defined(_MSC_VER)
// Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
// this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
"std::atomic<T*> must be always lock-free");
T* cb_for_constinit_;
#endif
#undef DEFINE_INTEGER_PARSER
std::atomic<T*> cb_;
};
template <typename T>
static void DoNothing(const T&) {}
#define DEFINE_HOOK(type, name) \
static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
type##Callback* Get##type##Hook() { return name##_hook.Load(); }
DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
#undef DEFINE_HOOK
} // namespace hooks
} // namespace re2

421
extern/re2/re2/re2.h vendored
View File

@ -30,6 +30,19 @@
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// The double backslashes are needed when writing C++ string literals.
// However, they should NOT be used when writing C++11 raw string literals:
//
// R"(hello (\w+) world)" -- \w matches a "word" character
// R"(version (\d+))" -- \d matches a digit
// R"(hello\s+world)" -- \s matches any whitespace character
// R"(\b(\w+)\b)" -- \b matches non-empty string at word boundary
// R"((?i)hello)" -- (?i) turns on case-insensitive matching
// R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible
//
// When using UTF-8 encoding, case-insensitive matching will perform
// simple case folding, not full case folding.
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
@ -195,6 +208,12 @@
#include <map>
#include <mutex>
#include <string>
#include <type_traits>
#include <vector>
#if defined(__APPLE__)
#include <TargetConditionals.h>
#endif
#include "re2/stringpiece.h"
@ -229,6 +248,7 @@ class RE2 {
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorUnexpectedParen, // unexpected closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
@ -287,11 +307,11 @@ class RE2 {
int ProgramSize() const;
int ReverseProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// If histogram is not null, outputs the program fanout
// as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(std::map<int, int>* histogram) const;
int ReverseProgramFanout(std::map<int, int>* histogram) const;
int ProgramFanout(std::vector<int>* histogram) const;
int ReverseProgramFanout(std::vector<int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
@ -349,12 +369,12 @@ class RE2 {
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "re" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// a. "text" matches "re" fully - from the beginning to the end of "text".
// b. The number of matched sub-patterns is >= number of supplied pointers.
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
@ -368,8 +388,17 @@ class RE2 {
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
}
// Exactly like FullMatch(), except that "re" is allowed to match
// a substring of "text".
// Like FullMatch(), except that "re" is allowed to match a substring
// of "text".
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "re" partially - for some substring of "text".
// b. The number of matched sub-patterns is >= number of supplied pointers.
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
@ -378,7 +407,16 @@ class RE2 {
// Like FullMatch() and PartialMatch(), except that "re" has to match
// a prefix of the text, and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true
// and "re" matched a non-empty substring of "text".
// and "re" matched a non-empty substring of "input".
//
// Returns true iff all of the following conditions are satisfied:
// a. "input" matches "re" partially - for some prefix of "input".
// b. The number of matched sub-patterns is >= number of supplied pointers.
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
@ -388,6 +426,15 @@ class RE2 {
// the text. That is, "re" need not start its match at the beginning
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
// the next word in "s" and stores it in "word".
//
// Returns true iff all of the following conditions are satisfied:
// a. "input" matches "re" partially - for some substring of "input".
// b. The number of matched sub-patterns is >= number of supplied pointers.
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, the "i"th captured sub-pattern is
// ignored.
template <typename... A>
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
@ -443,7 +490,7 @@ class RE2 {
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// will match exactly the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
@ -626,17 +673,6 @@ class RE2 {
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
@ -699,32 +735,12 @@ class RE2 {
const Options& options() const { return options_; }
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
template <typename T>
static Arg CRadix(T* ptr);
template <typename T>
static Arg Hex(T* ptr);
template <typename T>
static Arg Octal(T* ptr);
private:
void Init(const StringPiece& pattern, const Options& options);
@ -737,29 +753,26 @@ class RE2 {
re2::Prog* ReverseProg() const;
std::string pattern_; // string regular expression
Options options_; // option flags
std::string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
int num_captures_; // Number of capturing groups
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable re2::Prog* rprog_; // reverse program for regexp
mutable const std::string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable std::string error_arg_; // Fragment of regexp showing error
std::string pattern_; // string regular expression
Options options_; // option flags
re2::Regexp* entire_regexp_; // parsed regular expression
const std::string* error_; // error indicator (or points to empty string)
ErrorCode error_code_; // error code
std::string error_arg_; // fragment of regexp showing error
std::string prefix_; // required prefix (before suffix_regexp_)
bool prefix_foldcase_; // prefix_ is ASCII case-insensitive
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed
re2::Prog* prog_; // compiled program for regexp
int num_captures_; // number of capturing groups
bool is_one_pass_; // can use prog_->SearchOnePass?
// Reverse Prog for DFA execution only
mutable re2::Prog* rprog_;
// Map from capture names to indices
mutable const std::map<std::string, int>* named_groups_;
// Map from capture indices to names
mutable const std::map<int, std::string>* group_names_;
// Onces for lazy computations.
mutable std::once_flag rprog_once_;
mutable std::once_flag named_groups_once_;
mutable std::once_flag group_names_once_;
@ -770,137 +783,134 @@ class RE2 {
/***** Implementation details *****/
// Hex/Octal/Binary?
namespace re2_internal {
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
// Types for which the 3-ary Parse() function template has specializations.
template <typename T> struct Parse3ary : public std::false_type {};
template <> struct Parse3ary<void> : public std::true_type {};
template <> struct Parse3ary<std::string> : public std::true_type {};
template <> struct Parse3ary<StringPiece> : public std::true_type {};
template <> struct Parse3ary<char> : public std::true_type {};
template <> struct Parse3ary<signed char> : public std::true_type {};
template <> struct Parse3ary<unsigned char> : public std::true_type {};
template <> struct Parse3ary<float> : public std::true_type {};
template <> struct Parse3ary<double> : public std::true_type {};
template <typename T>
bool Parse(const char* str, size_t n, T* dest);
// Types for which the 4-ary Parse() function template has specializations.
template <typename T> struct Parse4ary : public std::false_type {};
template <> struct Parse4ary<long> : public std::true_type {};
template <> struct Parse4ary<unsigned long> : public std::true_type {};
template <> struct Parse4ary<short> : public std::true_type {};
template <> struct Parse4ary<unsigned short> : public std::true_type {};
template <> struct Parse4ary<int> : public std::true_type {};
template <> struct Parse4ary<unsigned int> : public std::true_type {};
template <> struct Parse4ary<long long> : public std::true_type {};
template <> struct Parse4ary<unsigned long long> : public std::true_type {};
template <typename T>
bool Parse(const char* str, size_t n, T* dest, int radix);
} // namespace re2_internal
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
private:
template <typename T>
using CanParse3ary = typename std::enable_if<
re2_internal::Parse3ary<T>::value,
int>::type;
// Constructor specially designed for NULL arguments
Arg(void*);
Arg(std::nullptr_t);
template <typename T>
using CanParse4ary = typename std::enable_if<
re2_internal::Parse4ary<T>::value,
int>::type;
#if !defined(_MSC_VER)
template <typename T>
using CanParseFrom = typename std::enable_if<
std::is_member_function_pointer<
decltype(static_cast<bool (T::*)(const char*, size_t)>(
&T::ParseFrom))>::value,
int>::type;
#endif
public:
Arg() : Arg(nullptr) {}
Arg(std::nullptr_t ptr) : arg_(ptr), parser_(DoNothing) {}
template <typename T, CanParse3ary<T> = 0>
Arg(T* ptr) : arg_(ptr), parser_(DoParse3ary<T>) {}
template <typename T, CanParse4ary<T> = 0>
Arg(T* ptr) : arg_(ptr), parser_(DoParse4ary<T>) {}
#if !defined(_MSC_VER)
template <typename T, CanParseFrom<T> = 0>
Arg(T* ptr) : arg_(ptr), parser_(DoParseFrom<T>) {}
#endif
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
template <typename T>
Arg(T* ptr, Parser parser) : arg_(ptr), parser_(parser) {}
MAKE_PARSER(char, parse_char)
MAKE_PARSER(signed char, parse_schar)
MAKE_PARSER(unsigned char, parse_uchar)
MAKE_PARSER(float, parse_float)
MAKE_PARSER(double, parse_double)
MAKE_PARSER(std::string, parse_string)
MAKE_PARSER(StringPiece, parse_stringpiece)
MAKE_PARSER(short, parse_short)
MAKE_PARSER(unsigned short, parse_ushort)
MAKE_PARSER(int, parse_int)
MAKE_PARSER(unsigned int, parse_uint)
MAKE_PARSER(long, parse_long)
MAKE_PARSER(unsigned long, parse_ulong)
MAKE_PARSER(long long, parse_longlong)
MAKE_PARSER(unsigned long long, parse_ulonglong)
#undef MAKE_PARSER
// Generic constructor templates
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
template <class T> Arg(T* p, Parser parser)
: arg_(p), parser_(parser) { }
// Parse the data
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest);
DECLARE_INTEGER_PARSER(short)
DECLARE_INTEGER_PARSER(ushort)
DECLARE_INTEGER_PARSER(int)
DECLARE_INTEGER_PARSER(uint)
DECLARE_INTEGER_PARSER(long)
DECLARE_INTEGER_PARSER(ulong)
DECLARE_INTEGER_PARSER(longlong)
DECLARE_INTEGER_PARSER(ulonglong)
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
} \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
} \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
bool Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
private:
static bool DoNothing(const char* /*str*/, size_t /*n*/, void* /*dest*/) {
return true;
}
#undef MAKE_INTEGER_PARSER
template <typename T>
static bool DoParse3ary(const char* str, size_t n, void* dest) {
return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest));
}
template <typename T>
static bool DoParse4ary(const char* str, size_t n, void* dest) {
return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 10);
}
#if !defined(_MSC_VER)
template <typename T>
static bool DoParseFrom(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
return reinterpret_cast<T*>(dest)->ParseFrom(str, n);
}
#endif
void* arg_;
Parser parser_;
};
template <typename T>
inline RE2::Arg RE2::CRadix(T* ptr) {
return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 0);
});
}
template <typename T>
inline RE2::Arg RE2::Hex(T* ptr) {
return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 16);
});
}
template <typename T>
inline RE2::Arg RE2::Octal(T* ptr) {
return RE2::Arg(ptr, [](const char* str, size_t n, void* dest) -> bool {
return re2_internal::Parse(str, n, reinterpret_cast<T*>(dest), 8);
});
}
#ifndef SWIG
// Silence warnings about missing initializers for members of LazyRE2.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
@ -949,7 +959,52 @@ class LazyRE2 {
void operator=(const LazyRE2&); // disallowed
};
#endif // SWIG
#endif
namespace hooks {
// Most platforms support thread_local. Older versions of iOS don't support
// thread_local, but for the sake of brevity, we lump together all versions
// of Apple platforms that aren't macOS. If an iOS application really needs
// the context pointee someday, we can get more specific then...
#define RE2_HAVE_THREAD_LOCAL
#if defined(__APPLE__) && !TARGET_OS_OSX
#undef RE2_HAVE_THREAD_LOCAL
#endif
// A hook must not make any assumptions regarding the lifetime of the context
// pointee beyond the current invocation of the hook. Pointers and references
// obtained via the context pointee should be considered invalidated when the
// hook returns. Hence, any data about the context pointee (e.g. its pattern)
// would have to be copied in order for it to be kept for an indefinite time.
//
// A hook must not use RE2 for matching. Control flow reentering RE2::Match()
// could result in infinite mutual recursion. To discourage that possibility,
// RE2 will not maintain the context pointer correctly when used in that way.
#ifdef RE2_HAVE_THREAD_LOCAL
extern thread_local const RE2* context;
#endif
struct DFAStateCacheReset {
int64_t state_budget;
size_t state_cache_size;
};
struct DFASearchFailure {
// Nothing yet...
};
#define DECLARE_HOOK(type) \
using type##Callback = void(const type&); \
void Set##type##Hook(type##Callback* cb); \
type##Callback* Get##type##Hook();
DECLARE_HOOK(DFAStateCacheReset)
DECLARE_HOOK(DFASearchFailure)
#undef DECLARE_HOOK
} // namespace hooks
} // namespace re2

View File

@ -20,6 +20,7 @@
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
return new Regexp(kRegexpEmptyMatch, flags);
}
Regexp** subcopy = NULL;
PODArray<Regexp*> subcopy;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
subcopy = PODArray<Regexp*>(nsub);
memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
sub = subcopy.data();
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
@ -501,6 +498,7 @@ static const char *kErrorStrings[] = {
"invalid character class range",
"missing ]",
"missing )",
"unexpected )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
@ -544,9 +542,12 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> {
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
#endif
return ignored;
}
@ -575,7 +576,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
@ -591,8 +592,10 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
#endif
return ignored;
}
@ -621,7 +624,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
@ -633,8 +636,10 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
#endif
return ignored;
}
@ -651,78 +656,89 @@ std::map<int, std::string>* Regexp::CaptureNames() {
return w.TakeMap();
}
void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
std::string* bytes) {
if (latin1) {
bytes->resize(nrunes);
for (int i = 0; i < nrunes; i++)
(*bytes)[i] = static_cast<char>(runes[i]);
} else {
bytes->resize(nrunes * UTFmax); // worst case
char* p = &(*bytes)[0];
for (int i = 0; i < nrunes; i++)
p += runetochar(p, &runes[i]);
bytes->resize(p - &(*bytes)[0]);
bytes->shrink_to_fit();
}
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix) {
prefix->clear();
*foldcase = false;
*suffix = NULL;
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
Regexp* re = sub()[i];
if (re->op_ != kRegexpLiteral &&
re->op_ != kRegexpLiteralString)
return false;
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
sub()[j]->Incref();
*suffix = Concat(sub() + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
*suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
bool latin1 = (re->parse_flags() & Latin1) != 0;
Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
ConvertRunesToBytes(latin1, runes, nrunes, prefix);
*foldcase = (re->parse_flags() & FoldCase) != 0;
return true;
}
// Determines whether regexp matches must be unanchored
// with a fixed string prefix. If so, returns the prefix.
// The prefix might be ASCII case-insensitive.
bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
prefix->clear();
*foldcase = false;
// No need for a walker: the regexp must either begin with or be
// a literal char or string. We "see through" capturing groups,
// but make no effort to glue multiple prefix fragments together.
Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
while (re->op_ == kRegexpCapture) {
re = re->sub()[0];
if (re->op_ == kRegexpConcat && re->nsub_ > 0)
re = re->sub()[0];
}
if (re->op_ != kRegexpLiteral &&
re->op_ != kRegexpLiteralString)
return false;
bool latin1 = (re->parse_flags() & Latin1) != 0;
Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
ConvertRunesToBytes(latin1, runes, nrunes, prefix);
*foldcase = (re->parse_flags() & FoldCase) != 0;
return true;
}
@ -903,7 +919,7 @@ void CharClassBuilder::Negate() {
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* CharClass::New(size_t maxranges) {
CharClass* cc;
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
@ -920,7 +936,7 @@ void CharClass::Delete() {
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
@ -957,7 +973,7 @@ bool CharClass::Contains(Rune r) {
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
CharClass* cc = CharClass::New(ranges_.size());
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;

View File

@ -86,6 +86,7 @@
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <set>
@ -177,6 +178,7 @@ enum RegexpStatusCode {
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpUnexpectedParen, // unexpected closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
@ -258,7 +260,7 @@ class CharClass {
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
static CharClass* New(size_t maxranges);
friend class CharClassBuilder;
@ -440,6 +442,13 @@ class Regexp {
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
// Whether every match of this regexp must be unanchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix.
// Callers should expect *prefix and *foldcase to be "zeroed"
// regardless of the return value.
bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);

48
extern/re2/re2/set.cc vendored
View File

@ -7,30 +7,49 @@
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "re2/stringpiece.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
namespace re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor)
: options_(options),
anchor_(anchor),
compiled_(false),
size_(0) {
options_.set_never_capture(true); // might unblock some optimisations
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
size_ = 0;
}
RE2::Set::~Set() {
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
delete prog_;
}
RE2::Set::Set(Set&& other)
: options_(other.options_),
anchor_(other.anchor_),
elem_(std::move(other.elem_)),
compiled_(other.compiled_),
size_(other.size_),
prog_(std::move(other.prog_)) {
other.elem_.clear();
other.elem_.shrink_to_fit();
other.compiled_ = false;
other.size_ = 0;
other.prog_.reset();
}
RE2::Set& RE2::Set::operator=(Set&& other) {
this->~Set();
(void) new (this) Set(std::move(other));
return *this;
}
int RE2::Set::Add(const StringPiece& pattern, std::string* error) {
@ -97,9 +116,9 @@ bool RE2::Set::Compile() {
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf);
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem()));
re->Decref();
return prog_ != NULL;
return prog_ != nullptr;
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
@ -124,9 +143,10 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
LOG(ERROR) << "DFA out of memory: "
<< "program size " << prog_->size() << ", "
<< "list count " << prog_->list_count() << ", "
<< "bytemap range " << prog_->bytemap_range();
if (error_info != NULL)
error_info->kind = kOutOfMemory;
return false;

13
extern/re2/re2/set.h vendored
View File

@ -5,6 +5,7 @@
#ifndef RE2_SET_H_
#define RE2_SET_H_
#include <memory>
#include <string>
#include <utility>
#include <vector>
@ -36,6 +37,13 @@ class RE2::Set {
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Not copyable.
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
// Movable.
Set(Set&& other);
Set& operator=(Set&& other);
// Adds pattern to the set using the options passed to the constructor.
// Returns the index that will identify the regexp in the output of Match(),
// or -1 if the regexp cannot be parsed.
@ -67,12 +75,9 @@ class RE2::Set {
RE2::Options options_;
RE2::Anchor anchor_;
std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
int size_;
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
std::unique_ptr<re2::Prog> prog_;
};
} // namespace re2

View File

@ -10,8 +10,8 @@
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/utf.h"
#include "re2/pod_array.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
@ -28,8 +28,6 @@ bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
@ -180,10 +178,20 @@ Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return cre;
return NULL;
if (cw.stopped_early()) {
cre->Decref();
return NULL;
}
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
if (sre == NULL)
return NULL;
if (sw.stopped_early()) {
sre->Decref();
return NULL;
}
return sre;
}
@ -212,9 +220,10 @@ Regexp* CoalesceWalker::Copy(Regexp* re) {
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
#endif
return re->Incref();
}
@ -437,9 +446,10 @@ Regexp* SimplifyWalker::Copy(Regexp* re) {
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
#endif
return re->Incref();
}

View File

@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_ARRAY_H_
#define UTIL_SPARSE_ARRAY_H_
#ifndef RE2_SPARSE_ARRAY_H_
#define RE2_SPARSE_ARRAY_H_
// DESCRIPTION
//
@ -102,7 +102,7 @@
#include <memory>
#include <utility>
#include "util/pod_array.h"
#include "re2/pod_array.h"
namespace re2 {
@ -389,4 +389,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
} // namespace re2
#endif // UTIL_SPARSE_ARRAY_H_
#endif // RE2_SPARSE_ARRAY_H_

View File

@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_SET_H_
#define UTIL_SPARSE_SET_H_
#ifndef RE2_SPARSE_SET_H_
#define RE2_SPARSE_SET_H_
// DESCRIPTION
//
@ -61,7 +61,7 @@
#include <memory>
#include <utility>
#include "util/pod_array.h"
#include "re2/pod_array.h"
namespace re2 {
@ -261,4 +261,4 @@ typedef SparseSetT<void> SparseSet;
} // namespace re2
#endif // UTIL_SPARSE_SET_H_
#endif // RE2_SPARSE_SET_H_

View File

@ -29,6 +29,7 @@
#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@ -53,7 +54,6 @@ namespace re2 {
class Backtracker {
public:
explicit Backtracker(Prog* prog);
~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
@ -79,9 +79,11 @@ class Backtracker {
int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
size_t nvisited_; // # of words in bitmap
const char* cap_[64]; // capture registers
PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
Backtracker(const Backtracker&) = delete;
Backtracker& operator=(const Backtracker&) = delete;
};
Backtracker::Backtracker(Prog* prog)
@ -90,13 +92,7 @@ Backtracker::Backtracker(Prog* prog)
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
visited_(NULL),
nvisited_(0) {
}
Backtracker::~Backtracker() {
delete[] visited_;
nsubmatch_(0) {
}
// Runs a backtracking search.
@ -105,7 +101,7 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
StringPiece* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.begin() == NULL)
if (context_.data() == NULL)
context_ = text;
if (prog_->anchor_start() && text.begin() > context_.begin())
return false;
@ -130,24 +126,28 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
delete[] visited_;
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
visited_ = new uint32_t[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
int nvisited = prog_->size() * static_cast<int>(text.size()+1);
nvisited = (nvisited + 31) / 32;
visited_ = PODArray<uint32_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return Visit(prog_->start(), text.begin());
cap_[0] = text.data();
return Visit(prog_->start(), text.data());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
for (const char* p = text.begin(); p <= text.end(); p++) {
for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
// Avoid invoking undefined behavior (arithmetic on a null pointer)
// by simply not continuing the loop.
if (p == NULL)
break;
}
return false;
}
@ -158,9 +158,10 @@ bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.end());
size_t n = id*(text_.size()+1) + (p - text_.begin());
CHECK_LT(n/32, nvisited_);
CHECK(p <= text_.data() + text_.size());
int n = id * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.data());
CHECK_LT(n/32, visited_.size());
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
@ -182,7 +183,7 @@ bool Backtracker::Try(int id, const char* p) {
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
if (p < text_.end())
if (p < text_.data() + text_.size())
c = *p & 0xFF;
Prog::Inst* ip = prog_->inst(id);
@ -224,11 +225,12 @@ bool Backtracker::Try(int id, const char* p) {
case kInstMatch:
// We found a match. If it's the best so far, record the
// parameters in the caller's submatch_ array.
if (endmatch_ && p != context_.end())
if (endmatch_ && p != context_.data() + context_.size())
return false;
cap_[1] = p;
if (submatch_[0].data() == NULL || // First match so far ...
(longest_ && p > submatch_[0].end())) { // ... or better match
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
// First match so far - or better match.
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(
cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));

View File

@ -85,7 +85,7 @@ static CCTest tests[] = {
{ {-1} } },
};
template<class CharClass>
template <typename CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
printf("\t%s:", desc);
@ -136,7 +136,7 @@ void Delete(CharClassBuilder* cc) {
delete cc;
}
template<class CharClass>
template <typename CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
typename CharClass::iterator it = cc->begin();
int size = 0;

View File

@ -147,10 +147,19 @@ static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*bytemap = prog->DumpByteMap();
delete prog;
{
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*bytemap = prog->DumpByteMap();
delete prog;
}
{
Prog* prog = re->CompileToReverseProg(0);
EXPECT_TRUE(prog != NULL);
EXPECT_EQ(*bytemap, prog->DumpByteMap());
delete prog;
}
re->Decref();
}
@ -213,16 +222,11 @@ TEST(TestCompile, UTF8Ranges) {
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-7f] -> 0\n"
"[80-8f] -> 2\n"
"[90-9f] -> 3\n"
"[a0-bf] -> 4\n"
"[80-bf] -> 2\n"
"[c0-c1] -> 1\n"
"[c2-df] -> 5\n"
"[e0-e0] -> 6\n"
"[e1-ef] -> 7\n"
"[f0-f0] -> 8\n"
"[f1-f3] -> 9\n"
"[f4-f4] -> 10\n"
"[c2-df] -> 3\n"
"[e0-ef] -> 4\n"
"[f0-f4] -> 5\n"
"[f5-ff] -> 1\n",
bytemap);
}
@ -232,7 +236,7 @@ TEST(TestCompile, InsufficientMemory) {
"^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
Regexp::LikePerl, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(920);
Prog* prog = re->CompileToProg(850);
// If the memory budget has been exhausted, compilation should fail
// and return NULL instead of trying to do anything with NoMatch().
EXPECT_TRUE(prog == NULL);
@ -299,20 +303,22 @@ TEST(TestCompile, Bug26705922) {
"8. byte [f0-f0] 0 -> 7\n",
reverse);
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4+ byte [c2-df] 0 -> 7\n"
"5+ byte [a0-bf] 1 -> 8\n"
"6. byte [80-bf] 0 -> 9\n"
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
"4+ byte [e0-ef] 0 -> 8\n"
"5. byte [f0-f4] 0 -> 9\n"
"6. byte [80-bf] 0 -> 7\n"
"7. match! 0\n"
"8. byte [e0-e0] 0 -> 7\n"
"9+ byte [e1-ef] 0 -> 7\n"
"10+ byte [90-bf] 1 -> 13\n"
"11+ byte [80-bf] 1 -> 14\n"
"12. byte [80-8f] 0 -> 15\n"
"13. byte [f0-f0] 0 -> 7\n"
"14. byte [f1-f3] 0 -> 7\n"
"15. byte [f4-f4] 0 -> 7\n",
"8. byte [80-bf] 0 -> 6\n"
"9. byte [80-bf] 0 -> 8\n",
forward);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4+ byte [c2-df] 0 -> 6\n"
"5. byte [80-bf] 0 -> 7\n"
"6. match! 0\n"
"7+ byte [e0-ef] 0 -> 6\n"
"8. byte [80-bf] 0 -> 9\n"
"9. byte [f0-f4] 0 -> 6\n",
reverse);
}

View File

@ -8,7 +8,9 @@
#include <vector>
#include "util/test.h"
#include "util/flags.h"
#include "util/logging.h"
#include "util/malloc_counter.h"
#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
@ -18,12 +20,26 @@
static const bool UsingMallocCounter = false;
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
DEFINE_int32(repeat, 2, "Repetition count.");
DEFINE_int32(threads, 4, "number of threads");
DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
DEFINE_FLAG(int, repeat, 2, "Repetition count.");
DEFINE_FLAG(int, threads, 4, "number of threads");
namespace re2 {
static int state_cache_resets = 0;
static int search_failures = 0;
struct SetHooks {
SetHooks() {
hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) {
++state_cache_resets;
});
hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) {
++search_failures;
});
}
} set_hooks;
// Check that multithreaded access to DFA class works.
// Helper function: builds entire DFA for prog.
@ -34,7 +50,7 @@ static void DoBuild(Prog* prog) {
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
std::string s = "a";
for (int i = 0; i < FLAGS_size; i++)
for (int i = 0; i < GetFlag(FLAGS_size); i++)
s += "[ab]";
s += "b";
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
@ -52,14 +68,14 @@ TEST(Multithreaded, BuildEntireDFA) {
}
// Build the DFA simultaneously in a bunch of threads.
for (int i = 0; i < FLAGS_repeat; i++) {
for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
for (int j = 0; j < FLAGS_threads; j++)
for (int j = 0; j < GetFlag(FLAGS_threads); j++)
threads.emplace_back(DoBuild, prog);
for (int j = 0; j < FLAGS_threads; j++)
for (int j = 0; j < GetFlag(FLAGS_threads); j++)
threads[j].join();
// One more compile, to make sure everything is okay.
@ -106,44 +122,6 @@ TEST(SingleThreaded, BuildEntireDFA) {
re->Decref();
}
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
static std::string DeBruijnString(int n) {
CHECK_LT(n, static_cast<int>(8*sizeof(int)));
CHECK_GT(n, 0);
std::vector<bool> did(size_t{1}<<n);
for (int i = 0; i < 1<<n; i++)
did[i] = false;
std::string s;
for (int i = 0; i < n-1; i++)
s.append("0");
int bits = 0;
int mask = (1<<n) - 1;
for (int i = 0; i < (1<<n); i++) {
bits <<= 1;
bits &= mask;
if (!did[bits|1]) {
bits |= 1;
s.append("1");
} else {
s.append("0");
}
CHECK(!did[bits]);
did[bits] = true;
}
return s;
}
// Test that the DFA gets the right result even if it runs
// out of memory during a search. The regular expression
// 0[01]{n}$ matches a binary string of 0s and 1s only if
@ -166,6 +144,8 @@ TEST(SingleThreaded, SearchDFA) {
// if it can't get a good cache hit rate.)
// Tell the DFA to trudge along instead.
Prog::TEST_dfa_should_bail_when_slow(false);
state_cache_resets = 0;
search_failures = 0;
// Choice of n is mostly arbitrary, except that:
// * making n too big makes the test run for too long.
@ -215,6 +195,8 @@ TEST(SingleThreaded, SearchDFA) {
// Reset to original behaviour.
Prog::TEST_dfa_should_bail_when_slow(true);
ASSERT_GT(state_cache_resets, 0);
ASSERT_EQ(search_failures, 0);
}
// Helper function: searches for match, which should match,
@ -237,6 +219,8 @@ static void DoSearch(Prog* prog, const StringPiece& match,
TEST(Multithreaded, SearchDFA) {
Prog::TEST_dfa_should_bail_when_slow(false);
state_cache_resets = 0;
search_failures = 0;
// Same as single-threaded test above.
const int n = 18;
@ -259,14 +243,14 @@ TEST(Multithreaded, SearchDFA) {
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
for (int i = 0; i < FLAGS_repeat; i++) {
for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
Prog* prog = re->CompileToProg(1<<n);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
for (int j = 0; j < FLAGS_threads; j++)
for (int j = 0; j < GetFlag(FLAGS_threads); j++)
threads.emplace_back(DoSearch, prog, match, no_match);
for (int j = 0; j < FLAGS_threads; j++)
for (int j = 0; j < GetFlag(FLAGS_threads); j++)
threads[j].join();
delete prog;
@ -276,6 +260,8 @@ TEST(Multithreaded, SearchDFA) {
// Reset to original behaviour.
Prog::TEST_dfa_should_bail_when_slow(true);
ASSERT_GT(state_cache_resets, 0);
ASSERT_EQ(search_failures, 0);
}
struct ReverseTest {

View File

@ -25,9 +25,6 @@
#include "re2/stringpiece.h"
#include "re2/regexp.h"
// Cause a link error if this file is used outside of testing.
DECLARE_string(test_tmpdir);
namespace re2 {
static const char* kOpcodeNames[] = {
@ -154,14 +151,11 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
}
std::string Regexp::Dump() {
// Make sure that we are being called from a unit test.
// Should cause a link error if used outside of testing.
CHECK(!::testing::TempDir().empty());
std::string s;
// Make sure being called from a unit test.
if (FLAGS_test_tmpdir.empty()) {
LOG(ERROR) << "Cannot use except for testing.";
return s;
}
DumpRegexpAppending(this, &s);
return s;
}

View File

@ -10,8 +10,6 @@
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test simple repetition operators
@ -34,11 +32,8 @@ TEST(Repetition, Capturing) {
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
7, Explode("ab"), "(?:%s)", "");
// This would be a great test, but it runs forever when PCRE is enabled.
if (FLAGS_regexp_engines.find("PCRE") == std::string::npos)
ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops,
50, Explode("a"), "(?:%s)", "");
ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops,
50, Explode("a"), "(?:%s)", "");
}
} // namespace re2

View File

@ -10,7 +10,6 @@
#include <vector>
#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {

View File

@ -14,6 +14,7 @@
#include <stdio.h>
#include "util/test.h"
#include "util/flags.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/testing/exhaustive_tester.h"
@ -24,11 +25,11 @@
#define LOGGING 0
#endif
DEFINE_bool(show_regexps, false, "show regexps during testing");
DEFINE_FLAG(bool, show_regexps, false, "show regexps during testing");
DEFINE_int32(max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
DEFINE_FLAG(int, max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
namespace re2 {
@ -62,11 +63,12 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc
for (int i = 0; i < n; i++) {
if (i > 0)
printf(" ");
if (m[i].begin() == NULL)
if (m[i].data() == NULL)
printf("-");
else
printf("%td-%td",
m[i].begin() - input.begin(), m[i].end() - input.begin());
m[i].begin() - input.begin(),
m[i].end() - input.begin());
}
}
@ -76,10 +78,11 @@ static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anc
void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
regexps_++;
std::string regexp = const_regexp;
if (!topwrapper_.empty())
if (!topwrapper_.empty()) {
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
}
if (FLAGS_show_regexps) {
if (GetFlag(FLAGS_show_regexps)) {
printf("\r%s", regexp.c_str());
fflush(stdout);
}
@ -134,7 +137,7 @@ void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
if (++bad_inputs >= GetFlag(FLAGS_max_bad_regexp_inputs))
break;
}
}

View File

@ -7,6 +7,7 @@
#include <memory>
#include <string>
#include <vector>
#include <utility>
#include "util/test.h"
#include "util/logging.h"
@ -291,4 +292,49 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
"EmptyStringInStringSetBug", &v));
}
TEST(FilteredRE2Test, MoveSemantics) {
FilterTestVars v1;
int id;
v1.f.Add("foo\\d+", v1.opts, &id);
EXPECT_EQ(0, id);
v1.f.Compile(&v1.atoms);
EXPECT_EQ(1, v1.atoms.size());
EXPECT_EQ("foo", v1.atoms[0]);
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
EXPECT_EQ(1, v1.matches.size());
EXPECT_EQ(0, v1.matches[0]);
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
EXPECT_EQ(0, v1.matches.size());
// The moved-to object should do what the moved-from object did.
FilterTestVars v2;
v2.f = std::move(v1.f);
v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches);
EXPECT_EQ(1, v2.matches.size());
EXPECT_EQ(0, v2.matches[0]);
v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches);
EXPECT_EQ(0, v2.matches.size());
// The moved-from object should have been reset and be reusable.
v1.f.Add("bar\\d+", v1.opts, &id);
EXPECT_EQ(0, id);
v1.f.Compile(&v1.atoms);
EXPECT_EQ(1, v1.atoms.size());
EXPECT_EQ("bar", v1.atoms[0]);
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
EXPECT_EQ(0, v1.matches.size());
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
EXPECT_EQ(1, v1.matches.size());
EXPECT_EQ(0, v1.matches[0]);
// Verify that "overwriting" works and also doesn't leak memory.
// (The latter will need a leak detector such as LeakSanitizer.)
v1.f = std::move(v2.f);
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
EXPECT_EQ(1, v1.matches.size());
EXPECT_EQ(0, v1.matches[0]);
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
EXPECT_EQ(0, v1.matches.size());
}
} // namespace re2

View File

@ -13,13 +13,16 @@ namespace re2 {
class NullWalker : public Regexp::Walker<bool> {
public:
NullWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
NullWalker() {}
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
virtual bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk(), not WalkExponential().
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
LOG(DFATAL) << "NullWalker::ShortVisit called";
#endif
return a;
}

View File

@ -9,12 +9,13 @@
#include <vector>
#include "util/test.h"
#include "util/flags.h"
#include "re2/testing/exhaustive_tester.h"
DEFINE_int32(regexpseed, 404, "Random regexp seed.");
DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
DEFINE_int32(stringseed, 200, "Random string seed.");
DEFINE_int32(stringcount, 100, "How many random strings to generate.");
DEFINE_FLAG(int, regexpseed, 404, "Random regexp seed.");
DEFINE_FLAG(int, regexpcount, 100, "How many random regexps to generate.");
DEFINE_FLAG(int, stringseed, 200, "Random string seed.");
DEFINE_FLAG(int, stringcount, 100, "How many random strings to generate.");
namespace re2 {
@ -37,8 +38,10 @@ static void RandomTest(int maxatoms, int maxops,
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
t.RandomStrings(GetFlag(FLAGS_stringseed),
GetFlag(FLAGS_stringcount));
t.GenerateRandom(GetFlag(FLAGS_regexpseed),
GetFlag(FLAGS_regexpcount));
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
EXPECT_EQ(0, t.failures());
@ -96,4 +99,3 @@ TEST(Random, Complicated) {
}
} // namespace re2

View File

@ -11,6 +11,7 @@
#include <string.h>
#include "util/test.h"
#include "util/logging.h"
#include "re2/re2.h"
namespace re2 {
@ -132,4 +133,28 @@ TEST(RE2ArgTest, Uint64Test) {
PARSE_FOR_TYPE(uint64_t, 5);
}
TEST(RE2ArgTest, ParseFromTest) {
#if !defined(_MSC_VER)
struct {
bool ParseFrom(const char* str, size_t n) {
LOG(INFO) << "str = " << str << ", n = " << n;
return true;
}
} obj1;
RE2::Arg arg1(&obj1);
EXPECT_TRUE(arg1.Parse("one", 3));
struct {
bool ParseFrom(const char* str, size_t n) {
LOG(INFO) << "str = " << str << ", n = " << n;
return false;
}
// Ensure that RE2::Arg works even with overloaded ParseFrom().
void ParseFrom(const char* str) {}
} obj2;
RE2::Arg arg2(&obj2);
EXPECT_FALSE(arg2.Parse("two", 3));
#endif
}
} // namespace re2

View File

@ -12,6 +12,7 @@
#include <map>
#include <string>
#include <utility>
#include <vector>
#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
#include <sys/mman.h>
#include <unistd.h> /* for sysconf */
@ -223,6 +224,15 @@ TEST(RE2, Extract) {
ASSERT_EQ(s, "'foo'");
}
TEST(RE2, MaxSubmatchTooLarge) {
std::string s;
ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
s = "foo";
ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
s = "foo";
ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
}
TEST(RE2, Consume) {
RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
std::string word;
@ -473,28 +483,27 @@ TEST(ProgramFanout, BigProgram) {
RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
std::map<int, int> histogram;
std::vector<int> histogram;
// 3 is the largest non-empty bucket and has 1 element.
ASSERT_EQ(3, re1.ProgramFanout(&histogram));
ASSERT_EQ(1, histogram[3]);
// 7 is the largest non-empty bucket and has 10 elements.
ASSERT_EQ(7, re10.ProgramFanout(&histogram));
ASSERT_EQ(10, histogram[7]);
// 6 is the largest non-empty bucket and has 10 elements.
ASSERT_EQ(6, re10.ProgramFanout(&histogram));
ASSERT_EQ(10, histogram[6]);
// 10 is the largest non-empty bucket and has 100 elements.
ASSERT_EQ(10, re100.ProgramFanout(&histogram));
ASSERT_EQ(100, histogram[10]);
// 9 is the largest non-empty bucket and has 100 elements.
ASSERT_EQ(9, re100.ProgramFanout(&histogram));
ASSERT_EQ(100, histogram[9]);
// 13 is the largest non-empty bucket and has 1000 elements.
ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
ASSERT_EQ(1000, histogram[13]);
// 2 is the largest non-empty bucket and has 3 elements.
// This differs from the others due to how reverse `.' works.
// 2 is the largest non-empty bucket and has 1 element.
ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
ASSERT_EQ(3, histogram[2]);
ASSERT_EQ(1, histogram[2]);
// 5 is the largest non-empty bucket and has 10 elements.
ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
@ -1232,11 +1241,10 @@ TEST(RE2, DeepRecursion) {
// Suggested by Josh Hyman. Failed when SearchOnePass was
// not implementing case-folding.
TEST(CaseInsensitive, MatchAndConsume) {
std::string result;
std::string text = "A fish named *Wanda*";
StringPiece sp(text);
EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
StringPiece result;
EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
}
@ -1269,38 +1277,43 @@ TEST(RE2, CL8622304) {
EXPECT_EQ(val, "1,0x2F,030,4,5");
}
// Check that RE2 returns correct regexp pieces on error.
// In particular, make sure it returns whole runes
// and that it always reports invalid UTF-8.
// Also check that Perl error flag piece is big enough.
static struct ErrorTest {
const char *regexp;
const char *error;
RE2::ErrorCode error_code;
const char *error_arg;
} error_tests[] = {
{ "ab\\αcd", "\\α" },
{ "ef\\x☺01", "\\x☺0" },
{ "gh\\x1☺01", "\\x1☺" },
{ "ij\\x1", "\\x1" },
{ "kl\\x", "\\x" },
{ "uv\\x{0000☺}", "\\x{0000☺" },
{ "wx\\p{ABC", "\\p{ABC" },
{ "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X
{ "aa(?sm☺i", "(?sm☺" },
{ "bb[abc", "[abc" },
{ "ab\\αcd", RE2::ErrorBadEscape, "\\α" },
{ "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" },
{ "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" },
{ "ij\\x1", RE2::ErrorBadEscape, "\\x1" },
{ "kl\\x", RE2::ErrorBadEscape, "\\x" },
{ "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" },
{ "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" },
// used to return (?s but the error is X
{ "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" },
{ "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" },
{ "bb[abc", RE2::ErrorMissingBracket, "[abc" },
{ "abc(def", RE2::ErrorMissingParen, "abc(def" },
{ "abc)def", RE2::ErrorUnexpectedParen, "abc)def" },
{ "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8
{ "op\377qr", "" },
{ "st\\x{00000\377", "" },
{ "zz\\p{\377}", "" },
{ "zz\\x{00\377}", "" },
{ "zz(?P<name\377>abc)", "" },
// no argument string returned for invalid UTF-8
{ "mn\\x1\377", RE2::ErrorBadUTF8, "" },
{ "op\377qr", RE2::ErrorBadUTF8, "" },
{ "st\\x{00000\377", RE2::ErrorBadUTF8, "" },
{ "zz\\p{\377}", RE2::ErrorBadUTF8, "" },
{ "zz\\x{00\377}", RE2::ErrorBadUTF8, "" },
{ "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" },
};
TEST(RE2, ErrorArgs) {
TEST(RE2, ErrorCodeAndArg) {
for (size_t i = 0; i < arraysize(error_tests); i++) {
RE2 re(error_tests[i].regexp, RE2::Quiet);
EXPECT_FALSE(re.ok());
EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error();
EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error();
}
}

File diff suppressed because it is too large Load Diff

View File

@ -241,7 +241,7 @@ void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
std::vector<std::string> Explode(const StringPiece& s) {
std::vector<std::string> v;
for (const char *q = s.begin(); q < s.end(); ) {
for (const char *q = s.data(); q < s.data() + s.size(); ) {
const char* p = q;
Rune r;
q += chartorune(&r, q);
@ -256,11 +256,11 @@ std::vector<std::string> Explode(const StringPiece& s) {
std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
std::vector<std::string> v;
if (sep.size() == 0)
if (sep.empty())
return Explode(s);
const char *p = s.begin();
for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
const char *p = s.data();
for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
if (StringPiece(q, sep.size()) == sep) {
v.push_back(std::string(p, q - p));
p = q + sep.size();
@ -268,8 +268,8 @@ std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
continue;
}
}
if (p < s.end())
v.push_back(std::string(p, s.end() - p));
if (p < s.data() + s.size())
v.push_back(std::string(p, s.data() + s.size() - p));
return v;
}

View File

@ -6,6 +6,7 @@
#include "util/test.h"
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
@ -19,15 +20,18 @@ struct PrefixTest {
};
static PrefixTest tests[] = {
// If the regexp is missing a ^, there's no required prefix.
{ "abc", false },
// Empty cases.
{ "", false },
{ "(?m)^", false },
{ "(?-m)^", false },
// If the regexp has no ^, there's no required prefix.
{ "abc", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
{ "^(abc)", false },
{ "^a*", false },
{ "^(abc)", false },
// Otherwise, it should work.
{ "^abc$", true, "abc", false, "(?-m:$)" },
@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) {
bool f;
Regexp* s;
ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf")
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
<< " " << re->Dump();
if (t.return_value) {
ASSERT_EQ(p, std::string(t.prefix))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(f, t.foldcase)
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(s->ToString(), std::string(t.suffix))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
s->Decref();
}
re->Decref();
@ -69,4 +73,81 @@ TEST(RequiredPrefix, SimpleTests) {
}
}
static PrefixTest for_accel_tests[] = {
// Empty cases.
{ "", false },
{ "(?m)^", false },
{ "(?-m)^", false },
// If the regexp has a ^, there's no required prefix.
{ "^abc", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
{ "a*", false },
// Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
// capturing groups, but doesn't try to glue prefix fragments together.
{ "(a?)def", false },
{ "(ab?)def", true, "a", false },
{ "(abc?)def", true, "ab", false },
{ "(()a)def", false },
{ "((a)b)def", true, "a", false },
{ "((ab)c)def", true, "ab", false },
// Otherwise, it should work.
{ "abc$", true, "abc", false },
{ "abc", true, "abc", false },
{ "(?i)abc", true, "abc", true },
{ "abcd*", true, "abc", false },
{ "[Aa][Bb]cd*", true, "ab", true },
{ "ab[Cc]d*", true, "ab", false },
{ "☺abc", true, "☺abc", false },
};
TEST(RequiredPrefixForAccel, SimpleTests) {
for (size_t i = 0; i < arraysize(for_accel_tests); i++) {
const PrefixTest& t = for_accel_tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
ASSERT_TRUE(re != NULL) << " " << t.regexp;
std::string p;
bool f;
ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
<< " " << re->Dump();
if (t.return_value) {
ASSERT_EQ(p, std::string(t.prefix))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(f, t.foldcase)
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
}
re->Decref();
}
}
}
TEST(PrefixAccel, BasicTest) {
Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
for (int i = 0; i < 100; i++) {
std::string text(i, 'a');
const char* p = reinterpret_cast<const char*>(
prog->PrefixAccel(text.data(), text.size()));
EXPECT_TRUE(p == NULL);
text.append("abc");
p = reinterpret_cast<const char*>(
prog->PrefixAccel(text.data(), text.size()));
EXPECT_EQ(i, p-text.data());
}
delete prog;
re->Decref();
}
} // namespace re2

View File

@ -5,6 +5,7 @@
#include <stddef.h>
#include <string>
#include <vector>
#include <utility>
#include "util/test.h"
#include "util/logging.h"
@ -201,4 +202,29 @@ TEST(Set, Prefix) {
ASSERT_EQ(v[0], 0);
}
TEST(Set, MoveSemantics) {
RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s1.Add("foo\\d+", NULL), 0);
ASSERT_EQ(s1.Compile(), true);
ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true);
ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false);
// The moved-to object should do what the moved-from object did.
RE2::Set s2 = std::move(s1);
ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true);
ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false);
// The moved-from object should have been reset and be reusable.
ASSERT_EQ(s1.Add("bar\\d+", NULL), 0);
ASSERT_EQ(s1.Compile(), true);
ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false);
ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true);
// Verify that "overwriting" works and also doesn't leak memory.
// (The latter will need a leak detector such as LeakSanitizer.)
s1 = std::move(s2);
ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true);
ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false);
}
} // namespace re2

View File

@ -111,4 +111,31 @@ void StringGenerator::GenerateNULL() {
hasnext_ = true;
}
std::string DeBruijnString(int n) {
CHECK_GE(n, 1);
CHECK_LE(n, 29);
const size_t size = size_t{1} << static_cast<size_t>(n);
const size_t mask = size - 1;
std::vector<bool> did(size, false);
std::string s;
s.reserve(static_cast<size_t>(n) + size);
for (size_t i = 0; i < static_cast<size_t>(n - 1); i++)
s += '0';
size_t bits = 0;
for (size_t i = 0; i < size; i++) {
bits <<= 1;
bits &= mask;
if (!did[bits | 1]) {
bits |= 1;
s += '1';
} else {
s += '0';
}
CHECK(!did[bits]);
did[bits] = true;
}
CHECK_EQ(s.size(), static_cast<size_t>(n - 1) + size);
return s;
}
} // namespace re2

View File

@ -58,6 +58,19 @@ class StringGenerator {
StringGenerator& operator=(const StringGenerator&) = delete;
};
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n-1 + 2^n called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
//
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
std::string DeBruijnString(int n);
} // namespace re2
#endif // RE2_TESTING_STRING_GENERATOR_H_

View File

@ -18,14 +18,15 @@
#include "re2/re2.h"
#include "re2/regexp.h"
DEFINE_bool(dump_prog, false, "dump regexp program");
DEFINE_bool(log_okay, false, "log successful runs");
DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
DEFINE_FLAG(bool, log_okay, false, "log successful runs");
DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
DEFINE_int32(max_regexp_failures, 100,
"maximum number of regexp test failures (-1 = unlimited)");
DEFINE_FLAG(int, max_regexp_failures, 100,
"maximum number of regexp test failures (-1 = unlimited)");
DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
DEFINE_FLAG(std::string, regexp_engines, "",
"pattern to select regexp engines to test");
namespace re2 {
@ -62,11 +63,11 @@ static uint32_t Engines() {
if (did_parse)
return cached_engines;
if (FLAGS_regexp_engines.empty()) {
if (GetFlag(FLAGS_regexp_engines).empty()) {
cached_engines = ~0;
} else {
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
if (FLAGS_regexp_engines.find(EngineName(i)) != std::string::npos)
if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
cached_engines |= 1<<i;
}
@ -85,6 +86,20 @@ static uint32_t Engines() {
// The result of running a match.
struct TestInstance::Result {
Result()
: skipped(false),
matched(false),
untrusted(false),
have_submatch(false),
have_submatch0(false) {
ClearSubmatch();
}
void ClearSubmatch() {
for (int i = 0; i < kMaxSubmatch; i++)
submatch[i] = StringPiece();
}
bool skipped; // test skipped: wasn't applicable
bool matched; // found a match
bool untrusted; // don't really trust the answer
@ -99,10 +114,11 @@ typedef TestInstance::Result Result;
// where a and b are the starting and ending offsets of s in text.
static std::string FormatCapture(const StringPiece& text,
const StringPiece& s) {
if (s.begin() == NULL)
if (s.data() == NULL)
return "(?,?)";
return StringPrintf("(%td,%td)",
s.begin() - text.begin(), s.end() - text.begin());
s.begin() - text.begin(),
s.end() - text.begin());
}
// Returns whether text contains non-ASCII (>= 0x80) bytes.
@ -198,7 +214,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
error_ = true;
return;
}
if (FLAGS_dump_prog) {
if (GetFlag(FLAGS_dump_prog)) {
LOG(INFO) << "Prog for "
<< " regexp "
<< CEscape(regexp_str_)
@ -216,7 +232,7 @@ TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
error_ = true;
return;
}
if (FLAGS_dump_rprog)
if (GetFlag(FLAGS_dump_rprog))
LOG(INFO) << rprog_->Dump();
}
@ -290,9 +306,6 @@ void TestInstance::RunSearch(Engine type,
const StringPiece& orig_context,
Prog::Anchor anchor,
Result* result) {
// Result is not trivial, so we cannot freely clear it with memset(3),
// but zeroing objects like so is safe and expedient for our purposes.
memset(reinterpret_cast<void*>(result), 0, sizeof *result);
if (regexp_ == NULL) {
result->skipped = true;
return;
@ -476,7 +489,7 @@ void TestInstance::RunSearch(Engine type,
}
if (!result->matched)
memset(result->submatch, 0, sizeof result->submatch);
result->ClearSubmatch();
}
// Checks whether r is okay given that correct is the right answer.
@ -489,7 +502,7 @@ static bool ResultOkay(const Result& r, const Result& correct) {
return false;
if (r.have_submatch || r.have_submatch0) {
for (int i = 0; i < kMaxSubmatch; i++) {
if (correct.submatch[i].begin() != r.submatch[i].begin() ||
if (correct.submatch[i].data() != r.submatch[i].data() ||
correct.submatch[i].size() != r.submatch[i].size())
return false;
if (!r.have_submatch)
@ -528,7 +541,7 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Result r;
RunSearch(i, text, context, anchor, &r);
if (ResultOkay(r, correct)) {
if (FLAGS_log_okay)
if (GetFlag(FLAGS_log_okay))
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
continue;
}
@ -555,8 +568,8 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
}
}
for (int i = 0; i < 1+num_captures_; i++) {
if (r.submatch[i].begin() != correct.submatch[i].begin() ||
r.submatch[i].end() != correct.submatch[i].end()) {
if (r.submatch[i].data() != correct.submatch[i].data() ||
r.submatch[i].size() != correct.submatch[i].size()) {
LOG(INFO) <<
StringPrintf(" $%d: should be %s is %s",
i,
@ -571,7 +584,10 @@ bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
}
if (!all_okay) {
if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
// This will be initialised once (after flags have been initialised)
// and that is desirable because we want to enforce a global limit.
static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
if (max_regexp_failures > 0 && --max_regexp_failures == 0)
LOG(QFATAL) << "Too many regexp failures.";
}
@ -640,7 +656,7 @@ static Prog::Anchor anchors[] = {
bool Tester::TestInput(const StringPiece& text) {
bool okay = TestInputInContext(text, text);
if (text.size() > 0) {
if (!text.empty()) {
StringPiece sp;
sp = text;
sp.remove_prefix(1);

View File

@ -13,7 +13,7 @@ import re
from six.moves import urllib
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd"
_UNICODE_DIR = "https://www.unicode.org/Public/13.0.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF

View File

@ -7,7 +7,7 @@
namespace re2 {
// 1381 groups, 2792 pairs, 356 ranges
// 1384 groups, 2798 pairs, 358 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
@ -349,6 +349,8 @@ const CaseFold unicode_casefold[] = {
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 42951, 42954, OddEven },
{ 42997, 42998, OddEven },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
@ -366,9 +368,9 @@ const CaseFold unicode_casefold[] = {
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
const int num_unicode_casefold = 356;
const int num_unicode_casefold = 358;
// 1381 groups, 1411 pairs, 198 ranges
// 1384 groups, 1414 pairs, 200 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
@ -560,6 +562,8 @@ const CaseFold unicode_tolower[] = {
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 42951, 42953, OddEvenSkip },
{ 42997, 42997, OddEven },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
@ -569,7 +573,7 @@ const CaseFold unicode_tolower[] = {
{ 93760, 93791, 32 },
{ 125184, 125217, 34 },
};
const int num_unicode_tolower = 198;
const int num_unicode_tolower = 200;

File diff suppressed because it is too large Load Diff

View File

@ -89,7 +89,7 @@ template<typename T> class Regexp::Walker {
private:
// Walk state for the entire traversal.
std::stack<WalkState<T> >* stack_;
std::stack<WalkState<T>> stack_;
bool stopped_early_;
int max_visits_;
@ -119,7 +119,7 @@ template<typename T> T Regexp::Walker<T>::Copy(T arg) {
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState<T>(Regexp* re, T parent)
WalkState(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
@ -134,24 +134,22 @@ template<typename T> struct WalkState {
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new std::stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (stack_ && stack_->size() > 0) {
if (!stack_.empty()) {
LOG(DFATAL) << "Stack not empty.";
while (stack_->size() > 0) {
delete stack_->top().child_args;
stack_->pop();
while (!stack_.empty()) {
delete[] stack_.top().child_args;
stack_.pop();
}
}
}
@ -165,12 +163,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
return top_arg;
}
stack_->push(WalkState<T>(re, top_arg));
stack_.push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_->top();
s = &stack_.top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
@ -201,7 +199,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
@ -214,12 +212,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
}
}
// We've finished stack_->top().
// We've finished stack_.top().
// Update next guy down.
stack_->pop();
if (stack_->size() == 0)
stack_.pop();
if (stack_.empty())
return t;
s = &stack_->top();
s = &stack_.top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else

View File

@ -1,12 +0,0 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Defines a Bazel macro that instantiates a native cc_test rule for an RE2 test.
def re2_test(name, deps=[], size="medium"):
native.cc_test(
name=name,
srcs=["re2/testing/%s.cc" % (name)],
deps=[":test"] + deps,
size=size,
)

0
extern/re2/runtests vendored Normal file → Executable file
View File

View File

@ -2,23 +2,26 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <re2/re2.h>
#include <re2/filtered_re2.h>
#include <stdio.h>
#include <re2/filtered_re2.h>
#include <re2/re2.h>
int main(void) {
re2::FilteredRE2 f;
int id;
f.Add("a.*b.*c", RE2::DefaultOptions, &id);
std::vector<std::string> v;
f.Compile(&v);
std::vector<int> ids;
f.FirstMatch("abbccc", ids);
int main() {
re2::FilteredRE2 f;
int id;
f.Add("a.*b.*c", RE2::DefaultOptions, &id);
std::vector<std::string> v;
f.Compile(&v);
std::vector<int> ids;
f.FirstMatch("abbccc", ids);
if(RE2::FullMatch("axbyc", "a.*b.*c")) {
printf("PASS\n");
return 0;
}
printf("FAIL\n");
return 2;
int n;
if (RE2::FullMatch("axbyc", "a.*b.*c") &&
RE2::PartialMatch("foo123bar", "(\\d+)", &n) && n == 123) {
printf("PASS\n");
return 0;
}
printf("FAIL\n");
return 2;
}

View File

@ -7,155 +7,125 @@
#include <stdlib.h>
#include <algorithm>
#include <chrono>
#include <thread>
#include "util/util.h"
#include "util/flags.h"
#include "util/benchmark.h"
#include "util/flags.h"
#include "re2/re2.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
#ifdef _WIN32
#define snprintf _snprintf
#endif
using testing::Benchmark;
using ::testing::Benchmark;
static Benchmark* benchmarks[10000];
static int nbenchmarks;
void Benchmark::Register() {
benchmarks[nbenchmarks] = this;
if(lo < 1)
lo = 1;
if(hi < lo)
hi = lo;
nbenchmarks++;
lo_ = std::max(1, lo_);
hi_ = std::max(lo_, hi_);
benchmarks[nbenchmarks++] = this;
}
static int64_t nsec() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch()).count();
return std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
}
static int64_t bytes;
static int64_t ns;
static int64_t t0;
static int64_t ns;
static int64_t bytes;
static int64_t items;
void SetBenchmarkBytesProcessed(int64_t x) {
bytes = x;
void StartBenchmarkTiming() {
if (t0 == 0) {
t0 = nsec();
}
}
void StopBenchmarkTiming() {
if(t0 != 0)
ns += nsec() - t0;
t0 = 0;
if (t0 != 0) {
ns += nsec() - t0;
t0 = 0;
}
}
void StartBenchmarkTiming() {
if(t0 == 0)
t0 = nsec();
}
void SetBenchmarkBytesProcessed(int64_t b) { bytes = b; }
void SetBenchmarkItemsProcessed(int n) {
items = n;
}
void SetBenchmarkItemsProcessed(int64_t i) { items = i; }
void BenchmarkMemoryUsage() {
// TODO(rsc): Implement.
}
int NumCPUs() {
return static_cast<int>(std::thread::hardware_concurrency());
}
static void runN(Benchmark *b, int n, int siz) {
bytes = 0;
items = 0;
ns = 0;
t0 = nsec();
if(b->fn)
b->fn(n);
else if(b->fnr)
b->fnr(n, siz);
else {
fprintf(stderr, "%s: missing function\n", b->name);
abort();
}
if(t0 != 0)
ns += nsec() - t0;
static void RunFunc(Benchmark* b, int iters, int arg) {
t0 = nsec();
ns = 0;
bytes = 0;
items = 0;
b->func()(iters, arg);
StopBenchmarkTiming();
}
static int round(int n) {
int base = 1;
while(base*10 < n)
base *= 10;
if(n < 2*base)
return 2*base;
if(n < 5*base)
return 5*base;
return 10*base;
int base = 1;
while (base * 10 < n) base *= 10;
if (n < 2 * base) return 2 * base;
if (n < 5 * base) return 5 * base;
return 10 * base;
}
void RunBench(Benchmark* b, int nthread, int siz) {
int n, last;
static void RunBench(Benchmark* b, int arg) {
int iters, last;
// TODO(rsc): Threaded benchmarks.
if(nthread != 1)
return;
// run once in case it's expensive
n = 1;
runN(b, n, siz);
while(ns < (int)1e9 && n < (int)1e9) {
last = n;
if(ns/n == 0)
n = (int)1e9;
else
n = (int)1e9 / static_cast<int>(ns/n);
n = std::max(last+1, std::min(n+n/2, 100*last));
n = round(n);
runN(b, n, siz);
}
char mb[100];
char suf[100];
mb[0] = '\0';
suf[0] = '\0';
if(ns > 0 && bytes > 0)
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
if(b->fnr || b->lo != b->hi) {
if(siz >= (1<<20))
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
else if(siz >= (1<<10))
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
else
snprintf(suf, sizeof suf, "/%d", siz);
}
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
fflush(stdout);
// Run once just in case it's expensive.
iters = 1;
RunFunc(b, iters, arg);
while (ns < (int)1e9 && iters < (int)1e9) {
last = iters;
if (ns / iters == 0) {
iters = (int)1e9;
} else {
iters = (int)1e9 / static_cast<int>(ns / iters);
}
iters = std::max(last + 1, std::min(iters + iters / 2, 100 * last));
iters = round(iters);
RunFunc(b, iters, arg);
}
char mb[100];
char suf[100];
mb[0] = '\0';
suf[0] = '\0';
if (ns > 0 && bytes > 0)
snprintf(mb, sizeof mb, "\t%7.2f MB/s",
((double)bytes / 1e6) / ((double)ns / 1e9));
if (b->has_arg()) {
if (arg >= (1 << 20)) {
snprintf(suf, sizeof suf, "/%dM", arg / (1 << 20));
} else if (arg >= (1 << 10)) {
snprintf(suf, sizeof suf, "/%dK", arg / (1 << 10));
} else {
snprintf(suf, sizeof suf, "/%d", arg);
}
}
printf("%s%s\t%8d\t%10lld ns/op%s\n", b->name(), suf, iters,
(long long)ns / iters, mb);
fflush(stdout);
}
static int match(const char* name, int argc, const char** argv) {
if(argc == 1)
return 1;
for(int i = 1; i < argc; i++)
if(RE2::PartialMatch(name, argv[i]))
return 1;
return 0;
static bool WantBench(const char* name, int argc, const char** argv) {
if (argc == 1) return true;
for (int i = 1; i < argc; i++) {
if (RE2::PartialMatch(name, argv[i]))
return true;
}
return false;
}
int main(int argc, const char** argv) {
for(int i = 0; i < nbenchmarks; i++) {
Benchmark* b = benchmarks[i];
if(match(b->name, argc, argv))
for(int j = b->threadlo; j <= b->threadhi; j++)
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
RunBench(b, j, k);
}
for (int i = 0; i < nbenchmarks; i++) {
Benchmark* b = benchmarks[i];
if (!WantBench(b->name(), argc, argv))
continue;
for (int arg = b->lo(); arg <= b->hi(); arg <<= 1)
RunBench(b, arg);
}
}

View File

@ -6,38 +6,151 @@
#define UTIL_BENCHMARK_H_
#include <stdint.h>
#include <functional>
#include "util/logging.h"
#include "util/util.h"
// Globals for the old benchmark API.
void StartBenchmarkTiming();
void StopBenchmarkTiming();
void SetBenchmarkBytesProcessed(int64_t b);
void SetBenchmarkItemsProcessed(int64_t i);
namespace benchmark {
// The new benchmark API implemented as a layer over the old benchmark API.
// (Please refer to https://github.com/google/benchmark for documentation.)
class State {
private:
class Iterator {
public:
// Benchmark code looks like this:
//
// for (auto _ : state) {
// // ...
// }
//
// We try to avoid compiler warnings about such variables being unused.
struct ATTRIBUTE_UNUSED Value {};
explicit Iterator(int64_t iters) : iters_(iters) {}
bool operator!=(const Iterator& that) const {
if (iters_ != that.iters_) {
return true;
} else {
// We are about to stop the loop, so stop timing.
StopBenchmarkTiming();
return false;
}
}
Value operator*() const {
return Value();
}
Iterator& operator++() {
--iters_;
return *this;
}
private:
int64_t iters_;
};
public:
explicit State(int64_t iters)
: iters_(iters), arg_(0), has_arg_(false) {}
State(int64_t iters, int64_t arg)
: iters_(iters), arg_(arg), has_arg_(true) {}
Iterator begin() {
// We are about to start the loop, so start timing.
StartBenchmarkTiming();
return Iterator(iters_);
}
Iterator end() {
return Iterator(0);
}
void SetBytesProcessed(int64_t b) { SetBenchmarkBytesProcessed(b); }
void SetItemsProcessed(int64_t i) { SetBenchmarkItemsProcessed(i); }
int64_t iterations() const { return iters_; }
// Pretend to support multiple arguments.
int64_t range(int pos) const { CHECK(has_arg_); return arg_; }
private:
int64_t iters_;
int64_t arg_;
bool has_arg_;
State(const State&) = delete;
State& operator=(const State&) = delete;
};
} // namespace benchmark
namespace testing {
struct Benchmark {
const char* name;
void (*fn)(int);
void (*fnr)(int, int);
int lo;
int hi;
int threadlo;
int threadhi;
class Benchmark {
public:
Benchmark(const char* name, void (*func)(benchmark::State&))
: name_(name),
func_([func](int iters, int arg) {
benchmark::State state(iters);
func(state);
}),
lo_(0),
hi_(0),
has_arg_(false) {
Register();
}
Benchmark(const char* name, void (*func)(benchmark::State&), int lo, int hi)
: name_(name),
func_([func](int iters, int arg) {
benchmark::State state(iters, arg);
func(state);
}),
lo_(lo),
hi_(hi),
has_arg_(true) {
Register();
}
// Pretend to support multiple threads.
Benchmark* ThreadRange(int lo, int hi) { return this; }
const char* name() const { return name_; }
const std::function<void(int, int)>& func() const { return func_; }
int lo() const { return lo_; }
int hi() const { return hi_; }
bool has_arg() const { return has_arg_; }
private:
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
const char* name_;
std::function<void(int, int)> func_;
int lo_;
int hi_;
bool has_arg_;
Benchmark(const Benchmark&) = delete;
Benchmark& operator=(const Benchmark&) = delete;
};
} // namespace testing
void SetBenchmarkBytesProcessed(int64_t);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
void SetBenchmarkItemsProcessed(int);
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f))
int NumCPUs();
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // UTIL_BENCHMARK_H_

View File

@ -10,20 +10,17 @@
// If you want to do that, see
// https://gflags.github.io/gflags/
#include <stdint.h>
#define DEFINE_flag(type, name, deflt, desc) \
#define DEFINE_FLAG(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
#define DECLARE_flag(type, name) \
#define DECLARE_FLAG(type, name) \
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(std::string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
#define DECLARE_string(name) DECLARE_flag(std::string, name)
namespace re2 {
template <typename T>
T GetFlag(const T& flag) {
return flag;
}
} // namespace re2
#endif // UTIL_FLAGS_H_

19
extern/re2/util/malloc_counter.h vendored Normal file
View File

@ -0,0 +1,19 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MALLOC_COUNTER_H_
#define UTIL_MALLOC_COUNTER_H_
namespace testing {
class MallocCounter {
public:
MallocCounter(int x) {}
static const int THIS_THREAD_ONLY = 0;
long long HeapGrowth() { return 0; }
long long PeakHeapGrowth() { return 0; }
void Reset() {}
};
} // namespace testing
#endif // UTIL_MALLOC_COUNTER_H_

View File

@ -10,7 +10,13 @@
* You should assume the locks are *not* re-entrant.
*/
#if !defined(_WIN32)
#ifdef _WIN32
// Requires Windows Vista or Windows Server 2008 at minimum.
#include <windows.h>
#if defined(WINVER) && WINVER >= 0x0600
#define MUTEX_IS_WIN32_SRWLOCK
#endif
#else
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200809L
#endif
@ -20,7 +26,9 @@
#endif
#endif
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
#if defined(MUTEX_IS_WIN32_SRWLOCK)
typedef SRWLOCK MutexType;
#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
#include <pthread.h>
#include <stdlib.h>
typedef pthread_rwlock_t MutexType;
@ -56,7 +64,16 @@ class Mutex {
Mutex& operator=(const Mutex&) = delete;
};
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
#if defined(MUTEX_IS_WIN32_SRWLOCK)
Mutex::Mutex() { InitializeSRWLock(&mutex_); }
Mutex::~Mutex() { }
void Mutex::Lock() { AcquireSRWLockExclusive(&mutex_); }
void Mutex::Unlock() { ReleaseSRWLockExclusive(&mutex_); }
void Mutex::ReaderLock() { AcquireSRWLockShared(&mutex_); }
void Mutex::ReaderUnlock() { ReleaseSRWLockShared(&mutex_); }
#elif defined(MUTEX_IS_PTHREAD_RWLOCK)
#define SAFE_PTHREAD(fncall) \
do { \

View File

@ -22,9 +22,7 @@
#include "util/strutil.h"
// Silence warnings about the wacky formatting in the operator() functions.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
@ -35,9 +33,10 @@
// not exceed main thread stacks. Note that other threads
// often have smaller stacks, and therefore tightening
// regexp_stack_limit may frequently be necessary.
DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
DEFINE_int32(regexp_match_limit, 1000000,
"default PCRE match limit (function calls)");
DEFINE_FLAG(int, regexp_stack_limit, 256 << 10,
"default PCRE stack limit (bytes)");
DEFINE_FLAG(int, regexp_match_limit, 1000000,
"default PCRE match limit (function calls)");
#ifndef USEPCRE
@ -523,12 +522,12 @@ int PCRE::TryMatch(const StringPiece& text,
int match_limit = match_limit_;
if (match_limit <= 0) {
match_limit = FLAGS_regexp_match_limit;
match_limit = GetFlag(FLAGS_regexp_match_limit);
}
int stack_limit = stack_limit_;
if (stack_limit <= 0) {
stack_limit = FLAGS_regexp_stack_limit;
stack_limit = GetFlag(FLAGS_regexp_stack_limit);
}
pcre_extra extra = { 0 };
@ -977,32 +976,7 @@ static bool parse_double_float(const char* str, size_t n, bool isfloat,
} else {
r = strtod(buf, &end);
}
if (end != buf + n) {
#ifdef _WIN32
// Microsoft's strtod() doesn't handle inf and nan, so we have to
// handle it explicitly. Speed is not important here because this
// code is only called in unit tests.
bool pos = true;
const char* i = buf;
if ('-' == *i) {
pos = false;
++i;
} else if ('+' == *i) {
++i;
}
if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) {
r = std::numeric_limits<double>::infinity();
if (!pos)
r = -r;
} else if (0 == _stricmp(i, "nan")) {
r = std::numeric_limits<double>::quiet_NaN();
} else {
return false;
}
#else
return false; // Leftover junk
#endif
}
if (end != buf + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
if (isfloat) {

View File

@ -555,7 +555,7 @@ class PCRE_Options {
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
template <typename T>
class _PCRE_MatchObject {
public:
static inline bool Parse(const char* str, size_t n, void* dest) {
@ -600,9 +600,9 @@ class PCRE::Arg {
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
template <typename T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
template <typename T> Arg(T* p)
: arg_(p), parser_(_PCRE_MatchObject<T>::Parse) {
}

Some files were not shown because too many files have changed in this diff Show More