8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-22 16:43:03 +01:00

Import re2 version 2019-08-01.

This commit is contained in:
Adriano dos Santos Fernandes 2019-08-11 11:10:29 -03:00
parent 28e18749ff
commit 65f003da0d
126 changed files with 39155 additions and 0 deletions

5
extern/re2/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
*.pyc
*.orig
core
obj/
benchlog.*

179
extern/re2/.travis.yml vendored Normal file
View File

@ -0,0 +1,179 @@
language: cpp
sudo: false
dist: trusty
script:
- make
- make test
matrix:
include:
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
env:
- MATRIX_EVAL="CC=gcc-4.8 CXX=g++-4.8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
env:
- MATRIX_EVAL="CC=gcc-4.9 CXX=g++-4.9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
env:
- MATRIX_EVAL="CC=gcc-5 CXX=g++-5"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-6
env:
- MATRIX_EVAL="CC=gcc-6 CXX=g++-6"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-7
env:
- MATRIX_EVAL="CC=gcc-7 CXX=g++-7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-8
env:
- MATRIX_EVAL="CC=gcc-8 CXX=g++-8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-9
env:
- MATRIX_EVAL="CC=gcc-9 CXX=g++-9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.5
packages:
- clang-3.5
env:
- MATRIX_EVAL="CC=clang-3.5 CXX=clang++-3.5"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.6
packages:
- clang-3.6
env:
- MATRIX_EVAL="CC=clang-3.6 CXX=clang++-3.6"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.7
packages:
- clang-3.7
env:
- MATRIX_EVAL="CC=clang-3.7 CXX=clang++-3.7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.8
packages:
- clang-3.8
env:
- MATRIX_EVAL="CC=clang-3.8 CXX=clang++-3.8"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.9
packages:
- clang-3.9
env:
- MATRIX_EVAL="CC=clang-3.9 CXX=clang++-3.9"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-trusty-4.0
packages:
- clang-4.0
env:
- MATRIX_EVAL="CC=clang-4.0 CXX=clang++-4.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- llvm-toolchain-trusty-5.0
packages:
- clang-5.0
env:
- MATRIX_EVAL="CC=clang-5.0 CXX=clang++-5.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-6.0 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-6.0
env:
- MATRIX_EVAL="CC=clang-6.0 CXX=clang++-6.0"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-7 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-7
env:
- MATRIX_EVAL="CC=clang-7 CXX=clang++-7"
- os: linux
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- sourceline: 'deb https://apt.llvm.org/trusty/ llvm-toolchain-trusty-8 main'
key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
packages:
- clang-8
env:
- MATRIX_EVAL="CC=clang-8 CXX=clang++-8"
before_install:
- eval "${MATRIX_EVAL}"

13
extern/re2/AUTHORS vendored Normal file
View File

@ -0,0 +1,13 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Samsung Electronics
Stefano Rivera <stefano.rivera@gmail.com>

239
extern/re2/BUILD vendored Normal file
View File

@ -0,0 +1,239 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Bazel (http://bazel.io/) BUILD file for RE2.
licenses(["notice"])
exports_files(["LICENSE"])
config_setting(
name = "darwin",
values = {"cpu": "darwin"},
)
config_setting(
name = "windows",
values = {"cpu": "x64_windows"},
)
config_setting(
name = "windows_msvc",
values = {"cpu": "x64_windows_msvc"},
)
cc_library(
name = "re2",
srcs = [
"re2/bitmap256.h",
"re2/bitstate.cc",
"re2/compile.cc",
"re2/dfa.cc",
"re2/filtered_re2.cc",
"re2/mimics_pcre.cc",
"re2/nfa.cc",
"re2/onepass.cc",
"re2/parse.cc",
"re2/perl_groups.cc",
"re2/prefilter.cc",
"re2/prefilter.h",
"re2/prefilter_tree.cc",
"re2/prefilter_tree.h",
"re2/prog.cc",
"re2/prog.h",
"re2/re2.cc",
"re2/regexp.cc",
"re2/regexp.h",
"re2/set.cc",
"re2/simplify.cc",
"re2/stringpiece.cc",
"re2/tostring.cc",
"re2/unicode_casefold.cc",
"re2/unicode_casefold.h",
"re2/unicode_groups.cc",
"re2/unicode_groups.h",
"re2/walker-inl.h",
"util/flags.h",
"util/logging.h",
"util/mix.h",
"util/mutex.h",
"util/pod_array.h",
"util/rune.cc",
"util/sparse_array.h",
"util/sparse_set.h",
"util/strutil.cc",
"util/strutil.h",
"util/utf.h",
"util/util.h",
],
hdrs = [
"re2/filtered_re2.h",
"re2/re2.h",
"re2/set.h",
"re2/stringpiece.h",
],
copts = select({
":windows": [],
":windows_msvc": [],
"//conditions:default": ["-pthread"],
}),
linkopts = select({
# Darwin doesn't need `-pthread' when linking and it appears that
# older versions of Clang will warn about the unused command line
# argument, so just don't pass it.
":darwin": [],
":windows": [],
":windows_msvc": [],
"//conditions:default": ["-pthread"],
}),
visibility = ["//visibility:public"],
)
cc_library(
name = "testing",
testonly = 1,
srcs = [
"re2/testing/backtrack.cc",
"re2/testing/dump.cc",
"re2/testing/exhaustive_tester.cc",
"re2/testing/null_walker.cc",
"re2/testing/regexp_generator.cc",
"re2/testing/string_generator.cc",
"re2/testing/tester.cc",
"util/pcre.cc",
],
hdrs = [
"re2/testing/exhaustive_tester.h",
"re2/testing/regexp_generator.h",
"re2/testing/string_generator.h",
"re2/testing/tester.h",
"util/benchmark.h",
"util/pcre.h",
"util/test.h",
],
deps = [":re2"],
)
cc_library(
name = "test",
testonly = 1,
srcs = ["util/test.cc"],
deps = [":testing"],
)
load(":re2_test.bzl", "re2_test")
re2_test(
"charclass_test",
size = "small",
)
re2_test(
"compile_test",
size = "small",
)
re2_test(
"filtered_re2_test",
size = "small",
)
re2_test(
"mimics_pcre_test",
size = "small",
)
re2_test(
"parse_test",
size = "small",
)
re2_test(
"possible_match_test",
size = "small",
)
re2_test(
"re2_arg_test",
size = "small",
)
re2_test(
"re2_test",
size = "small",
)
re2_test(
"regexp_test",
size = "small",
)
re2_test(
"required_prefix_test",
size = "small",
)
re2_test(
"search_test",
size = "small",
)
re2_test(
"set_test",
size = "small",
)
re2_test(
"simplify_test",
size = "small",
)
re2_test(
"string_generator_test",
size = "small",
)
re2_test(
"dfa_test",
size = "large",
)
re2_test(
"exhaustive1_test",
size = "large",
)
re2_test(
"exhaustive2_test",
size = "large",
)
re2_test(
"exhaustive3_test",
size = "large",
)
re2_test(
"exhaustive_test",
size = "large",
)
re2_test(
"random_test",
size = "large",
)
cc_library(
name = "benchmark",
testonly = 1,
srcs = ["util/benchmark.cc"],
deps = [":testing"],
)
cc_binary(
name = "regexp_benchmark",
testonly = 1,
srcs = ["re2/testing/regexp_benchmark.cc"],
deps = [":benchmark"],
)

152
extern/re2/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,152 @@
# Copyright 2015 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Old enough to support Ubuntu Trusty.
cmake_minimum_required(VERSION 2.8.12)
if(POLICY CMP0048)
cmake_policy(SET CMP0048 NEW)
endif()
project(RE2 CXX)
include(CTest)
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(USEPCRE "use PCRE in tests and benchmarks" OFF)
# CMake seems to have no way to enable/disable testing per subproject,
# so we provide an option similar to BUILD_TESTING, but just for RE2.
option(RE2_BUILD_TESTING "enable testing for RE2" ON)
set(EXTRA_TARGET_LINK_LIBRARIES)
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if(MSVC_VERSION LESS 1900)
message(FATAL_ERROR "you need Visual Studio 2015 or later")
endif()
if(BUILD_SHARED_LIBS)
# See http://www.kitware.com/blog/home/post/939 for details.
cmake_minimum_required(VERSION 3.4)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX,
# so we disable various warnings that aren't particularly helpful.
add_compile_options(/wd4100 /wd4201 /wd4456 /wd4457 /wd4702 /wd4815)
# Without a byte order mark (BOM), Visual Studio assumes that the source
# file is encoded using the current user code page, so we specify UTF-8.
add_compile_options(/utf-8)
elseif(CYGWIN OR MINGW)
# See https://stackoverflow.com/questions/38139631 for details.
add_compile_options(-std=gnu++11)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_compile_options(-std=c++11)
endif()
if(WIN32)
add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX)
add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
elseif(UNIX)
add_compile_options(-pthread)
list(APPEND EXTRA_TARGET_LINK_LIBRARIES -pthread)
endif()
if(USEPCRE)
add_definitions(-DUSEPCRE)
list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
set(RE2_SOURCES
re2/bitstate.cc
re2/compile.cc
re2/dfa.cc
re2/filtered_re2.cc
re2/mimics_pcre.cc
re2/nfa.cc
re2/onepass.cc
re2/parse.cc
re2/perl_groups.cc
re2/prefilter.cc
re2/prefilter_tree.cc
re2/prog.cc
re2/re2.cc
re2/regexp.cc
re2/set.cc
re2/simplify.cc
re2/stringpiece.cc
re2/tostring.cc
re2/unicode_casefold.cc
re2/unicode_groups.cc
util/rune.cc
util/strutil.cc
)
add_library(re2 ${RE2_SOURCES})
add_library(re2::re2 ALIAS re2)
if(RE2_BUILD_TESTING)
set(TESTING_SOURCES
re2/testing/backtrack.cc
re2/testing/dump.cc
re2/testing/exhaustive_tester.cc
re2/testing/null_walker.cc
re2/testing/regexp_generator.cc
re2/testing/string_generator.cc
re2/testing/tester.cc
util/pcre.cc
)
add_library(testing STATIC ${TESTING_SOURCES})
set(TEST_TARGETS
charclass_test
compile_test
filtered_re2_test
mimics_pcre_test
parse_test
possible_match_test
re2_test
re2_arg_test
regexp_test
required_prefix_test
search_test
set_test
simplify_test
string_generator_test
dfa_test
exhaustive1_test
exhaustive2_test
exhaustive3_test
exhaustive_test
random_test
)
set(BENCHMARK_TARGETS
regexp_benchmark
)
foreach(target ${TEST_TARGETS})
add_executable(${target} re2/testing/${target}.cc util/test.cc)
target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES})
add_test(NAME ${target} COMMAND ${target})
endforeach(target)
foreach(target ${BENCHMARK_TARGETS})
add_executable(${target} re2/testing/${target}.cc util/benchmark.cc)
target_link_libraries(${target} testing re2 ${EXTRA_TARGET_LINK_LIBRARIES})
endforeach(target)
endif()
set(RE2_HEADERS
re2/filtered_re2.h
re2/re2.h
re2/set.h
re2/stringpiece.h
)
install(FILES ${RE2_HEADERS} DESTINATION include/re2)
install(TARGETS re2 EXPORT re2Config ARCHIVE DESTINATION lib LIBRARY DESTINATION lib RUNTIME DESTINATION bin INCLUDES DESTINATION include)
install(EXPORT re2Config DESTINATION lib/cmake/re2 NAMESPACE re2::)

2
extern/re2/CONTRIBUTING.md vendored Normal file
View File

@ -0,0 +1,2 @@
RE2 uses Gerrit instead of GitHub pull requests.
See the [Contributing](https://github.com/google/re2/wiki/Contribute) wiki page.

41
extern/re2/CONTRIBUTORS vendored Normal file
View File

@ -0,0 +1,41 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the RE2 repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Dominic Battré <battre@chromium.org>
Doug Kwan <dougkwan@google.com>
Dmitriy Vyukov <dvyukov@google.com>
John Millikin <jmillikin@gmail.com>
Mike Nazarewicz <mpn@google.com>
Nico Weber <thakis@chromium.org>
Pawel Hajdan <phajdan.jr@gmail.com>
Rob Pike <r@google.com>
Russ Cox <rsc@swtch.com>
Sanjay Ghemawat <sanjay@google.com>
Stefano Rivera <stefano.rivera@gmail.com>
Srinivasan Venkatachary <vsri@google.com>
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>

27
extern/re2/LICENSE vendored Normal file
View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

38
extern/re2/README vendored Normal file
View File

@ -0,0 +1,38 @@
This is the source code repository for RE2, a regular expression library.
For documentation about how to install and use RE2,
visit https://github.com/google/re2/.
The short version is:
make
make test
make install
make testinstall
There is a fair amount of documentation (including code snippets) in
the re2.h header file.
More information can be found on the wiki:
https://github.com/google/re2/wiki
Issue tracker:
https://github.com/google/re2/issues
Mailing list:
https://groups.google.com/group/re2-dev
Unless otherwise noted, the RE2 source files are distributed
under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
A C wrapper is at https://github.com/marcomaggi/cre2/.
An Erlang wrapper is at https://github.com/dukesoferl/re2/ and on Hex (hex.pm).
An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM (npmjs.com).
An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM (opam.ocaml.org).
A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN (cpan.org).
A Python wrapper is at https://github.com/facebook/pyre2/ and on PyPI (pypi.org).
An R wrapper is at https://github.com/qinwf/re2r/ and on CRAN (cran.r-project.org).
A Ruby wrapper is at https://github.com/mudge/re2/ and on RubyGems (rubygems.org).

6
extern/re2/WORKSPACE vendored Normal file
View File

@ -0,0 +1,6 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Bazel (http://bazel.io/) WORKSPACE file for RE2.
workspace(name = "com_googlesource_code_re2")

98
extern/re2/benchlog/benchplot.py vendored Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
import argparse # for ArgumentParser
import subprocess # for Popen
import tempfile # for NamedTemporaryFile
import os # for remove
class gnuplot(object):
output = "result.png"
script = """
set terminal png size 1024, 768
set output "{}.png"
set title "re2 benchlog"
set datafile separator ";"
set grid x y
set ylabel "MB/s"
set autoscale
plot """
template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
benchdata = dict()
tempfiles = []
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
"""
remove all temporary files
"""
for filename in self.tempfiles:
os.remove(filename)
def parse_re2_benchlog(self, filename):
"""
parse the input benchlog and return a dictionary contain bench data
"""
benchdata = self.benchdata
with open(filename) as f:
for raw in f.readlines():
data = raw.split('\t')
if len(data) == 4:
data = data[0].split('/') + data[1:]
data = list(map(str.strip, data))
if not benchdata.get(data[0]):
benchdata[data[0]] = [ data[1:] ]
else:
benchdata[data[0]].append(data[1:])
def gen_csv(self):
"""
generate temporary csv files
"""
for name, data in self.benchdata.items():
with tempfile.NamedTemporaryFile(delete=False) as f:
for index, line in enumerate(data):
f.write('{};{}\n'.format(index, ';'.join(line)).encode())
self.tempfiles.append(f.name)
self.script = self.script + self.template.format(f.name, name)
def run(self):
self.gen_csv()
script = self.script[:-3].format(self.output)
command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
command.communicate(script.encode())
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='generate plots for benchlog')
parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
args = parser.parse_args()
try:
subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
except FileNotFoundError:
print('you can install "gnuplot" to generate plots automatically')
exit(1)
with gnuplot() as plot:
plot.output = args.benchlog
plot.parse_re2_benchlog(args.benchlog)
plot.run()

155
extern/re2/benchlog/mktable vendored Normal file
View File

@ -0,0 +1,155 @@
#!/usr/bin/perl
# XXX
sub table() {
my ($name) = @_;
print <<'EOF';
<table border=0>
<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
EOF
foreach my $sys (@sys) {
my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
}
print <<'EOF';
<tr height=5><td colspan=3></td></tr>
</table>
EOF
}
@sizes = (
"8", "16", "32", "64", "128", "256", "512",
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
"1M", "2M", "4M", "8M", "16M"
);
%color = (
"PCRE" => "0.7 0 0",
"RE2" => "0 0 1",
);
$ngraph = 0;
sub graph() {
my ($name) = @_;
my $sys = "wreck";
my $base = sprintf("regexp3g%d", ++$ngraph);
open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
printf JGR "bbox -20 -12 392 95\n";
printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
$ymax = 0;
%lastx = ();
%lasty = ();
foreach my $who ("PCRE", "RE2") {
printf JGR "newcurve pts\n";
for(my $i=0; $i<@sizes; $i++) {
my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
my $val = $data{$sys}->{$key}->{'MB/s'};
next if !defined($val);
if($val > $ymax) {
$ymax = $val;
}
$lastx{$who} = $i;
$lasty{$who} = $val;
printf JGR "$i %f (* %s *)\n", $val, $key;
}
my $color = $color{$who};
printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
}
my $n = @sizes;
printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
for($i=0; $i<@sizes; $i+=3) {
printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
}
my $y = 1;
while(10*$y <= $ymax) {
$y = 10*$y;
}
for($i=2; $i<=10; $i++) {
if($i*$y > $ymax) {
$y = $i*$y;
last;
}
}
foreach my $who ("PCRE", "RE2") {
$x1 = $lastx{$who};
$y1 = $lasty{$who};
$x1 *= 1.01;
my $v = "vjc";
if($y1 < 0.05 * $y) {
$v = "vjb";
$y1 = 0.05 * $y;
}
printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
}
printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
printf JGR " hash_labels fontsize 9\n";
# printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
system("jgraph $base.jgr >$base.eps"); # die "system: $!";
system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
printf "<img src=$base.png>\n"
}
sub skip() {
while(<>) {
if(/^<!-- -->/) {
print;
last;
}
}
}
@sys = ("r70", "c2", "wreck", "mini");
%sysname = (
"r70" => "AMD Opteron 8214 HE, 2.2 GHz",
"c2" => "Intel Core2 Duo E7200, 2.53 GHz",
"wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
"mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
);
%func = (
"table" => \&table,
"graph" => \&graph,
);
foreach my $sys (@sys) {
open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
my %sysdat;
while(<F>) {
if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
my %row;
$row{"name"} = $1;
$row{"iter"} = $2;
$row{"ns/op"} = $3;
if(/([\d.]+) MB\/s/){
$row{"MB/s"} = $1;
}
$sysdat{$row{"name"}} = \%row;
}
}
close F;
$data{$sys} = \%sysdat;
}
while(<>) {
print;
if(/^<!-- benchlog (\w+) -->/) {
$func{$1}();
skip();
next;
}
if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
$func{$1}($2);
skip();
next;
}
}

1
extern/re2/doc/README.xkcd vendored Normal file
View File

@ -0,0 +1 @@
xkcd.png is a cropped version of http://xkcd.com/208/

41
extern/re2/doc/mksyntaxgo vendored Normal file
View File

@ -0,0 +1,41 @@
#!/bin/sh
set -e
out=$GOROOT/src/regexp/syntax/doc.go
cp syntax.txt $out
sam -d $out <<'!'
,x g/NOT SUPPORTED/d
/^Unicode character class/,$d
,s/[«»]//g
,x g/^Possessive repetitions:/d
,x g/\\C/d
,x g/Flag syntax/d
,s/.=(true|false)/flag &/g
,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
,s/\n\n\n+/\n\n/g
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
1,2c
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package regexp (such as Compile and Match) instead of this package.
Syntax
The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to Parse.
.
$a
*/
package syntax
.
w
q
!

42
extern/re2/doc/mksyntaxhtml vendored Normal file
View File

@ -0,0 +1,42 @@
#!/bin/sh
cp syntax.txt syntax.html
sam -d syntax.html <<'!'
,s/\&/\&amp;/g
,s/</\&lt;/g
,s/>/\&gt;/g
,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
,s!«!<code>!g
,s!»!</code>!g
,s! vim$! <font size=-2>VIM</font>!g
,s! pcre$! <font size=-2>PCRE</font>!g
,s! perl$! <font size=-2>PERL</font>!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
,s!NOT SUPPORTED!!g
,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
1,2c
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
</body>
</html>
.
w
q
!

36
extern/re2/doc/mksyntaxwiki vendored Normal file
View File

@ -0,0 +1,36 @@
#!/bin/sh
cp syntax.txt syntax.wiki
sam -d syntax.wiki <<'!'
,s!`!`````!g
,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
,s!«!`!g
,s!»!`!g
,s! vim$! <font size="1">VIM</font>!g
,s! pcre$! <font size="1">PCRE</font>!g
,s! perl$! <font size="1">PERL</font>!g
,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
,s!NOT SUPPORTED!<font size="1">(&)</font>!g
,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
1,2c
#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
<wiki:comment>
GENERATED BY mksyntaxwiki. DO NOT EDIT
</wiki:comment>
<table border="0" cellpadding="2" cellspacing="2">
<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
.
w
q
!

409
extern/re2/doc/syntax.html vendored Normal file
View File

@ -0,0 +1,409 @@
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Single characters:</b></td></tr>
<tr><td><code>.</code></td><td>any character, possibly including newline (s=true)</td></tr>
<tr><td><code>[xyz]</code></td><td>character class</td></tr>
<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
<tr><td><code>[[:alpha:]]</code></td><td>ASCII character class</td></tr>
<tr><td><code>[[:^alpha:]]</code></td><td>negated ASCII character class</td></tr>
<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Composites:</b></td></tr>
<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Repetitions:</b></td></tr>
<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
<tr><td><code><font color=#808080>(?&gt;re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re@&gt;</font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Flags:</b></td></tr>
<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
<tr><td><code>m</code></td><td>multi-line mode: <code>^</code> and <code>$</code> match begin/end line in addition to begin/end text (default false)</td></tr>
<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Empty strings:</b></td></tr>
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr>
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
<tr><td><code>\B</code></td><td>not a word boundary</td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
<tr><td><code>\z</code></td><td>at end of text</td></tr>
<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re&amp;</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Escape sequences:</b></td></tr>
<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\g&lt;name&gt;</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\k&lt;name&gt;</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Character class elements:</b></td></tr>
<tr><td><code>x</code></td><td>single character</td></tr>
<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Perl character classes:</b></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
<tr><td><code>[[:alnum:]]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
<tr><td><code>[[:alpha:]]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
<tr><td><code>[[:ascii:]]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
<tr><td><code>[[:blank:]]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
<tr><td><code>[[:cntrl:]]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
<tr><td><code>[[:digit:]]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>[[:graph:]]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&amp;'()*+,\-./:;&lt;=&gt;?@[\\\]^_`{|}~]</code>)</td></tr>
<tr><td><code>[[:lower:]]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
<tr><td><code>[[:print:]]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
<tr><td><code>[[:punct:]]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
<tr><td><code>[[:space:]]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
<tr><td><code>[[:upper:]]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
<tr><td><code>[[:word:]]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>[[:xdigit:]]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
<tr><td><code>C</code></td><td>other</td></tr>
<tr><td><code>Cc</code></td><td>control</td></tr>
<tr><td><code>Cf</code></td><td>format</td></tr>
<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
<tr><td><code>Co</code></td><td>private use</td></tr>
<tr><td><code>Cs</code></td><td>surrogate</td></tr>
<tr><td><code>L</code></td><td>letter</td></tr>
<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
<tr><td><code><font color=#808080>L&amp;</font></code></td><td>cased letter </td></tr>
<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
<tr><td><code>Lo</code></td><td>other letter</td></tr>
<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
<tr><td><code>M</code></td><td>mark</td></tr>
<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
<tr><td><code>N</code></td><td>number</td></tr>
<tr><td><code>Nd</code></td><td>decimal number</td></tr>
<tr><td><code>Nl</code></td><td>letter number</td></tr>
<tr><td><code>No</code></td><td>other number</td></tr>
<tr><td><code>P</code></td><td>punctuation</td></tr>
<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
<tr><td><code>Po</code></td><td>other punctuation</td></tr>
<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
<tr><td><code>S</code></td><td>symbol</td></tr>
<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
<tr><td><code>Sm</code></td><td>math symbol</td></tr>
<tr><td><code>So</code></td><td>other symbol</td></tr>
<tr><td><code>Z</code></td><td>separator</td></tr>
<tr><td><code>Zl</code></td><td>line separator</td></tr>
<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
<tr><td><code>Zs</code></td><td>space separator</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
<tr><td><code>Arabic</code></td><td>Arabic</td></tr>
<tr><td><code>Armenian</code></td><td>Armenian</td></tr>
<tr><td><code>Balinese</code></td><td>Balinese</td></tr>
<tr><td><code>Bamum</code></td><td>Bamum</td></tr>
<tr><td><code>Batak</code></td><td>Batak</td></tr>
<tr><td><code>Bengali</code></td><td>Bengali</td></tr>
<tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr>
<tr><td><code>Brahmi</code></td><td>Brahmi</td></tr>
<tr><td><code>Braille</code></td><td>Braille</td></tr>
<tr><td><code>Buginese</code></td><td>Buginese</td></tr>
<tr><td><code>Buhid</code></td><td>Buhid</td></tr>
<tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr>
<tr><td><code>Carian</code></td><td>Carian</td></tr>
<tr><td><code>Chakma</code></td><td>Chakma</td></tr>
<tr><td><code>Cham</code></td><td>Cham</td></tr>
<tr><td><code>Cherokee</code></td><td>Cherokee</td></tr>
<tr><td><code>Common</code></td><td>characters not specific to one script</td></tr>
<tr><td><code>Coptic</code></td><td>Coptic</td></tr>
<tr><td><code>Cuneiform</code></td><td>Cuneiform</td></tr>
<tr><td><code>Cypriot</code></td><td>Cypriot</td></tr>
<tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr>
<tr><td><code>Deseret</code></td><td>Deseret</td></tr>
<tr><td><code>Devanagari</code></td><td>Devanagari</td></tr>
<tr><td><code>Egyptian_Hieroglyphs</code></td><td>Egyptian Hieroglyphs</td></tr>
<tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr>
<tr><td><code>Georgian</code></td><td>Georgian</td></tr>
<tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr>
<tr><td><code>Gothic</code></td><td>Gothic</td></tr>
<tr><td><code>Greek</code></td><td>Greek</td></tr>
<tr><td><code>Gujarati</code></td><td>Gujarati</td></tr>
<tr><td><code>Gurmukhi</code></td><td>Gurmukhi</td></tr>
<tr><td><code>Han</code></td><td>Han</td></tr>
<tr><td><code>Hangul</code></td><td>Hangul</td></tr>
<tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr>
<tr><td><code>Hebrew</code></td><td>Hebrew</td></tr>
<tr><td><code>Hiragana</code></td><td>Hiragana</td></tr>
<tr><td><code>Imperial_Aramaic</code></td><td>Imperial Aramaic</td></tr>
<tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr>
<tr><td><code>Inscriptional_Pahlavi</code></td><td>Inscriptional Pahlavi</td></tr>
<tr><td><code>Inscriptional_Parthian</code></td><td>Inscriptional Parthian</td></tr>
<tr><td><code>Javanese</code></td><td>Javanese</td></tr>
<tr><td><code>Kaithi</code></td><td>Kaithi</td></tr>
<tr><td><code>Kannada</code></td><td>Kannada</td></tr>
<tr><td><code>Katakana</code></td><td>Katakana</td></tr>
<tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr>
<tr><td><code>Kharoshthi</code></td><td>Kharoshthi</td></tr>
<tr><td><code>Khmer</code></td><td>Khmer</td></tr>
<tr><td><code>Lao</code></td><td>Lao</td></tr>
<tr><td><code>Latin</code></td><td>Latin</td></tr>
<tr><td><code>Lepcha</code></td><td>Lepcha</td></tr>
<tr><td><code>Limbu</code></td><td>Limbu</td></tr>
<tr><td><code>Linear_B</code></td><td>Linear B</td></tr>
<tr><td><code>Lycian</code></td><td>Lycian</td></tr>
<tr><td><code>Lydian</code></td><td>Lydian</td></tr>
<tr><td><code>Malayalam</code></td><td>Malayalam</td></tr>
<tr><td><code>Mandaic</code></td><td>Mandaic</td></tr>
<tr><td><code>Meetei_Mayek</code></td><td>Meetei Mayek</td></tr>
<tr><td><code>Meroitic_Cursive</code></td><td>Meroitic Cursive</td></tr>
<tr><td><code>Meroitic_Hieroglyphs</code></td><td>Meroitic Hieroglyphs</td></tr>
<tr><td><code>Miao</code></td><td>Miao</td></tr>
<tr><td><code>Mongolian</code></td><td>Mongolian</td></tr>
<tr><td><code>Myanmar</code></td><td>Myanmar</td></tr>
<tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr>
<tr><td><code>Nko</code></td><td>Nko</td></tr>
<tr><td><code>Ogham</code></td><td>Ogham</td></tr>
<tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr>
<tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr>
<tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr>
<tr><td><code>Old_South_Arabian</code></td><td>Old South Arabian</td></tr>
<tr><td><code>Old_Turkic</code></td><td>Old Turkic</td></tr>
<tr><td><code>Oriya</code></td><td>Oriya</td></tr>
<tr><td><code>Osmanya</code></td><td>Osmanya</td></tr>
<tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr>
<tr><td><code>Phoenician</code></td><td>Phoenician</td></tr>
<tr><td><code>Rejang</code></td><td>Rejang</td></tr>
<tr><td><code>Runic</code></td><td>Runic</td></tr>
<tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr>
<tr><td><code>Sharada</code></td><td>Sharada</td></tr>
<tr><td><code>Shavian</code></td><td>Shavian</td></tr>
<tr><td><code>Sinhala</code></td><td>Sinhala</td></tr>
<tr><td><code>Sora_Sompeng</code></td><td>Sora Sompeng</td></tr>
<tr><td><code>Sundanese</code></td><td>Sundanese</td></tr>
<tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr>
<tr><td><code>Syriac</code></td><td>Syriac</td></tr>
<tr><td><code>Tagalog</code></td><td>Tagalog</td></tr>
<tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr>
<tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr>
<tr><td><code>Tai_Tham</code></td><td>Tai Tham</td></tr>
<tr><td><code>Tai_Viet</code></td><td>Tai Viet</td></tr>
<tr><td><code>Takri</code></td><td>Takri</td></tr>
<tr><td><code>Tamil</code></td><td>Tamil</td></tr>
<tr><td><code>Telugu</code></td><td>Telugu</td></tr>
<tr><td><code>Thaana</code></td><td>Thaana</td></tr>
<tr><td><code>Thai</code></td><td>Thai</td></tr>
<tr><td><code>Tibetan</code></td><td>Tibetan</td></tr>
<tr><td><code>Tifinagh</code></td><td>Tifinagh</td></tr>
<tr><td><code>Ugaritic</code></td><td>Ugaritic</td></tr>
<tr><td><code>Vai</code></td><td>Vai</td></tr>
<tr><td><code>Yi</code></td><td>Yi</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim flags:</b></td></tr>
<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Magic:</b></td></tr>
<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
<tr><td><code><font color=#808080>(?&amp;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>(?P&gt;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
<tr><td></td></tr>
</table>
</body>
</html>

452
extern/re2/doc/syntax.txt vendored Normal file
View File

@ -0,0 +1,452 @@
RE2 regular expression syntax reference
-------------------------­-------­-----
Single characters:
. any character, possibly including newline (s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[[:alpha:]] ASCII character class
[[:^alpha:]] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy «x» followed by «y»
x|y «x» or «y» (prefer «x»)
Repetitions:
x* zero or more «x», prefer more
x+ one or more «x», prefer more
x? zero or one «x», prefer one
x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
x{n,} «n» or more «x», prefer more
x{n} exactly «n» «x»
x*? zero or more «x», prefer fewer
x+? one or more «x», prefer fewer
x?? zero or one «x», prefer zero
x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
x{n,}? «n» or more «x», prefer fewer
x{n}? exactly «n» «x»
x{} (== x*) NOT SUPPORTED vim
x{-} (== x*?) NOT SUPPORTED vim
x{-n} (== x{n}?) NOT SUPPORTED vim
x= (== x?) NOT SUPPORTED vim
Implementation restriction: The counting forms «x{n,m}», «x{n,}», and «x{n}»
reject forms that create a minimum or maximum repetition count above 1000.
Unlimited repetitions are not subject to this restriction.
Possessive repetitions:
x*+ zero or more «x», possessive NOT SUPPORTED
x++ one or more «x», possessive NOT SUPPORTED
x?+ zero or one «x», possessive NOT SUPPORTED
x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
x{n,}+ «n» or more «x», possessive NOT SUPPORTED
x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
(?#text) comment NOT SUPPORTED
(?|x|y|z) branch numbering reset NOT SUPPORTED
(?>re) possessive match of «re» NOT SUPPORTED
re@> possessive match of «re» NOT SUPPORTED vim
%(re) non-capturing group NOT SUPPORTED vim
Flags:
i case-insensitive (default false)
m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
s let «.» match «\n» (default false)
U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
Empty strings:
^ at beginning of text or line («m»=true)
$ at end of text (like «\z» not «\Z») or line («m»=true)
\A at beginning of text
\b at ASCII word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
\B not at ASCII word boundary
\G at beginning of subtext being searched NOT SUPPORTED pcre
\G at end of last match NOT SUPPORTED perl
\Z at end of text, or before newline at end of text NOT SUPPORTED
\z at end of text
(?=re) before text matching «re» NOT SUPPORTED
(?!re) before text not matching «re» NOT SUPPORTED
(?<=re) after text matching «re» NOT SUPPORTED
(?<!re) after text not matching «re» NOT SUPPORTED
re& before text matching «re» NOT SUPPORTED vim
re@= before text matching «re» NOT SUPPORTED vim
re@! before text not matching «re» NOT SUPPORTED vim
re@<= after text matching «re» NOT SUPPORTED vim
re@<! after text not matching «re» NOT SUPPORTED vim
\zs sets start of match (= \K) NOT SUPPORTED vim
\ze sets end of match NOT SUPPORTED vim
\%^ beginning of file NOT SUPPORTED vim
\%$ end of file NOT SUPPORTED vim
\%V on screen NOT SUPPORTED vim
\%# cursor position NOT SUPPORTED vim
\%'m mark «m» position NOT SUPPORTED vim
\%23l in line 23 NOT SUPPORTED vim
\%23c in column 23 NOT SUPPORTED vim
\%23v in virtual column 23 NOT SUPPORTED vim
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal «*», for any punctuation character «*»
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\C match a single byte even in UTF-8 mode
\Q...\E literal text «...» even if «...» has punctuation
\1 backreference NOT SUPPORTED
\b backspace NOT SUPPORTED (use «\010»)
\cK control char ^K NOT SUPPORTED (use «\001» etc)
\e escape NOT SUPPORTED (use «\033»)
\g1 backreference NOT SUPPORTED
\g{1} backreference NOT SUPPORTED
\g{+1} backreference NOT SUPPORTED
\g{-1} backreference NOT SUPPORTED
\g{name} named backreference NOT SUPPORTED
\g<name> subroutine call NOT SUPPORTED
\g'name' subroutine call NOT SUPPORTED
\k<name> named backreference NOT SUPPORTED
\k'name' named backreference NOT SUPPORTED
\lX lowercase «X» NOT SUPPORTED
\ux uppercase «x» NOT SUPPORTED
\L...\E lowercase text «...» NOT SUPPORTED
\K reset beginning of «$0» NOT SUPPORTED
\N{name} named Unicode character NOT SUPPORTED
\R line break NOT SUPPORTED
\U...\E upper case text «...» NOT SUPPORTED
\X extended Unicode sequence NOT SUPPORTED
\%d123 decimal character 123 NOT SUPPORTED vim
\%xFF hex character FF NOT SUPPORTED vim
\%o123 octal character 123 NOT SUPPORTED vim
\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class «foo»
\p{Foo} Unicode character class «Foo»
\pF Unicode character class «F» (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes (all ASCII-only):
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
\h horizontal space NOT SUPPORTED
\H not horizontal space NOT SUPPORTED
\v vertical space NOT SUPPORTED
\V not vertical space NOT SUPPORTED
ASCII character classes:
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
[[:alpha:]] alphabetic (== [A-Za-z])
[[:ascii:]] ASCII (== [\x00-\x7F])
[[:blank:]] blank (== [\t ])
[[:cntrl:]] control (== [\x00-\x1F\x7F])
[[:digit:]] digits (== [0-9])
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[[:lower:]] lower case (== [a-z])
[[:print:]] printable (== [ -~] == [ [:graph:]])
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
[[:space:]] whitespace (== [\t\n\v\f\r ])
[[:upper:]] upper case (== [A-Z])
[[:word:]] word characters (== [0-9A-Za-z_])
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
Unicode character class names--general category:
C other
Cc control
Cf format
Cn unassigned code points NOT SUPPORTED
Co private use
Cs surrogate
L letter
LC cased letter NOT SUPPORTED
L& cased letter NOT SUPPORTED
Ll lowercase letter
Lm modifier letter
Lo other letter
Lt titlecase letter
Lu uppercase letter
M mark
Mc spacing mark
Me enclosing mark
Mn non-spacing mark
N number
Nd decimal number
Nl letter number
No other number
P punctuation
Pc connector punctuation
Pd dash punctuation
Pe close punctuation
Pf final punctuation
Pi initial punctuation
Po other punctuation
Ps open punctuation
S symbol
Sc currency symbol
Sk modifier symbol
Sm math symbol
So other symbol
Z separator
Zl line separator
Zp paragraph separator
Zs space separator
Unicode character class names--scripts:
Adlam
Ahom
Anatolian_Hieroglyphs
Arabic
Armenian
Avestan
Balinese
Bamum
Bassa_Vah
Batak
Bengali
Bhaiksuki
Bopomofo
Brahmi
Braille
Buginese
Buhid
Canadian_Aboriginal
Carian
Caucasian_Albanian
Chakma
Cham
Cherokee
Common
Coptic
Cuneiform
Cypriot
Cyrillic
Deseret
Devanagari
Dogra
Duployan
Egyptian_Hieroglyphs
Elbasan
Elymaic
Ethiopic
Georgian
Glagolitic
Gothic
Grantha
Greek
Gujarati
Gunjala_Gondi
Gurmukhi
Han
Hangul
Hanifi_Rohingya
Hanunoo
Hatran
Hebrew
Hiragana
Imperial_Aramaic
Inherited
Inscriptional_Pahlavi
Inscriptional_Parthian
Javanese
Kaithi
Kannada
Katakana
Kayah_Li
Kharoshthi
Khmer
Khojki
Khudawadi
Lao
Latin
Lepcha
Limbu
Linear_A
Linear_B
Lisu
Lycian
Lydian
Mahajani
Makasar
Malayalam
Mandaic
Manichaean
Marchen
Masaram_Gondi
Medefaidrin
Meetei_Mayek
Mende_Kikakui
Meroitic_Cursive
Meroitic_Hieroglyphs
Miao
Modi
Mongolian
Mro
Multani
Myanmar
Nabataean
Nandinagari
New_Tai_Lue
Newa
Nko
Nushu
Nyiakeng_Puachue_Hmong
Ogham
Ol_Chiki
Old_Hungarian
Old_Italic
Old_North_Arabian
Old_Permic
Old_Persian
Old_Sogdian
Old_South_Arabian
Old_Turkic
Oriya
Osage
Osmanya
Pahawh_Hmong
Palmyrene
Pau_Cin_Hau
Phags_Pa
Phoenician
Psalter_Pahlavi
Rejang
Runic
Samaritan
Saurashtra
Sharada
Shavian
Siddham
SignWriting
Sinhala
Sogdian
Sora_Sompeng
Soyombo
Sundanese
Syloti_Nagri
Syriac
Tagalog
Tagbanwa
Tai_Le
Tai_Tham
Tai_Viet
Takri
Tamil
Tangut
Telugu
Thaana
Thai
Tibetan
Tifinagh
Tirhuta
Ugaritic
Vai
Wancho
Warang_Citi
Yi
Zanabazar_Square
Vim character classes:
\i identifier character NOT SUPPORTED vim
\I «\i» except digits NOT SUPPORTED vim
\k keyword character NOT SUPPORTED vim
\K «\k» except digits NOT SUPPORTED vim
\f file name character NOT SUPPORTED vim
\F «\f» except digits NOT SUPPORTED vim
\p printable character NOT SUPPORTED vim
\P «\p» except digits NOT SUPPORTED vim
\s whitespace character (== [ \t]) NOT SUPPORTED vim
\S non-white space character (== [^ \t]) NOT SUPPORTED vim
\d digits (== [0-9]) vim
\D not «\d» vim
\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
\X not «\x» NOT SUPPORTED vim
\o octal digits (== [0-7]) NOT SUPPORTED vim
\O not «\o» NOT SUPPORTED vim
\w word character vim
\W not «\w» vim
\h head of word character NOT SUPPORTED vim
\H not «\h» NOT SUPPORTED vim
\a alphabetic NOT SUPPORTED vim
\A not «\a» NOT SUPPORTED vim
\l lowercase NOT SUPPORTED vim
\L not lowercase NOT SUPPORTED vim
\u uppercase NOT SUPPORTED vim
\U not uppercase NOT SUPPORTED vim
\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
Vim flags:
\c ignore case NOT SUPPORTED vim
\C match case NOT SUPPORTED vim
\m magic NOT SUPPORTED vim
\M nomagic NOT SUPPORTED vim
\v verymagic NOT SUPPORTED vim
\V verynomagic NOT SUPPORTED vim
\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
Magic:
(?{code}) arbitrary Perl code NOT SUPPORTED perl
(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
(?+n) recursive call to relative group «+n» NOT SUPPORTED
(?-n) recursive call to relative group «-n» NOT SUPPORTED
(?C) PCRE callout NOT SUPPORTED pcre
(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
(?&name) recursive call to named group NOT SUPPORTED
(?P=name) named backreference NOT SUPPORTED
(?P>name) recursive call to named group NOT SUPPORTED
(?(cond)true|false) conditional branch NOT SUPPORTED
(?(cond)true) conditional branch NOT SUPPORTED
(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
(*COMMIT) NOT SUPPORTED
(*F) NOT SUPPORTED
(*FAIL) NOT SUPPORTED
(*MARK) NOT SUPPORTED
(*PRUNE) NOT SUPPORTED
(*SKIP) NOT SUPPORTED
(*THEN) NOT SUPPORTED
(*ANY) set newline convention NOT SUPPORTED
(*ANYCRLF) NOT SUPPORTED
(*CR) NOT SUPPORTED
(*CRLF) NOT SUPPORTED
(*LF) NOT SUPPORTED
(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
(*BSR_UNICODE) NOT SUPPORTED pcre

BIN
extern/re2/doc/xkcd.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

26
extern/re2/kokoro/bazel.sh vendored Normal file
View File

@ -0,0 +1,26 @@
#!/bin/bash
set -eux
cd git/re2
bazel clean
bazel build --compilation_mode=dbg -- //:all
bazel test --compilation_mode=dbg --test_output=errors -- //:all \
-//:dfa_test \
-//:exhaustive1_test \
-//:exhaustive2_test \
-//:exhaustive3_test \
-//:exhaustive_test \
-//:random_test
bazel clean
bazel build --compilation_mode=opt -- //:all
bazel test --compilation_mode=opt --test_output=errors -- //:all \
-//:dfa_test \
-//:exhaustive1_test \
-//:exhaustive2_test \
-//:exhaustive3_test \
-//:exhaustive_test \
-//:random_test
exit 0

25
extern/re2/kokoro/cmake.sh vendored Normal file
View File

@ -0,0 +1,25 @@
#!/bin/bash
set -eux
cd git/re2
case "${KOKORO_JOB_NAME}" in
*/windows-*)
CMAKE_G_A_FLAGS=('-G' 'Visual Studio 14 2015' '-A' 'x64')
;;
*)
CMAKE_G_A_FLAGS=()
# Work around a bug in older versions of bash. :/
set +u
;;
esac
cmake -D CMAKE_BUILD_TYPE=Debug "${CMAKE_G_A_FLAGS[@]}" .
cmake --build . --config Debug --clean-first
ctest -C Debug --output-on-failure -E 'dfa|exhaustive|random'
cmake -D CMAKE_BUILD_TYPE=Release "${CMAKE_G_A_FLAGS[@]}" .
cmake --build . --config Release --clean-first
ctest -C Release --output-on-failure -E 'dfa|exhaustive|random'
exit 0

1
extern/re2/kokoro/macos-bazel.cfg vendored Normal file
View File

@ -0,0 +1 @@
build_file: "re2/kokoro/macos-bazel.sh"

4
extern/re2/kokoro/macos-bazel.sh vendored Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/bazel.sh
exit $?

1
extern/re2/kokoro/macos-cmake.cfg vendored Normal file
View File

@ -0,0 +1 @@
build_file: "re2/kokoro/macos-cmake.sh"

4
extern/re2/kokoro/macos-cmake.sh vendored Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/cmake.sh
exit $?

1
extern/re2/kokoro/ubuntu-bazel.cfg vendored Normal file
View File

@ -0,0 +1 @@
build_file: "re2/kokoro/ubuntu-bazel.sh"

4
extern/re2/kokoro/ubuntu-bazel.sh vendored Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
set -eux
bash git/re2/kokoro/bazel.sh
exit $?

2
extern/re2/kokoro/windows-bazel.bat vendored Normal file
View File

@ -0,0 +1,2 @@
bash git/re2/kokoro/bazel.sh
EXIT /B %ERRORLEVEL%

1
extern/re2/kokoro/windows-bazel.cfg vendored Normal file
View File

@ -0,0 +1 @@
build_file: "re2/kokoro/windows-bazel.bat"

2
extern/re2/kokoro/windows-cmake.bat vendored Normal file
View File

@ -0,0 +1,2 @@
bash git/re2/kokoro/cmake.sh
EXIT /B %ERRORLEVEL%

1
extern/re2/kokoro/windows-cmake.cfg vendored Normal file
View File

@ -0,0 +1 @@
build_file: "re2/kokoro/windows-cmake.bat"

104
extern/re2/lib/git/commit-msg.hook vendored Normal file
View File

@ -0,0 +1,104 @@
#!/bin/sh
# From Gerrit Code Review 2.2.1
#
# Part of Gerrit Code Review (http://code.google.com/p/gerrit/)
#
# Copyright (C) 2009 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
CHANGE_ID_AFTER="Bug|Issue"
MSG="$1"
# Check for, and add if missing, a unique Change-Id
#
add_ChangeId() {
clean_message=`sed -e '
/^diff --git a\/.*/{
s///
q
}
/^Signed-off-by:/d
/^#/d
' "$MSG" | git stripspace`
if test -z "$clean_message"
then
return
fi
if grep -i '^Change-Id:' "$MSG" >/dev/null
then
return
fi
id=`_gen_ChangeId`
perl -e '
$MSG = shift;
$id = shift;
$CHANGE_ID_AFTER = shift;
undef $/;
open(I, $MSG); $_ = <I>; close I;
s|^diff --git a/.*||ms;
s|^#.*$||mg;
exit unless $_;
@message = split /\n/;
$haveFooter = 0;
$startFooter = @message;
for($line = @message - 1; $line >= 0; $line--) {
$_ = $message[$line];
if (/^[a-zA-Z0-9-]+:/ && !m,^[a-z0-9-]+://,) {
$haveFooter++;
next;
}
next if /^[ []/;
$startFooter = $line if ($haveFooter && /^\r?$/);
last;
}
@footer = @message[$startFooter+1..@message];
@message = @message[0..$startFooter];
push(@footer, "") unless @footer;
for ($line = 0; $line < @footer; $line++) {
$_ = $footer[$line];
next if /^($CHANGE_ID_AFTER):/i;
last;
}
splice(@footer, $line, 0, "Change-Id: I$id");
$_ = join("\n", @message, @footer);
open(O, ">$MSG"); print O; close O;
' "$MSG" "$id" "$CHANGE_ID_AFTER"
}
_gen_ChangeIdInput() {
echo "tree `git write-tree`"
if parent=`git rev-parse HEAD^0 2>/dev/null`
then
echo "parent $parent"
fi
echo "author `git var GIT_AUTHOR_IDENT`"
echo "committer `git var GIT_COMMITTER_IDENT`"
echo
printf '%s' "$clean_message"
}
_gen_ChangeId() {
_gen_ChangeIdInput |
git hash-object -t commit --stdin
}
add_ChangeId

16
extern/re2/libre2.symbols vendored Normal file
View File

@ -0,0 +1,16 @@
{
global:
# re2::RE2*
_ZN3re23RE2*;
_ZNK3re23RE2*;
# re2::StringPiece*
_ZN3re211StringPiece*;
_ZNK3re211StringPiece*;
# re2::operator<<*
_ZN3re2ls*;
# re2::FilteredRE2*
_ZN3re211FilteredRE2*;
_ZNK3re211FilteredRE2*;
local:
*;
};

12
extern/re2/libre2.symbols.darwin vendored Normal file
View File

@ -0,0 +1,12 @@
# Linker doesn't like these unmangled:
# re2::RE2*
__ZN3re23RE2*
__ZNK3re23RE2*
# re2::StringPiece*
__ZN3re211StringPiece*
__ZNK3re211StringPiece*
# re2::operator<<*
__ZN3re2ls*
# re2::FilteredRE2*
__ZN3re211FilteredRE2*
__ZNK3re211FilteredRE2*

10
extern/re2/re2.pc vendored Normal file
View File

@ -0,0 +1,10 @@
prefix=@prefix@
exec_prefix=@exec_prefix@
includedir=@includedir@
libdir=@libdir@
Name: re2
Description: RE2 is a fast, safe, thread-friendly regular expression engine.
Version: 0.0.0
Cflags: -std=c++11 -pthread -I${includedir}
Libs: -pthread -L${libdir} -lre2

118
extern/re2/re2/bitmap256.h vendored Normal file
View File

@ -0,0 +1,118 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_BITMAP256_H_
#define RE2_BITMAP256_H_
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <stdint.h>
#include <string.h>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
class Bitmap256 {
public:
Bitmap256() {
Clear();
}
// Clears all of the bits.
void Clear() {
memset(words_, 0, sizeof words_);
}
// Tests the bit with index c.
bool Test(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
}
// Sets the bit with index c.
void Set(int c) {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
words_[c / 64] |= (1ULL << (c % 64));
}
// Finds the next non-zero bit with index >= c.
// Returns -1 if no such bit exists.
int FindNextSetBit(int c) const;
private:
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
unsigned long c;
_BitScanForward64(&c, n);
return static_cast<int>(c);
#elif defined(_MSC_VER) && defined(_M_IX86)
unsigned long c;
if (static_cast<uint32_t>(n) != 0) {
_BitScanForward(&c, static_cast<uint32_t>(n));
return static_cast<int>(c);
} else {
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
return static_cast<int>(c) + 32;
}
#else
int c = 63;
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
uint64_t word = n << shift;
if (word != 0) {
n = word;
c -= shift;
}
}
return c;
#endif
}
uint64_t words_[4];
};
int Bitmap256::FindNextSetBit(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
uint64_t word = words_[i] & (~0ULL << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);
// Check any following words.
i++;
switch (i) {
case 1:
if (words_[1] != 0)
return (1 * 64) + FindLSBSet(words_[1]);
FALLTHROUGH_INTENDED;
case 2:
if (words_[2] != 0)
return (2 * 64) + FindLSBSet(words_[2]);
FALLTHROUGH_INTENDED;
case 3:
if (words_[3] != 0)
return (3 * 64) + FindLSBSet(words_[3]);
FALLTHROUGH_INTENDED;
default:
return -1;
}
}
} // namespace re2
#endif // RE2_BITMAP256_H_

378
extern/re2/re2/bitstate.cc vendored Normal file
View File

@ -0,0 +1,378 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Similarly to
// testing/backtrack.cc, it allocates a bitmap with (count of
// lists) * (length of prog) bits to make sure it never explores the
// same (instruction list, character position) multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <limits>
#include <utility>
#include "util/logging.h"
#include "util/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int rle; // run length encoding
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p);
void GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece* submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
static const int VisitedBits = 32;
PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited
PODArray<const char*> cap_; // capture registers
PODArray<Job> job_; // stack of text positions to explore
int njob_; // stack size
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
njob_(0) {
}
// Given id, which *must* be a list head, we can look up its list ID.
// Then the question is: Should the search visit the (list ID, p) pair?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
static_cast<int>(p-text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
void BitState::GrowStack() {
PODArray<Job> tmp(2*job_.size());
memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
job_ = std::move(tmp);
}
// Push (id, p) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p) {
if (njob_ >= job_.size()) {
GrowStack();
if (njob_ >= job_.size()) {
LOG(DFATAL) << "GrowStack() failed: "
<< "njob_ = " << njob_ << ", "
<< "job_.size() = " << job_.size();
return;
}
}
// If id < 0, it's undoing a Capture,
// so we mustn't interfere with that.
if (id >= 0 && njob_ > 0) {
Job* top = &job_[njob_-1];
if (id == top->id &&
p == top->p + top->rle + 1 &&
top->rle < std::numeric_limits<int>::max()) {
++top->rle;
return;
}
}
Job* top = &job_[njob_++];
top->id = id;
top->rle = 0;
top->p = p;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
njob_ = 0;
// Push() no longer checks ShouldVisit(),
// so we must perform the check ourselves.
if (ShouldVisit(id0, p0))
Push(id0, p0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
int& rle = job_[njob_].rle;
const char* p = job_[njob_].p;
if (id < 0) {
// Undo the Capture.
cap_[prog_->inst(-id)->cap()] = p;
continue;
}
if (rle > 0) {
p += rle;
// Revivify job on stack.
--rle;
++njob_;
}
Loop:
// Visit id, p.
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
return false;
case kInstFail:
break;
case kInstAltMatch:
if (ip->greedy(prog_)) {
// out1 is the Match instruction.
id = ip->out1();
p = end;
goto Loop;
}
if (longest_) {
// ip must be non-greedy...
// out is the Match instruction.
id = ip->out();
p = end;
goto Loop;
}
goto Next;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (!ip->Matches(c))
goto Next;
if (ip->hint() != 0)
Push(id+ip->hint(), p); // try the next when we're done
id = ip->out();
p++;
goto CheckAndLoop;
}
case kInstCapture:
if (!ip->last())
Push(id+1, p); // try the next when we're done
if (0 <= ip->cap() && ip->cap() < cap_.size()) {
// Capture p to register, but save old value first.
Push(-id, cap_[ip->cap()]); // undo when we're done
cap_[ip->cap()] = p;
}
id = ip->out();
goto CheckAndLoop;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
goto Next;
if (!ip->last())
Push(id+1, p); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstNop:
if (!ip->last())
Push(id+1, p); // try the next when we're done
id = ip->out();
CheckAndLoop:
// Sanity check: id is the head of its list, which must
// be the case if id-1 is the last of *its* list. :)
DCHECK(id == 0 || prog_->inst(id-1)->last());
if (ShouldVisit(id, p))
goto Loop;
break;
case kInstMatch: {
if (endmatch_ && p != end)
goto Next;
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] =
StringPiece(cap_[2 * i],
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == end)
return true;
// Otherwise, continue on in hope of a longer match.
// Note the absence of the ShouldVisit() check here
// due to execution remaining in the same list.
Next:
if (!ip->last()) {
id++;
goto Loop;
}
break;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece();
// Allocate scratch space.
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
nvisited = (nvisited + VisitedBits-1) / VisitedBits;
visited_ = PODArray<uint32_t>(nvisited);
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
int ncap = 2*nsubmatch;
if (ncap < 2)
ncap = 2;
cap_ = PODArray<const char*>(ncap);
memset(cap_.data(), 0, ncap*sizeof cap_[0]);
// When sizeof(Job) == 16, we start with a nice round 1KiB. :)
job_ = PODArray<Job>(64);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
// Try to use memchr to find the first byte quickly.
int fb = prog_->first_byte();
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL)
p = text.end();
}
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

1279
extern/re2/re2/compile.cc vendored Normal file

File diff suppressed because it is too large Load Diff

2151
extern/re2/re2/dfa.cc vendored Normal file

File diff suppressed because it is too large Load Diff

121
extern/re2/re2/filtered_re2.cc vendored Normal file
View File

@ -0,0 +1,121 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/filtered_re2.h"
#include <stddef.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::FilteredRE2(int min_atom_len)
: compiled_(false),
prefilter_tree_(new PrefilterTree(min_atom_len)) {
}
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
}
delete re;
} else {
*id = static_cast<int>(re2_vec_.size());
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(std::vector<std::string>* atoms) {
if (compiled_) {
LOG(ERROR) << "Compile called already.";
return;
}
if (re2_vec_.empty()) {
LOG(ERROR) << "Compile called before Add.";
return;
}
for (size_t i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return static_cast<int>(i);
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile.";
return -1;
}
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const {
matching_regexps->clear();
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::AllPotentials(
const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

109
extern/re2/re2/filtered_re2.h vendored Normal file
View File

@ -0,0 +1,109 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#include <string>
#include <vector>
#include "re2/re2.h"
namespace re2 {
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(std::vector<std::string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
// Get the individual RE2 objects.
const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
private:
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
std::vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

173
extern/re2/re2/fuzzing/re2_fuzzer.cc vendored Normal file
View File

@ -0,0 +1,173 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <memory>
#include <queue>
#include <string>
#include "re2/prefilter.h"
#include "re2/re2.h"
using re2::StringPiece;
// NOT static, NOT signed.
uint8_t dummy = 0;
void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
RE2 re(pattern, options);
if (!re.ok())
return;
// Don't waste time fuzzing programs with large substrings.
// They can cause bug reports due to fuzzer timeouts when they
// are repetitions (e.g. hundreds of NUL bytes) and matching is
// unanchored. And they aren't interesting for fuzzing purposes.
std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
if (prefilter == nullptr)
return;
std::queue<re2::Prefilter*> nodes;
nodes.push(prefilter.get());
while (!nodes.empty()) {
re2::Prefilter* node = nodes.front();
nodes.pop();
if (node->op() == re2::Prefilter::ATOM) {
if (node->atom().size() > 9)
return;
} else if (node->op() == re2::Prefilter::AND ||
node->op() == re2::Prefilter::OR) {
for (re2::Prefilter* sub : *node->subs())
nodes.push(sub);
}
}
// Don't waste time fuzzing high-size programs.
// They can cause bug reports due to fuzzer timeouts.
int size = re.ProgramSize();
if (size > 9999)
return;
int rsize = re.ReverseProgramSize();
if (rsize > 9999)
return;
// Don't waste time fuzzing high-fanout programs.
// They can cause bug reports due to fuzzer timeouts.
std::map<int, int> histogram;
int fanout = re.ProgramFanout(&histogram);
if (fanout > 9)
return;
int rfanout = re.ReverseProgramFanout(&histogram);
if (rfanout > 9)
return;
if (re.NumberOfCapturingGroups() == 0) {
// Avoid early return due to too many arguments.
StringPiece sp = text;
RE2::FullMatch(sp, re);
RE2::PartialMatch(sp, re);
RE2::Consume(&sp, re);
sp = text; // Reset.
RE2::FindAndConsume(&sp, re);
} else {
// Okay, we have at least one capturing group...
// Try conversion for variously typed arguments.
StringPiece sp = text;
short s;
RE2::FullMatch(sp, re, &s);
long l;
RE2::PartialMatch(sp, re, &l);
float f;
RE2::Consume(&sp, re, &f);
sp = text; // Reset.
double d;
RE2::FindAndConsume(&sp, re, &d);
}
std::string s = std::string(text);
RE2::Replace(&s, re, "");
s = std::string(text); // Reset.
RE2::GlobalReplace(&s, re, "");
std::string min, max;
re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
// Exercise some other API functionality.
dummy += re.NamedCapturingGroups().size();
dummy += re.CapturingGroupNames().size();
dummy += RE2::QuoteMeta(pattern).size();
}
// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
if (size == 0 || size > 999)
return 0;
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
// Otherwise, we will waste time on inputs that have long runs of various
// character classes. The fuzzer has shown itself to be easily capable of
// generating such patterns that fall within the other limits, but result
// in timeouts nonetheless. The marginal cost is high - even more so when
// counted repetition is involved - whereas the marginal benefit is zero.
// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
int char_class = 0;
int backslash_p = 0; // very expensive, so handle specially
for (size_t i = 0; i < size; i++) {
if (data[i] == '.')
char_class++;
if (data[i] != '\\')
continue;
i++;
if (i >= size)
break;
if (data[i] == 'p' || data[i] == 'P' ||
data[i] == 'd' || data[i] == 'D' ||
data[i] == 's' || data[i] == 'S' ||
data[i] == 'w' || data[i] == 'W')
char_class++;
if (data[i] == 'p' || data[i] == 'P')
backslash_p++;
}
if (char_class > 9)
return 0;
if (backslash_p > 1)
return 0;
// The one-at-a-time hash by Bob Jenkins.
uint32_t hash = 0;
for (size_t i = 0; i < size; i++) {
hash += data[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
RE2::Options options;
options.set_log_errors(false);
options.set_max_mem(64 << 20);
options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
: RE2::Options::EncodingUTF8);
options.set_posix_syntax(hash & 2);
options.set_longest_match(hash & 4);
options.set_literal(hash & 8);
options.set_never_nl(hash & 16);
options.set_dot_nl(hash & 32);
options.set_never_capture(hash & 64);
options.set_case_sensitive(hash & 128);
options.set_perl_classes(hash & 256);
options.set_word_boundary(hash & 512);
options.set_one_line(hash & 1024);
const char* ptr = reinterpret_cast<const char*>(data);
int len = static_cast<int>(size);
StringPiece pattern(ptr, len);
StringPiece text(ptr, len);
Test(pattern, options, text);
return 0;
}

116
extern/re2/re2/make_perl_groups.pl vendored Normal file
View File

@ -0,0 +1,116 @@
#!/usr/bin/perl
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
@perlclasses = (
"\\d",
"\\s",
"\\w",
);
%overrides = (
# Prior to Perl 5.18, \s did not match vertical tab.
# RE2 preserves that original behaviour.
"\\s:11" => 0,
);
sub ComputeClass($) {
my ($cname) = @_;
my @ranges;
my $regexp = qr/[$cname]/;
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cnum, $cname, @ranges) = @_;
print "static const URange16 code${cnum}[] = { /* $cname */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
my $escname = $cname;
$escname =~ s/\\/\\\\/g;
$negname = $escname;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
}
my $cnum = 0;
sub PrintClasses($@) {
my ($pname, @classes) = @_;
my @entries;
foreach my $cname (@classes) {
my @ranges = ComputeClass($cname);
push @entries, PrintClass(++$cnum, $cname, @ranges);
}
print "const UGroup ${pname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
print "const int num_${pname}_groups = $count;\n";
}
print <<EOF;
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);
print <<EOF;
} // namespace re2
EOF

151
extern/re2/re2/make_unicode_casefold.py vendored Normal file
View File

@ -0,0 +1,151 @@
#!/usr/bin/python
# coding=utf-8
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# See unicode_casefold.h for description of case folding tables.
"""Generate C++ table for Unicode case folding."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unicode
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
def _Delta(a, b):
"""Compute the delta for b - a. Even/odd and odd/even
are handled specially, as described above."""
if a+1 == b:
if a%2 == 0:
return 'EvenOdd'
else:
return 'OddEven'
if a == b+1:
if a%2 == 0:
return 'OddEven'
else:
return 'EvenOdd'
return b - a
def _AddDelta(a, delta):
"""Return a + delta, handling EvenOdd and OddEven specially."""
if type(delta) == int:
return a+delta
if delta == 'EvenOdd':
if a%2 == 0:
return a+1
else:
return a-1
if delta == 'OddEven':
if a%2 == 1:
return a+1
else:
return a-1
print("Bad Delta:", delta, file=sys.stderr)
raise unicode.Error("Bad Delta")
def _MakeRanges(pairs):
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
into [(65, 90, +32)]."""
ranges = []
last = -100
def evenodd(last, a, b, r):
if a != last+1 or b != _AddDelta(a, r[2]):
return False
r[1] = a
return True
def evenoddpair(last, a, b, r):
if a != last+2:
return False
delta = r[2]
d = delta
if type(delta) is not str:
return False
if delta.endswith('Skip'):
d = delta[:-4]
else:
delta = d + 'Skip'
if b != _AddDelta(a, d):
return False
r[1] = a
r[2] = delta
return True
for a, b in pairs:
if ranges and evenodd(last, a, b, ranges[-1]):
pass
elif ranges and evenoddpair(last, a, b, ranges[-1]):
pass
else:
ranges.append([a, a, _Delta(a, b)])
last = a
return ranges
# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
# case-folding group, so it is important that this bound be small.
# The current tables have no group bigger than 4.
# If there are ever groups bigger than 10 or so, it will be
# time to rework the code in parse.cc.
MaxCasefoldGroup = 4
def main():
lowergroups, casegroups = unicode.CaseGroups()
foldpairs = []
seen = {}
for c in casegroups:
if len(c) > MaxCasefoldGroup:
raise unicode.Error("casefold group too long: %s" % (c,))
for i in range(len(c)):
if c[i-1] in seen:
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
seen[c[i-1]] = True
foldpairs.append([c[i-1], c[i]])
lowerpairs = []
for lower, group in lowergroups.items():
for g in group:
if g != lower:
lowerpairs.append([g, lower])
def printpairs(name, foldpairs):
foldpairs.sort()
foldranges = _MakeRanges(foldpairs)
print("// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)))
print("const CaseFold unicode_%s[] = {" % (name,))
for lo, hi, delta in foldranges:
print("\t{ %d, %d, %s }," % (lo, hi, delta))
print("};")
print("const int num_unicode_%s = %d;" % (name, len(foldranges)))
print("")
print(_header)
printpairs("casefold", foldpairs)
printpairs("tolower", lowerpairs)
print(_trailer)
if __name__ == '__main__':
main()

117
extern/re2/re2/make_unicode_groups.py vendored Normal file
View File

@ -0,0 +1,117 @@
#!/usr/bin/python
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Generate C++ tables for Unicode Script and Category groups."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unicode
_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
n16 = 0
n32 = 0
def MakeRanges(codes):
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
ranges = []
last = -100
for c in codes:
if c == last+1:
ranges[-1][1] = c
else:
ranges.append([c, c])
last = c
return ranges
def PrintRanges(type, name, ranges):
"""Print the ranges as an array of type named name."""
print("static const %s %s[] = {" % (type, name))
for lo, hi in ranges:
print("\t{ %d, %d }," % (lo, hi))
print("};")
# def PrintCodes(type, name, codes):
# """Print the codes as an array of type named name."""
# print("static %s %s[] = {" % (type, name))
# for c in codes:
# print("\t%d," % (c,))
# print("};")
def PrintGroup(name, codes):
"""Print the data structures for the group of codes.
Return a UGroup literal for the group."""
# See unicode_groups.h for a description of the data structure.
# Split codes into 16-bit ranges and 32-bit ranges.
range16 = MakeRanges([c for c in codes if c < 65536])
range32 = MakeRanges([c for c in codes if c >= 65536])
# Pull singleton ranges out of range16.
# code16 = [lo for lo, hi in range16 if lo == hi]
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
global n16
global n32
n16 += len(range16)
n32 += len(range32)
ugroup = "{ \"%s\", +1" % (name,)
# if len(code16) > 0:
# PrintCodes("uint16_t", name+"_code16", code16)
# ugroup += ", %s_code16, %d" % (name, len(code16))
# else:
# ugroup += ", 0, 0"
if len(range16) > 0:
PrintRanges("URange16", name+"_range16", range16)
ugroup += ", %s_range16, %d" % (name, len(range16))
else:
ugroup += ", 0, 0"
if len(range32) > 0:
PrintRanges("URange32", name+"_range32", range32)
ugroup += ", %s_range32, %d" % (name, len(range32))
else:
ugroup += ", 0, 0"
ugroup += " }"
return ugroup
def main():
categories = unicode.Categories()
scripts = unicode.Scripts()
print(_header)
ugroups = []
for name in sorted(categories):
ugroups.append(PrintGroup(name, categories[name]))
for name in sorted(scripts):
ugroups.append(PrintGroup(name, scripts[name]))
print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
print("const UGroup unicode_groups[] = {")
ugroups.sort()
for ug in ugroups:
print("\t%s," % (ug,))
print("};")
print("const int num_unicode_groups = %d;" % (len(ugroups),))
print(_trailer)
if __name__ == '__main__':
main()

187
extern/re2/re2/mimics_pcre.cc vendored Normal file
View File

@ -0,0 +1,187 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
EmptyStringWalker(const EmptyStringWalker&) = delete;
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

757
extern/re2/re2/nfa.cc vendored Normal file
View File

@ -0,0 +1,757 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
namespace re2 {
static const bool ExtraDebug = false;
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
struct Thread {
union {
int ref;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
Thread* t; // if not null, set t0 = t before processing id
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline Thread* Incref(Thread* t);
inline void Decref(Thread* t);
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p);
// Returns text version of capture information, for debugging.
std::string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
PODArray<AddState> stack_; // pre-allocated for AddToThreadq
Thread* free_threads_; // free list
const char** match_; // best match so far
bool matched_; // any match so far?
NFA(const NFA&) = delete;
NFA& operator=(const NFA&) = delete;
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog_->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
// See NFA::AddToThreadq() for why this is so.
int nstack = 2*prog_->inst_count(kInstCapture) +
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
stack_ = PODArray<AddState>(nstack);
free_threads_ = NULL;
match_ = NULL;
matched_ = false;
}
NFA::~NFA() {
delete[] match_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
t->ref = 1;
return t;
}
NFA::Thread* NFA::Incref(Thread* t) {
DCHECK(t != NULL);
t->ref++;
return t;
}
void NFA::Decref(Thread* t) {
if (t == NULL)
return;
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
t->next = free_threads_;
free_threads_ = t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// context is used (with p) for evaluating empty-width specials.
// p is the current input position, and t0 is the current thread.
void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
const char* p, Thread* t0) {
if (id0 == 0)
return;
// Use stack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// two entries per Capture;
// one entry per EmptyWidth; and
// one entry per Nop.
// This reflects the maximum number of stack pushes that each can
// perform. (Each instruction can be processed at most once.)
AddState* stk = stack_.data();
int nstk = 0;
stk[nstk++] = {id0, NULL};
while (nstk > 0) {
DCHECK_LE(nstk, stack_.size());
AddState a = stk[--nstk];
Loop:
if (a.t != NULL) {
// t0 was a thread that we allocated and copied in order to
// record the capture, so we must now decref it.
Decref(t0);
t0 = a.t;
}
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (ExtraDebug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->get_existing(id);
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
DCHECK(!ip->last());
a = {id+1, NULL};
goto Loop;
case kInstNop:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
// Continue on.
a = {ip->out(), NULL};
goto Loop;
case kInstCapture:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore t0
// once we finish exploring this possibility.
stk[nstk++] = {0, t0};
// Record capture.
t = AllocThread();
CopyCapture(t->capture, t0->capture);
t->capture[j] = p;
t0 = t;
}
a = {ip->out(), NULL};
goto Loop;
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
if (ExtraDebug)
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
if (ip->hint() == 0)
break;
a = {id+ip->hint(), NULL};
goto Loop;
case kInstMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
if (ExtraDebug)
fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str());
Next:
if (ip->last())
break;
a = {id+1, NULL};
goto Loop;
case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = {id+1, NULL};
// Continue on if we have all the right flag bits.
if (ip->empty() & ~Prog::EmptyFlags(context, p))
break;
a = {ip->out(), NULL};
goto Loop;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// context is used (with p) for evaluating empty-width specials.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
Decref(t);
continue;
}
}
int id = i->index();
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
AddToThreadq(nextq, ip->out(), c, context, p, t);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture(match_, t->capture);
matched_ = true;
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->value());
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch: {
// Avoid invoking undefined behavior when p happens
// to be null - and p-1 would be meaningless anyway.
if (p == NULL)
break;
if (endmatch_ && p-1 != etext_)
break;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
}
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->value());
runq->clear();
return 0;
}
break;
}
}
Decref(t);
}
runq->clear();
return 0;
}
std::string NFA::FormatCapture(const char** capture) {
std::string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
s += "(?,?)";
else if (capture[i+1] == NULL)
s += StringPrintf("(%d,?)",
(int)(capture[i] - btext_));
else
s += StringPrintf("(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
// Sanity check: make sure that text lies within context.
if (text.begin() < context.begin() || text.end() > context.end()) {
LOG(DFATAL) << "context does not contain text";
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
// For debugging prints.
btext_ = context.begin();
if (ExtraDebug)
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
std::string(text).c_str(), std::string(context).c_str(), anchored,
longest);
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
c = '^';
else if (p > text.end())
c = '$';
else if (p < text.end())
c = p[0] & 0xFF;
fprintf(stderr, "%c:", c);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->value();
if (t == NULL)
continue;
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
}
fprintf(stderr, "\n");
}
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, context, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
int fb = prog_->first_byte();
if (!anchored && runq->size() == 0 &&
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
}
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, context, p,
t);
Decref(t);
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (ExtraDebug)
fprintf(stderr, "dead\n");
break;
}
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
Decref(i->value());
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i] =
StringPiece(match_[2 * i],
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
if (ExtraDebug)
fprintf(stderr, "match (%td,%td)\n",
match_[0] - btext_, match_[1] - btext_);
return true;
}
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int Prog::ComputeFirstByte() {
int b = -1;
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
if (!ip->last())
q.insert(id+1);
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAltMatch:
DCHECK(!ip->last());
q.insert(id+1);
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (ExtraDebug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
//
// fanout holds the results and is also the work queue for the outer iteration.
// reachable holds the reached nodes for the inner iteration.
void Prog::Fanout(SparseArray<int>* fanout) {
DCHECK_EQ(fanout->max_size(), size());
SparseSet reachable(size());
fanout->clear();
fanout->set_new(start(), 0);
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
int* count = &i->value();
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
int id = *j;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
if (!ip->last())
reachable.insert(id+1);
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
case kInstAltMatch:
DCHECK(!ip->last());
reachable.insert(id+1);
break;
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last())
reachable.insert(id+1);
reachable.insert(ip->out());
break;
case kInstMatch:
if (!ip->last())
reachable.insert(id+1);
break;
case kInstFail:
break;
}
}
}
}
} // namespace re2

623
extern/re2/re2/onepass.cc vendored Normal file
View File

@ -0,0 +1,623 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
namespace re2 {
static const bool ExtraDebug = false;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32_t matchcond; // conditions to match right now.
uint32_t action[];
};
// The uint32_t conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32_t kMatchWins = 1 << kEmptyShift;
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
"kEmptyShift disagrees with kEmptyAllFlags");
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
"kMaxCap disagrees with kMaxOnePassCapture");
}
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32_t cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Computes the OneState* for the given nodeindex.
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
uint8_t* nodes = onepass_nodes_.data();
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
// start() is always mapped to the zeroth OneState.
OneState* state = IndexToNode(nodes, statesize, 0);
uint8_t* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32_t nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32_t matchcond = nextmatchcond;
uint32_t cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32_t nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32_t matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i] =
StringPiece(matchcap[2 * i],
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32_t cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_nodes_.data() != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + inst_count(kInstByteRange);
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
PODArray<InstCond> stack(stacksize);
int size = this->size();
PODArray<int> nodebyid(size); // indexed by ip
memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
std::vector<uint8_t> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
int nalloc = 1;
nodes.insert(nodes.end(), statesize, 0);
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
uint32_t cond = stack[nstack].cond;
Loop:
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
DCHECK(!ip->last());
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
AddQ(&tovisit, ip->out());
nodebyid[ip->out()] = nalloc;
nalloc++;
nodes.insert(nodes.end(), statesize, 0);
// Update node because it might have been invalidated.
node = IndexToNode(nodes.data(), statesize, nodeindex);
}
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
}
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
}
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last()) {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
stack[nstack].id = id+1;
stack[nstack++].cond = cond;
}
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
if (ip->opcode() == kInstEmptyWidth)
cond |= ip->empty();
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
goto fail;
}
id = ip->out();
goto Loop;
case kInstMatch:
if (matched) {
// (3) is violated
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple matches from %d\n", *it);
goto fail;
}
matched = true;
node->matchcond = cond;
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstFail:
break;
}
}
}
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
LOG(ERROR) << "prog:\n" << Dump();
std::map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
std::string dump;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
dump += StringPrintf("node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
dump += StringPrintf(" %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << "nodes:\n" << dump;
}
dfa_mem_ -= nalloc*statesize;
onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
return true;
fail:
return false;
}
} // namespace re2

2463
extern/re2/re2/parse.cc vendored Normal file

File diff suppressed because it is too large Load Diff

119
extern/re2/re2/perl_groups.cc vendored Normal file
View File

@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
const int num_posix_groups = 28;
} // namespace re2

710
extern/re2/re2/prefilter.cc vendored Normal file
View File

@ -0,0 +1,710 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter.h"
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const bool ExtraDebug = false;
typedef std::set<std::string>::iterator SSIter;
typedef std::set<std::string>::const_iterator ConstSSIter;
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new std::vector<Prefilter*>;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->empty()) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (size_t i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(std::set<std::string>* ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching "ab" will already allow this regexp to be a
// candidate for match, so further matching "abc" is redundant.
// Note that we must ignore "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
if (i->empty())
continue;
SSIter j = i;
++j;
while (j != ss->end()) {
if (j->find(*i) != std::string::npos) {
j = ss->erase(j);
continue;
}
++j;
}
}
}
Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) {
Prefilter* or_prefilter = new Prefilter(NONE);
SimplifyStringSet(ss);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const std::string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyCharOrAnyByte();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
std::string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
std::set<std::string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
std::set<std::string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
std::string Prefilter::Info::ToString() {
if (is_exact_) {
int n = 0;
std::string s;
for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const std::set<std::string>& src,
std::set<std::string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const std::set<std::string>& a,
const std::set<std::string>& b,
std::set<std::string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static std::string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return std::string(buf, n);
}
static std::string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return std::string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character) or \C (any byte).
Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (ExtraDebug) {
LOG(ERROR) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
LOG(ERROR) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyCharOrAnyByte();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (ExtraDebug)
LOG(ERROR) << " = " << a->ToString();
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (ExtraDebug)
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
case kRegexpAnyByte:
// Claim nothing, except that it's not empty.
info = AnyCharOrAnyByte();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (ExtraDebug)
LOG(ERROR) << "BuildInfo " << re->ToString()
<< ": " << (info ? info->ToString() : "");
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
std::string Prefilter::DebugString() const {
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
std::string s = "";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
return s;
}
case OR: {
std::string s = "(";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

108
extern/re2/re2/prefilter.h vendored Normal file
View File

@ -0,0 +1,108 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#include <set>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const std::string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
std::vector<Prefilter*>* subs() {
DCHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
std::string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const std::string& str);
static Prefilter* OrStrings(std::set<std::string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
std::vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
std::string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
Prefilter(const Prefilter&) = delete;
Prefilter& operator=(const Prefilter&) = delete;
};
} // namespace re2
#endif // RE2_PREFILTER_H_

407
extern/re2/re2/prefilter_tree.cc vendored Normal file
View File

@ -0,0 +1,407 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter_tree.h"
#include <stddef.h>
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
namespace re2 {
static const bool ExtraDebug = false;
PrefilterTree::PrefilterTree()
: compiled_(false),
min_atom_len_(3) {
}
PrefilterTree::PrefilterTree(int min_atom_len)
: compiled_(false),
min_atom_len_(min_atom_len) {
}
PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (size_t i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
void PrefilterTree::Add(Prefilter* prefilter) {
if (compiled_) {
LOG(DFATAL) << "Add called after Compile.";
return;
}
if (prefilter != NULL && !KeepNode(prefilter)) {
delete prefilter;
prefilter = NULL;
}
prefilter_vec_.push_back(prefilter);
}
void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile called already.";
return;
}
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it) {
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
}
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
std::string node_string = NodeString(node);
std::map<std::string, Prefilter*>::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
}
std::string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
std::string s = StringPrintf("%d", node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
}
}
return s;
}
bool PrefilterTree::KeepNode(Prefilter* node) const {
if (node == NULL)
return false;
switch (node->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
return false;
case Prefilter::ALL:
case Prefilter::NONE:
return false;
case Prefilter::ATOM:
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
case Prefilter::AND: {
int j = 0;
std::vector<Prefilter*>* subs = node->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepNode((*subs)[i]))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < node->subs()->size(); i++)
if (!KeepNode((*node->subs())[i]))
return false;
return true;
}
}
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
std::vector<std::string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
std::vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(static_cast<int>(i));
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (size_t i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const std::vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
nodes->emplace(NodeString(node), node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(nodes->size());
// Create parent StdIntMap for the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
}
// Fill the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
std::set<int> uniq_child;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(nodes, child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) ==
child_entry->parents->end()) {
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
}
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
? static_cast<int>(uniq_child.size())
: 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
// Some legacy users of PrefilterTree call Compile() before
// adding any regexps and expect Compile() to have no effect.
// This kludge is a counterpart to that kludge.
if (prefilter_vec_.empty())
return;
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
for (size_t i = 0; i < prefilter_vec_.size(); i++)
regexps->push_back(static_cast<int>(i));
} else {
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
std::vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++)
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
std::sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(static_cast<int>(entries_.size()));
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
regexps->set(entry.regexps[i], 1);
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->first;
const Entry& parent = entries_[j];
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
const std::vector<int>& regexps = entries_[i].regexps;
LOG(ERROR) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
LOG(ERROR) << it->first;
}
LOG(ERROR) << "Map:";
for (std::map<std::string, Prefilter*>::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
std::string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

139
extern/re2/re2/prefilter_tree.h vendored Normal file
View File

@ -0,0 +1,139 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added to PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/sparse_array.h"
#include "re2/prefilter.h"
namespace re2 {
class PrefilterTree {
public:
PrefilterTree();
explicit PrefilterTree(int min_atom_len);
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(std::vector<std::string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
private:
typedef SparseArray<int> IntMap;
typedef std::map<int, int> StdIntMap;
typedef std::map<std::string, Prefilter*> NodeMap;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
StdIntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
std::vector<int> regexps;
};
// Returns true if the prefilter node should be kept.
bool KeepNode(Prefilter* node) const;
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
std::string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo(NodeMap* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
std::vector<Entry> entries_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
std::vector<int> unfiltered_;
// vector of Prefilter for all regexps.
std::vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
std::vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
// Strings less than this length are not stored as atoms.
const int min_atom_len_;
PrefilterTree(const PrefilterTree&) = delete;
PrefilterTree& operator=(const PrefilterTree&) = delete;
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

921
extern/re2/re2/prog.cc vendored Normal file
View File

@ -0,0 +1,921 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "re2/prog.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/bitmap256.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
hint_foldcase_ = foldcase&1;
}
void Prog::Inst::InitCapture(int cap, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32_t id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
std::string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] %d -> %d",
foldcase() ? "/i" : "",
lo_, hi_, hint(), out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_flatten_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
bytemap_range_(0),
first_byte_(-1),
flags_(0),
list_count_(0),
dfa_mem_(0),
dfa_first_(NULL),
dfa_longest_(NULL) {
}
Prog::~Prog() {
DeleteDFA(dfa_longest_);
DeleteDFA(dfa_first_);
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static std::string ProgToString(Prog* prog, Workq* q) {
std::string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
static std::string FlattenedProgToString(Prog* prog, int start) {
std::string s;
for (int id = start; id < prog->size(); id++) {
Prog::Inst* ip = prog->inst(id);
if (ip->last())
s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
else
s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
}
return s;
}
std::string Prog::Dump() {
if (did_flatten_)
return FlattenedProgToString(this, start_);
Workq q(size_);
AddToQueue(&q, start_);
return ProgToString(this, &q);
}
std::string Prog::DumpUnanchored() {
if (did_flatten_)
return FlattenedProgToString(this, start_unanchored_);
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
std::string Prog::DumpByteMap() {
std::string map;
for (int c = 0; c < 256; c++) {
int b = bytemap_[c];
int lo = c;
while (c < 256-1 && bytemap_[c+1] == b)
c++;
int hi = c;
map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
}
return map;
}
int Prog::first_byte() {
std::call_once(first_byte_once_, [](Prog* prog) {
prog->first_byte_ = prog->ComputeFirstByte();
}, this);
return first_byte_;
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
// ByteMapBuilder implements a coloring algorithm.
//
// The first phase is a series of "mark and merge" batches: we mark one or more
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
// performance; rather, it means that the ranges are treated indistinguishably.
//
// Internally, the ranges are represented using a bitmap that stores the splits
// and a vector that stores the colors; both of them are indexed by the ranges'
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
// hi (if not already split), then recolor each range in between. The color map
// (i.e. from the old color to the new color) is maintained for the lifetime of
// the batch and so underpins this somewhat obscure approach to set operations.
//
// The second phase builds the bytemap from our internal state: we recolor each
// range, then store the new color (which is now the byte class) in each of the
// corresponding array elements. Finally, we output the number of byte classes.
class ByteMapBuilder {
public:
ByteMapBuilder() {
// Initial state: the [0-255] range has color 256.
// This will avoid problems during the second phase,
// in which we assign byte classes numbered from 0.
splits_.Set(255);
colors_[255] = 256;
nextcolor_ = 257;
}
void Mark(int lo, int hi);
void Merge();
void Build(uint8_t* bytemap, int* bytemap_range);
private:
int Recolor(int oldcolor);
Bitmap256 splits_;
int colors_[256];
int nextcolor_;
std::vector<std::pair<int, int>> colormap_;
std::vector<std::pair<int, int>> ranges_;
ByteMapBuilder(const ByteMapBuilder&) = delete;
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
};
void ByteMapBuilder::Mark(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
// Ignore any [0-255] ranges. They cause us to recolor every range, which
// has no effect on the eventual result and is therefore a waste of time.
if (lo == 0 && hi == 255)
return;
ranges_.emplace_back(lo, hi);
}
void ByteMapBuilder::Merge() {
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
it != ranges_.end();
++it) {
int lo = it->first-1;
int hi = it->second;
if (0 <= lo && !splits_.Test(lo)) {
splits_.Set(lo);
int next = splits_.FindNextSetBit(lo+1);
colors_[lo] = colors_[next];
}
if (!splits_.Test(hi)) {
splits_.Set(hi);
int next = splits_.FindNextSetBit(hi+1);
colors_[hi] = colors_[next];
}
int c = lo+1;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
colors_[next] = Recolor(colors_[next]);
if (next == hi)
break;
c = next+1;
}
}
colormap_.clear();
ranges_.clear();
}
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
// Assign byte classes numbered from 0.
nextcolor_ = 0;
int c = 0;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
while (c <= next) {
bytemap[c] = b;
c++;
}
}
*bytemap_range = nextcolor_;
}
int ByteMapBuilder::Recolor(int oldcolor) {
// Yes, this is a linear search. There can be at most 256
// colors and there will typically be far fewer than that.
// Also, we need to consider keys *and* values in order to
// avoid recoloring a given range more than once per batch.
std::vector<std::pair<int, int>>::const_iterator it =
std::find_if(colormap_.begin(), colormap_.end(),
[=](const std::pair<int, int>& kv) -> bool {
return kv.first == oldcolor || kv.second == oldcolor;
});
if (it != colormap_.end())
return it->second;
int newcolor = nextcolor_;
nextcolor_++;
colormap_.emplace_back(oldcolor, newcolor);
return newcolor;
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for the program.
// Ranges of bytes that are treated indistinguishably
// will be mapped to a single byte class.
ByteMapBuilder builder;
// Don't repeat the work for ^ and $.
bool marked_line_boundaries = false;
// Don't repeat the work for \b and \B.
bool marked_word_boundaries = false;
for (int id = 0; id < size(); id++) {
Inst* ip = inst(id);
if (ip->opcode() == kInstByteRange) {
int lo = ip->lo();
int hi = ip->hi();
builder.Mark(lo, hi);
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
int foldlo = lo;
int foldhi = hi;
if (foldlo < 'a')
foldlo = 'a';
if (foldhi > 'z')
foldhi = 'z';
if (foldlo <= foldhi) {
foldlo += 'A' - 'a';
foldhi += 'A' - 'a';
builder.Mark(foldlo, foldhi);
}
}
// If this Inst is not the last Inst in its list AND the next Inst is
// also a ByteRange AND the Insts have the same out, defer the merge.
if (!ip->last() &&
inst(id+1)->opcode() == kInstByteRange &&
ip->out() == inst(id+1)->out())
continue;
builder.Merge();
} else if (ip->opcode() == kInstEmptyWidth) {
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
!marked_line_boundaries) {
builder.Mark('\n', '\n');
builder.Merge();
marked_line_boundaries = true;
}
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
!marked_word_boundaries) {
// We require two batches here: the first for ranges that are word
// characters, the second for ranges that are not word characters.
for (bool isword : {true, false}) {
int j;
for (int i = 0; i < 256; i = j) {
for (j = i + 1; j < 256 &&
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
Prog::IsWordChar(static_cast<uint8_t>(j));
j++)
;
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
builder.Mark(i, j - 1);
}
builder.Merge();
}
marked_word_boundaries = true;
}
}
}
builder.Build(bytemap_, &bytemap_range_);
if (0) { // For debugging, use trivial bytemap.
LOG(ERROR) << "Using trivial bytemap.";
for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8_t>(i);
bytemap_range_ = 256;
}
}
// Prog::Flatten() implements a graph rewriting algorithm.
//
// The overall process is similar to epsilon removal, but retains some epsilon
// transitions: those from Capture and EmptyWidth instructions; and those from
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
// in the worst case.) It might be best thought of as Alt instruction elision.
//
// In conceptual terms, it divides the Prog into "trees" of instructions, then
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
// is one or more instructions that grow from one "root" instruction to one or
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
// "root" is also the "leaf". In most cases, a "root" is the successor of some
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
// EmptyWidth or Match instruction. However, this is insufficient for handling
// nested nullable subexpressions correctly, so in some cases, a "root" is the
// dominator of the instructions reachable from some "successor root" (i.e. it
// has an unreachable predecessor) and is considered a "dominator root". Since
// only Alt instructions can be "dominator roots" (other instructions would be
// "leaves"), only Alt instructions are required to be marked as predecessors.
//
// Dividing the Prog into "trees" comprises two passes: marking the "successor
// roots" and the predecessors; and marking the "dominator roots". Sorting the
// "successor roots" by their bytecode offsets enables iteration in order from
// greatest to least during the second pass; by working backwards in this case
// and flooding the graph no further than "leaves" and already marked "roots",
// it becomes possible to mark "dominator roots" without doing excessive work.
//
// Traversing the "trees" is just iterating over the "roots" in order of their
// marking and flooding the graph no further than "leaves" and "roots". When a
// "leaf" is reached, the instruction is copied with its successor remapped to
// its "root" number. When a "root" is reached, a Nop instruction is generated
// with its successor remapped similarly. As each "list" is produced, its last
// instruction is marked as such. After all of the "lists" have been produced,
// a pass over their instructions remaps their successors to bytecode offsets.
void Prog::Flatten() {
if (did_flatten_)
return;
did_flatten_ = true;
// Scratch structures. It's important that these are reused by functions
// that we call in loops because they would thrash the heap otherwise.
SparseSet reachable(size());
std::vector<int> stk;
stk.reserve(size());
// First pass: Marks "successor roots" and predecessors.
// Builds the mapping from inst-ids to root-ids.
SparseArray<int> rootmap(size());
SparseArray<int> predmap(size());
std::vector<std::vector<int>> predvec;
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
// Second pass: Marks "dominator roots".
SparseArray<int> sorted(rootmap);
std::sort(sorted.begin(), sorted.end(), sorted.less);
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
i != sorted.begin();
--i) {
if (i->index() != start_unanchored() && i->index() != start())
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
}
// Third pass: Emits "lists". Remaps outs to root-ids.
// Builds the mapping from root-ids to flat-ids.
std::vector<int> flatmap(rootmap.size());
std::vector<Inst> flat;
flat.reserve(size());
for (SparseArray<int>::const_iterator i = rootmap.begin();
i != rootmap.end();
++i) {
flatmap[i->value()] = static_cast<int>(flat.size());
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
flat.back().set_last();
// We have the bounds of the "list", so this is the
// most convenient point at which to compute hints.
ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
}
list_count_ = static_cast<int>(flatmap.size());
for (int i = 0; i < kNumInst; i++)
inst_count_[i] = 0;
// Fourth pass: Remaps outs to flat-ids.
// Counts instructions by opcode.
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
Inst* ip = &flat[id];
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
ip->set_out(flatmap[ip->out()]);
inst_count_[ip->opcode()]++;
}
int total = 0;
for (int i = 0; i < kNumInst; i++)
total += inst_count_[i];
DCHECK_EQ(total, static_cast<int>(flat.size()));
// Remap start_unanchored and start.
if (start_unanchored() == 0) {
DCHECK_EQ(start(), 0);
} else if (start_unanchored() == start()) {
set_start_unanchored(flatmap[1]);
set_start(flatmap[1]);
} else {
set_start_unanchored(flatmap[1]);
set_start(flatmap[2]);
}
// Finally, replace the old instructions with the new instructions.
size_ = static_cast<int>(flat.size());
inst_ = PODArray<Inst>(size_);
memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
// Populate the list heads for BitState.
// 512 instructions limits the memory footprint to 1KiB.
if (size_ <= 512) {
list_heads_ = PODArray<uint16_t>(size_);
// 0xFF makes it more obvious if we try to look up a non-head.
memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
for (int i = 0; i < list_count_; ++i)
list_heads_[flatmap[i]] = i;
}
}
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
// Mark the kInstFail instruction.
rootmap->set_new(0, rootmap->size());
// Mark the start_unanchored and start instructions.
if (!rootmap->has_index(start_unanchored()))
rootmap->set_new(start_unanchored(), rootmap->size());
if (!rootmap->has_index(start()))
rootmap->set_new(start(), rootmap->size());
reachable->clear();
stk->clear();
stk->push_back(start_unanchored());
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
// Mark this instruction as a predecessor of each out.
for (int out : {ip->out(), ip->out1()}) {
if (!predmap->has_index(out)) {
predmap->set_new(out, static_cast<int>(predvec->size()));
predvec->emplace_back();
}
(*predvec)[predmap->get_existing(out)].emplace_back(id);
}
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
// Mark the out of this instruction as a "root".
if (!rootmap->has_index(ip->out()))
rootmap->set_new(ip->out(), rootmap->size());
id = ip->out();
goto Loop;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
}
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition.
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
for (SparseSet::const_iterator i = reachable->begin();
i != reachable->end();
++i) {
int id = *i;
if (predmap->has_index(id)) {
for (int pred : (*predvec)[predmap->get_existing(id)]) {
if (!reachable->contains(pred)) {
// id has a predecessor that cannot be reached from root!
// Therefore, id must be a "root" too - mark it as such.
if (!rootmap->has_index(id))
rootmap->set_new(id, rootmap->size());
}
}
}
}
}
void Prog::EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition. Emit a kInstNop
// instruction so that the Prog does not become quadratically larger.
flat->emplace_back();
flat->back().set_opcode(kInstNop);
flat->back().set_out(rootmap->get_existing(id));
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
flat->emplace_back();
flat->back().set_opcode(kInstAltMatch);
flat->back().set_out(static_cast<int>(flat->size()));
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
FALLTHROUGH_INTENDED;
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
flat->back().set_out(rootmap->get_existing(ip->out()));
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
break;
}
}
}
// For each ByteRange instruction in [begin, end), computes a hint to execution
// engines: the delta to the next instruction (in flat) worth exploring iff the
// current instruction matched.
//
// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
// colors are instructions and recoloring ranges precisely identifies conflicts
// between instructions. Iterating backwards over [begin, end) is guaranteed to
// identify the nearest conflict (if any) with only linear complexity.
void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
Bitmap256 splits;
int colors[256];
bool dirty = false;
for (int id = end; id >= begin; --id) {
if (id == end ||
(*flat)[id].opcode() != kInstByteRange) {
if (dirty) {
dirty = false;
splits.Clear();
}
splits.Set(255);
colors[255] = id;
// At this point, the [0-255] range is colored with id.
// Thus, hints cannot point beyond id; and if id == end,
// hints that would have pointed to id will be 0 instead.
continue;
}
dirty = true;
// We recolor the [lo-hi] range with id. Note that first ratchets backwards
// from end to the nearest conflict (if any) during recoloring.
int first = end;
auto Recolor = [&](int lo, int hi) {
// Like ByteMapBuilder, we split at lo-1 and at hi.
--lo;
if (0 <= lo && !splits.Test(lo)) {
splits.Set(lo);
int next = splits.FindNextSetBit(lo+1);
colors[lo] = colors[next];
}
if (!splits.Test(hi)) {
splits.Set(hi);
int next = splits.FindNextSetBit(hi+1);
colors[hi] = colors[next];
}
int c = lo+1;
while (c < 256) {
int next = splits.FindNextSetBit(c);
// Ratchet backwards...
first = std::min(first, colors[next]);
// Recolor with id - because it's the new nearest conflict!
colors[next] = id;
if (next == hi)
break;
c = next+1;
}
};
Inst* ip = &(*flat)[id];
int lo = ip->lo();
int hi = ip->hi();
Recolor(lo, hi);
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
int foldlo = lo;
int foldhi = hi;
if (foldlo < 'a')
foldlo = 'a';
if (foldhi > 'z')
foldhi = 'z';
if (foldlo <= foldhi) {
foldlo += 'A' - 'a';
foldhi += 'A' - 'a';
Recolor(foldlo, foldhi);
}
}
if (first != end) {
uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
ip->hint_foldcase_ |= hint<<1;
}
}
}
} // namespace re2

432
extern/re2/re2/prog.h vendored Normal file
View File

@ -0,0 +1,432 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PROG_H_
#define RE2_PROG_H_
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#include <stdint.h>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include <type_traits>
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "re2/re2.h"
namespace re2 {
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
kNumInst,
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class DFA;
class Regexp;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
// See the assertion below for why this is so.
Inst() = default;
// Copyable.
Inst(const Inst&) = default;
Inst& operator=(const Inst&) = default;
// Constructors per opcode
void InitAlt(uint32_t out, uint32_t out1);
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
void InitCapture(int cap, uint32_t out);
void InitEmptyWidth(EmptyOp empty, uint32_t out);
void InitMatch(int id);
void InitNop(uint32_t out);
void InitFail();
// Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog* p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange ||
(p->inst(out())->opcode() == kInstNop &&
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase() && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
std::string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_. PatchList/last steal another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
}
void set_last() {
out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
void set_out(int out) {
out_opcode_ = (out<<4) | (last()<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<4) | (last()<<3) | opcode;
}
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction
int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
// hint to execution engines: the delta to the
// next instruction (in the current list) worth
// exploring iff this instruction matched; 0
// means there are no remaining possibilities,
// which is most likely for character classes.
// foldcase: A-Z -> a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
};
// Inst must be trivial so that we can freely clear it with memset(3).
// Arrays of Inst are initialised by copying the initial elements with
// memmove(3) and then clearing any remaining elements with memset(3).
static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
uint16_t* list_heads() { return list_heads_.data(); }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
// Lazily computed.
int first_byte();
// Returns string representation of program for debugging.
std::string Dump();
std::string DumpUnanchored();
std::string DumpByteMap();
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8_t c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches);
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
// will halt. Otherwise, the state has been built and next points to an array
// of bytemap_range()+1 slots holding the next states as per the bytemap and
// kByteEndText. The number of the state is implied by the callback sequence:
// the first callback is for state 0, the second callback is for state 1, ...
// match indicates whether the state is a matching state.
using DFAStateCallback = std::function<void(const int* next, bool match)>;
// Build the entire DFA for the given match kind.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
// Controls whether the DFA should bail out early if the NFA would be faster.
// FOR TESTING ONLY.
static void TEST_dfa_should_bail_when_slow(bool b);
// Compute bytemap.
void ComputeByteMap();
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the list count and the text size.
bool CanBitState() { return list_heads_.data() != NULL; }
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
void Flatten();
// Walks the Prog; the "successor roots" or predecessors of the reachable
// instructions are marked in rootmap or predmap/predvec, respectively.
// reachable and stk are preallocated scratch structures.
void MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the "dominator root"
// of the reachable instructions (if such exists) is marked in rootmap.
// reachable and stk are preallocated scratch structures.
void MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the reachable
// instructions are emitted in "list" form and appended to flat.
// reachable and stk are preallocated scratch structures.
void EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk);
// Computes hints for ByteRange instructions in [begin, end).
void ComputeHints(std::vector<Inst>* flat, int begin, int end);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
void DeleteDFA(DFA* dfa);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
// not populated if size_ is overly large
PODArray<Inst> inst_; // pointer to instruction array
PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
uint8_t bytemap_[256]; // map from input bytes to byte classes
std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
};
} // namespace re2
#endif // RE2_PROG_H_

1236
extern/re2/re2/re2.cc vendored Normal file

File diff suppressed because it is too large Load Diff

959
extern/re2/re2/re2.h vendored Normal file
View File

@ -0,0 +1,959 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H_
#define RE2_RE2_H_
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See https://github.com/google/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUBSTRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched substrings.
// On match failure, none of the pointees will have been modified.
// On match success, the substrings will be converted (as necessary) and
// their values will be assigned to their pointees until all conversions
// have succeeded or one conversion has failed.
// On conversion failure, the pointees will be in an indeterminate state
// because the caller has no way of knowing which conversion failed.
// However, conversion cannot fail for types like string and StringPiece
// that do not inspect the substring contents. Hence, in the common case
// where all of the pointees are of such types, failure is always due to
// match failure and thus none of the pointees will have been modified.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// std::string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// std::string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// std::string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include "re2/stringpiece.h"
namespace re2 {
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, possibly passing one of these to
// the Option constructor, change the settings, and pass that
// Option class to the RE2 constructor.
enum CannedOptions {
DefaultOptions = 0,
Latin1, // treat input as Latin-1 (default UTF-8)
POSIX, // POSIX syntax, leftmost-longest match
Quiet // do not log about regexp parse errors
};
// Need to have the const char* and const std::string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const std::string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& options);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const std::string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const std::string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const std::string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
int ReverseProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(std::map<int, int>* histogram) const;
int ReverseProgramFanout(std::map<int, int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The array-based matching interface ******/
// The functions here have names ending in 'N' and are used to implement
// the functions whose names are the prefix before the 'N'. It is sometimes
// useful to invoke them directly, but the syntax is awkward, so the 'N'-less
// versions should be preferred.
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int n);
static bool PartialMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int n);
static bool ConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int n);
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int n);
#ifndef SWIG
private:
template <typename F, typename SP>
static inline bool Apply(F f, SP sp, const RE2& re) {
return f(sp, re, NULL, 0);
}
template <typename F, typename SP, typename... A>
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
const Arg* const args[] = {&a...};
const int n = sizeof...(a);
return f(sp, re, args, n);
}
public:
// In order to allow FullMatch() et al. to be called with a varying number
// of arguments of varying types, we use two layers of variadic templates.
// The first layer constructs the temporary Arg objects. The second layer
// (above) constructs the array of pointers to the temporary Arg objects.
/***** The useful part: the matching interface *****/
// Matches "text" against "re". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "std::string" for "text".
// You can pass in a "const char*" or a "std::string" or a "RE2" for "re".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// std::string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "re" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
template <typename... A>
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
}
// Exactly like FullMatch(), except that "re" is allowed to match
// a substring of "text".
template <typename... A>
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
}
// Like FullMatch() and PartialMatch(), except that "re" has to match
// a prefix of the text, and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true
// and "re" matched a non-empty substring of "text".
template <typename... A>
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
}
// Like Consume(), but does not anchor the match at the beginning of
// the text. That is, "re" need not start its match at the beginning
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
// the next word in "s" and stores it in "word".
template <typename... A>
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
}
#endif
// Replace the first match of "re" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// std::string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(std::string* str,
const RE2& re,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// std::string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(std::string* str,
const RE2& re,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
//
// REQUIRES: "text" must not alias any part of "*out".
static bool Extract(const StringPiece& text,
const RE2& re,
const StringPiece& rewrite,
std::string* out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static std::string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(std::string* min, std::string* max,
int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const { return num_captures_; }
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const std::map<std::string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const std::map<int, std::string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in submatch[] (up to nsubmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
// submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
// submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
// Caveat: submatch[] may be clobbered even on match failure.
//
// Don't ask for more match information than you will use:
// runs much faster with nsubmatch == 1 than nsubmatch > 1, and
// runs even faster if nsubmatch == 0.
// Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, submatch[i].data() == NULL.
bool Match(const StringPiece& text,
size_t startpos,
size_t endpos,
Anchor re_anchor,
StringPiece* submatch,
int nsubmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite,
std::string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
static int MaxSubmatch(const StringPiece& rewrite);
// Append the "rewrite" string, with backslash subsitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(std::string* out,
const StringPiece& rewrite,
const StringPiece* vec,
int veclen) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// dot_nl (false) dot matches everything including new line
// never_capture (false) parse all parens as non-capturing
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// When posix_syntax == false, these features are always enabled and
// cannot be turned off; to perform multi-line matching in that case,
// begin the regexp with (?m).
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_START searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
never_capture_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
/*implicit*/ Options(CannedOptions);
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int64_t max_mem() const { return max_mem_; }
void set_max_mem(int64_t m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool dot_nl() const { return dot_nl_; }
void set_dot_nl(bool b) { dot_nl_ = b; }
bool never_capture() const { return never_capture_; }
void set_never_capture(bool b) { never_capture_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
*this = src;
}
int ParseFlags() const;
private:
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
bool never_capture_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
};
// Returns the options set in the constructor.
const Options& options() const { return options_; }
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor re_anchor,
size_t* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
std::string pattern_; // string regular expression
Options options_; // option flags
std::string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
int num_captures_; // Number of capturing groups
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable re2::Prog* rprog_; // reverse program for regexp
mutable const std::string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable std::string error_arg_; // Fragment of regexp showing error
// Map from capture names to indices
mutable const std::map<std::string, int>* named_groups_;
// Map from capture indices to names
mutable const std::map<int, std::string>* group_names_;
// Onces for lazy computations.
mutable std::once_flag rprog_once_;
mutable std::once_flag named_groups_once_;
mutable std::once_flag group_names_once_;
RE2(const RE2&) = delete;
RE2& operator=(const RE2&) = delete;
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
Arg(std::nullptr_t);
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char)
MAKE_PARSER(signed char, parse_schar)
MAKE_PARSER(unsigned char, parse_uchar)
MAKE_PARSER(float, parse_float)
MAKE_PARSER(double, parse_double)
MAKE_PARSER(std::string, parse_string)
MAKE_PARSER(StringPiece, parse_stringpiece)
MAKE_PARSER(short, parse_short)
MAKE_PARSER(unsigned short, parse_ushort)
MAKE_PARSER(int, parse_int)
MAKE_PARSER(unsigned int, parse_uint)
MAKE_PARSER(long, parse_long)
MAKE_PARSER(unsigned long, parse_ulong)
MAKE_PARSER(long long, parse_longlong)
MAKE_PARSER(unsigned long long, parse_ulonglong)
#undef MAKE_PARSER
// Generic constructor templates
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
template <class T> Arg(T* p, Parser parser)
: arg_(p), parser_(parser) { }
// Parse the data
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest);
DECLARE_INTEGER_PARSER(short)
DECLARE_INTEGER_PARSER(ushort)
DECLARE_INTEGER_PARSER(int)
DECLARE_INTEGER_PARSER(uint)
DECLARE_INTEGER_PARSER(long)
DECLARE_INTEGER_PARSER(ulong)
DECLARE_INTEGER_PARSER(longlong)
DECLARE_INTEGER_PARSER(ulonglong)
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
} \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
} \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
}
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#undef MAKE_INTEGER_PARSER
#ifndef SWIG
// Silence warnings about missing initializers for members of LazyRE2.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
// Helper for writing global or static RE2s safely.
// Write
// static LazyRE2 re = {".*"};
// and then use *re instead of writing
// static RE2 re(".*");
// The former is more careful about multithreaded
// situations than the latter.
//
// N.B. This class never deletes the RE2 object that
// it constructs: that's a feature, so that it can be used
// for global and function static variables.
class LazyRE2 {
private:
struct NoArg {};
public:
typedef RE2 element_type; // support std::pointer_traits
// Constructor omitted to preserve braced initialization in C++98.
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
RE2& operator*() const { return *get(); }
RE2* operator->() const { return get(); }
// Named accessor/initializer:
RE2* get() const {
std::call_once(once_, &LazyRE2::Init, this);
return ptr_;
}
// All data fields must be public to support {"foo"} initialization.
const char* pattern_;
RE2::CannedOptions options_;
NoArg barrier_against_excess_initializers_;
mutable RE2* ptr_;
mutable std::once_flag once_;
private:
static void Init(const LazyRE2* lazy_re2) {
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
}
void operator=(const LazyRE2&); // disallowed
};
#endif // SWIG
} // namespace re2
using re2::RE2;
using re2::LazyRE2;
#endif // RE2_RE2_H_

971
extern/re2/re2/regexp.cc vendored Normal file
View File

@ -0,0 +1,971 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "re2/regexp.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(static_cast<uint8_t>(op)),
simple_(false),
parse_flags_(static_cast<uint16_t>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
if (cc_)
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
// Lazily allocated.
static Mutex* ref_mutex;
static std::map<Regexp*, int>* ref_map;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
MutexLock l(ref_mutex);
return (*ref_map)[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
static std::once_flag ref_once;
std::call_once(ref_once, []() {
ref_mutex = new Mutex;
ref_map = new std::map<Regexp*, int>;
});
// Store ref count in overflow map.
MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
} else {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = static_cast<uint16_t>(r);
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
// Squash **, ++ and ??.
if (op == sub->op() && flags == sub->parse_flags())
return sub;
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
if ((sub->op() == kRegexpStar ||
sub->op() == kRegexpPlus ||
sub->op() == kRegexpQuest) &&
flags == sub->parse_flags()) {
// If sub is Star, no need to rewrite it.
if (sub->op() == kRegexpStar)
return sub;
// Rewrite sub to Star.
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub->sub()[0]->Incref();
sub->Decref(); // We didn't consume the reference after all.
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpPlus, sub, flags);
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpStar, sub, flags);
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpQuest, sub, flags);
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
if (nsub == 0) {
if (op == kRegexpAlternate)
return new Regexp(kRegexpNoMatch, flags);
else
return new Regexp(kRegexpEmptyMatch, flags);
}
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
void Regexp::Swap(Regexp* that) {
// Regexp is not trivially copyable, so we cannot freely copy it with
// memmove(3), but swapping objects like so is safe for our purposes.
char tmp[sizeof *this];
void* vthis = reinterpret_cast<void*>(this);
void* vthat = reinterpret_cast<void*>(that);
memmove(tmp, vthis, sizeof *this);
memmove(vthis, vthat, sizeof *this);
memmove(vthat, tmp, sizeof *this);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
std::vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
size_t n = stk.size();
if (n == 0)
break;
DCHECK_GE(n, 2);
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
std::string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
std::string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
NumCapturesWalker(const NumCapturesWalker&) = delete;
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
std::map<std::string, int>* TakeMap() {
std::map<std::string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new std::map<std::string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
std::map<std::string, int>* map_;
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
};
std::map<std::string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
std::map<int, std::string>* TakeMap() {
std::map<int, std::string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new std::map<int, std::string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
std::map<int, std::string>* map_;
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
};
std::map<int, std::string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = std::max<Rune>(lo, 'A');
Rune hi1 = std::min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = std::max<Rune>(lo, 'a');
hi1 = std::min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
std::vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (size_t i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
uint8_t* data = reinterpret_cast<uint8_t*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(n, static_cast<int>(ranges_.size()));
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

652
extern/re2/re2/regexp.h vendored Normal file
View File

@ -0,0 +1,652 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#include <stdint.h>
#include <map>
#include <set>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static std::string CodeText(RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
std::string Text() const;
private:
RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
std::string* tmp_; // Temporary storage, possibly where error_arg_ is.
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
};
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
CharClass(const CharClass&) = delete;
CharClass& operator=(const CharClass&) = delete;
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
AllParseFlags = (1<<14)-1,
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_ != 0; }
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class CoalesceWalker;
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
std::map<std::string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
std::map<int, std::string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
std::string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
std::string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64_t max_mem);
Prog* CompileToReverseProg(int64_t max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
// regardless of the return value.
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a Star, Plus or Quest,
// squashing the pair if sub is also a Star, Plus or Quest.
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
friend class FactorAlternationImpl;
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = static_cast<uint16_t>(n);
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8_t instead of RegexpOp to control space usage.
uint8_t op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8_t instead of bool to control space usage.
uint8_t simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16_t instead of ParseFlags to control space usage.
uint16_t parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16_t to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (kMaxRepeat),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16_t ref_;
static const uint16_t kMaxRef = 0xffff;
// Subexpressions.
// uint16_t to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16_t nsub_;
static const uint16_t kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
std::string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
Regexp(const Regexp&) = delete;
Regexp& operator=(const Regexp&) = delete;
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32_t AlphaMask = (1<<26) - 1;
uint32_t upper_; // bitmap of A-Z
uint32_t lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
CharClassBuilder(const CharClassBuilder&) = delete;
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
};
// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
// Attempting to produce a value out of enum's range has undefined behaviour.
return static_cast<Regexp::ParseFlags>(
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
}
} // namespace re2
#endif // RE2_REGEXP_H_

153
extern/re2/re2/set.cc vendored Normal file
View File

@ -0,0 +1,153 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include <stddef.h>
#include <algorithm>
#include <memory>
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
namespace re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
options_.set_never_capture(true); // might unblock some optimisations
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
size_ = 0;
}
RE2::Set::~Set() {
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, std::string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
PODArray<re2::Regexp*> sub(nsub + 1);
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub.data(), nsub + 1, pf);
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
elem_.emplace_back(std::string(pattern), re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
return false;
}
compiled_ = true;
size_ = static_cast<int>(elem_.size());
// Sort the elements by their patterns. This is good enough for now
// until we have a Regexp comparison function. (Maybe someday...)
std::sort(elem_.begin(), elem_.end(),
[](const Elem& a, const Elem& b) -> bool {
return a.first < b.first;
});
PODArray<re2::Regexp*> sub(size_);
for (int i = 0; i < size_; i++)
sub[i] = elem_[i].second;
elem_.clear();
elem_.shrink_to_fit();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf);
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
re->Decref();
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
return Match(text, v, NULL);
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
if (error_info != NULL)
error_info->kind = kNotCompiled;
return false;
}
bool dfa_failed = false;
std::unique_ptr<SparseSet> matches;
if (v != NULL) {
matches.reset(new SparseSet(size_));
v->clear();
}
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
if (error_info != NULL)
error_info->kind = kOutOfMemory;
return false;
}
if (ret == false) {
if (error_info != NULL)
error_info->kind = kNoError;
return false;
}
if (v != NULL) {
if (matches->empty()) {
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
if (error_info != NULL)
error_info->kind = kInconsistent;
return false;
}
v->assign(matches->begin(), matches->end());
}
if (error_info != NULL)
error_info->kind = kNoError;
return true;
}
} // namespace re2

80
extern/re2/re2/set.h vendored Normal file
View File

@ -0,0 +1,80 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H_
#define RE2_SET_H_
#include <string>
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
enum ErrorKind {
kNoError = 0,
kNotCompiled, // The set is not compiled.
kOutOfMemory, // The DFA ran out of memory.
kInconsistent, // The result is inconsistent. This should never happen.
};
struct ErrorInfo {
ErrorKind kind;
};
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Adds pattern to the set using the options passed to the constructor.
// Returns the index that will identify the regexp in the output of Match(),
// or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Errors do not increment the index; if error is not NULL, *error will hold
// the error message from the parser.
int Add(const StringPiece& pattern, std::string* error);
// Compiles the set in preparation for matching.
// Returns false if the compiler runs out of memory.
// Add() must not be called again after Compile().
// Compile() must be called before Match().
bool Compile();
// Returns true if text matches at least one of the regexps in the set.
// Fills v (if not NULL) with the indices of the matching regexps.
// Callers must not expect v to be sorted.
bool Match(const StringPiece& text, std::vector<int>* v) const;
// As above, but populates error_info (if not NULL) when none of the regexps
// in the set matched. This can inform callers when DFA execution fails, for
// example, because they might wish to handle that case differently.
bool Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const;
private:
typedef std::pair<std::string, re2::Regexp*> Elem;
RE2::Options options_;
RE2::Anchor anchor_;
std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
int size_;
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
};
} // namespace re2
#endif // RE2_SET_H_

655
extern/re2/re2/simplify.cc vendored Normal file
View File

@ -0,0 +1,655 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/pod_array.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
std::string* dst, RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
// PostVisit creates the coalesced result, which should then be simplified.
class CoalesceWalker : public Regexp::Walker<Regexp*> {
public:
CoalesceWalker() {}
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside CoalesceWalker so that
// they can edit the private fields of the Regexps they construct.
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
// the parse flags are consistent. (They will not be checked again later.)
static bool CanCoalesce(Regexp* r1, Regexp* r2);
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
// will be empty match and the coalesced op. In other cases, where part of a
// literal string was removed to be coalesced, the array elements afterwards
// will be the coalesced op and the remainder of the literal string.
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
CoalesceWalker(const CoalesceWalker&) = delete;
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
};
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return cre;
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
// Utility function for PostVisit implementations that compares re->sub() with
// child_args to determine whether any child_args changed. In the common case,
// where nothing changed, calls Decref() for all child_args and returns false,
// so PostVisit must return re->Incref(). Otherwise, returns true.
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
for (int i = 0; i < re->nsub(); i++) {
Regexp* sub = re->sub()[i];
Regexp* newsub = child_args[i];
if (newsub != sub)
return true;
}
for (int i = 0; i < re->nsub(); i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
return false;
}
Regexp* CoalesceWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
return re->Incref();
}
Regexp* CoalesceWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
if (re->nsub() == 0)
return re->Incref();
if (re->op() != kRegexpConcat) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
// Repeats and Captures have additional data that must be copied.
if (re->op() == kRegexpRepeat) {
nre->min_ = re->min();
nre->max_ = re->max();
} else if (re->op() == kRegexpCapture) {
nre->cap_ = re->cap();
}
return nre;
}
bool can_coalesce = false;
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1])) {
can_coalesce = true;
break;
}
}
if (!can_coalesce) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
return nre;
}
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1]))
DoCoalesce(&child_args[i], &child_args[i+1]);
}
// Determine how many empty matches were left by DoCoalesce.
int n = 0;
for (int i = n; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch)
n++;
}
// Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub() - n);
Regexp** nre_subs = nre->sub();
for (int i = 0, j = 0; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch) {
child_args[i]->Decref();
continue;
}
nre_subs[j] = child_args[i];
j++;
}
return nre;
}
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
// any byte.
if ((r1->op() == kRegexpStar ||
r1->op() == kRegexpPlus ||
r1->op() == kRegexpQuest ||
r1->op() == kRegexpRepeat) &&
(r1->sub()[0]->op() == kRegexpLiteral ||
r1->sub()[0]->op() == kRegexpCharClass ||
r1->sub()[0]->op() == kRegexpAnyChar ||
r1->sub()[0]->op() == kRegexpAnyByte)) {
// r2 must be a star/plus/quest/repeat of the same literal, char class,
// any char or any byte.
if ((r2->op() == kRegexpStar ||
r2->op() == kRegexpPlus ||
r2->op() == kRegexpQuest ||
r2->op() == kRegexpRepeat) &&
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
// The parse flags must be consistent.
((r1->parse_flags() & Regexp::NonGreedy) ==
(r2->parse_flags() & Regexp::NonGreedy))) {
return true;
}
// ... OR an occurrence of that literal, char class, any char or any byte
if (Regexp::Equal(r1->sub()[0], r2)) {
return true;
}
// ... OR a literal string that begins with that literal.
if (r1->sub()[0]->op() == kRegexpLiteral &&
r2->op() == kRegexpLiteralString &&
r2->runes()[0] == r1->sub()[0]->rune() &&
// The parse flags must be consistent.
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
(r2->parse_flags() & Regexp::FoldCase))) {
return true;
}
}
return false;
}
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
Regexp* r1 = *r1ptr;
Regexp* r2 = *r2ptr;
Regexp* nre = Regexp::Repeat(
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
switch (r1->op()) {
case kRegexpStar:
nre->min_ = 0;
nre->max_ = -1;
break;
case kRegexpPlus:
nre->min_ = 1;
nre->max_ = -1;
break;
case kRegexpQuest:
nre->min_ = 0;
nre->max_ = 1;
break;
case kRegexpRepeat:
nre->min_ = r1->min();
nre->max_ = r1->max();
break;
default:
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
nre->Decref();
return;
}
switch (r2->op()) {
case kRegexpStar:
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpPlus:
nre->min_++;
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpQuest:
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
case kRegexpRepeat:
nre->min_ += r2->min();
if (r2->max() == -1)
nre->max_ = -1;
else if (nre->max() != -1)
nre->max_ += r2->max();
goto LeaveEmpty;
case kRegexpLiteral:
case kRegexpCharClass:
case kRegexpAnyChar:
case kRegexpAnyByte:
nre->min_++;
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
LeaveEmpty:
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
*r2ptr = nre;
break;
case kRegexpLiteralString: {
Rune r = r1->sub()[0]->rune();
// Determine how much of the literal string is removed.
// We know that we have at least one rune. :)
int n = 1;
while (n < r2->nrunes() && r2->runes()[n] == r)
n++;
nre->min_ += n;
if (nre->max() != -1)
nre->max_ += n;
if (n == r2->nrunes())
goto LeaveEmpty;
*r1ptr = nre;
*r2ptr = Regexp::LiteralString(
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
break;
}
default:
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
nre->Decref();
return;
}
r1->Decref();
r2->Decref();
}
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple()) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return Regexp::Concat(nre_subs.data(), min, f);
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
PODArray<Regexp*> nre_subs(min);
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs.data(), min, f);
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

65
extern/re2/re2/stringpiece.cc vendored Normal file
View File

@ -0,0 +1,65 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include <ostream>
#include "util/util.h"
namespace re2 {
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
size_type pos) const {
size_type ret = std::min(size_ - pos, n);
memcpy(buf, data_ + pos, ret);
return ret;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > size_) pos = size_;
if (n > size_ - pos) n = size_ - pos;
return StringPiece(data_ + pos, n);
}
StringPiece::size_type StringPiece::find(const StringPiece& s,
size_type pos) const {
if (pos > size_) return npos;
const_pointer result = std::search(data_ + pos, data_ + size_,
s.data_, s.data_ + s.size_);
size_type xpos = result - data_;
return xpos + s.size_ <= size_ ? xpos : npos;
}
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
if (size_ <= 0 || pos >= size_) return npos;
const_pointer result = std::find(data_ + pos, data_ + size_, c);
return result != data_ + size_ ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
size_type pos) const {
if (size_ < s.size_) return npos;
if (s.size_ == 0) return std::min(size_, pos);
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
return result != last ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
if (size_ <= 0) return npos;
for (size_t i = std::min(pos + 1, size_); i != 0;) {
if (data_[--i] == c) return i;
}
return npos;
}
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
o.write(p.data(), p.size());
return o;
}
} // namespace re2

210
extern/re2/re2/stringpiece.h vendored Normal file
View File

@ -0,0 +1,210 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_STRINGPIECE_H_
#define RE2_STRINGPIECE_H_
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
// Doing this simplifies the logic below.
#ifndef __has_include
#define __has_include(x) 0
#endif
#include <stddef.h>
#include <string.h>
#include <algorithm>
#include <iosfwd>
#include <iterator>
#include <string>
#if __has_include(<string_view>) && __cplusplus >= 201703L
#include <string_view>
#endif
namespace re2 {
class StringPiece {
public:
typedef std::char_traits<char> traits_type;
typedef char value_type;
typedef char* pointer;
typedef const char* const_pointer;
typedef char& reference;
typedef const char& const_reference;
typedef const char* const_iterator;
typedef const_iterator iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef const_reverse_iterator reverse_iterator;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos = static_cast<size_type>(-1);
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece()
: data_(NULL), size_(0) {}
#if __has_include(<string_view>) && __cplusplus >= 201703L
StringPiece(const std::string_view& str)
: data_(str.data()), size_(str.size()) {}
#endif
StringPiece(const std::string& str)
: data_(str.data()), size_(str.size()) {}
StringPiece(const char* str)
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
StringPiece(const char* str, size_type len)
: data_(str), size_(len) {}
const_iterator begin() const { return data_; }
const_iterator end() const { return data_ + size_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(data_ + size_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(data_);
}
size_type size() const { return size_; }
size_type length() const { return size_; }
bool empty() const { return size_ == 0; }
const_reference operator[](size_type i) const { return data_[i]; }
const_pointer data() const { return data_; }
void remove_prefix(size_type n) {
data_ += n;
size_ -= n;
}
void remove_suffix(size_type n) {
size_ -= n;
}
void set(const char* str) {
data_ = str;
size_ = str == NULL ? 0 : strlen(str);
}
void set(const char* str, size_type len) {
data_ = str;
size_ = len;
}
// Converts to `std::basic_string`.
template <typename A>
explicit operator std::basic_string<char, traits_type, A>() const {
if (!data_) return {};
return std::basic_string<char, traits_type, A>(data_, size_);
}
std::string as_string() const {
return std::string(data_, size_);
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data_, size_);
}
void CopyToString(std::string* target) const {
target->assign(data_, size_);
}
void AppendToString(std::string* target) const {
target->append(data_, size_);
}
size_type copy(char* buf, size_type n, size_type pos = 0) const;
StringPiece substr(size_type pos = 0, size_type n = npos) const;
int compare(const StringPiece& x) const {
size_type min_size = std::min(size(), x.size());
if (min_size > 0) {
int r = memcmp(data(), x.data(), min_size);
if (r < 0) return -1;
if (r > 0) return 1;
}
if (size() < x.size()) return -1;
if (size() > x.size()) return 1;
return 0;
}
// Does "this" start with "x"?
bool starts_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
}
// Does "this" end with "x"?
bool ends_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() &&
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
}
bool contains(const StringPiece& s) const {
return find(s) != npos;
}
size_type find(const StringPiece& s, size_type pos = 0) const;
size_type find(char c, size_type pos = 0) const;
size_type rfind(const StringPiece& s, size_type pos = npos) const;
size_type rfind(char c, size_type pos = npos) const;
private:
const_pointer data_;
size_type size_;
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type len = x.size();
if (len != y.size()) return false;
return x.data() == y.data() || len == 0 ||
memcmp(x.data(), y.data(), len) == 0;
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type min_size = std::min(x.size(), y.size());
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
return (r < 0) || (r == 0 && x.size() < y.size());
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
// Allow StringPiece to be logged.
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
} // namespace re2
#endif // RE2_STRINGPIECE_H_

273
extern/re2/re2/testing/backtrack.cc vendored Normal file
View File

@ -0,0 +1,273 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
//
// Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
// except that it remembers where it has been, trading a lot of
// memory for a lot of time. It exists only for testing purposes.
//
// Let me repeat that.
//
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
// - It uses a ton of memory.
// - It uses a ton of stack.
// - It uses CHECK and LOG(FATAL).
// - It implements unanchored search by repeated anchored search.
//
// On the other hand, it is very simple and a good reference
// implementation for the more complicated regexp packages.
//
// In BUILD, this file is linked into the ":testing" library,
// not the main library, in order to make it harder to pick up
// accidentally.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "util/util.h"
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
// Backtracker holds the state for a backtracking search.
//
// Excluding the search parameters, the main search state
// is just the "capture registers", which record, for the
// current execution, the string position at which each
// parenthesis was passed. cap_[0] and cap_[1] are the
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
//
// To avoid infinite loops during backtracking on expressions
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
// pairs that have already been explored and are thus not worth
// re-exploring if we get there via another path. Modern backtracking
// libraries engineer their program representation differently, to make
// such infinite loops possible to avoid without keeping a giant visited_
// bitmap, but visited_ works fine for a reference implementation
// and it has the nice benefit of making the search run in linear time.
class Backtracker {
public:
explicit Backtracker(Prog* prog);
~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
// Explores from instruction id at string position p looking for a match.
// Returns true if found (so that caller can stop trying other possibilities).
bool Visit(int id, const char* p);
// Tries instruction id at string position p.
// Returns true if a match is found.
bool Try(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether search must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
size_t nvisited_; // # of words in bitmap
};
Backtracker::Backtracker(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
visited_(NULL),
nvisited_(0) {
}
Backtracker::~Backtracker() {
delete[] visited_;
}
// Runs a backtracking search.
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && text.begin() > context_.begin())
return false;
if (prog_->anchor_end() && text.end() < context_.end())
return false;
anchored_ = anchored | prog_->anchor_start();
longest_ = longest | prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
memset(cap_, 0, sizeof cap_);
// We use submatch_[0] for our own bookkeeping,
// so it had better exist.
StringPiece sp0;
if (nsubmatch < 1) {
submatch_ = &sp0;
nsubmatch_ = 1;
}
submatch_[0] = StringPiece();
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
delete[] visited_;
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
visited_ = new uint32_t[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return Visit(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Explores from instruction id at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.end());
size_t n = id*(text_.size()+1) + (p - text_.begin());
CHECK_LT(n/32, nvisited_);
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
Prog::Inst* ip = prog_->inst(id);
if (Try(id, p)) {
if (longest_ && !ip->last())
Visit(id+1, p);
return true;
}
if (!ip->last())
return Visit(id+1, p);
return false;
}
// Tries instruction id at string position p.
// Returns true if a match is found.
bool Backtracker::Try(int id, const char* p) {
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
if (p < text_.end())
c = *p & 0xFF;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
return false; // not reached
case kInstAltMatch:
// Ignored.
return false;
case kInstByteRange:
if (ip->Matches(c))
return Visit(ip->out(), p+1);
return false;
case kInstCapture:
if (0 <= ip->cap() &&
ip->cap() < static_cast<int>(arraysize(cap_))) {
// Capture p to register, but save old value.
const char* q = cap_[ip->cap()];
cap_[ip->cap()] = p;
bool ret = Visit(ip->out(), p);
// Restore old value as we backtrack.
cap_[ip->cap()] = q;
return ret;
}
return Visit(ip->out(), p);
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
return false;
return Visit(ip->out(), p);
case kInstNop:
return Visit(ip->out(), p);
case kInstMatch:
// We found a match. If it's the best so far, record the
// parameters in the caller's submatch_ array.
if (endmatch_ && p != context_.end())
return false;
cap_[1] = p;
if (submatch_[0].data() == NULL || // First match so far ...
(longest_ && p > submatch_[0].end())) { // ... or better match
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(
cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
return true;
case kInstFail:
return false;
}
}
// Runs a backtracking search.
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
Backtracker b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

226
extern/re2/re2/testing/charclass_test.cc vendored Normal file
View File

@ -0,0 +1,226 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test character class manipulations.
#include <stdio.h>
#include "util/test.h"
#include "util/utf.h"
#include "re2/regexp.h"
namespace re2 {
struct CCTest {
struct {
Rune lo;
Rune hi;
} add[10];
int remove;
struct {
Rune lo;
Rune hi;
} final[10];
};
static CCTest tests[] = {
{ { { 10, 20 }, {-1} }, -1,
{ { 10, 20 }, {-1} } },
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
{ { 10, 30 }, {-1} } },
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
{ { 10, 40 }, {-1} } },
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
{ { 5, 25 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
{ { 10, 23 }, {-1} } },
// These check boundary cases during negation.
{ { { 0, Runemax }, {-1} }, -1,
{ { 0, Runemax }, {-1} } },
{ { { 0, 50 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 50, Runemax }, {-1} }, -1,
{ { 50, Runemax }, {-1} } },
// Check RemoveAbove.
{ { { 50, Runemax }, {-1} }, 255,
{ { 50, 255 }, {-1} } },
{ { { 50, Runemax }, {-1} }, 65535,
{ { 50, 65535 }, {-1} } },
{ { { 50, Runemax }, {-1} }, Runemax,
{ { 50, Runemax }, {-1} } },
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
{ { 50, 60 }, { 250, 255 }, {-1} } },
{ { { 50, 60 }, {-1} }, 255,
{ { 50, 60 }, {-1} } },
{ { { 350, 360 }, {-1} }, 255,
{ {-1} } },
{ { {-1} }, 255,
{ {-1} } },
};
template<class CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
printf("\t%s:", desc);
} else {
printf("\n");
printf("CharClass added: [%s]", desc);
for (int k = 0; t->add[k].lo >= 0; k++)
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
printf("\n");
if (t->remove >= 0)
printf("Removed > %d\n", t->remove);
printf("\twant:");
for (int k = 0; t->final[k].lo >= 0; k++)
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
printf("\n");
printf("\thave:");
}
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
printf(" %d-%d", it->lo, it->hi);
printf("\n");
}
bool ShouldContain(CCTest *t, int x) {
for (int j = 0; t->final[j].lo >= 0; j++)
if (t->final[j].lo <= x && x <= t->final[j].hi)
return true;
return false;
}
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
CharClass* Negate(CharClass *cc) {
return cc->Negate();
}
void Delete(CharClass* cc) {
cc->Delete();
}
CharClassBuilder* Negate(CharClassBuilder* cc) {
CharClassBuilder* ncc = cc->Copy();
ncc->Negate();
return ncc;
}
void Delete(CharClassBuilder* cc) {
delete cc;
}
template<class CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
typename CharClass::iterator it = cc->begin();
int size = 0;
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
if (it == cc->end() ||
it->lo != t->final[j].lo ||
it->hi != t->final[j].hi) {
Broke(desc, t, cc);
return false;
}
size += it->hi - it->lo + 1;
}
if (it != cc->end()) {
Broke(desc, t, cc);
return false;
}
if (cc->size() != size) {
Broke(desc, t, cc);
printf("wrong size: want %d have %d\n", size, cc->size());
return false;
}
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) != cc->Contains(j)) {
Broke(desc, t, cc);
printf("want contains(%d)=%d, got %d\n",
j, ShouldContain(t, j), cc->Contains(j));
return false;
}
}
CharClass* ncc = Negate(cc);
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("want ncc contains(%d)!=%d, got %d\n",
j, ShouldContain(t, j), ncc->Contains(j));
Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("ncc size should be %d is %d\n",
Runemax+1 - cc->size(), ncc->size());
Delete(ncc);
return false;
}
}
Delete(ncc);
return true;
}
TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
for (size_t i = 0; i < arraysize(tests); i++) {
CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
ccb.AddRange(t->add[j].lo, t->add[j].hi);
if (t->remove >= 0)
ccb.RemoveAbove(t->remove);
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
nfail++;
CharClass* cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "before copy (CharClass)"))
nfail++;
cc->Delete();
CharClassBuilder *ccb1 = ccb.Copy();
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
nfail++;
cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "after copy (CharClass)"))
nfail++;
cc->Delete();
delete ccb1;
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

397
extern/re2/re2/testing/compile_test.cc vendored Normal file
View File

@ -0,0 +1,397 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test prog.cc, compile.cc
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/prog.h"
namespace re2 {
// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.
struct Test {
const char* regexp;
const char* code;
};
static Test tests[] = {
{ "a",
"3. byte [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "ab",
"3. byte [61-61] 0 -> 4\n"
"4. byte [62-62] 0 -> 5\n"
"5. match! 0\n" },
{ "a|c",
"3+ byte [61-61] 0 -> 5\n"
"4. byte [63-63] 0 -> 5\n"
"5. match! 0\n" },
{ "a|b",
"3. byte [61-62] 0 -> 4\n"
"4. match! 0\n" },
{ "[ab]",
"3. byte [61-62] 0 -> 4\n"
"4. match! 0\n" },
{ "a+",
"3. byte [61-61] 0 -> 4\n"
"4+ nop -> 3\n"
"5. match! 0\n" },
{ "a+?",
"3. byte [61-61] 0 -> 4\n"
"4+ match! 0\n"
"5. nop -> 3\n" },
{ "a*",
"3+ byte [61-61] 1 -> 3\n"
"4. match! 0\n" },
{ "a*?",
"3+ match! 0\n"
"4. byte [61-61] 0 -> 3\n" },
{ "a?",
"3+ byte [61-61] 1 -> 5\n"
"4. nop -> 5\n"
"5. match! 0\n" },
{ "a??",
"3+ nop -> 5\n"
"4. byte [61-61] 0 -> 5\n"
"5. match! 0\n" },
{ "a{4}",
"3. byte [61-61] 0 -> 4\n"
"4. byte [61-61] 0 -> 5\n"
"5. byte [61-61] 0 -> 6\n"
"6. byte [61-61] 0 -> 7\n"
"7. match! 0\n" },
{ "(a)",
"3. capture 2 -> 4\n"
"4. byte [61-61] 0 -> 5\n"
"5. capture 3 -> 6\n"
"6. match! 0\n" },
{ "(?:a)",
"3. byte [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "",
"3. match! 0\n" },
{ ".",
"3+ byte [00-09] 0 -> 5\n"
"4. byte [0b-ff] 0 -> 5\n"
"5. match! 0\n" },
{ "[^ab]",
"3+ byte [00-09] 0 -> 6\n"
"4+ byte [0b-60] 0 -> 6\n"
"5. byte [63-ff] 0 -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
"3. byte/i [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "\\C+",
"3. byte [00-ff] 0 -> 4\n"
"4+ altmatch -> 5 | 6\n"
"5+ nop -> 3\n"
"6. match! 0\n" },
{ "\\C*",
"3+ altmatch -> 4 | 5\n"
"4+ byte [00-ff] 1 -> 3\n"
"5. match! 0\n" },
{ "\\C?",
"3+ byte [00-ff] 1 -> 5\n"
"4. nop -> 5\n"
"5. match! 0\n" },
// Issue 20992936
{ "[[-`]",
"3. byte [5b-60] 0 -> 4\n"
"4. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
for (size_t i = 0; i < arraysize(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
LOG(ERROR) << "Cannot parse: " << t.regexp;
failed++;
continue;
}
Prog* prog = re->CompileToProg(0);
if (prog == NULL) {
LOG(ERROR) << "Cannot compile: " << t.regexp;
re->Decref();
failed++;
continue;
}
ASSERT_TRUE(re->CompileToProg(1) == NULL);
std::string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
LOG(ERROR) << "Want:\n" << t.code;
LOG(ERROR) << "Got:\n" << s;
failed++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(failed, 0);
}
static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
std::string* bytemap) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*bytemap = prog->DumpByteMap();
delete prog;
re->Decref();
}
TEST(TestCompile, Latin1Ranges) {
// The distinct byte ranges involved in the Latin-1 dot ([^\n]).
std::string bytemap;
DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-ff] -> 0\n",
bytemap);
}
TEST(TestCompile, OtherByteMapTests) {
std::string bytemap;
// Test that "absent" ranges are mapped to the same byte class.
DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-2f] -> 0\n"
"[30-39] -> 1\n"
"[3a-40] -> 0\n"
"[41-46] -> 1\n"
"[47-60] -> 0\n"
"[61-66] -> 1\n"
"[67-ff] -> 0\n",
bytemap);
// Test the byte classes for \b.
DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-2f] -> 0\n"
"[30-39] -> 1\n"
"[3a-40] -> 0\n"
"[41-5a] -> 1\n"
"[5b-5e] -> 0\n"
"[5f-5f] -> 1\n"
"[60-60] -> 0\n"
"[61-7a] -> 1\n"
"[7b-ff] -> 0\n",
bytemap);
// Bug in the ASCII case-folding optimization created too many byte classes.
DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-5e] -> 0\n"
"[5f-5f] -> 1\n"
"[60-ff] -> 0\n",
bytemap);
}
TEST(TestCompile, UTF8Ranges) {
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
std::string bytemap;
DumpByteMap(".", Regexp::PerlX, &bytemap);
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-7f] -> 0\n"
"[80-8f] -> 2\n"
"[90-9f] -> 3\n"
"[a0-bf] -> 4\n"
"[c0-c1] -> 1\n"
"[c2-df] -> 5\n"
"[e0-e0] -> 6\n"
"[e1-ef] -> 7\n"
"[f0-f0] -> 8\n"
"[f1-f3] -> 9\n"
"[f4-f4] -> 10\n"
"[f5-ff] -> 1\n",
bytemap);
}
TEST(TestCompile, InsufficientMemory) {
Regexp* re = Regexp::Parse(
"^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
Regexp::LikePerl, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(920);
// If the memory budget has been exhausted, compilation should fail
// and return NULL instead of trying to do anything with NoMatch().
EXPECT_TRUE(prog == NULL);
re->Decref();
}
static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
std::string* forward, std::string* reverse) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
if (forward != NULL) {
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*forward = prog->Dump();
delete prog;
}
if (reverse != NULL) {
Prog* prog = re->CompileToReverseProg(0);
EXPECT_TRUE(prog != NULL);
*reverse = prog->Dump();
delete prog;
}
re->Decref();
}
TEST(TestCompile, Bug26705922) {
// Bug in the compiler caused inefficient bytecode to be generated for Unicode
// groups: common suffixes were cached, but common prefixes were not factored.
std::string forward, reverse;
Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
"4. byte [90-90] 0 -> 5\n"
"5. byte [80-80] 0 -> 6\n"
"6+ byte [80-80] 0 -> 8\n"
"7. byte [90-90] 0 -> 8\n"
"8. match! 0\n",
forward);
EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
"4. byte [90-90] 0 -> 5\n"
"5. byte [80-80] 0 -> 6\n"
"6. byte [90-90] 0 -> 7\n"
"7. byte [f0-f0] 0 -> 8\n"
"8. match! 0\n",
reverse);
Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
"4. byte [f0-f0] 0 -> 8\n"
"5. byte [80-bf] 0 -> 6\n"
"6. byte [80-bf] 0 -> 7\n"
"7. match! 0\n"
"8. byte [90-90] 0 -> 5\n",
forward);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4. byte [80-bf] 0 -> 5\n"
"5+ byte [e8-ef] 0 -> 7\n"
"6. byte [90-90] 0 -> 8\n"
"7. match! 0\n"
"8. byte [f0-f0] 0 -> 7\n",
reverse);
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4+ byte [c2-df] 0 -> 7\n"
"5+ byte [a0-bf] 1 -> 8\n"
"6. byte [80-bf] 0 -> 9\n"
"7. match! 0\n"
"8. byte [e0-e0] 0 -> 7\n"
"9+ byte [e1-ef] 0 -> 7\n"
"10+ byte [90-bf] 1 -> 13\n"
"11+ byte [80-bf] 1 -> 14\n"
"12. byte [80-8f] 0 -> 15\n"
"13. byte [f0-f0] 0 -> 7\n"
"14. byte [f1-f3] 0 -> 7\n"
"15. byte [f4-f4] 0 -> 7\n",
reverse);
}
TEST(TestCompile, Bug35237384) {
// Bug in the compiler caused inefficient bytecode to be generated for
// nested nullable subexpressions.
std::string forward;
Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
"4. nop -> 5\n"
"5+ byte [61-61] 1 -> 5\n"
"6. nop -> 7\n"
"7+ byte [61-61] 1 -> 7\n"
"8. match! 0\n",
forward);
Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ nop -> 6\n"
"4+ nop -> 8\n"
"5. nop -> 21\n"
"6+ byte [61-61] 1 -> 6\n"
"7. nop -> 3\n"
"8+ byte [62-62] 1 -> 8\n"
"9. nop -> 3\n"
"10+ byte [61-61] 1 -> 10\n"
"11. nop -> 21\n"
"12+ byte [62-62] 1 -> 12\n"
"13. nop -> 21\n"
"14+ byte [61-61] 1 -> 14\n"
"15. nop -> 18\n"
"16+ byte [62-62] 1 -> 16\n"
"17. nop -> 18\n"
"18+ nop -> 14\n"
"19+ nop -> 16\n"
"20. match! 0\n"
"21+ nop -> 10\n"
"22+ nop -> 12\n"
"23. nop -> 18\n",
forward);
Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ nop -> 36\n"
"4+ nop -> 31\n"
"5. nop -> 33\n"
"6+ byte [00-09] 0 -> 8\n"
"7. byte [0b-ff] 0 -> 8\n"
"8+ nop -> 6\n"
"9+ nop -> 29\n"
"10. nop -> 28\n"
"11+ byte [00-09] 0 -> 13\n"
"12. byte [0b-ff] 0 -> 13\n"
"13+ nop -> 11\n"
"14+ nop -> 26\n"
"15. nop -> 28\n"
"16+ byte [00-09] 0 -> 18\n"
"17. byte [0b-ff] 0 -> 18\n"
"18+ nop -> 16\n"
"19+ nop -> 36\n"
"20. nop -> 33\n"
"21+ byte [00-09] 0 -> 23\n"
"22. byte [0b-ff] 0 -> 23\n"
"23+ nop -> 21\n"
"24+ nop -> 31\n"
"25. nop -> 33\n"
"26+ nop -> 28\n"
"27. byte [53-53] 0 -> 11\n"
"28. match! 0\n"
"29+ nop -> 28\n"
"30. byte [53-53] 0 -> 6\n"
"31+ nop -> 33\n"
"32. byte [53-53] 0 -> 21\n"
"33+ nop -> 29\n"
"34+ nop -> 26\n"
"35. nop -> 28\n"
"36+ nop -> 33\n"
"37. byte [53-53] 0 -> 16\n",
forward);
}
} // namespace re2

381
extern/re2/re2/testing/dfa_test.cc vendored Normal file
View File

@ -0,0 +1,381 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdint.h>
#include <string>
#include <thread>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
static const bool UsingMallocCounter = false;
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
DEFINE_int32(repeat, 2, "Repetition count.");
DEFINE_int32(threads, 4, "number of threads");
namespace re2 {
// Check that multithreaded access to DFA class works.
// Helper function: builds entire DFA for prog.
static void DoBuild(Prog* prog) {
ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
}
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
std::string s = "a";
for (int i = 0; i < FLAGS_size; i++)
s += "[ab]";
s += "b";
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
// Check that single-threaded code works.
{
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
std::thread t(DoBuild, prog);
t.join();
delete prog;
}
// Build the DFA simultaneously in a bunch of threads.
for (int i = 0; i < FLAGS_repeat; i++) {
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
for (int j = 0; j < FLAGS_threads; j++)
threads.emplace_back(DoBuild, prog);
for (int j = 0; j < FLAGS_threads; j++)
threads[j].join();
// One more compile, to make sure everything is okay.
prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
delete prog;
}
re->Decref();
}
// Check that DFA size requirements are followed.
// BuildEntireDFA will, like SearchDFA, stop building out
// the DFA once the memory limits are reached.
TEST(SingleThreaded, BuildEntireDFA) {
// Create regexp with 2^30 states in DFA.
Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
for (int i = 17; i < 24; i++) {
int64_t limit = int64_t{1}<<i;
int64_t usage;
//int64_t progusage, dfamem;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(limit);
ASSERT_TRUE(prog != NULL);
//progusage = m.HeapGrowth();
//dfamem = prog->dfa_mem();
prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
usage = m.HeapGrowth();
delete prog;
}
if (UsingMallocCounter) {
//LOG(INFO) << "limit " << limit << ", "
// << "prog usage " << progusage << ", "
// << "DFA budget " << dfamem << ", "
// << "total " << usage;
// Tolerate +/- 10%.
ASSERT_GT(usage, limit*9/10);
ASSERT_LT(usage, limit*11/10);
}
}
re->Decref();
}
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
static std::string DeBruijnString(int n) {
CHECK_LT(n, static_cast<int>(8*sizeof(int)));
CHECK_GT(n, 0);
std::vector<bool> did(size_t{1}<<n);
for (int i = 0; i < 1<<n; i++)
did[i] = false;
std::string s;
for (int i = 0; i < n-1; i++)
s.append("0");
int bits = 0;
int mask = (1<<n) - 1;
for (int i = 0; i < (1<<n); i++) {
bits <<= 1;
bits &= mask;
if (!did[bits|1]) {
bits |= 1;
s.append("1");
} else {
s.append("0");
}
CHECK(!did[bits]);
did[bits] = true;
}
return s;
}
// Test that the DFA gets the right result even if it runs
// out of memory during a search. The regular expression
// 0[01]{n}$ matches a binary string of 0s and 1s only if
// the (n+1)th-to-last character is a 0. Matching this in
// a single forward pass (as done by the DFA) requires
// keeping one bit for each of the last n+1 characters
// (whether each was a 0), or 2^(n+1) possible states.
// If we run this regexp to search in a string that contains
// every possible n-character binary string as a substring,
// then it will have to run through at least 2^n states.
// States are big data structures -- certainly more than 1 byte --
// so if the DFA can search correctly while staying within a
// 2^n byte limit, it must be handling out-of-memory conditions
// gracefully.
TEST(SingleThreaded, SearchDFA) {
// The De Bruijn string is the worst case input for this regexp.
// By default, the DFA will notice that it is flushing its cache
// too frequently and will bail out early, so that RE2 can use the
// NFA implementation instead. (The DFA loses its speed advantage
// if it can't get a good cache hit rate.)
// Tell the DFA to trudge along instead.
Prog::TEST_dfa_should_bail_when_slow(false);
// Choice of n is mostly arbitrary, except that:
// * making n too big makes the test run for too long.
// * making n too small makes the DFA refuse to run,
// because it has so little memory compared to the program size.
// Empirically, n = 18 is a good compromise between the two.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
std::string no_match = DeBruijnString(n);
std::string match = no_match + "0";
int64_t usage;
int64_t peak_usage;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(1<<n);
ASSERT_TRUE(prog != NULL);
for (int i = 0; i < 10; i++) {
bool matched = false;
bool failed = false;
matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_TRUE(matched);
matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_FALSE(matched);
}
usage = m.HeapGrowth();
peak_usage = m.PeakHeapGrowth();
delete prog;
}
if (UsingMallocCounter) {
//LOG(INFO) << "usage " << usage << ", "
// << "peak usage " << peak_usage;
ASSERT_LT(usage, 1<<n);
ASSERT_LT(peak_usage, 1<<n);
}
re->Decref();
// Reset to original behaviour.
Prog::TEST_dfa_should_bail_when_slow(true);
}
// Helper function: searches for match, which should match,
// and no_match, which should not.
static void DoSearch(Prog* prog, const StringPiece& match,
const StringPiece& no_match) {
for (int i = 0; i < 2; i++) {
bool matched = false;
bool failed = false;
matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_TRUE(matched);
matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
Prog::kFirstMatch, NULL, &failed, NULL);
ASSERT_FALSE(failed);
ASSERT_FALSE(matched);
}
}
TEST(Multithreaded, SearchDFA) {
Prog::TEST_dfa_should_bail_when_slow(false);
// Same as single-threaded test above.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
std::string no_match = DeBruijnString(n);
std::string match = no_match + "0";
// Check that single-threaded code works.
{
Prog* prog = re->CompileToProg(1<<n);
ASSERT_TRUE(prog != NULL);
std::thread t(DoSearch, prog, match, no_match);
t.join();
delete prog;
}
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
for (int i = 0; i < FLAGS_repeat; i++) {
Prog* prog = re->CompileToProg(1<<n);
ASSERT_TRUE(prog != NULL);
std::vector<std::thread> threads;
for (int j = 0; j < FLAGS_threads; j++)
threads.emplace_back(DoSearch, prog, match, no_match);
for (int j = 0; j < FLAGS_threads; j++)
threads[j].join();
delete prog;
}
re->Decref();
// Reset to original behaviour.
Prog::TEST_dfa_should_bail_when_slow(true);
}
struct ReverseTest {
const char* regexp;
const char* text;
bool match;
};
// Test that reverse DFA handles anchored/unanchored correctly.
// It's in the DFA interface but not used by RE2.
ReverseTest reverse_tests[] = {
{ "\\A(a|b)", "abc", true },
{ "(a|b)\\z", "cba", true },
{ "\\A(a|b)", "cba", false },
{ "(a|b)\\z", "abc", false },
};
TEST(DFA, ReverseMatch) {
int nfail = 0;
for (size_t i = 0; i < arraysize(reverse_tests); i++) {
const ReverseTest& t = reverse_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToReverseProg(0);
ASSERT_TRUE(prog != NULL);
bool failed = false;
bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
Prog::kFirstMatch, NULL, &failed, NULL);
if (matched != t.match) {
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
nfail++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(nfail, 0);
}
struct CallbackTest {
const char* regexp;
const char* dump;
};
// Test that DFA::BuildAllStates() builds the expected DFA states
// and issues the expected callbacks. These test cases reflect the
// very compact encoding of the callbacks, but that also makes them
// very difficult to understand, so let's work through "\\Aa\\z".
// There are three slots per DFA state because the bytemap has two
// equivalence classes and there is a third slot for kByteEndText:
// 0: all bytes that are not 'a'
// 1: the byte 'a'
// 2: kByteEndText
// -1 means that there is no transition from that DFA state to any
// other DFA state for that slot. The valid transitions are thus:
// state 0 --slot 1--> state 1
// state 1 --slot 2--> state 2
// The double brackets indicate that state 2 is a matching state.
// Putting it together, this means that the DFA must consume the
// byte 'a' and then hit end of text. Q.E.D.
CallbackTest callback_tests[] = {
{ "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
{ "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
{ "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
{ "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
{ "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
{ "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
{ "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
{ "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
{ "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
{ "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
};
TEST(DFA, Callback) {
int nfail = 0;
for (size_t i = 0; i < arraysize(callback_tests); i++) {
const CallbackTest& t = callback_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
std::string dump;
prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
ASSERT_TRUE(next != NULL);
if (!dump.empty())
dump += " ";
dump += match ? "[[" : "[";
for (int b = 0; b < prog->bytemap_range() + 1; b++)
dump += StringPrintf("%d,", next[b]);
dump.pop_back();
dump += match ? "]]" : "]";
});
if (dump != t.dump) {
LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
nfail++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

169
extern/re2/re2/testing/dump.cc vendored Normal file
View File

@ -0,0 +1,169 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Dump the regexp into a string showing structure.
// Tested by parse_unittest.cc
// This function traverses the regexp recursively,
// meaning that on inputs like Regexp::Simplify of
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
// it takes time and space exponential in the size of the
// original regular expression. It can also use stack space
// linear in the size of the regular expression for inputs
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
// As a result, Dump is provided only in the testing
// library (see BUILD).
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
#include "re2/regexp.h"
// Cause a link error if this file is used outside of testing.
DECLARE_string(test_tmpdir);
namespace re2 {
static const char* kOpcodeNames[] = {
"bad",
"no",
"emp",
"lit",
"str",
"cat",
"alt",
"star",
"plus",
"que",
"rep",
"cap",
"dot",
"byte",
"bol",
"eol",
"wb", // kRegexpWordBoundary
"nwb", // kRegexpNoWordBoundary
"bot",
"eot",
"cc",
"match",
};
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, std::string* s) {
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
*s += StringPrintf("op%d", re->op());
} else {
switch (re->op()) {
default:
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (re->parse_flags() & Regexp::NonGreedy)
s->append("n");
break;
}
s->append(kOpcodeNames[re->op()]);
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
Rune r = re->rune();
if ('a' <= r && r <= 'z')
s->append("fold");
}
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
if ('a' <= r && r <= 'z') {
s->append("fold");
break;
}
}
}
}
s->append("{");
switch (re->op()) {
default:
break;
case kRegexpEndText:
if (!(re->parse_flags() & Regexp::WasDollar)) {
s->append("\\z");
}
break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
break;
case kRegexpConcat:
case kRegexpAlternate:
for (int i = 0; i < re->nsub(); i++)
DumpRegexpAppending(re->sub()[i], s);
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCapture:
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
s->append(*re->name());
s->append(":");
}
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpRepeat:
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCharClass: {
std::string sep;
for (CharClass::iterator it = re->cc()->begin();
it != re->cc()->end(); ++it) {
RuneRange rr = *it;
s->append(sep);
if (rr.lo == rr.hi)
s->append(StringPrintf("%#x", rr.lo));
else
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
sep = " ";
}
break;
}
}
s->append("}");
}
std::string Regexp::Dump() {
std::string s;
// Make sure being called from a unit test.
if (FLAGS_test_tmpdir.empty()) {
LOG(ERROR) << "Cannot use except for testing.";
return s;
}
DumpRegexpAppending(this, &s);
return s;
}
} // namespace re2

View File

@ -0,0 +1,44 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test simple repetition operators
TEST(Repetition, Simple) {
std::vector<std::string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Explode("abc."), ops,
6, Explode("ab"), "(?:%s)", "");
ExhaustiveTest(3, 2, Explode("abc."), ops,
40, Explode("a"), "(?:%s)", "");
}
// Test capturing parens -- (a) -- inside repetition operators
TEST(Repetition, Capturing) {
std::vector<std::string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
7, Explode("ab"), "(?:%s)", "");
// This would be a great test, but it runs forever when PCRE is enabled.
if (FLAGS_regexp_engines.find("PCRE") == std::string::npos)
ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops,
50, Explode("a"), "(?:%s)", "");
}
} // namespace re2

View File

@ -0,0 +1,73 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include <stddef.h>
#include <memory>
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test empty string matches (aka "(?:)")
TEST(EmptyString, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
RegexpGenerator::EgrepOps(),
5, Split("", "ab"), "", "");
}
// Test escaped versions of regexp syntax.
TEST(Punctuation, Literals) {
std::vector<std::string> alphabet = Explode("()*+?{}[]\\^$.");
std::vector<std::string> escaped = alphabet;
for (size_t i = 0; i < escaped.size(); i++)
escaped[i] = "\\" + escaped[i];
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
2, alphabet, "", "");
}
// Test ^ $ . \A \z in presence of line endings.
// Have to wrap the empty-width ones in (?:) so that
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
RegexpGenerator::EgrepOps(),
4, Explode("ab\n"), "", "");
}
// Test what does and does not match \n.
// This would be a good test, except that PCRE seems to have a bug:
// in single-byte character set mode (the default),
// [^a] matches \n, but in UTF-8 mode it does not.
// So when we run the test, the tester complains that
// we don't agree with PCRE, but it's PCRE that is at fault.
// For what it's worth, Perl gets this right (matches
// regardless of whether UTF-8 input is selected):
//
// #!/usr/bin/perl
// use POSIX qw(locale_h);
// print "matches in latin1\n" if "\n" =~ /[^a]/;
// setlocale("en_US.utf8");
// print "matches in utf8\n" if "\n" =~ /[^a]/;
//
// The rule chosen for RE2 is that by default, like Perl,
// dot does not match \n but negated character classes [^a] do.
// (?s) will allow dot to match \n; there is no way in RE2
// to stop [^a] from matching \n, though the underlying library
// provides a mechanism, and RE2 could add new syntax if needed.
//
// TEST(Newlines, Exhaustive) {
// std::vector<std::string> empty_vector;
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
// RegexpGenerator::EgrepOps(),
// 4, Explode("a\n"), "");
// }
} // namespace re2

View File

@ -0,0 +1,100 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include <stddef.h>
#include <memory>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/utf.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test simple character classes by themselves.
TEST(CharacterClasses, Exhaustive) {
std::vector<std::string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "", "");
}
// Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses, ExhaustiveAB) {
std::vector<std::string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "a%sb", "");
}
// Returns UTF8 for Rune r
static std::string UTF8(Rune r) {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
return std::string(buf);
}
// Returns a vector of "interesting" UTF8 characters.
// Unicode is now too big to just return all of them,
// so UTF8Characters return a set likely to be good test cases.
static const std::vector<std::string>& InterestingUTF8() {
static bool init;
static std::vector<std::string> v;
if (init)
return v;
init = true;
// All the Latin1 equivalents are interesting.
for (int i = 1; i < 256; i++)
v.push_back(UTF8(i));
// After that, the codes near bit boundaries are
// interesting, because they span byte sequence lengths.
for (int j = 0; j < 8; j++)
v.push_back(UTF8(256 + j));
for (int i = 512; i < Runemax; i <<= 1)
for (int j = -8; j < 8; j++)
v.push_back(UTF8(i + j));
// The codes near Runemax, including Runemax itself, are interesting.
for (int j = -8; j <= 0; j++)
v.push_back(UTF8(Runemax + j));
return v;
}
// Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8, SingleOps) {
std::vector<std::string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
std::vector<std::string> ops; // no ops
ExhaustiveTest(1, 0, atoms, ops,
1, InterestingUTF8(), "", "");
}
// Test interesting UTF-8 characters against character classes,
// but wrap everything inside AB.
TEST(InterestingUTF8, AB) {
std::vector<std::string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
std::vector<std::string> ops; // no ops
std::vector<std::string> alpha = InterestingUTF8();
for (size_t i = 0; i < alpha.size(); i++)
alpha[i] = "a" + alpha[i] + "b";
ExhaustiveTest(1, 0, atoms, ops,
1, alpha, "a%sb", "");
}
} // namespace re2

View File

@ -0,0 +1,36 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test very simple expressions.
TEST(EgrepLiterals, Lowercase) {
EgrepTest(3, 2, "abc.", 3, "abc", "");
}
// Test mixed-case expressions.
TEST(EgrepLiterals, MixedCase) {
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
}
// Test mixed-case in case-insensitive mode.
TEST(EgrepLiterals, FoldCase) {
// The punctuation characters surround A-Z and a-z
// in the ASCII table. This looks for bugs in the
// bytemap range code in the DFA.
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
}
// Test very simple expressions.
TEST(EgrepLiterals, UTF8) {
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
}
} // namespace re2

View File

@ -0,0 +1,188 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
// a maximum regular expression length, and a maximum number of letters
// that can appear in the regular expression. Given these parameters,
// it tries every possible regular expression and string, verifying that
// the NFA, DFA, and a trivial backtracking implementation agree about
// the location of the match.
#include <stdio.h>
#include "util/test.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/tester.h"
// For target `log' in the Makefile.
#ifndef LOGGING
#define LOGGING 0
#endif
DEFINE_bool(show_regexps, false, "show regexps during testing");
DEFINE_int32(max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
namespace re2 {
static char* escape(const StringPiece& sp) {
static char buf[512];
char* p = buf;
*p++ = '\"';
for (size_t i = 0; i < sp.size(); i++) {
if(p+5 >= buf+sizeof buf)
LOG(FATAL) << "ExhaustiveTester escape: too long";
if(sp[i] == '\\' || sp[i] == '\"') {
*p++ = '\\';
*p++ = sp[i];
} else if(sp[i] == '\n') {
*p++ = '\\';
*p++ = 'n';
} else {
*p++ = sp[i];
}
}
*p++ = '\"';
*p = '\0';
return buf;
}
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
printf("-");
return;
}
for (int i = 0; i < n; i++) {
if (i > 0)
printf(" ");
if (m[i].begin() == NULL)
printf("-");
else
printf("%td-%td",
m[i].begin() - input.begin(), m[i].end() - input.begin());
}
}
// Processes a single generated regexp.
// Compiles it using Regexp interface and PCRE, and then
// checks that NFA, DFA, and PCRE all return the same results.
void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
regexps_++;
std::string regexp = const_regexp;
if (!topwrapper_.empty())
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
if (FLAGS_show_regexps) {
printf("\r%s", regexp.c_str());
fflush(stdout);
}
if (LOGGING) {
// Write out test cases and answers for use in testing
// other implementations, such as Go's regexp package.
if (randomstrings_)
LOG(ERROR) << "Cannot log with random strings.";
if (regexps_ == 1) { // first
printf("strings\n");
strgen_.Reset();
while (strgen_.HasNext())
printf("%s\n", escape(strgen_.Next()));
printf("regexps\n");
}
printf("%s\n", escape(regexp));
RE2 re(regexp);
RE2::Options longest;
longest.set_longest_match(true);
RE2 relongest(regexp, longest);
int ngroup = re.NumberOfCapturingGroups()+1;
StringPiece* group = new StringPiece[ngroup];
strgen_.Reset();
while (strgen_.HasNext()) {
StringPiece input = strgen_.Next();
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
printf("\n");
}
delete[] group;
return;
}
Tester tester(regexp);
if (tester.error())
return;
strgen_.Reset();
strgen_.GenerateNULL();
if (randomstrings_)
strgen_.Random(stringseed_, stringcount_);
int bad_inputs = 0;
while (strgen_.HasNext()) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
break;
}
}
}
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const std::vector<std::string>& alphabet,
const std::vector<std::string>& ops,
int maxstrlen,
const std::vector<std::string>& stralphabet,
const std::string& wrapper,
const std::string& topwrapper) {
if (RE2_DEBUG_MODE) {
if (maxatoms > 1)
maxatoms--;
if (maxops > 1)
maxops--;
if (maxstrlen > 1)
maxstrlen--;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper,
topwrapper);
t.Generate();
if (!LOGGING) {
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
}
EXPECT_EQ(0, t.failures());
}
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
int maxstrlen, const std::string& stralphabet,
const std::string& wrapper) {
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
for (size_t i = 0; i < arraysize(tops); i++) {
ExhaustiveTest(maxatoms, maxops,
Split("", alphabet),
RegexpGenerator::EgrepOps(),
maxstrlen,
Split("", stralphabet),
wrapper,
tops[i]);
}
}
} // namespace re2

View File

@ -0,0 +1,105 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_
#define RE2_TESTING_EXHAUSTIVE_TESTER_H_
#include <stdint.h>
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#if !defined(NDEBUG)
// We are in a debug build.
const bool RE2_DEBUG_MODE = true;
#elif __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) || __has_feature(thread_sanitizer)
// Not a debug build, but still under sanitizers.
const bool RE2_DEBUG_MODE = true;
#else
const bool RE2_DEBUG_MODE = false;
#endif
// Exhaustive regular expression test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
// each possible string, and if so, where the match is.
//
// Can also be used in a "random" mode that generates a given number
// of random regexp and strings, allowing testing of larger expressions
// and inputs.
class ExhaustiveTester : public RegexpGenerator {
public:
ExhaustiveTester(int maxatoms,
int maxops,
const std::vector<std::string>& alphabet,
const std::vector<std::string>& ops,
int maxstrlen,
const std::vector<std::string>& stralphabet,
const std::string& wrapper,
const std::string& topwrapper)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
wrapper_(wrapper),
topwrapper_(topwrapper),
regexps_(0), tests_(0), failures_(0),
randomstrings_(0), stringseed_(0), stringcount_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
int failures() { return failures_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const std::string& regexp);
// Causes testing to generate random input strings.
void RandomStrings(int32_t seed, int32_t count) {
randomstrings_ = true;
stringseed_ = seed;
stringcount_ = count;
}
private:
StringGenerator strgen_;
std::string wrapper_; // Regexp wrapper - either empty or has one %s.
std::string topwrapper_; // Regexp top-level wrapper.
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
int failures_; // Number of tests failed.
bool randomstrings_; // Whether to use random strings
int32_t stringseed_; // If so, the seed.
int stringcount_; // If so, how many to generate.
ExhaustiveTester(const ExhaustiveTester&) = delete;
ExhaustiveTester& operator=(const ExhaustiveTester&) = delete;
};
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const std::vector<std::string>& alphabet,
const std::vector<std::string>& ops,
int maxstrlen,
const std::vector<std::string>& stralphabet,
const std::string& wrapper,
const std::string& topwrapper);
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
int maxstrlen, const std::string& stralphabet,
const std::string& wrapper);
} // namespace re2
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_

View File

@ -0,0 +1,294 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
namespace re2 {
struct FilterTestVars {
FilterTestVars() {}
explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {}
std::vector<std::string> atoms;
std::vector<int> atom_indices;
std::vector<int> matches;
RE2::Options opts;
FilteredRE2 f;
};
TEST(FilteredRE2Test, EmptyTest) {
FilterTestVars v;
v.f.Compile(&v.atoms);
EXPECT_EQ(0, v.atoms.size());
// Compile has no effect at all when called before Add: it will not
// record that it has been called and it will not clear the vector.
// The second point does not matter here, but the first point means
// that an error will be logged during the call to AllMatches.
v.f.AllMatches("foo", v.atom_indices, &v.matches);
EXPECT_EQ(0, v.matches.size());
}
TEST(FilteredRE2Test, SmallOrTest) {
FilterTestVars v(4); // override the minimum atom length
int id;
v.f.Add("(foo|bar)", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(0, v.atoms.size());
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
TEST(FilteredRE2Test, SmallLatinTest) {
FilterTestVars v;
int id;
v.opts.set_encoding(RE2::Options::EncodingLatin1);
v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(1, v.atoms.size());
EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
v.atom_indices.push_back(0);
v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
struct AtomTest {
const char* testname;
// If any test needs more than this many regexps or atoms, increase
// the size of the corresponding array.
const char* regexps[20];
const char* atoms[20];
};
AtomTest atom_tests[] = {
{
// This test checks to make sure empty patterns are allowed.
"CheckEmptyPattern",
{""},
{}
}, {
// This test checks that all atoms of length greater than min length
// are found, and no atoms that are of smaller length are found.
"AllAtomsGtMinLengthFound", {
"(abc123|def456|ghi789).*mnop[x-z]+",
"abc..yyy..zz",
"mnmnpp[a-z]+PPP"
}, {
"abc123",
"def456",
"ghi789",
"mnop",
"abc",
"yyy",
"mnmnpp",
"ppp"
}
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
// substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
"(abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
"abc",
"ghi789",
"abcd",
"yyy",
"yyyzzz",
"mnmnpp",
"ppp"
}
}, {
// Test character class expansion.
"CharClassExpansion", {
"m[a-c][d-f]n.*[x-z]+",
"[x-y]bcde[ab]"
}, {
"madn", "maen", "mafn",
"mbdn", "mben", "mbfn",
"mcdn", "mcen", "mcfn",
"xbcdea", "xbcdeb",
"ybcdea", "ybcdeb"
}
}, {
// Test upper/lower of non-ASCII.
"UnicodeLower", {
"(?i)ΔδΠϖπΣςσ",
"ΛΜΝΟΠ",
"ψρστυ",
}, {
"δδπππσσσ",
"λμνοπ",
"ψρστυ",
},
},
};
void AddRegexpsAndCompile(const char* regexps[],
size_t n,
struct FilterTestVars* v) {
for (size_t i = 0; i < n; i++) {
int id;
v->f.Add(regexps[i], v->opts, &id);
}
v->f.Compile(&v->atoms);
}
bool CheckExpectedAtoms(const char* atoms[],
size_t n,
const char* testname,
struct FilterTestVars* v) {
std::vector<std::string> expected;
for (size_t i = 0; i < n; i++)
expected.push_back(atoms[i]);
bool pass = expected.size() == v->atoms.size();
std::sort(v->atoms.begin(), v->atoms.end());
std::sort(expected.begin(), expected.end());
for (size_t i = 0; pass && i < n; i++)
pass = pass && expected[i] == v->atoms[i];
if (!pass) {
LOG(ERROR) << "Failed " << testname;
LOG(ERROR) << "Expected #atoms = " << expected.size();
for (size_t i = 0; i < expected.size(); i++)
LOG(ERROR) << expected[i];
LOG(ERROR) << "Found #atoms = " << v->atoms.size();
for (size_t i = 0; i < v->atoms.size(); i++)
LOG(ERROR) << v->atoms[i];
}
return pass;
}
TEST(FilteredRE2Test, AtomTests) {
int nfail = 0;
for (size_t i = 0; i < arraysize(atom_tests); i++) {
FilterTestVars v;
AtomTest* t = &atom_tests[i];
size_t nregexp, natom;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
for (natom = 0; natom < arraysize(t->atoms); natom++)
if (t->atoms[natom] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
nfail++;
}
EXPECT_EQ(0, nfail);
}
void FindAtomIndices(const std::vector<std::string>& atoms,
const std::vector<std::string>& matched_atoms,
std::vector<int>* atom_indices) {
atom_indices->clear();
for (size_t i = 0; i < matched_atoms.size(); i++) {
for (size_t j = 0; j < atoms.size(); j++) {
if (matched_atoms[i] == atoms[j]) {
atom_indices->push_back(static_cast<int>(j));
break;
}
}
}
}
TEST(FilteredRE2Test, MatchEmptyPattern) {
FilterTestVars v;
AtomTest* t = &atom_tests[0];
// We are using the regexps used in one of the atom tests
// for this test. Adding the EXPECT here to make sure
// the index we use for the test is for the correct test.
EXPECT_EQ("CheckEmptyPattern", std::string(t->testname));
size_t nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
std::string text = "0123";
std::vector<int> atom_ids;
std::vector<int> matching_regexps;
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
}
TEST(FilteredRE2Test, MatchTests) {
FilterTestVars v;
AtomTest* t = &atom_tests[2];
// We are using the regexps used in one of the atom tests
// for this test.
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname));
size_t nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
std::string text = "abc121212xyz";
// atoms = abc
std::vector<int> atom_ids;
std::vector<std::string> atoms;
atoms.push_back("abc");
FindAtomIndices(v.atoms, atoms, &atom_ids);
std::vector<int> matching_regexps;
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abc12312yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abcd12yyy32yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("abcd");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
LOG(INFO) << "S: " << atom_ids.size();
for (size_t i = 0; i < atom_ids.size(); i++)
LOG(INFO) << "i: " << i << " : " << atom_ids[i];
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(2, matching_regexps.size());
}
TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
// Bug due to find() finding "" at the start of everything in a string
// set and thus SimplifyStringSet() would end up erasing everything.
// In order to test this, we have to keep PrefilterTree from discarding
// the OR entirely, so we have to make the minimum atom length zero.
FilterTestVars v(0); // override the minimum atom length
const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
const char* atoms[] = {"", "-r", "add=;aa", "}"};
AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
"EmptyStringInStringSetBug", &v));
}
} // namespace re2

View File

@ -0,0 +1,77 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct PCRETest {
const char* regexp;
bool should_match;
};
static PCRETest tests[] = {
// Most things should behave exactly.
{ "abc", true },
{ "(a|b)c", true },
{ "(a*|b)c", true },
{ "(a|b*)c", true },
{ "a(b|c)d", true },
{ "a(()|())c", true },
{ "ab*c", true },
{ "ab+c", true },
{ "a(b*|c*)d", true },
{ "\\W", true },
{ "\\W{1,2}", true },
{ "\\d", true },
// Check that repeated empty strings do not.
{ "(a*)*", false },
{ "x(a*)*y", false },
{ "(a*)+", false },
{ "(a+)*", true },
{ "(a+)+", true },
{ "(a+)+", true },
// \v is the only character class that shouldn't.
{ "\\b", true },
{ "\\v", false },
{ "\\d", true },
// The handling of ^ in multi-line mode is different, as is
// the handling of $ in single-line mode. (Both involve
// boundary cases if the string ends with \n.)
{ "\\A", true },
{ "\\z", true },
{ "(?m)^", false },
{ "(?m)$", true },
{ "(?-m)^", true },
{ "(?-m)$", false }, // In PCRE, == \Z
{ "(?m)\\A", true },
{ "(?m)\\z", true },
{ "(?-m)\\A", true },
{ "(?-m)\\z", true },
};
TEST(MimicsPCRE, SimpleTests) {
for (size_t i = 0; i < arraysize(tests); i++) {
const PCRETest& t = tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
ASSERT_TRUE(re != NULL) << " " << t.regexp;
ASSERT_EQ(t.should_match, re->MimicsPCRE())
<< " " << t.regexp << " "
<< (j == 0 ? "latin1" : "utf");
re->Decref();
}
}
}
} // namespace re2

46
extern/re2/re2/testing/null_walker.cc vendored Normal file
View File

@ -0,0 +1,46 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Null walker. For benchmarking the walker itself.
class NullWalker : public Regexp::Walker<bool> {
public:
NullWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NullWalker::ShortVisit called";
return a;
}
private:
NullWalker(const NullWalker&) = delete;
NullWalker& operator=(const NullWalker&) = delete;
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
return false;
}
// Returns whether re can match an empty string.
void Regexp::NullWalk() {
NullWalker w;
w.Walk(this, false);
}
} // namespace re2

508
extern/re2/re2/testing/parse_test.cc vendored Normal file
View File

@ -0,0 +1,508 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
// In the past, we used 1<<30 here and zeroed the bit later, but that
// has undefined behaviour, so now we use an internal-only flag because
// otherwise we would have to introduce a new flag value just for this.
static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
struct Test {
const char* regexp;
const char* parse;
Regexp::ParseFlags flags;
};
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
Regexp::PerlX |
Regexp::PerlClasses |
Regexp::UnicodeGroups;
static Test tests[] = {
// Base cases
{ "a", "lit{a}" },
{ "a.", "cat{lit{a}dot{}}" },
{ "a.b", "cat{lit{a}dot{}lit{b}}" },
{ "ab", "str{ab}" },
{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
{ "abc", "str{abc}" },
{ "a|^", "alt{lit{a}bol{}}" },
{ "a|b", "cc{0x61-0x62}" },
{ "(a)", "cap{lit{a}}" },
{ "(a)|b", "alt{cap{lit{a}}lit{b}}" },
{ "a*", "star{lit{a}}" },
{ "a+", "plus{lit{a}}" },
{ "a?", "que{lit{a}}" },
{ "a{2}", "rep{2,2 lit{a}}" },
{ "a{2,3}", "rep{2,3 lit{a}}" },
{ "a{2,}", "rep{2,-1 lit{a}}" },
{ "a*?", "nstar{lit{a}}" },
{ "a+?", "nplus{lit{a}}" },
{ "a??", "nque{lit{a}}" },
{ "a{2}?", "nrep{2,2 lit{a}}" },
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
{ "", "emp{}" },
{ "|", "alt{emp{}emp{}}" },
{ "|x|", "alt{emp{}lit{x}emp{}}" },
{ ".", "dot{}" },
{ "^", "bol{}" },
{ "$", "eol{}" },
{ "\\|", "lit{|}" },
{ "\\(", "lit{(}" },
{ "\\)", "lit{)}" },
{ "\\*", "lit{*}" },
{ "\\+", "lit{+}" },
{ "\\?", "lit{?}" },
{ "{", "lit{{}" },
{ "}", "lit{}}" },
{ "\\.", "lit{.}" },
{ "\\^", "lit{^}" },
{ "\\$", "lit{$}" },
{ "\\\\", "lit{\\}" },
{ "[ace]", "cc{0x61 0x63 0x65}" },
{ "[abc]", "cc{0x61-0x63}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[a]", "lit{a}" },
{ "\\-", "lit{-}" },
{ "-", "lit{-}" },
{ "\\_", "lit{_}" },
// Posix and Perl extensions
{ "[[:lower:]]", "cc{0x61-0x7a}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "\\d", "cc{0x30-0x39}" },
{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
{ "\\C", "byte{}" },
// Unicode, negatives, and a double negative.
{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
// More interesting regular expressions.
{ "a{,2}", "str{a{,2}}" },
{ "\\.\\^\\$\\\\", "str{.^$\\}" },
{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
{ "a*{", "cat{star{lit{a}}lit{{}}" },
// Test precedences
{ "(?:ab)*", "star{str{ab}}" },
{ "(ab)*", "star{cap{str{ab}}}" },
{ "ab|cd", "alt{str{ab}str{cd}}" },
{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
// Test squashing of **, ++, ?? et cetera.
{ "(?:(?:a)*)*", "star{lit{a}}" },
{ "(?:(?:a)+)+", "plus{lit{a}}" },
{ "(?:(?:a)?)?", "que{lit{a}}" },
{ "(?:(?:a)*)+", "star{lit{a}}" },
{ "(?:(?:a)*)?", "star{lit{a}}" },
{ "(?:(?:a)+)*", "star{lit{a}}" },
{ "(?:(?:a)+)?", "star{lit{a}}" },
{ "(?:(?:a)?)*", "star{lit{a}}" },
{ "(?:(?:a)?)+", "star{lit{a}}" },
// Test flattening.
{ "(?:a)", "lit{a}" },
{ "(?:ab)(?:cd)", "str{abcd}" },
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
{ "a|c", "cc{0x61 0x63}" },
{ "a|[cd]", "cc{0x61 0x63-0x64}" },
{ "a|.", "dot{}" },
{ "[ab]|c", "cc{0x61-0x63}" },
{ "[ab]|[cd]", "cc{0x61-0x64}" },
{ "[ab]|.", "dot{}" },
{ ".|c", "dot{}" },
{ ".|[cd]", "dot{}" },
{ ".|.", "dot{}" },
// Test Perl quoted literals
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
{ "\\Q+\\E+", "plus{lit{+}}" },
{ "\\Q\\\\E", "lit{\\}" },
{ "\\Q\\\\\\E", "str{\\\\}" },
{ "\\Qa\\E*", "star{lit{a}}" },
{ "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
{ "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
// Test Perl \A and \z
{ "(?m)^", "bol{}" },
{ "(?m)$", "eol{}" },
{ "(?-m)^", "bot{}" },
{ "(?-m)$", "eot{}" },
{ "(?m)\\A", "bot{}" },
{ "(?m)\\z", "eot{\\z}" },
{ "(?-m)\\A", "bot{}" },
{ "(?-m)\\z", "eot{\\z}" },
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },
// Strings
{ "abcde", "str{abcde}" },
{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
// Reported bug involving \n leaking in despite use of NeverNL.
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::FoldCase },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::FoldCase },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
// Bug in Regexp::ToString() that emitted [^], which
// would (obviously) fail to parse when fed back in.
{ "[\\s\\S]", "cc{0-0x10ffff}" },
};
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
return Regexp::Equal(a, b);
}
void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
const std::string& title) {
Regexp** re = new Regexp*[ntests];
for (int i = 0; i < ntests; i++) {
RegexpStatus status;
Regexp::ParseFlags f = flags;
if (tests[i].flags != 0) {
f = tests[i].flags & ~TestZeroFlags;
}
re[i] = Regexp::Parse(tests[i].regexp, f, &status);
ASSERT_TRUE(re[i] != NULL)
<< " " << tests[i].regexp << " " << status.Text();
std::string s = re[i]->Dump();
EXPECT_EQ(std::string(tests[i].parse), s)
<< "Regexp: " << tests[i].regexp
<< "\nparse: " << std::string(tests[i].parse)
<< " s: " << s << " flag=" << f;
}
for (int i = 0; i < ntests; i++) {
for (int j = 0; j < ntests; j++) {
EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
RegexpEqualTestingOnly(re[i], re[j]))
<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
}
}
for (int i = 0; i < ntests; i++)
re[i]->Decref();
delete[] re;
}
// Test that regexps parse to expected structures.
TEST(TestParse, SimpleRegexps) {
TestParse(tests, arraysize(tests), kTestFlags, "simple");
}
Test foldcase_tests[] = {
{ "AbCdE", "strfold{abcde}" },
{ "[Aa]", "litfold{a}" },
{ "a", "litfold{a}" },
// 0x17F is an old English long s (looks like an f) and folds to s.
// 0x212A is the Kelvin symbol and folds to k.
{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
};
// Test that parsing with FoldCase works.
TEST(TestParse, FoldCase) {
TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
}
Test literal_tests[] = {
{ "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
};
// Test that parsing with Literal works.
TEST(TestParse, Literal) {
TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
}
Test matchnl_tests[] = {
{ ".", "dot{}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing with MatchNL works.
// (Also tested above during simple cases.)
TEST(TestParse, MatchNL) {
TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
}
Test nomatchnl_tests[] = {
{ ".", "cc{0-0x9 0xb-0x10ffff}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing without MatchNL works.
TEST(TestParse, NoMatchNL) {
TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
}
Test prefix_tests[] = {
{ "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "abc|abd|aef|bcx|bcy",
"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
"cat{str{bc}cc{0x78-0x79}}}" },
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
{ "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
{ ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
{ "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
{ "x{2}|x{2}[0-9]",
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
{ "x{2}y|x{2}[0-9]y",
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
{ "n|r|rs",
"alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
{ "n|rs|r",
"alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
{ "r|rs|n",
"alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
{ "rs|r|n",
"alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
{ "a\\C*?c|a\\C*?b",
"cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
{ "^/a/bc|^/a/de",
"cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
// In the past, factoring was limited to kFactorAlternationMaxDepth (8).
{ "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
"lit{a}}}}}}}}}}}}}}}}}}}" },
{ "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
"cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
"cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
"str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
};
// Test that prefix factoring works.
TEST(TestParse, Prefix) {
TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
}
Test nested_tests[] = {
{ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
"cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
{ "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
{ "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
{ "((((((x{2}){2}){2}){5}){5}){5})",
"cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
};
// Test that nested repetition works.
TEST(TestParse, Nested) {
TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
}
// Invalid regular expressions
const char* badtests[] = {
"(",
")",
"(a",
"(a|b|",
"(a|b",
"[a-z",
"([a-z)",
"x{1001}",
"\xff", // Invalid UTF-8
"[\xff]",
"[\\\xff]",
"\\\xff",
"(?P<name>a",
"(?P<name>",
"(?P<name",
"(?P<x y>a)",
"(?P<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
"a{100000,}",
"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
"(((x{7}){11}){13})",
"\\Q\\E*",
};
// Valid in Perl, bad in POSIX
const char* only_perl[] = {
"[a-b-c]",
"\\Qabc\\E",
"\\Q*+?{[\\E",
"\\Q\\\\E",
"\\Q\\\\\\E",
"\\Q\\\\\\\\E",
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?P<name>a)",
};
// Valid in POSIX, bad in Perl.
const char* only_posix[] = {
"a++",
"a**",
"a?*",
"a+*",
"a{1}*",
};
// Test that parser rejects bad regexps.
TEST(TestParse, InvalidRegexps) {
for (size_t i = 0; i < arraysize(badtests); i++) {
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
<< " " << badtests[i];
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << badtests[i];
}
for (size_t i = 0; i < arraysize(only_posix); i++) {
ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
<< " " << only_posix[i];
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
ASSERT_TRUE(re != NULL) << " " << only_posix[i];
re->Decref();
}
for (size_t i = 0; i < arraysize(only_perl); i++) {
ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << only_perl[i];
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
ASSERT_TRUE(re != NULL) << " " << only_perl[i];
re->Decref();
}
}
// Test that ToString produces original regexp or equivalent one.
TEST(TestToString, EquivalentParse) {
for (size_t i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
Regexp::ParseFlags f = kTestFlags;
if (tests[i].flags != 0) {
f = tests[i].flags & ~TestZeroFlags;
}
Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
std::string s = re->Dump();
EXPECT_EQ(std::string(tests[i].parse), s)
<< "Regexp: " << tests[i].regexp
<< "\nparse: " << std::string(tests[i].parse)
<< " s: " << s << " flag=" << f;
std::string t = re->ToString();
if (t != tests[i].regexp) {
// If ToString didn't return the original regexp,
// it must have found one with fewer parens.
// Unfortunately we can't check the length here, because
// ToString produces "\\{" for a literal brace,
// but "{" is a shorter equivalent.
// ASSERT_LT(t.size(), strlen(tests[i].regexp))
// << " t=" << t << " regexp=" << tests[i].regexp;
// Test that if we parse the new regexp we get the same structure.
Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
std::string ss = nre->Dump();
std::string tt = nre->ToString();
if (s != ss || t != tt)
LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
EXPECT_EQ(s, ss);
EXPECT_EQ(t, tt);
nre->Decref();
}
re->Decref();
}
}
// Test that capture error args are correct.
TEST(NamedCaptures, ErrorArgs) {
RegexpStatus status;
Regexp* re;
re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<name");
re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
}
} // namespace re2

View File

@ -0,0 +1,247 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Test that C++ strings are compared as uint8s, not int8s.
// PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings, EightBit) {
std::string s = "\x70";
std::string t = "\xA0";
EXPECT_LT(s, t);
}
struct PrefixTest {
const char* regexp;
int maxlen;
const char* min;
const char* max;
};
static PrefixTest tests[] = {
{ "", 10, "", "", },
{ "Abcdef", 10, "Abcdef", "Abcdef" },
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
{ "a+hello", 10, "aa", "ahello" },
{ "a*hello", 10, "a", "hello" },
{ "def|abc", 10, "abc", "def" },
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
{ "ab(cab|cat)", 10, "abcab", "abcat" },
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
{ "(ab|x)(c|de)", 10, "abc", "xde" },
{ "(ab|x)?(c|z)?", 10, "", "z" },
{ "[^\\s\\S]", 10, "", "" },
{ "(abc)+", 5, "abc", "abcac" },
{ "(abc)+", 2, "ab", "ac" },
{ "(abc)+", 1, "a", "b" },
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "a*", 10, "", "ab" },
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)a+hello", 10, "AA", "ahello" },
{ "(?i)a*hello", 10, "A", "hello" },
{ "(?i)def|abc", 10, "ABC", "def" },
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)[^\\s\\S]", 10, "", "" },
{ "(?i)(abc)+", 5, "ABC", "abcac" },
{ "(?i)(abc)+", 2, "AB", "ac" },
{ "(?i)(abc)+", 1, "A", "b" },
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)a*", 10, "", "ab" },
{ "(?i)A*", 10, "", "ab" },
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
{ "\\Aa+hello", 10, "aa", "ahello" },
{ "\\Aa*hello", 10, "a", "hello" },
{ "\\Adef|abc", 10, "abc", "def" },
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "\\A[^\\s\\S]", 10, "", "" },
{ "\\A(abc)+", 5, "abc", "abcac" },
{ "\\A(abc)+", 2, "ab", "ac" },
{ "\\A(abc)+", 1, "a", "b" },
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "\\Aa*", 10, "", "ab" },
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
{ "(?i)\\Aa*hello", 10, "A", "hello" },
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
{ "(?i)\\A(abc)+", 1, "A", "b" },
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)\\Aa*", 10, "", "ab" },
{ "(?i)\\AA*", 10, "", "ab" },
};
TEST(PossibleMatchRange, HandWritten) {
for (size_t i = 0; i < arraysize(tests); i++) {
for (size_t j = 0; j < 2; j++) {
const PrefixTest& t = tests[i];
std::string min, max;
if (j == 0) {
LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
ASSERT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
ASSERT_TRUE(prog != NULL);
ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
<< " " << t.regexp;
delete prog;
re->Decref();
} else {
ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
}
EXPECT_EQ(t.min, min) << t.regexp;
EXPECT_EQ(t.max, max) << t.regexp;
}
}
}
// Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange, Failures) {
std::string min, max;
// Fails because no room to write max.
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
// Fails because there is no max -- any non-empty string matches
// or begins a match. Have to use Latin-1 input, because there
// are no valid UTF-8 strings beginning with byte 0xFF.
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("\\C*").
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
// Fails because it's a malformed regexp.
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
}
// Exhaustive test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that the prefix information agrees with whether
// the regexp matches each of the strings.
class PossibleMatchTester : public RegexpGenerator {
public:
PossibleMatchTester(int maxatoms,
int maxops,
const std::vector<std::string>& alphabet,
const std::vector<std::string>& ops,
int maxstrlen,
const std::vector<std::string>& stralphabet)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
regexps_(0), tests_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const std::string& regexp);
private:
StringGenerator strgen_;
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
PossibleMatchTester(const PossibleMatchTester&) = delete;
PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
};
// Processes a single generated regexp.
// Checks that all accepted strings agree with the prefix range.
void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
regexps_++;
VLOG(3) << CEscape(regexp);
RE2 re(regexp, RE2::Latin1);
ASSERT_EQ(re.error(), "");
std::string min, max;
if(!re.PossibleMatchRange(&min, &max, 10)) {
// There's no good max for "\\C*". Can't use strcmp
// because sometimes it gets embedded in more
// complicated expressions.
if(strstr(regexp.c_str(), "\\C*"))
return;
LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
}
strgen_.Reset();
while (strgen_.HasNext()) {
const StringPiece& s = strgen_.Next();
tests_++;
if (!RE2::FullMatch(s, re))
continue;
ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
}
}
TEST(PossibleMatchRange, Exhaustive) {
int natom = 3;
int noperator = 3;
int stringlen = 5;
if (RE2_DEBUG_MODE) {
natom = 2;
noperator = 3;
stringlen = 3;
}
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
RegexpGenerator::EgrepOps(),
stringlen, Explode("ab4"));
t.Generate();
LOG(INFO) << t.regexps() << " regexps, "
<< t.tests() << " tests";
}
} // namespace re2

99
extern/re2/re2/testing/random_test.cc vendored Normal file
View File

@ -0,0 +1,99 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Random testing of regular expression matching.
#include <stdio.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DEFINE_int32(regexpseed, 404, "Random regexp seed.");
DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
DEFINE_int32(stringseed, 200, "Random string seed.");
DEFINE_int32(stringcount, 100, "How many random strings to generate.");
namespace re2 {
// Runs a random test on the given parameters.
// (Always uses the same random seeds for reproducibility.
// Can give different seeds on command line.)
static void RandomTest(int maxatoms, int maxops,
const std::vector<std::string>& alphabet,
const std::vector<std::string>& ops,
int maxstrlen,
const std::vector<std::string>& stralphabet,
const std::string& wrapper) {
// Limit to smaller test cases in debug mode,
// because everything is so much slower.
if (RE2_DEBUG_MODE) {
maxatoms--;
maxops--;
maxstrlen /= 2;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
EXPECT_EQ(0, t.failures());
}
// Tests random small regexps involving literals and egrep operators.
TEST(Random, SmallEgrepLiterals) {
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals and egrep operators.
TEST(Random, BigEgrepLiterals) {
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random small regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, SmallEgrepCaptures) {
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, BigEgrepCaptures) {
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random large complicated expressions, using all the possible
// operators, some literals, some parenthesized literals, and predefined
// character classes like \d. (Adding larger character classes would
// make for too many possibilities.)
TEST(Random, Complicated) {
std::vector<std::string> ops = Split(" ",
"%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
"%s{2} %s{2,} %s{3,4} %s{4,5}");
// Use (?:\b) and (?:\B) instead of \b and \B,
// because PCRE rejects \b* but accepts (?:\b)*.
// Ditto ^ and $.
std::vector<std::string> atoms = Split(" ",
". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
"\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
"a (a) b c - \\\\");
std::vector<std::string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
RandomTest(10, 10, atoms, ops, 20, alphabet, "");
}
} // namespace re2

135
extern/re2/re2/testing/re2_arg_test.cc vendored Normal file
View File

@ -0,0 +1,135 @@
// Copyright 2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This tests to make sure numbers are parsed from strings
// correctly.
// Todo: Expand the test to validate strings parsed to the other types
// supported by RE2::Arg class
#include <stdint.h>
#include <string.h>
#include "util/test.h"
#include "re2/re2.h"
namespace re2 {
struct SuccessTable {
const char * value_string;
int64_t value;
bool success[6];
};
// Test boundary cases for different integral sizes.
// Specifically I want to make sure that values outside the boundries
// of an integral type will fail and that negative numbers will fail
// for unsigned types. The following table contains the boundaries for
// the various integral types and has entries for whether or not each
// type can contain the given value.
const SuccessTable kSuccessTable[] = {
// string integer value i16 u16 i32 u32 i64 u64
// 0 to 2^7-1
{ "0", 0, { true, true, true, true, true, true }},
{ "127", 127, { true, true, true, true, true, true }},
// -1 to -2^7
{ "-1", -1, { true, false, true, false, true, false }},
{ "-128", -128, { true, false, true, false, true, false }},
// 2^7 to 2^8-1
{ "128", 128, { true, true, true, true, true, true }},
{ "255", 255, { true, true, true, true, true, true }},
// 2^8 to 2^15-1
{ "256", 256, { true, true, true, true, true, true }},
{ "32767", 32767, { true, true, true, true, true, true }},
// -2^7-1 to -2^15
{ "-129", -129, { true, false, true, false, true, false }},
{ "-32768", -32768, { true, false, true, false, true, false }},
// 2^15 to 2^16-1
{ "32768", 32768, { false, true, true, true, true, true }},
{ "65535", 65535, { false, true, true, true, true, true }},
// 2^16 to 2^31-1
{ "65536", 65536, { false, false, true, true, true, true }},
{ "2147483647", 2147483647, { false, false, true, true, true, true }},
// -2^15-1 to -2^31
{ "-32769", -32769, { false, false, true, false, true, false }},
{ "-2147483648", static_cast<int64_t>(0xFFFFFFFF80000000LL),
{ false, false, true, false, true, false }},
// 2^31 to 2^32-1
{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
// 2^32 to 2^63-1
{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
{ "9223372036854775807",
9223372036854775807LL, { false, false, false, false, true, true }},
// -2^31-1 to -2^63
{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
{ "-9223372036854775808", static_cast<int64_t>(0x8000000000000000LL),
{ false, false, false, false, true, false }},
// 2^63 to 2^64-1
{ "9223372036854775808", static_cast<int64_t>(9223372036854775808ULL),
{ false, false, false, false, false, true }},
{ "18446744073709551615", static_cast<int64_t>(18446744073709551615ULL),
{ false, false, false, false, false, true }},
// >= 2^64
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
};
const int kNumStrings = arraysize(kSuccessTable);
// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ
// macro outside of a TEST block and this seems to be the only way to
// avoid code duplication. I can also pull off a couple nice tricks
// using concatenation for the type I'm checking against.
#define PARSE_FOR_TYPE(type, column) { \
type r; \
for (int i = 0; i < kNumStrings; ++i) { \
RE2::Arg arg(&r); \
const char* const p = kSuccessTable[i].value_string; \
bool retval = arg.Parse(p, strlen(p)); \
bool success = kSuccessTable[i].success[column]; \
EXPECT_EQ(retval, success) \
<< "Parsing '" << p << "' for type " #type " should return " \
<< success; \
if (success) { \
EXPECT_EQ(r, (type)kSuccessTable[i].value); \
} \
} \
}
TEST(RE2ArgTest, Int16Test) {
PARSE_FOR_TYPE(int16_t, 0);
}
TEST(RE2ArgTest, Uint16Test) {
PARSE_FOR_TYPE(uint16_t, 1);
}
TEST(RE2ArgTest, Int32Test) {
PARSE_FOR_TYPE(int32_t, 2);
}
TEST(RE2ArgTest, Uint32Test) {
PARSE_FOR_TYPE(uint32_t, 3);
}
TEST(RE2ArgTest, Int64Test) {
PARSE_FOR_TYPE(int64_t, 4);
}
TEST(RE2ArgTest, Uint64Test) {
PARSE_FOR_TYPE(uint64_t, 5);
}
} // namespace re2

1631
extern/re2/re2/testing/re2_test.cc vendored Normal file

File diff suppressed because it is too large Load Diff

1586
extern/re2/re2/testing/regexp_benchmark.cc vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,276 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression generator: generates all possible
// regular expressions within parameters (see regexp_generator.h for details).
// The regexp generator first generates a sequence of commands in a simple
// postfix language. Each command in the language is a string,
// like "a" or "%s*" or "%s|%s".
//
// To evaluate a command, enough arguments are popped from the value stack to
// plug into the %s slots. Then the result is pushed onto the stack.
// For example, the command sequence
// a b %s%s c
// results in the stack
// ab c
//
// GeneratePostfix generates all possible command sequences.
// Then RunPostfix turns each sequence into a regular expression
// and passes the regexp to HandleRegexp.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <memory>
#include <stack>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns a vector of the egrep regexp operators.
const std::vector<std::string>& RegexpGenerator::EgrepOps() {
static const char *ops[] = {
"%s%s",
"%s|%s",
"%s*",
"%s+",
"%s?",
"%s\\C*",
};
static std::vector<std::string> v(ops, ops + arraysize(ops));
return v;
}
RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
const std::vector<std::string>& atoms,
const std::vector<std::string>& ops)
: maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
// Degenerate case.
if (atoms_.empty())
maxatoms_ = 0;
if (ops_.empty())
maxops_ = 0;
}
// Generates all possible regular expressions (within the parameters),
// calling HandleRegexp for each one.
void RegexpGenerator::Generate() {
std::vector<std::string> postfix;
GeneratePostfix(&postfix, 0, 0, 0);
}
// Generates random regular expressions, calling HandleRegexp for each one.
void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
rng_.seed(seed);
for (int i = 0; i < n; i++) {
std::vector<std::string> postfix;
GenerateRandomPostfix(&postfix, 0, 0, 0);
}
}
// Counts and returns the number of occurrences of "%s" in s.
static int CountArgs(const std::string& s) {
const char *p = s.c_str();
int n = 0;
while ((p = strstr(p, "%s")) != NULL) {
p += 2;
n++;
}
return n;
}
// Generates all possible postfix command sequences.
// Each sequence is handed off to RunPostfix to generate a regular expression.
// The arguments are:
// post: the current postfix sequence
// nstk: the number of elements that would be on the stack after executing
// the sequence
// ops: the number of operators used in the sequence
// atoms: the number of atoms used in the sequence
// For example, if post were ["a", "b", "%s%s", "c"],
// then nstk = 2, ops = 1, atoms = 3.
//
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
//
void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
int nstk, int ops, int atoms) {
if (nstk == 1)
RunPostfix(*post);
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return;
// Add atoms if there is room.
if (atoms < maxatoms_) {
for (size_t i = 0; i < atoms_.size(); i++) {
post->push_back(atoms_[i]);
GeneratePostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
}
}
// Add operators if there are enough arguments.
if (ops < maxops_) {
for (size_t i = 0; i < ops_.size(); i++) {
const std::string& fmt = ops_[i];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
post->pop_back();
}
}
}
}
// Generates a random postfix command sequence.
// Stops and returns true once a single sequence has been generated.
bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
int nstk, int ops, int atoms) {
std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
std::uniform_int_distribution<int> random_bit(0, 1);
std::uniform_int_distribution<int> random_ops_index(
0, static_cast<int>(ops_.size()) - 1);
std::uniform_int_distribution<int> random_atoms_index(
0, static_cast<int>(atoms_.size()) - 1);
for (;;) {
// Stop if we get to a single element, but only sometimes.
if (nstk == 1 && random_stop(rng_) == 0) {
RunPostfix(*post);
return true;
}
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return false;
// Add operators if there are enough arguments.
if (ops < maxops_ && random_bit(rng_) == 0) {
const std::string& fmt = ops_[random_ops_index(rng_)];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
ops + 1, atoms);
post->pop_back();
if (ret)
return true;
}
}
// Add atoms if there is room.
if (atoms < maxatoms_ && random_bit(rng_) == 0) {
post->push_back(atoms_[random_atoms_index(rng_)]);
bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
if (ret)
return true;
}
}
}
// Interprets the postfix command sequence to create a regular expression
// passed to HandleRegexp. The results of operators like %s|%s are wrapped
// in (?: ) to avoid needing to maintain a precedence table.
void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
std::stack<std::string> regexps;
for (size_t i = 0; i < post.size(); i++) {
switch (CountArgs(post[i])) {
default:
LOG(FATAL) << "Bad operator: " << post[i];
case 0:
regexps.push(post[i]);
break;
case 1: {
std::string a = regexps.top();
regexps.pop();
regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
break;
}
case 2: {
std::string b = regexps.top();
regexps.pop();
std::string a = regexps.top();
regexps.pop();
regexps.push("(?:" +
StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
")");
break;
}
}
}
if (regexps.size() != 1) {
// Internal error - should never happen.
printf("Bad regexp program:\n");
for (size_t i = 0; i < post.size(); i++) {
printf(" %s\n", CEscape(post[i]).c_str());
}
printf("Stack after running program:\n");
while (!regexps.empty()) {
printf(" %s\n", CEscape(regexps.top()).c_str());
regexps.pop();
}
LOG(FATAL) << "Bad regexp program.";
}
HandleRegexp(regexps.top());
HandleRegexp("^(?:" + regexps.top() + ")$");
HandleRegexp("^(?:" + regexps.top() + ")");
HandleRegexp("(?:" + regexps.top() + ")$");
}
// Split s into an vector of strings, one for each UTF-8 character.
std::vector<std::string> Explode(const StringPiece& s) {
std::vector<std::string> v;
for (const char *q = s.begin(); q < s.end(); ) {
const char* p = q;
Rune r;
q += chartorune(&r, q);
v.push_back(std::string(p, q - p));
}
return v;
}
// Split string everywhere a substring is found, returning
// vector of pieces.
std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s) {
std::vector<std::string> v;
if (sep.size() == 0)
return Explode(s);
const char *p = s.begin();
for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
if (StringPiece(q, sep.size()) == sep) {
v.push_back(std::string(p, q - p));
p = q + sep.size();
q = p - 1; // -1 for ++ in loop
continue;
}
}
if (p < s.end())
v.push_back(std::string(p, s.end() - p));
return v;
}
} // namespace re2

View File

@ -0,0 +1,77 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_REGEXP_GENERATOR_H_
#define RE2_TESTING_REGEXP_GENERATOR_H_
// Regular expression generator: generates all possible
// regular expressions within given parameters (see below for details).
#include <stdint.h>
#include <random>
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Regular expression generator.
//
// Given a set of atom expressions like "a", "b", or "."
// and operators like "%s*", generates all possible regular expressions
// using at most maxbases base expressions and maxops operators.
// For each such expression re, calls HandleRegexp(re).
//
// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
//
class RegexpGenerator {
public:
RegexpGenerator(int maxatoms, int maxops,
const std::vector<std::string>& atoms,
const std::vector<std::string>& ops);
virtual ~RegexpGenerator() {}
// Generates all the regular expressions, calling HandleRegexp(re) for each.
void Generate();
// Generates n random regular expressions, calling HandleRegexp(re) for each.
void GenerateRandom(int32_t seed, int n);
// Handles a regular expression. Must be provided by subclass.
virtual void HandleRegexp(const std::string& regexp) = 0;
// The egrep regexp operators: * + ? | and concatenation.
static const std::vector<std::string>& EgrepOps();
private:
void RunPostfix(const std::vector<std::string>& post);
void GeneratePostfix(std::vector<std::string>* post,
int nstk, int ops, int lits);
bool GenerateRandomPostfix(std::vector<std::string>* post,
int nstk, int ops, int lits);
int maxatoms_; // Maximum number of atoms allowed in expr.
int maxops_; // Maximum number of ops allowed in expr.
std::vector<std::string> atoms_; // Possible atoms.
std::vector<std::string> ops_; // Possible ops.
std::minstd_rand0 rng_; // Random number generator.
RegexpGenerator(const RegexpGenerator&) = delete;
RegexpGenerator& operator=(const RegexpGenerator&) = delete;
};
// Helpers for preparing arguments to RegexpGenerator constructor.
// Returns one string for each character in s.
std::vector<std::string> Explode(const StringPiece& s);
// Splits string everywhere sep is found, returning
// vector of pieces.
std::vector<std::string> Split(const StringPiece& sep, const StringPiece& s);
} // namespace re2
#endif // RE2_TESTING_REGEXP_GENERATOR_H_

86
extern/re2/re2/testing/regexp_test.cc vendored Normal file
View File

@ -0,0 +1,86 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <stddef.h>
#include <map>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
// Test that overflowed ref counts work.
TEST(Regexp, BigRef) {
Regexp* re;
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
for (int i = 0; i < 100000; i++)
re->Incref();
for (int i = 0; i < 100000; i++)
re->Decref();
ASSERT_EQ(re->Ref(), 1);
re->Decref();
}
// Test that very large Concats work.
// Depends on overflowed ref counts working.
TEST(Regexp, BigConcat) {
Regexp* x;
x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
std::vector<Regexp*> v(90000, x); // ToString bails out at 100000
for (size_t i = 0; i < v.size(); i++)
x->Incref();
ASSERT_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref();
Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()),
Regexp::NoParseFlags);
ASSERT_EQ(re->ToString(), std::string(v.size(), 'x'));
re->Decref();
ASSERT_EQ(x->Ref(), 1) << x->Ref();
x->Decref();
}
TEST(Regexp, NamedCaptures) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const std::map<std::string, int>* have = x->NamedCaptures();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(2, have->size()); // there are only two named groups in
// the regexp: 'g1' and 'g2'.
std::map<std::string, int> want;
want["g1"] = 1;
want["g2"] = 3;
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
TEST(Regexp, CaptureNames) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const std::map<int, std::string>* have = x->CaptureNames();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(3, have->size());
std::map<int, std::string> want;
want[1] = "g1";
want[3] = "g2";
want[4] = "g1";
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
} // namespace re2

View File

@ -0,0 +1,72 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
struct PrefixTest {
const char* regexp;
bool return_value;
const char* prefix;
bool foldcase;
const char* suffix;
};
static PrefixTest tests[] = {
// If the regexp is missing a ^, there's no required prefix.
{ "abc", false },
{ "", false },
{ "(?m)^", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
{ "^(abc)", false },
{ "^a*", false },
// Otherwise, it should work.
{ "^abc$", true, "abc", false, "(?-m:$)" },
{ "^abc", true, "abc", false, "" },
{ "^(?i)abc", true, "abc", true, "" },
{ "^abcd*", true, "abc", false, "d*" },
{ "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
{ "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
{ "^☺abc", true, "☺abc", false, "" },
};
TEST(RequiredPrefix, SimpleTests) {
for (size_t i = 0; i < arraysize(tests); i++) {
const PrefixTest& t = tests[i];
for (size_t j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
ASSERT_TRUE(re != NULL) << " " << t.regexp;
std::string p;
bool f;
Regexp* s;
ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf")
<< " " << re->Dump();
if (t.return_value) {
ASSERT_EQ(p, std::string(t.prefix))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
ASSERT_EQ(f, t.foldcase)
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
ASSERT_EQ(s->ToString(), std::string(t.suffix))
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
s->Decref();
}
re->Decref();
}
}
}
} // namespace re2

332
extern/re2/re2/testing/search_test.cc vendored Normal file
View File

@ -0,0 +1,332 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/testing/tester.h"
#include "re2/testing/exhaustive_tester.h"
// For target `log' in the Makefile.
#ifndef LOGGING
#define LOGGING 0
#endif
namespace re2 {
struct RegexpTest {
const char* regexp;
const char* text;
};
RegexpTest simple_tests[] = {
{ "a", "a" },
{ "a", "zyzzyva" },
{ "a+", "aa" },
{ "(a+|b)+", "ab" },
{ "ab|cd", "xabcdx" },
{ "h.*od?", "hello\ngoodbye\n" },
{ "h.*o", "hello\ngoodbye\n" },
{ "h.*o", "goodbye\nhello\n" },
{ "h.*o", "hello world" },
{ "h.*o", "othello, world" },
{ "[^\\s\\S]", "aaaaaaa" },
{ "a", "aaaaaaa" },
{ "a*", "aaaaaaa" },
{ "a*", "" },
{ "ab|cd", "xabcdx" },
{ "a", "cab" },
{ "a*b", "cab" },
{ "((((((((((((((((((((x))))))))))))))))))))", "x" },
{ "[abcd]", "xxxabcdxxx" },
{ "[^x]", "xxxabcdxxx" },
{ "[abcd]+", "xxxabcdxxx" },
{ "[^x]+", "xxxabcdxxx" },
{ "(fo|foo)", "fo" },
{ "(foo|fo)", "foo" },
{ "aa", "aA" },
{ "a", "Aa" },
{ "a", "A" },
{ "ABC", "abc" },
{ "abc", "XABCY" },
{ "ABC", "xabcy" },
// Make sure ^ and $ work.
// The pathological cases didn't work
// in the original grep code.
{ "foo|bar|[A-Z]", "foo" },
{ "^(foo|bar|[A-Z])", "foo" },
{ "(foo|bar|[A-Z])$", "foo\n" },
{ "(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "foo\n" },
{ "^(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "bar" },
{ "^(foo|bar|[A-Z])$", "X" },
{ "^(foo|bar|[A-Z])$", "XY" },
{ "^(fo|foo)$", "fo" },
{ "^(fo|foo)$", "foo" },
{ "^^(fo|foo)$", "fo" },
{ "^^(fo|foo)$", "foo" },
{ "^$", "" },
{ "^$", "x" },
{ "^^$", "" },
{ "^$$", "" },
{ "^^$", "x" },
{ "^$$", "x" },
{ "^^$$", "" },
{ "^^$$", "x" },
{ "^^^^^^^^$$$$$$$$", "" },
{ "^", "x" },
{ "$", "x" },
// Word boundaries.
{ "\\bfoo\\b", "nofoo foo that" },
{ "a\\b", "faoa x" },
{ "\\bbar", "bar x" },
{ "\\bbar", "foo\nbar x" },
{ "bar\\b", "foobar" },
{ "bar\\b", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\b", "foo" },
{ "(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b", "" },
{ "\\b", "x" },
{ "\\b(foo|bar|[A-Z])", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "X" },
{ "\\b(foo|bar|[A-Z])\\b", "XY" },
{ "\\b(foo|bar|[A-Z])\\b", "bar" },
{ "\\b(foo|bar|[A-Z])\\b", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
{ "\\b(fo|foo)\\b", "fo" },
{ "\\b(fo|foo)\\b", "foo" },
{ "\\b\\b", "" },
{ "\\b\\b", "x" },
{ "\\b$", "" },
{ "\\b$", "x" },
{ "\\b$", "y x" },
{ "\\b.$", "x" },
{ "^\\b(fo|foo)\\b", "fo" },
{ "^\\b(fo|foo)\\b", "foo" },
{ "^\\b", "" },
{ "^\\b", "x" },
{ "^\\b\\b", "" },
{ "^\\b\\b", "x" },
{ "^\\b$", "" },
{ "^\\b$", "x" },
{ "^\\b.$", "x" },
{ "^\\b.\\b$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "" },
{ "^^^^^^^^\\b.$$$$$$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "x" },
// Non-word boundaries.
{ "\\Bfoo\\B", "n foo xfoox that" },
{ "a\\B", "faoa x" },
{ "\\Bbar", "bar x" },
{ "\\Bbar", "foo\nbar x" },
{ "bar\\B", "foobar" },
{ "bar\\B", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\B", "foox" },
{ "(foo|bar|[A-Z])\\B", "foo\n" },
{ "\\B", "" },
{ "\\B", "x" },
{ "\\B(foo|bar|[A-Z])", "foo" },
{ "\\B(foo|bar|[A-Z])\\B", "xXy" },
{ "\\B(foo|bar|[A-Z])\\B", "XY" },
{ "\\B(foo|bar|[A-Z])\\B", "XYZ" },
{ "\\B(foo|bar|[A-Z])\\B", "abara" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
{ "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
{ "\\B(fo|foo)\\B", "xfoo" },
{ "\\B(foo|fo)\\B", "xfooo" },
{ "\\B\\B", "" },
{ "\\B\\B", "x" },
{ "\\B$", "" },
{ "\\B$", "x" },
{ "\\B$", "y x" },
{ "\\B.$", "x" },
{ "^\\B(fo|foo)\\B", "fo" },
{ "^\\B(fo|foo)\\B", "foo" },
{ "^\\B", "" },
{ "^\\B", "x" },
{ "^\\B\\B", "" },
{ "^\\B\\B", "x" },
{ "^\\B$", "" },
{ "^\\B$", "x" },
{ "^\\B.$", "x" },
{ "^\\B.\\B$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "" },
{ "^^^^^^^^\\B.$$$$$$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "x" },
// PCRE uses only ASCII for \b computation.
// All non-ASCII are *not* word characters.
{ "\\bx\\b", "x" },
{ "\\bx\\b", "x>" },
{ "\\bx\\b", "<x" },
{ "\\bx\\b", "<x>" },
{ "\\bx\\b", "ax" },
{ "\\bx\\b", "xb" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "«x" },
{ "\\bx\\b", "" },
{ "\\bx\\b", "«x»" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "áxβ" },
{ "\\Bx\\B", "axb" },
{ "\\Bx\\B", "áxβ" },
// Weird boundary cases.
{ "^$^$", "" },
{ "^$^", "" },
{ "$^$", "" },
{ "^$^$", "x" },
{ "^$^", "x" },
{ "$^$", "x" },
{ "^$^$", "x\ny" },
{ "^$^", "x\ny" },
{ "$^$", "x\ny" },
{ "^$^$", "x\n\ny" },
{ "^$^", "x\n\ny" },
{ "$^$", "x\n\ny" },
{ "^(foo\\$)$", "foo$bar" },
{ "(foo\\$)", "foo$bar" },
{ "^...$", "abc" },
// UTF-8
{ "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^\\C\\C\\C$", "\xe6\x9c\xac" },
{ "^\\C$", "\xe6\x9c\xac" },
{ "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
// Latin1
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^.....$", ".\xe6\x9c\xac." },
// Perl v Posix
{ "\\B(fo|foo)\\B", "xfooo" },
{ "(fo|foo)", "foo" },
// Octal escapes.
{ "\\141", "a" },
{ "\\060", "0" },
{ "\\0600", "00" },
{ "\\608", "08" },
{ "\\01", "\01" },
{ "\\018", "\01" "8" },
// Hexadecimal escapes
{ "\\x{61}", "a" },
{ "\\x61", "a" },
{ "\\x{00000061}", "a" },
// Unicode scripts.
{ "\\p{Greek}+", "aαβb" },
{ "\\P{Greek}+", "aαβb" },
{ "\\p{^Greek}+", "aαβb" },
{ "\\P{^Greek}+", "aαβb" },
// Unicode properties. Nd is decimal number. N is any number.
{ "[^0-9]+", "abc123" },
{ "\\p{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\pN+", "abc123²³¼½¾₀₉" },
{ "\\p{N}+", "abc123²³¼½¾₀₉" },
{ "\\p{^N}+", "abc123²³¼½¾₀₉" },
{ "\\p{Any}+", "abc123" },
// Character classes & case folding.
{ "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
{ "(?i)[A-Z]+", "aAzZ" },
{ "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
// splits the ranges in an interesting way.
// would like to use, but PCRE mishandles in full-match, non-greedy mode
// { "(?i)[\\\\]+", "Aa" },
{ "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Character classes & case folding.
{ "[@-A]+", "@AaB" },
{ "[A-Z]+", "aAzZ" },
{ "[^\\\\]+", "Aa\\" },
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Anchoring. (^abc in aabcdef was a former bug)
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "^abc", "abcdef" },
{ "^abc", "aabcdef" },
{ "^[ay]*[bx]+c", "abcdef" },
{ "^[ay]*[bx]+c", "aabcdef" },
{ "def$", "abcdef" },
{ "def$", "abcdeff" },
{ "d[ex][fy]$", "abcdef" },
{ "d[ex][fy]$", "abcdeff" },
{ "[dz][ex][fy]$", "abcdef" },
{ "[dz][ex][fy]$", "abcdeff" },
{ "(?m)^abc", "abcdef" },
{ "(?m)^abc", "aabcdef" },
{ "(?m)^[ay]*[bx]+c", "abcdef" },
{ "(?m)^[ay]*[bx]+c", "aabcdef" },
{ "(?m)def$", "abcdef" },
{ "(?m)def$", "abcdeff" },
{ "(?m)d[ex][fy]$", "abcdef" },
{ "(?m)d[ex][fy]$", "abcdeff" },
{ "(?m)[dz][ex][fy]$", "abcdef" },
{ "(?m)[dz][ex][fy]$", "abcdeff" },
{ "^", "a" },
{ "^^", "a" },
// Context.
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "a", "a" },
{ "ab*", "a" },
{ "a\\C*", "a" },
{ "a\\C+", "a" },
{ "a\\C?", "a" },
{ "a\\C*?", "a" },
{ "a\\C+?", "a" },
{ "a\\C??", "a" },
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
{ "\\w*I\\w*", "Inc." },
};
TEST(Regexp, SearchTests) {
int failures = 0;
for (size_t i = 0; i < arraysize(simple_tests); i++) {
const RegexpTest& t = simple_tests[i];
if (!TestRegexpOnText(t.regexp, t.text))
failures++;
if (LOGGING) {
// Build a dummy ExhaustiveTest call that will trigger just
// this one test, so that we log the test case.
std::vector<std::string> atom, alpha, ops;
atom.push_back(t.regexp);
alpha.push_back(t.text);
ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
}
}
EXPECT_EQ(failures, 0);
}
} // namespace re2

204
extern/re2/re2/testing/set_test.cc vendored Normal file
View File

@ -0,0 +1,204 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stddef.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "re2/re2.h"
#include "re2/set.h"
namespace re2 {
TEST(Set, Unanchored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s.Add("foo", NULL), 0);
ASSERT_EQ(s.Add("(", NULL), -1);
ASSERT_EQ(s.Add("bar", NULL), 1);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("foobar", NULL), true);
ASSERT_EQ(s.Match("fooba", NULL), true);
ASSERT_EQ(s.Match("oobar", NULL), true);
std::vector<int> v;
ASSERT_EQ(s.Match("foobar", &v), true);
ASSERT_EQ(v.size(), 2);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(v[1], 1);
ASSERT_EQ(s.Match("fooba", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("oobar", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 1);
}
TEST(Set, UnanchoredFactored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s.Add("foo", NULL), 0);
ASSERT_EQ(s.Add("(", NULL), -1);
ASSERT_EQ(s.Add("foobar", NULL), 1);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("foobar", NULL), true);
ASSERT_EQ(s.Match("obarfoobaroo", NULL), true);
ASSERT_EQ(s.Match("fooba", NULL), true);
ASSERT_EQ(s.Match("oobar", NULL), false);
std::vector<int> v;
ASSERT_EQ(s.Match("foobar", &v), true);
ASSERT_EQ(v.size(), 2);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(v[1], 1);
ASSERT_EQ(s.Match("obarfoobaroo", &v), true);
ASSERT_EQ(v.size(), 2);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(v[1], 1);
ASSERT_EQ(s.Match("fooba", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("oobar", &v), false);
ASSERT_EQ(v.size(), 0);
}
TEST(Set, UnanchoredDollar) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s.Add("foo$", NULL), 0);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("foo", NULL), true);
ASSERT_EQ(s.Match("foobar", NULL), false);
std::vector<int> v;
ASSERT_EQ(s.Match("foo", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("foobar", &v), false);
ASSERT_EQ(v.size(), 0);
}
TEST(Set, UnanchoredWordBoundary) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s.Add("foo\\b", NULL), 0);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("foo", NULL), true);
ASSERT_EQ(s.Match("foobar", NULL), false);
ASSERT_EQ(s.Match("foo bar", NULL), true);
std::vector<int> v;
ASSERT_EQ(s.Match("foo", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("foobar", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("foo bar", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
}
TEST(Set, Anchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
ASSERT_EQ(s.Add("foo", NULL), 0);
ASSERT_EQ(s.Add("(", NULL), -1);
ASSERT_EQ(s.Add("bar", NULL), 1);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("foobar", NULL), false);
ASSERT_EQ(s.Match("fooba", NULL), false);
ASSERT_EQ(s.Match("oobar", NULL), false);
ASSERT_EQ(s.Match("foo", NULL), true);
ASSERT_EQ(s.Match("bar", NULL), true);
std::vector<int> v;
ASSERT_EQ(s.Match("foobar", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("fooba", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("oobar", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("foo", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("bar", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 1);
}
TEST(Set, EmptyUnanchored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("", NULL), false);
ASSERT_EQ(s.Match("foobar", NULL), false);
std::vector<int> v;
ASSERT_EQ(s.Match("", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("foobar", &v), false);
ASSERT_EQ(v.size(), 0);
}
TEST(Set, EmptyAnchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("", NULL), false);
ASSERT_EQ(s.Match("foobar", NULL), false);
std::vector<int> v;
ASSERT_EQ(s.Match("", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("foobar", &v), false);
ASSERT_EQ(v.size(), 0);
}
TEST(Set, Prefix) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
ASSERT_EQ(s.Add("/prefix/\\d*", NULL), 0);
ASSERT_EQ(s.Compile(), true);
ASSERT_EQ(s.Match("/prefix", NULL), false);
ASSERT_EQ(s.Match("/prefix/", NULL), true);
ASSERT_EQ(s.Match("/prefix/42", NULL), true);
std::vector<int> v;
ASSERT_EQ(s.Match("/prefix", &v), false);
ASSERT_EQ(v.size(), 0);
ASSERT_EQ(s.Match("/prefix/", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
ASSERT_EQ(s.Match("/prefix/42", &v), true);
ASSERT_EQ(v.size(), 1);
ASSERT_EQ(v[0], 0);
}
} // namespace re2

273
extern/re2/re2/testing/simplify_test.cc vendored Normal file
View File

@ -0,0 +1,273 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test simplify.cc.
#include <string.h>
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
struct Test {
const char* regexp;
const char* simplified;
};
static Test tests[] = {
// Already-simple constructs
{ "a", "a" },
{ "ab", "ab" },
{ "a|b", "[a-b]" },
{ "ab|cd", "ab|cd" },
{ "(ab)*", "(ab)*" },
{ "(ab)+", "(ab)+" },
{ "(ab)?", "(ab)?" },
{ ".", "." },
{ "^", "^" },
{ "$", "$" },
{ "[ac]", "[ac]" },
{ "[^ac]", "[^ac]" },
// Posix character classes
{ "[[:alnum:]]", "[0-9A-Za-z]" },
{ "[[:alpha:]]", "[A-Za-z]" },
{ "[[:blank:]]", "[\\t ]" },
{ "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
{ "[[:digit:]]", "[0-9]" },
{ "[[:graph:]]", "[!-~]" },
{ "[[:lower:]]", "[a-z]" },
{ "[[:print:]]", "[ -~]" },
{ "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
{ "[[:space:]]" , "[\\t-\\r ]" },
{ "[[:upper:]]", "[A-Z]" },
{ "[[:xdigit:]]", "[0-9A-Fa-f]" },
// Perl character classes
{ "\\d", "[0-9]" },
{ "\\s", "[\\t-\\n\\f-\\r ]" },
{ "\\w", "[0-9A-Z_a-z]" },
{ "\\D", "[^0-9]" },
{ "\\S", "[^\\t-\\n\\f-\\r ]" },
{ "\\W", "[^0-9A-Z_a-z]" },
{ "[\\d]", "[0-9]" },
{ "[\\s]", "[\\t-\\n\\f-\\r ]" },
{ "[\\w]", "[0-9A-Z_a-z]" },
{ "[\\D]", "[^0-9]" },
{ "[\\S]", "[^\\t-\\n\\f-\\r ]" },
{ "[\\W]", "[^0-9A-Z_a-z]" },
// Posix repetitions
{ "a{1}", "a" },
{ "a{2}", "aa" },
{ "a{5}", "aaaaa" },
{ "a{0,1}", "a?" },
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version fewer parens.
{ "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
{ "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
{ "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,2}", "(?:aa?)?" }, // (aa?)?
{ "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
{ "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,}", "a*" },
{ "a{1,}", "a+" },
{ "a{2,}", "aa+" },
{ "a{5,}", "aaaaa+" },
// Test that operators simplify their arguments.
// (Simplify used to not simplify arguments to a {} repeat.)
{ "(?:a{1,}){1,}", "a+" },
{ "(a{1,}b{1,})", "(a+b+)" },
{ "a{1,}|b{1,}", "a+|b+" },
{ "(?:a{1,})*", "(?:a+)*" },
{ "(?:a{1,})+", "a+" },
{ "(?:a{1,})?", "(?:a+)?" },
{ "a{0}", "" },
// Character class simplification
{ "[ab]", "[a-b]" },
{ "[a-za-za-z]", "[a-z]" },
{ "[A-Za-zA-Za-z]", "[A-Za-z]" },
{ "[ABCDEFGH]", "[A-H]" },
{ "[AB-CD-EF-GH]", "[A-H]" },
{ "[W-ZP-XE-R]", "[E-Z]" },
{ "[a-ee-gg-m]", "[a-m]" },
{ "[a-ea-ha-m]", "[a-m]" },
{ "[a-ma-ha-e]", "[a-m]" },
{ "[a-zA-Z0-9 -~]", "[ -~]" },
// Empty character classes
{ "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
// Full character classes
{ "[[:cntrl:][:^cntrl:]]", "." },
// Unicode case folding.
{ "(?i)A", "[Aa]" },
{ "(?i)a", "[Aa]" },
{ "(?i)K", "[Kk\\x{212a}]" },
{ "(?i)k", "[Kk\\x{212a}]" },
{ "(?i)\\x{212a}", "[Kk\\x{212a}]" },
{ "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
{ "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
{ "(?i)[\\x00-\\x{10ffff}]", "." },
// Empty string as a regular expression.
// Empty string must be preserved inside parens in order
// to make submatches work right, so these are less
// interesting than they used to be. ToString inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{ "(a|b|)", "([a-b]|(?:))" },
{ "(|)", "((?:)|(?:))" },
{ "a()", "a()" },
{ "(()|())", "(()|())" },
{ "(a|)", "(a|(?:))" },
{ "ab()cd()", "ab()cd()" },
{ "()", "()" },
{ "()*", "()*" },
{ "()+", "()+" },
{ "()?" , "()?" },
{ "(){0}", "" },
{ "(){1}", "()" },
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
// Test that coalescing occurs and that the resulting repeats are simplified.
// Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
{ "a*a*", "a*" },
{ "a*a+", "a+" },
{ "a*a?", "a*" },
{ "a*a{2}", "aa+" },
{ "a*a{2,}", "aa+" },
{ "a*a{2,3}", "aa+" },
{ "a+a*", "a+" },
{ "a+a+", "aa+" },
{ "a+a?", "a+" },
{ "a+a{2}", "aaa+" },
{ "a+a{2,}", "aaa+" },
{ "a+a{2,3}", "aaa+" },
{ "a?a*", "a*" },
{ "a?a+", "a+" },
{ "a?a?", "(?:aa?)?" },
{ "a?a{2}", "aaa?" },
{ "a?a{2,}", "aa+" },
{ "a?a{2,3}", "aa(?:aa?)?" },
{ "a{2}a*", "aa+" },
{ "a{2}a+", "aaa+" },
{ "a{2}a?", "aaa?" },
{ "a{2}a{2}", "aaaa" },
{ "a{2}a{2,}", "aaaa+" },
{ "a{2}a{2,3}", "aaaaa?" },
{ "a{2,}a*", "aa+" },
{ "a{2,}a+", "aaa+" },
{ "a{2,}a?", "aa+" },
{ "a{2,}a{2}", "aaaa+" },
{ "a{2,}a{2,}", "aaaa+" },
{ "a{2,}a{2,3}", "aaaa+" },
{ "a{2,3}a*", "aa+" },
{ "a{2,3}a+", "aaa+" },
{ "a{2,3}a?", "aa(?:aa?)?" },
{ "a{2,3}a{2}", "aaaaa?" },
{ "a{2,3}a{2,}", "aaaa+" },
{ "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
// With a char class, any char and any byte:
{ "\\d*\\d*", "[0-9]*" },
{ ".*.*", ".*" },
{ "\\C*\\C*", "\\C*" },
// FoldCase works, but must be consistent:
{ "(?i)A*a*", "[Aa]*" },
{ "(?i)a+A+", "[Aa][Aa]+" },
{ "(?i)A*(?-i)a*", "[Aa]*a*" },
{ "(?i)a+(?-i)A+", "[Aa]+A+" },
// NonGreedy works, but must be consistent:
{ "a*?a*?", "a*?" },
{ "a+?a+?", "aa+?" },
{ "a*?a*", "a*?a*" },
{ "a+a+?", "a+a+?" },
// The second element is the literal, char class, any char or any byte:
{ "a*a", "a+" },
{ "\\d*\\d", "[0-9]+" },
{ ".*.", ".+" },
{ "\\C*\\C", "\\C+" },
// FoldCase works, but must be consistent:
{ "(?i)A*a", "[Aa]+" },
{ "(?i)a+A", "[Aa][Aa]+" },
{ "(?i)A*(?-i)a", "[Aa]*a" },
{ "(?i)a+(?-i)A", "[Aa]+A" },
// The second element is a literal string that begins with the literal:
{ "a*aa", "aa+" },
{ "a*aab", "aa+b" },
// FoldCase works, but must be consistent:
{ "(?i)a*aa", "[Aa][Aa]+" },
{ "(?i)a*aab", "[Aa][Aa]+[Bb]" },
{ "(?i)a*(?-i)aa", "[Aa]*aa" },
{ "(?i)a*(?-i)aab", "[Aa]*aab" },
// Negative tests with mismatching ops:
{ "a*b*", "a*b*" },
{ "\\d*\\D*", "[0-9]*[^0-9]*" },
{ "a+b", "a+b" },
{ "\\d+\\D", "[0-9]+[^0-9]" },
{ "a?bb", "a?bb" },
// Negative tests with capturing groups:
{ "(a*)a*", "(a*)a*" },
{ "a+(a)", "a+(a)" },
{ "(a?)(aa)", "(a?)(aa)" },
// Just for fun:
{ "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
// During coalescing, the child of the repeat changes, so we build a new
// repeat. The new repeat must have the min and max of the old repeat.
// Failure to copy them results in min=0 and max=0 -> empty match.
{ "(?:a*aab){2}", "aa+baa+b" },
// During coalescing, the child of the capture changes, so we build a new
// capture. The new capture must have the cap of the old capture.
// Failure to copy it results in cap=0 -> ToString() logs a fatal error.
{ "(a*aab)", "(aa+b)" },
// Test squashing of **, ++, ?? et cetera.
{ "(?:(?:a){0,}){0,}", "a*" },
{ "(?:(?:a){1,}){1,}", "a+" },
{ "(?:(?:a){0,1}){0,1}", "a?" },
{ "(?:(?:a){0,}){1,}", "a*" },
{ "(?:(?:a){0,}){0,1}", "a*" },
{ "(?:(?:a){1,}){0,}", "a*" },
{ "(?:(?:a){1,}){0,1}", "a*" },
{ "(?:(?:a){0,1}){0,}", "a*" },
{ "(?:(?:a){0,1}){1,}", "a*" },
};
TEST(TestSimplify, SimpleRegexps) {
for (size_t i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
VLOG(1) << "Testing " << tests[i].regexp;
Regexp* re = Regexp::Parse(tests[i].regexp,
Regexp::MatchNL | (Regexp::LikePerl &
~Regexp::OneLine),
&status);
ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
Regexp* sre = re->Simplify();
ASSERT_TRUE(sre != NULL);
// Check that already-simple regexps don't allocate new ones.
if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
ASSERT_TRUE(re == sre) << " " << tests[i].regexp
<< " " << re->ToString() << " " << sre->ToString();
}
EXPECT_EQ(tests[i].simplified, sre->ToString())
<< " " << tests[i].regexp << " " << sre->Dump();
re->Decref();
sre->Decref();
}
}
} // namespace re2

View File

@ -0,0 +1,114 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "util/logging.h"
#include "re2/testing/string_generator.h"
namespace re2 {
StringGenerator::StringGenerator(int maxlen,
const std::vector<std::string>& alphabet)
: maxlen_(maxlen), alphabet_(alphabet),
generate_null_(false),
random_(false), nrandom_(0) {
// Degenerate case: no letters, no non-empty strings.
if (alphabet_.empty())
maxlen_ = 0;
// Next() will return empty string (digits_ is empty).
hasnext_ = true;
}
// Resets the string generator state to the beginning.
void StringGenerator::Reset() {
digits_.clear();
hasnext_ = true;
random_ = false;
nrandom_ = 0;
generate_null_ = false;
}
// Increments the big number in digits_, returning true if successful.
// Returns false if all the numbers have been used.
bool StringGenerator::IncrementDigits() {
// First try to increment the current number.
for (int i = static_cast<int>(digits_.size()) - 1; i >= 0; i--) {
if (++digits_[i] < static_cast<int>(alphabet_.size()))
return true;
digits_[i] = 0;
}
// If that failed, make a longer number.
if (static_cast<int>(digits_.size()) < maxlen_) {
digits_.push_back(0);
return true;
}
return false;
}
// Generates random digits_, return true if successful.
// Returns false if the random sequence is over.
bool StringGenerator::RandomDigits() {
if (--nrandom_ <= 0)
return false;
std::uniform_int_distribution<int> random_len(0, maxlen_);
std::uniform_int_distribution<int> random_alphabet_index(
0, static_cast<int>(alphabet_.size()) - 1);
// Pick length.
int len = random_len(rng_);
digits_.resize(len);
for (int i = 0; i < len; i++)
digits_[i] = random_alphabet_index(rng_);
return true;
}
// Returns the next string in the iteration, which is the one
// currently described by digits_. Calls IncrementDigits
// after computing the string, so that it knows the answer
// for subsequent HasNext() calls.
const StringPiece& StringGenerator::Next() {
CHECK(hasnext_);
if (generate_null_) {
generate_null_ = false;
sp_ = StringPiece();
return sp_;
}
s_.clear();
for (size_t i = 0; i < digits_.size(); i++) {
s_ += alphabet_[digits_[i]];
}
hasnext_ = random_ ? RandomDigits() : IncrementDigits();
sp_ = s_;
return sp_;
}
// Sets generator up to return n random strings.
void StringGenerator::Random(int32_t seed, int n) {
rng_.seed(seed);
random_ = true;
nrandom_ = n;
hasnext_ = nrandom_ > 0;
}
void StringGenerator::GenerateNULL() {
generate_null_ = true;
hasnext_ = true;
}
} // namespace re2

View File

@ -0,0 +1,63 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_STRING_GENERATOR_H_
#define RE2_TESTING_STRING_GENERATOR_H_
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#include <stdint.h>
#include <random>
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
class StringGenerator {
public:
StringGenerator(int maxlen, const std::vector<std::string>& alphabet);
~StringGenerator() {}
const StringPiece& Next();
bool HasNext() { return hasnext_; }
// Resets generator to start sequence over.
void Reset();
// Causes generator to emit random strings for next n calls to Next().
void Random(int32_t seed, int n);
// Causes generator to emit a NULL as the next call.
void GenerateNULL();
private:
bool IncrementDigits();
bool RandomDigits();
// Global state.
int maxlen_; // Maximum length string to generate.
std::vector<std::string> alphabet_; // Alphabet, one string per letter.
// Iteration state.
StringPiece sp_; // Last StringPiece returned by Next().
std::string s_; // String data in last StringPiece returned by Next().
bool hasnext_; // Whether Next() can be called again.
std::vector<int> digits_; // Alphabet indices for next string.
bool generate_null_; // Whether to generate a NULL StringPiece next.
bool random_; // Whether generated strings are random.
int nrandom_; // Number of random strings left to generate.
std::minstd_rand0 rng_; // Random number generator.
StringGenerator(const StringGenerator&) = delete;
StringGenerator& operator=(const StringGenerator&) = delete;
};
} // namespace re2
#endif // RE2_TESTING_STRING_GENERATOR_H_

View File

@ -0,0 +1,110 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test StringGenerator.
#include <stdint.h>
#include <string>
#include "util/test.h"
#include "util/utf.h"
#include "re2/testing/string_generator.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns i to the e.
static int64_t IntegerPower(int i, int e) {
int64_t p = 1;
while (e-- > 0)
p *= i;
return p;
}
// Checks that for given settings of the string generator:
// * it generates strings that are non-decreasing in length.
// * strings of the same length are sorted in alphabet order.
// * it doesn't generate the same string twice.
// * it generates the right number of strings.
//
// If all of these hold, the StringGenerator is behaving.
// Assumes that the alphabet is sorted, so that the generated
// strings can just be compared lexicographically.
static void RunTest(int len, const std::string& alphabet, bool donull) {
StringGenerator g(len, Explode(alphabet));
int n = 0;
int last_l = -1;
std::string last_s;
if (donull) {
g.GenerateNULL();
EXPECT_TRUE(g.HasNext());
StringPiece sp = g.Next();
EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
EXPECT_EQ(sp.size(), 0);
}
while (g.HasNext()) {
std::string s = std::string(g.Next());
n++;
// Check that all characters in s appear in alphabet.
for (const char *p = s.c_str(); *p != '\0'; ) {
Rune r;
p += chartorune(&r, p);
EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
}
// Check that string is properly ordered w.r.t. previous string.
int l = utflen(s.c_str());
EXPECT_LE(l, len);
if (last_l < l) {
last_l = l;
} else {
EXPECT_EQ(last_l, l);
EXPECT_LT(last_s, s);
}
last_s = s;
}
// Check total string count.
int64_t m = 0;
int alpha = utflen(alphabet.c_str());
if (alpha == 0) // Degenerate case.
len = 0;
for (int i = 0; i <= len; i++)
m += IntegerPower(alpha, i);
EXPECT_EQ(n, m);
}
TEST(StringGenerator, NoLength) {
RunTest(0, "abc", false);
}
TEST(StringGenerator, NoLengthNoAlphabet) {
RunTest(0, "", false);
}
TEST(StringGenerator, NoAlphabet) {
RunTest(5, "", false);
}
TEST(StringGenerator, Simple) {
RunTest(3, "abc", false);
}
TEST(StringGenerator, UTF8) {
RunTest(4, "abc\xE2\x98\xBA", false);
}
TEST(StringGenerator, GenNULL) {
RunTest(0, "abc", true);
RunTest(0, "", true);
RunTest(5, "", true);
RunTest(3, "abc", true);
RunTest(4, "abc\xE2\x98\xBA", true);
}
} // namespace re2

669
extern/re2/re2/testing/tester.cc vendored Normal file
View File

@ -0,0 +1,669 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression engine tester -- test all the implementations against each other.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <string>
#include "util/util.h"
#include "util/flags.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/testing/tester.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
DEFINE_bool(dump_prog, false, "dump regexp program");
DEFINE_bool(log_okay, false, "log successful runs");
DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
DEFINE_int32(max_regexp_failures, 100,
"maximum number of regexp test failures (-1 = unlimited)");
DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
namespace re2 {
enum {
kMaxSubmatch = 1+16, // $0...$16
};
const char* engine_names[kEngineMax] = {
"Backtrack",
"NFA",
"DFA",
"DFA1",
"OnePass",
"BitState",
"RE2",
"RE2a",
"RE2b",
"PCRE",
};
// Returns the name of the engine.
static const char* EngineName(Engine e) {
CHECK_GE(e, 0);
CHECK_LT(e, arraysize(engine_names));
CHECK(engine_names[e] != NULL);
return engine_names[e];
}
// Returns bit mask of engines to use.
static uint32_t Engines() {
static bool did_parse = false;
static uint32_t cached_engines = 0;
if (did_parse)
return cached_engines;
if (FLAGS_regexp_engines.empty()) {
cached_engines = ~0;
} else {
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
if (FLAGS_regexp_engines.find(EngineName(i)) != std::string::npos)
cached_engines |= 1<<i;
}
if (cached_engines == 0)
LOG(INFO) << "Warning: no engines enabled.";
if (!UsingPCRE)
cached_engines &= ~(1<<kEnginePCRE);
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
if (cached_engines & (1<<i))
LOG(INFO) << EngineName(i) << " enabled";
}
did_parse = true;
return cached_engines;
}
// The result of running a match.
struct TestInstance::Result {
bool skipped; // test skipped: wasn't applicable
bool matched; // found a match
bool untrusted; // don't really trust the answer
bool have_submatch; // computed all submatch info
bool have_submatch0; // computed just submatch[0]
StringPiece submatch[kMaxSubmatch];
};
typedef TestInstance::Result Result;
// Formats a single capture range s in text in the form (a,b)
// where a and b are the starting and ending offsets of s in text.
static std::string FormatCapture(const StringPiece& text,
const StringPiece& s) {
if (s.begin() == NULL)
return "(?,?)";
return StringPrintf("(%td,%td)",
s.begin() - text.begin(), s.end() - text.begin());
}
// Returns whether text contains non-ASCII (>= 0x80) bytes.
static bool NonASCII(const StringPiece& text) {
for (size_t i = 0; i < text.size(); i++)
if ((uint8_t)text[i] >= 0x80)
return true;
return false;
}
// Returns string representation of match kind.
static std::string FormatKind(Prog::MatchKind kind) {
switch (kind) {
case Prog::kFullMatch:
return "full match";
case Prog::kLongestMatch:
return "longest match";
case Prog::kFirstMatch:
return "first match";
case Prog::kManyMatch:
return "many match";
}
return "???";
}
// Returns string representation of anchor kind.
static std::string FormatAnchor(Prog::Anchor anchor) {
switch (anchor) {
case Prog::kAnchored:
return "anchored";
case Prog::kUnanchored:
return "unanchored";
}
return "???";
}
struct ParseMode {
Regexp::ParseFlags parse_flags;
std::string desc;
};
static const Regexp::ParseFlags single_line =
Regexp::LikePerl;
static const Regexp::ParseFlags multi_line =
static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
static ParseMode parse_modes[] = {
{ single_line, "single-line" },
{ single_line|Regexp::Latin1, "single-line, latin1" },
{ multi_line, "multiline" },
{ multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
{ multi_line|Regexp::Latin1, "multiline, latin1" },
};
static std::string FormatMode(Regexp::ParseFlags flags) {
for (size_t i = 0; i < arraysize(parse_modes); i++)
if (parse_modes[i].parse_flags == flags)
return parse_modes[i].desc;
return StringPrintf("%#x", static_cast<uint32_t>(flags));
}
// Constructs and saves all the matching engines that
// will be required for the given tests.
TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
Regexp::ParseFlags flags)
: regexp_str_(regexp_str),
kind_(kind),
flags_(flags),
error_(false),
regexp_(NULL),
num_captures_(0),
prog_(NULL),
rprog_(NULL),
re_(NULL),
re2_(NULL) {
VLOG(1) << CEscape(regexp_str);
// Compile regexp to prog.
// Always required - needed for backtracking (reference implementation).
RegexpStatus status;
regexp_ = Regexp::Parse(regexp_str, flags, &status);
if (regexp_ == NULL) {
LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
<< " mode: " << FormatMode(flags);
error_ = true;
return;
}
num_captures_ = regexp_->NumCaptures();
prog_ = regexp_->CompileToProg(0);
if (prog_ == NULL) {
LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_prog) {
LOG(INFO) << "Prog for "
<< " regexp "
<< CEscape(regexp_str_)
<< " (" << FormatKind(kind_)
<< ", " << FormatMode(flags_)
<< ")\n"
<< prog_->Dump();
}
// Compile regexp to reversed prog. Only needed for DFA engines.
if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
rprog_ = regexp_->CompileToReverseProg(0);
if (rprog_ == NULL) {
LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_rprog)
LOG(INFO) << rprog_->Dump();
}
// Create re string that will be used for RE and RE2.
std::string re = std::string(regexp_str);
// Accomodate flags.
// Regexp::Latin1 will be accomodated below.
if (!(flags & Regexp::OneLine))
re = "(?m)" + re;
if (flags & Regexp::NonGreedy)
re = "(?U)" + re;
if (flags & Regexp::DotNL)
re = "(?s)" + re;
// Compile regexp to RE2.
if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
RE2::Options options;
if (flags & Regexp::Latin1)
options.set_encoding(RE2::Options::EncodingLatin1);
if (kind_ == Prog::kLongestMatch)
options.set_longest_match(true);
re2_ = new RE2(re, options);
if (!re2_->error().empty()) {
LOG(INFO) << "Cannot RE2: " << CEscape(re);
error_ = true;
return;
}
}
// Compile regexp to RE.
// PCRE as exposed by the RE interface isn't always usable.
// 1. It disagrees about handling of empty-string reptitions
// like matching (a*)* against "b". PCRE treats the (a*) as
// occurring once, while we treat it as occurring not at all.
// 2. It treats $ as this weird thing meaning end of string
// or before the \n at the end of the string.
// 3. It doesn't implement POSIX leftmost-longest matching.
// 4. It lets \s match vertical tab.
// MimicsPCRE() detects 1 and 2.
if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
kind_ != Prog::kLongestMatch) {
PCRE_Options o;
o.set_option(PCRE::UTF8);
if (flags & Regexp::Latin1)
o.set_option(PCRE::None);
// PCRE has interface bug keeping us from finding $0, so
// add one more layer of parens.
re_ = new PCRE("("+re+")", o);
if (!re_->error().empty()) {
LOG(INFO) << "Cannot PCRE: " << CEscape(re);
error_ = true;
return;
}
}
}
TestInstance::~TestInstance() {
if (regexp_)
regexp_->Decref();
delete prog_;
delete rprog_;
delete re_;
delete re2_;
}
// Runs a single search using the named engine type.
// This interface hides all the irregularities of the various
// engine interfaces from the rest of this file.
void TestInstance::RunSearch(Engine type,
const StringPiece& orig_text,
const StringPiece& orig_context,
Prog::Anchor anchor,
Result* result) {
// Result is not trivial, so we cannot freely clear it with memset(3),
// but zeroing objects like so is safe and expedient for our purposes.
memset(reinterpret_cast<void*>(result), 0, sizeof *result);
if (regexp_ == NULL) {
result->skipped = true;
return;
}
int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
if (nsubmatch > kMaxSubmatch)
nsubmatch = kMaxSubmatch;
StringPiece text = orig_text;
StringPiece context = orig_context;
switch (type) {
default:
LOG(FATAL) << "Bad RunSearch type: " << (int)type;
case kEngineBacktrack:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineNFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchNFA(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineDFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
&result->skipped, NULL);
break;
case kEngineDFA1:
if (prog_ == NULL || rprog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
&result->skipped, NULL);
// If anchored, no need for second run,
// but do it anyway to find more bugs.
if (result->matched) {
if (!rprog_->SearchDFA(result->submatch[0], context,
Prog::kAnchored, Prog::kLongestMatch,
result->submatch,
&result->skipped, NULL)) {
LOG(ERROR) << "Reverse DFA inconsistency: "
<< CEscape(regexp_str_)
<< " on " << CEscape(text);
result->matched = false;
}
}
result->have_submatch0 = true;
break;
case kEngineOnePass:
if (prog_ == NULL ||
!prog_->IsOnePass() ||
anchor == Prog::kUnanchored ||
nsubmatch > Prog::kMaxOnePassCapture) {
result->skipped = true;
break;
}
result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineBitState:
if (prog_ == NULL ||
!prog_->CanBitState()) {
result->skipped = true;
break;
}
result->matched = prog_->SearchBitState(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineRE2:
case kEngineRE2a:
case kEngineRE2b: {
if (!re2_ || text.end() != context.end()) {
result->skipped = true;
break;
}
RE2::Anchor re_anchor;
if (anchor == Prog::kAnchored)
re_anchor = RE2::ANCHOR_START;
else
re_anchor = RE2::UNANCHORED;
if (kind_ == Prog::kFullMatch)
re_anchor = RE2::ANCHOR_BOTH;
result->matched = re2_->Match(
context,
static_cast<size_t>(text.begin() - context.begin()),
static_cast<size_t>(text.end() - context.begin()),
re_anchor,
result->submatch,
nsubmatch);
result->have_submatch = nsubmatch > 0;
break;
}
case kEnginePCRE: {
if (!re_ || text.begin() != context.begin() ||
text.end() != context.end()) {
result->skipped = true;
break;
}
// In Perl/PCRE, \v matches any character considered vertical
// whitespace, not just vertical tab. Regexp::MimicsPCRE() is
// unable to handle all cases of this, unfortunately, so just
// catch them here. :(
if (regexp_str_.find("\\v") != StringPiece::npos &&
(text.find('\n') != StringPiece::npos ||
text.find('\f') != StringPiece::npos ||
text.find('\r') != StringPiece::npos)) {
result->skipped = true;
break;
}
// PCRE 8.34 or so started allowing vertical tab to match \s,
// following a change made in Perl 5.18. RE2 does not.
if ((regexp_str_.find("\\s") != StringPiece::npos ||
regexp_str_.find("\\S") != StringPiece::npos) &&
text.find('\v') != StringPiece::npos) {
result->skipped = true;
break;
}
const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
PCRE::Arg *a = new PCRE::Arg[nsubmatch];
for (int i = 0; i < nsubmatch; i++) {
a[i] = PCRE::Arg(&result->submatch[i]);
argptr[i] = &a[i];
}
size_t consumed;
PCRE::Anchor pcre_anchor;
if (anchor == Prog::kAnchored)
pcre_anchor = PCRE::ANCHOR_START;
else
pcre_anchor = PCRE::UNANCHORED;
if (kind_ == Prog::kFullMatch)
pcre_anchor = PCRE::ANCHOR_BOTH;
re_->ClearHitLimit();
result->matched =
re_->DoMatch(text,
pcre_anchor,
&consumed,
argptr, nsubmatch);
if (re_->HitLimit()) {
result->untrusted = true;
delete[] argptr;
delete[] a;
break;
}
result->have_submatch = true;
delete[] argptr;
delete[] a;
break;
}
}
if (!result->matched)
memset(result->submatch, 0, sizeof result->submatch);
}
// Checks whether r is okay given that correct is the right answer.
// Specifically, r's answers have to match (but it doesn't have to
// claim to have all the answers).
static bool ResultOkay(const Result& r, const Result& correct) {
if (r.skipped)
return true;
if (r.matched != correct.matched)
return false;
if (r.have_submatch || r.have_submatch0) {
for (int i = 0; i < kMaxSubmatch; i++) {
if (correct.submatch[i].begin() != r.submatch[i].begin() ||
correct.submatch[i].size() != r.submatch[i].size())
return false;
if (!r.have_submatch)
break;
}
}
return true;
}
// Runs a single test.
bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
// Backtracking is the gold standard.
Result correct;
RunSearch(kEngineBacktrack, text, context, anchor, &correct);
if (correct.skipped) {
if (regexp_ == NULL)
return true;
LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
<< " " << FormatMode(flags_);
return false;
}
VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
<< " text " << CEscape(text)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
// Compare the others.
bool all_okay = true;
for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
if (!(Engines() & (1<<i)))
continue;
Result r;
RunSearch(i, text, context, anchor, &r);
if (ResultOkay(r, correct)) {
if (FLAGS_log_okay)
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
continue;
}
// We disagree with PCRE on the meaning of some Unicode matches.
// In particular, we treat non-ASCII UTF-8 as non-word characters.
// We also treat "empty" character sets like [^\w\W] as being
// impossible to match, while PCRE apparently excludes some code
// points (e.g., 0x0080) from both \w and \W.
if (i == kEnginePCRE && NonASCII(text))
continue;
if (!r.untrusted)
all_okay = false;
LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
context, anchor);
if (r.matched != correct.matched) {
if (r.matched) {
LOG(INFO) << " Should not match (but does).";
} else {
LOG(INFO) << " Should match (but does not).";
continue;
}
}
for (int i = 0; i < 1+num_captures_; i++) {
if (r.submatch[i].begin() != correct.submatch[i].begin() ||
r.submatch[i].end() != correct.submatch[i].end()) {
LOG(INFO) <<
StringPrintf(" $%d: should be %s is %s",
i,
FormatCapture(text, correct.submatch[i]).c_str(),
FormatCapture(text, r.submatch[i]).c_str());
} else {
LOG(INFO) <<
StringPrintf(" $%d: %s ok", i,
FormatCapture(text, r.submatch[i]).c_str());
}
}
}
if (!all_okay) {
if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
LOG(QFATAL) << "Too many regexp failures.";
}
return all_okay;
}
void TestInstance::LogMatch(const char* prefix, Engine e,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
LOG(INFO) << prefix
<< EngineName(e)
<< " regexp "
<< CEscape(regexp_str_)
<< " "
<< CEscape(regexp_->ToString())
<< " text "
<< CEscape(text)
<< " ("
<< text.begin() - context.begin()
<< ","
<< text.end() - context.begin()
<< ") of context "
<< CEscape(context)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
}
static Prog::MatchKind kinds[] = {
Prog::kFirstMatch,
Prog::kLongestMatch,
Prog::kFullMatch,
};
// Test all possible match kinds and parse modes.
Tester::Tester(const StringPiece& regexp) {
error_ = false;
for (size_t i = 0; i < arraysize(kinds); i++) {
for (size_t j = 0; j < arraysize(parse_modes); j++) {
TestInstance* t = new TestInstance(regexp, kinds[i],
parse_modes[j].parse_flags);
error_ |= t->error();
v_.push_back(t);
}
}
}
Tester::~Tester() {
for (size_t i = 0; i < v_.size(); i++)
delete v_[i];
}
bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
bool okay = true;
for (size_t i = 0; i < v_.size(); i++)
okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
return okay;
}
static Prog::Anchor anchors[] = {
Prog::kAnchored,
Prog::kUnanchored
};
bool Tester::TestInput(const StringPiece& text) {
bool okay = TestInputInContext(text, text);
if (text.size() > 0) {
StringPiece sp;
sp = text;
sp.remove_prefix(1);
okay &= TestInputInContext(sp, text);
sp = text;
sp.remove_suffix(1);
okay &= TestInputInContext(sp, text);
}
return okay;
}
bool Tester::TestInputInContext(const StringPiece& text,
const StringPiece& context) {
bool okay = true;
for (size_t i = 0; i < arraysize(anchors); i++)
okay &= TestCase(text, context, anchors[i]);
return okay;
}
bool TestRegexpOnText(const StringPiece& regexp,
const StringPiece& text) {
Tester t(regexp);
return t.TestInput(text);
}
} // namespace re2

123
extern/re2/re2/testing/tester.h vendored Normal file
View File

@ -0,0 +1,123 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_TESTER_H_
#define RE2_TESTING_TESTER_H_
// Comparative tester for regular expression matching.
// Checks all implementations against each other.
#include <vector>
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/re2.h"
#include "util/pcre.h"
namespace re2 {
// All the supported regexp engines.
enum Engine {
kEngineBacktrack = 0, // Prog::UnsafeSearchBacktrack
kEngineNFA, // Prog::SearchNFA
kEngineDFA, // Prog::SearchDFA, only ask whether it matched
kEngineDFA1, // Prog::SearchDFA, ask for match[0]
kEngineOnePass, // Prog::SearchOnePass, if applicable
kEngineBitState, // Prog::SearchBitState
kEngineRE2, // RE2, all submatches
kEngineRE2a, // RE2, only ask for match[0]
kEngineRE2b, // RE2, only ask whether it matched
kEnginePCRE, // PCRE (util/pcre.h)
kEngineMax,
};
// Make normal math on the enum preserve the type.
// By default, C++ doesn't define ++ on enum, and e+1 has type int.
static inline void operator++(Engine& e, int unused) {
e = static_cast<Engine>(e+1);
}
static inline Engine operator+(Engine e, int i) {
return static_cast<Engine>(static_cast<int>(e)+i);
}
// A TestInstance caches per-regexp state for a given
// regular expression in a given configuration
// (UTF-8 vs Latin1, longest vs first match, etc.).
class TestInstance {
public:
struct Result;
TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
Regexp::ParseFlags flags);
~TestInstance();
Regexp::ParseFlags flags() { return flags_; }
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
private:
// Runs a single search using the named engine type.
void RunSearch(Engine type,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor,
Result *result);
void LogMatch(const char* prefix, Engine e, const StringPiece& text,
const StringPiece& context, Prog::Anchor anchor);
const StringPiece regexp_str_; // regexp being tested
Prog::MatchKind kind_; // kind of match
Regexp::ParseFlags flags_; // flags for parsing regexp_str_
bool error_; // error during constructor?
Regexp* regexp_; // parsed regexp
int num_captures_; // regexp_->NumCaptures() cached
Prog* prog_; // compiled program
Prog* rprog_; // compiled reverse program
PCRE* re_; // PCRE implementation
RE2* re2_; // RE2 implementation
TestInstance(const TestInstance&) = delete;
TestInstance& operator=(const TestInstance&) = delete;
};
// A group of TestInstances for all possible configurations.
class Tester {
public:
explicit Tester(const StringPiece& regexp);
~Tester();
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
// Run TestCase(text, text, anchor) for all anchoring modes.
bool TestInput(const StringPiece& text);
// Run TestCase(text, context, anchor) for all anchoring modes.
bool TestInputInContext(const StringPiece& text, const StringPiece& context);
private:
bool error_;
std::vector<TestInstance*> v_;
Tester(const Tester&) = delete;
Tester& operator=(const Tester&) = delete;
};
// Run all possible tests using regexp and text.
bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
} // namespace re2
#endif // RE2_TESTING_TESTER_H_

351
extern/re2/re2/tostring.cc vendored Normal file
View File

@ -0,0 +1,351 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include <string.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(std::string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(std::string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
std::string* t_; // The string the walker appends to.
ToStringWalker(const ToStringWalker&) = delete;
ToStringWalker& operator=(const ToStringWalker&) = delete;
};
std::string Regexp::ToString() {
std::string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, static_cast<char>(r));
} else if (foldcase && 'a' <= r && r <= 'z') {
r -= 'a' - 'A';
t->append(1, '[');
t->append(1, static_cast<char>(r));
t->append(1, static_cast<char>(r) + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(),
(re->parse_flags() & Regexp::FoldCase) != 0);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i],
(re->parse_flags() & Regexp::FoldCase) != 0);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE and yet somehow isn't full.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE) && !cc->full()) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append("(?HaveMatch:%d)", re->match_id());
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(std::string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, static_cast<char>(r));
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
*t += StringPrintf("\\x%02x", static_cast<int>(r));
return;
}
*t += StringPrintf("\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

303
extern/re2/re2/unicode.py vendored Normal file
View File

@ -0,0 +1,303 @@
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Parser for Unicode data files (as distributed by unicode.org)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
from six.moves import urllib
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
class Error(Exception):
"""Unicode error base class."""
class InputError(Error):
"""Unicode input error class. Raised on invalid input."""
def _UInt(s):
"""Converts string to Unicode code point ('263A' => 0x263a).
Args:
s: string to convert
Returns:
Unicode code point
Raises:
InputError: the string is not a valid Unicode value.
"""
try:
v = int(s, 16)
except ValueError:
v = -1
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (s,))
return v
def _URange(s):
"""Converts string to Unicode range.
'0001..0003' => [1, 2, 3].
'0001' => [1].
Args:
s: string to convert
Returns:
Unicode range
Raises:
InputError: the string is not a valid Unicode range.
"""
a = s.split("..")
if len(a) == 1:
return [_UInt(a[0])]
if len(a) == 2:
lo = _UInt(a[0])
hi = _UInt(a[1])
if lo < hi:
return range(lo, hi + 1)
raise InputError("invalid Unicode range %s" % (s,))
def _UStr(v):
"""Converts Unicode code point to hex string.
0x263a => '0x263A'.
Args:
v: code point to convert
Returns:
Unicode string
Raises:
InputError: the argument is not a valid Unicode value.
"""
if v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (v,))
return "0x%04X" % (v,)
def _ParseContinue(s):
"""Parses a Unicode continuation field.
These are of the form '<Name, First>' or '<Name, Last>'.
Instead of giving an explicit range in a single table entry,
some Unicode tables use two entries, one for the first
code value in the range and one for the last.
The first entry's description is '<Name, First>' instead of 'Name'
and the second is '<Name, Last>'.
'<Name, First>' => ('Name', 'First')
'<Name, Last>' => ('Name', 'Last')
'Anything else' => ('Anything else', None)
Args:
s: continuation field string
Returns:
pair: name and ('First', 'Last', or None)
"""
match = re.match("<(.*), (First|Last)>", s)
if match is not None:
return match.groups()
return (s, None)
def ReadUnicodeTable(filename, nfields, doline):
"""Generic Unicode table text file reader.
The reader takes care of stripping out comments and also
parsing the two different ways that the Unicode tables specify
code ranges (using the .. notation and splitting the range across
multiple lines).
Each non-comment line in the table is expected to have the given
number of fields. The first field is known to be the Unicode value
and the second field its description.
The reader calls doline(codes, fields) for each entry in the table.
If fn raises an exception, the reader prints that exception,
prefixed with the file name and line number, and continues
processing the file. When done with the file, the reader re-raises
the first exception encountered during the file.
Arguments:
filename: the Unicode data file to read, or a file-like object.
nfields: the number of expected fields per line in that file.
doline: the function to call for each table entry.
Raises:
InputError: nfields is invalid (must be >= 2).
"""
if nfields < 2:
raise InputError("invalid number of fields %d" % (nfields,))
if type(filename) == str:
if filename.startswith("https://"):
fil = urllib.request.urlopen(filename)
else:
fil = open(filename, "rb")
else:
fil = filename
first = None # first code in multiline range
expect_last = None # tag expected for "Last" line in multiline range
lineno = 0 # current line number
for line in fil:
lineno += 1
try:
line = line.decode('latin1')
# Chop # comments and white space; ignore empty lines.
sharp = line.find("#")
if sharp >= 0:
line = line[:sharp]
line = line.strip()
if not line:
continue
# Split fields on ";", chop more white space.
# Must have the expected number of fields.
fields = [s.strip() for s in line.split(";")]
if len(fields) != nfields:
raise InputError("wrong number of fields %d %d - %s" %
(len(fields), nfields, line))
# The Unicode text files have two different ways
# to list a Unicode range. Either the first field is
# itself a range (0000..FFFF), or the range is split
# across two lines, with the second field noting
# the continuation.
codes = _URange(fields[0])
(name, cont) = _ParseContinue(fields[1])
if expect_last is not None:
# If the last line gave the First code in a range,
# this one had better give the Last one.
if (len(codes) != 1 or codes[0] <= first or
cont != "Last" or name != expect_last):
raise InputError("expected Last line for %s" %
(expect_last,))
codes = range(first, codes[0] + 1)
first = None
expect_last = None
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
fields[1] = name
elif cont == "First":
# Otherwise, if this is the First code in a range,
# remember it and go to the next line.
if len(codes) != 1:
raise InputError("bad First line: range given")
expect_last = name
first = codes[0]
continue
doline(codes, fields)
except Exception as e:
print("%s:%d: %s" % (filename, lineno, e))
raise
if expect_last is not None:
raise InputError("expected Last line for %s; got EOF" %
(expect_last,))
def CaseGroups(unicode_dir=_UNICODE_DIR):
"""Returns list of Unicode code groups equivalent under case folding.
Each group is a sorted list of code points,
and the list of groups is sorted by first code point
in the group.
Args:
unicode_dir: Unicode data directory
Returns:
list of Unicode code groups
"""
# Dict mapping lowercase code point to fold-equivalent group.
togroup = {}
def DoLine(codes, fields):
"""Process single CaseFolding.txt line, updating togroup."""
(_, foldtype, lower, _) = fields
if foldtype not in ("C", "S"):
return
lower = _UInt(lower)
togroup.setdefault(lower, [lower]).extend(codes)
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
groups = list(togroup.values())
for g in groups:
g.sort()
groups.sort()
return togroup, groups
def Scripts(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping script names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping script names to code lists
"""
scripts = {}
def DoLine(codes, fields):
"""Process single Scripts.txt line, updating scripts."""
(_, name) = fields
scripts.setdefault(name, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
return scripts
def Categories(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping category names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping category names to code lists
"""
categories = {}
def DoLine(codes, fields):
"""Process single UnicodeData.txt line, updating categories."""
category = fields[2]
categories.setdefault(category, []).extend(codes)
# Add codes from Lu into L, etc.
if len(category) > 1:
short = category[0]
categories.setdefault(short, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
return categories

578
extern/re2/re2/unicode_casefold.cc vendored Normal file
View File

@ -0,0 +1,578 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
// 1381 groups, 2792 pairs, 356 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 604, 604, 42319 },
{ 608, 608, -205 },
{ 609, 609, 42315 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 618, 618, 42308 },
{ 619, 619, 10743 },
{ 620, 620, 42305 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 642, 642, 42307 },
{ 643, 643, -218 },
{ 647, 647, 42282 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 669, 669, 42261 },
{ 670, 670, 42258 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1011, 1011, -116 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1073, -32 },
{ 1074, 1074, 6222 },
{ 1075, 1075, -32 },
{ 1076, 1076, 6221 },
{ 1077, 1085, -32 },
{ 1086, 1086, 6212 },
{ 1087, 1088, -32 },
{ 1089, 1090, 6210 },
{ 1091, 1097, -32 },
{ 1098, 1098, 6204 },
{ 1099, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1122, EvenOdd },
{ 1123, 1123, 6180 },
{ 1124, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1327, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 4304, 4346, 3008 },
{ 4349, 4351, 3008 },
{ 5024, 5103, 38864 },
{ 5104, 5109, 8 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6254 },
{ 7297, 7297, -6253 },
{ 7298, 7298, -6244 },
{ 7299, 7299, -6242 },
{ 7300, 7300, EvenOdd },
{ 7301, 7301, -6243 },
{ 7302, 7302, -6236 },
{ 7303, 7303, -6181 },
{ 7304, 7304, 35266 },
{ 7312, 7354, -3008 },
{ 7357, 7359, -3008 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7566, 7566, 35384 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11310, 48 },
{ 11312, 11358, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11506, 11507, EvenOdd },
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42570, EvenOdd },
{ 42571, 42571, -35267 },
{ 42572, 42605, EvenOdd },
{ 42624, 42651, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42900, 42900, 48 },
{ 42902, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42943, EvenOdd },
{ 42946, 42947, EvenOdd },
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
{ 66736, 66771, 40 },
{ 66776, 66811, -40 },
{ 68736, 68786, 64 },
{ 68800, 68850, -64 },
{ 71840, 71871, 32 },
{ 71872, 71903, -32 },
{ 93760, 93791, 32 },
{ 93792, 93823, -32 },
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
const int num_unicode_casefold = 356;
// 1381 groups, 1411 pairs, 198 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1326, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6222 },
{ 7297, 7297, -6221 },
{ 7298, 7298, -6212 },
{ 7299, 7300, -6210 },
{ 7301, 7301, -6211 },
{ 7302, 7302, -6204 },
{ 7303, 7303, -6180 },
{ 7304, 7304, 35267 },
{ 7312, 7354, -3008 },
{ 7357, 7359, -3008 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11310, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42650, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42902, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42942, EvenOddSkip },
{ 42946, 42946, EvenOdd },
{ 42948, 42948, -48 },
{ 42949, 42949, -42307 },
{ 42950, 42950, -35384 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
{ 66736, 66771, 40 },
{ 68736, 68786, 64 },
{ 71840, 71871, 32 },
{ 93760, 93791, 32 },
{ 125184, 125217, 34 },
};
const int num_unicode_tolower = 198;
} // namespace re2

Some files were not shown because too many files have changed in this diff Show More