Add 'outside/re2/' from commit '539b44fc4c5a49c3453b80e3af85d297f4cab4bf'

git-subtree-dir: outside/re2
git-subtree-mainline: f94738bfd171ae447133e0964843addbb497894f
git-subtree-split: 539b44fc4c5a49c3453b80e3af85d297f4cab4bf
This commit is contained in:
Steve Dee 2014-04-10 11:36:47 -07:00
parent ec13f53941
commit 27dd121d14
122 changed files with 49659 additions and 0 deletions

7
outside/re2/.hgignore Normal file
View File

@ -0,0 +1,7 @@
syntax:glob
*.pyc
*.orig
core
syntax:regexp
^obj/

13
outside/re2/AUTHORS Normal file
View File

@ -0,0 +1,13 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Samsung Electronics
Stefano Rivera <stefano.rivera@gmail.com>

40
outside/re2/CONTRIBUTORS Normal file
View File

@ -0,0 +1,40 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the RE2 repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Dominic Battré <battre@chromium.org>
Doug Kwan <dougkwan@google.com>
Dmitriy Vyukov <dvyukov@google.com>
John Millikin <jmillikin@gmail.com>
Mike Nazarewicz <mpn@google.com>
Pawel Hajdan <phajdan.jr@gmail.com>
Rob Pike <r@google.com>
Russ Cox <rsc@swtch.com>
Sanjay Ghemawat <sanjay@google.com>
Stefano Rivera <stefano.rivera@gmail.com>
Srinivasan Venkatachary <vsri@google.com>
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>

27
outside/re2/LICENSE Normal file
View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

291
outside/re2/Makefile Normal file
View File

@ -0,0 +1,291 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
all: obj/libre2.a obj/so/libre2.so
# to build against PCRE for testing or benchmarking,
# uncomment the next two lines
# CCPCRE=-I/usr/local/include -DUSEPCRE
# LDPCRE=-L/usr/local/lib -lpcre
CXX?=g++
CXXFLAGS?=-Wall -O3 -g -pthread # can override
RE2_CXXFLAGS?=-Wno-sign-compare -c -I. $(CCPCRE) # required
LDFLAGS?=
AR?=ar
ARFLAGS?=rsc
NM?=nm
NMFLAGS?=-p
# Variables mandated by GNU, the arbiter of all good taste on the internet.
# http://www.gnu.org/prep/standards/standards.html
prefix=/usr/local
exec_prefix=$(prefix)
bindir=$(exec_prefix)/bin
includedir=$(prefix)/include
libdir=$(exec_prefix)/lib
INSTALL=install
INSTALL_PROGRAM=$(INSTALL)
INSTALL_DATA=$(INSTALL) -m 644
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
SONAME=0
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
# REBUILD_TABLES=1
ifeq ($(shell uname),Darwin)
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
else
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script=libre2.symbols $(LDFLAGS)
endif
INSTALL_HFILES=\
re2/filtered_re2.h\
re2/re2.h\
re2/set.h\
re2/stringpiece.h\
re2/variadic_function.h\
HFILES=\
util/arena.h\
util/atomicops.h\
util/benchmark.h\
util/flags.h\
util/logging.h\
util/mutex.h\
util/pcre.h\
util/random.h\
util/sparse_array.h\
util/sparse_set.h\
util/test.h\
util/utf.h\
util/util.h\
util/valgrind.h\
re2/filtered_re2.h\
re2/prefilter.h\
re2/prefilter_tree.h\
re2/prog.h\
re2/re2.h\
re2/regexp.h\
re2/set.h\
re2/stringpiece.h\
re2/testing/exhaustive_tester.h\
re2/testing/regexp_generator.h\
re2/testing/string_generator.h\
re2/testing/tester.h\
re2/unicode_casefold.h\
re2/unicode_groups.h\
re2/variadic_function.h\
re2/walker-inl.h\
OFILES=\
obj/util/arena.o\
obj/util/hash.o\
obj/util/rune.o\
obj/util/stringpiece.o\
obj/util/stringprintf.o\
obj/util/strutil.o\
obj/util/valgrind.o\
obj/re2/bitstate.o\
obj/re2/compile.o\
obj/re2/dfa.o\
obj/re2/filtered_re2.o\
obj/re2/mimics_pcre.o\
obj/re2/nfa.o\
obj/re2/onepass.o\
obj/re2/parse.o\
obj/re2/perl_groups.o\
obj/re2/prefilter.o\
obj/re2/prefilter_tree.o\
obj/re2/prog.o\
obj/re2/re2.o\
obj/re2/regexp.o\
obj/re2/set.o\
obj/re2/simplify.o\
obj/re2/tostring.o\
obj/re2/unicode_casefold.o\
obj/re2/unicode_groups.o\
TESTOFILES=\
obj/util/pcre.o\
obj/util/random.o\
obj/util/thread.o\
obj/re2/testing/backtrack.o\
obj/re2/testing/dump.o\
obj/re2/testing/exhaustive_tester.o\
obj/re2/testing/null_walker.o\
obj/re2/testing/regexp_generator.o\
obj/re2/testing/string_generator.o\
obj/re2/testing/tester.o\
TESTS=\
obj/test/charclass_test\
obj/test/compile_test\
obj/test/filtered_re2_test\
obj/test/mimics_pcre_test\
obj/test/parse_test\
obj/test/possible_match_test\
obj/test/re2_test\
obj/test/re2_arg_test\
obj/test/regexp_test\
obj/test/required_prefix_test\
obj/test/search_test\
obj/test/set_test\
obj/test/simplify_test\
obj/test/string_generator_test\
BIGTESTS=\
obj/test/dfa_test\
obj/test/exhaustive1_test\
obj/test/exhaustive2_test\
obj/test/exhaustive3_test\
obj/test/exhaustive_test\
obj/test/random_test\
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
obj/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/dbg/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
obj/so/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/libre2.a: $(OFILES)
@mkdir -p obj
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
obj/dbg/libre2.a: $(DOFILES)
@mkdir -p obj/dbg
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
obj/so/libre2.so: $(SOFILES)
@mkdir -p obj/so
$(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES)
ln -sf libre2.so.$(SONAME) $@
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
@mkdir -p obj/dbg/test
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
@mkdir -p obj/so/test
$(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
ifdef REBUILD_TABLES
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
endif
distclean: clean
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
clean:
rm -rf obj
rm -f re2/*.pyc
testofiles: $(TESTOFILES)
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
debug-test: $(DTESTS)
@echo
@echo Running debug binary tests.
@echo
@./runtests $(DTESTS)
static-test: $(TESTS)
@echo
@echo Running static binary tests.
@echo
@./runtests $(TESTS)
shared-test: $(STESTS)
@echo
@echo Running dynamic binary tests.
@echo
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
debug-bigtest: $(DTESTS) $(DBIGTESTS)
@./runtests $(DTESTS) $(DBIGTESTS)
static-bigtest: $(TESTS) $(BIGTESTS)
@./runtests $(TESTS) $(BIGTESTS)
shared-bigtest: $(STESTS) $(SBIGTESTS)
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
benchmark: obj/test/regexp_benchmark
install: obj/libre2.a obj/so/libre2.so
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
testinstall:
@mkdir -p obj
cp testinstall.cc obj
(cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
(uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
# Keep gmake from deleting intermediate files it creates.
# This makes repeated builds faster and preserves debug info on OS X.
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
obj/dbg/libre2.a obj/so/libre2.a \
obj/test/% obj/so/test/% obj/dbg/test/%
log:
make clean
make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test
echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
echo '#' $$(date) >>re2-exhaustive.txt
obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
echo '#' $$(date) >>re2-search.txt
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
x: x.cc obj/libre2.a
g++ -I. -o x x.cc obj/libre2.a

19
outside/re2/README Normal file
View File

@ -0,0 +1,19 @@
This is the source code repository for RE2, a regular expression library.
For documentation about how to install and use RE2,
visit http://code.google.com/p/re2/.
The short version is:
make
make test
make install
make testinstall
Unless otherwise noted, the RE2 source files are distributed
under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
A Python wrapper is at http://github.com/facebook/pyre2/.
A Ruby wrapper is at http://github.com/axic/rre2/.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,582 @@
hw.ncpu: 2
hw.byteorder: 1234
hw.memsize: 4294967296
hw.activecpu: 2
hw.physicalcpu: 2
hw.physicalcpu_max: 2
hw.logicalcpu: 2
hw.logicalcpu_max: 2
hw.cputype: 7
hw.cpusubtype: 4
hw.cpu64bit_capable: 1
hw.cpufamily: 1114597871
hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0
hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0
hw.pagesize: 4096
hw.busfrequency: 664000000
hw.busfrequency_min: 664000000
hw.busfrequency_max: 664000000
hw.cpufrequency: 1830000000
hw.cpufrequency_min: 1830000000
hw.cpufrequency_max: 1830000000
hw.cachelinesize: 64
hw.l1icachesize: 32768
hw.l1dcachesize: 32768
hw.l2cachesize: 2097152
hw.tbfrequency: 1000000000
hw.packages: 1
hw.optional.floatingpoint: 1
hw.optional.mmx: 1
hw.optional.sse: 1
hw.optional.sse2: 1
hw.optional.sse3: 1
hw.optional.supplementalsse3: 1
hw.optional.sse4_1: 0
hw.optional.sse4_2: 0
hw.optional.x86_64: 1
hw.machine = i386
hw.model = Macmini2,1
hw.ncpu = 2
hw.byteorder = 1234
hw.physmem = 2147483648
hw.usermem = 1849147392
hw.pagesize = 4096
hw.epoch = 0
hw.vectorunit = 1
hw.busfrequency = 664000000
hw.cpufrequency = 1830000000
hw.cachelinesize = 64
hw.l1icachesize = 32768
hw.l1dcachesize = 32768
hw.l2settings = 1
hw.l2cachesize = 2097152
hw.tbfrequency = 1000000000
hw.memsize = 4294967296
hw.availcpu = 2
machdep.cpu.max_basic: 10
machdep.cpu.max_ext: 2147483656
machdep.cpu.vendor: GenuineIntel
machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz
machdep.cpu.family: 6
machdep.cpu.model: 15
machdep.cpu.extmodel: 0
machdep.cpu.extfamily: 0
machdep.cpu.stepping: 2
machdep.cpu.feature_bits: 3219913727 58301
machdep.cpu.extfeature_bits: 537921536 1
machdep.cpu.signature: 1778
machdep.cpu.brand: 0
machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
machdep.cpu.extfeatures: SYSCALL XD EM64T
machdep.cpu.logical_per_package: 2
machdep.cpu.cores_per_package: 2
machdep.cpu.microcode_version: 87
machdep.cpu.mwait.linesize_min: 64
machdep.cpu.mwait.linesize_max: 64
machdep.cpu.mwait.extensions: 3
machdep.cpu.mwait.sub_Cstates: 139808
machdep.cpu.thermal.sensor: 1
machdep.cpu.thermal.dynamic_acceleration: 0
machdep.cpu.thermal.thresholds: 2
machdep.cpu.thermal.ACNT_MCNT: 1
machdep.cpu.arch_perf.version: 2
machdep.cpu.arch_perf.number: 2
machdep.cpu.arch_perf.width: 40
machdep.cpu.arch_perf.events_number: 7
machdep.cpu.arch_perf.events: 0
machdep.cpu.arch_perf.fixed_number: 0
machdep.cpu.arch_perf.fixed_width: 0
machdep.cpu.cache.linesize: 64
machdep.cpu.cache.L2_associativity: 6
machdep.cpu.cache.size: 2048
machdep.cpu.tlb.inst.small: 128
machdep.cpu.tlb.inst.large: 8
machdep.cpu.tlb.data.small: 16
machdep.cpu.tlb.data.small_level1: 256
machdep.cpu.tlb.data.large: 16
machdep.cpu.tlb.data.large_level1: 32
machdep.cpu.address_bits.physical: 36
machdep.cpu.address_bits.virtual: 48
machdep.cpu.core_count: 2
machdep.cpu.thread_count: 2
==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010
# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386
# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1)
# Copyright (C) 2007 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions. There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# a94585d91e66+ tip
# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64
Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s
Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s
Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s
Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s
Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s
Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s
Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s
Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s
Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s
Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s
Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s
Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s
Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s
Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s
Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s
Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s
Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s
Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s
Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s
Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s
Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s
Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s
Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s
Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s
Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s
Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s
Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s
Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s
Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s
Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s
Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s
Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s
Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s
Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s
Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s
Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s
Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s
Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s
Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s
Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s
Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s
Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s
Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s
Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s
Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s
Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s
Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s
Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s
Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s
Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s
Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s
Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s
Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s
Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s
Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s
Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s
Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s
Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s
Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s
Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s
Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s
Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s
Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s
Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s
Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s
Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s
Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s
Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s
Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s
Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s
Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s
Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s
Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s
Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s
Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s
Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s
Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s
Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s
Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s
Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s
Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s
Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s
Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s
Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s
Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s
Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s
Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s
Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s
Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s
Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s
Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s
Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s
Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s
Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s
Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s
Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s
Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s
Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s
Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s
Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s
Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s
Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s
Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s
Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s
Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s
Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s
Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s
Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s
Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s
Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s
Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s
Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s
Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s
Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s
Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s
Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s
Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s
Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s
Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s
Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s
Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s
Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s
Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s
Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s
Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s
Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s
Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s
Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s
Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s
Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s
Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s
Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s
Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s
Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s
Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s
Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s
Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s
Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s
Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s
Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s
Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s
Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s
Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s
Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s
Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s
Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s
Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s
Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s
Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s
Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s
Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s
Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s
Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s
Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s
Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s
Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s
Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s
Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s
Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s
Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s
Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s
Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s
Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s
Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s
Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s
Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s
Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s
Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s
Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s
Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s
Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s
Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s
Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s
Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s
Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s
Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s
Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s
Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s
Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s
Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s
Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s
Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s
Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s
Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s
Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s
Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s
Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s
Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s
Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s
Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s
Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s
Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s
Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s
Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s
Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s
Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s
Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s
Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s
Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s
Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s
Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s
Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s
Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s
Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s
Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s
Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s
Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s
Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s
Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s
Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s
Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s
Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s
Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s
Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s
Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s
Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s
Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s
Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s
Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s
Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s
Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s
Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s
Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s
Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s
Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s
Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s
Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s
Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s
Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s
Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s
Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s
Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s
Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s
Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s
Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s
Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s
Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s
Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s
Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s
Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s
Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s
Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s
Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s
Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s
Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s
Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s
Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s
Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s
Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s
Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s
Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s
Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s
Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s
Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s
Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s
Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s
Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s
Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s
Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s
Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s
Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s
Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s
Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s
Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s
Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s
Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s
Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s
Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s
Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s
Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s
Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s
Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s
Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s
Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s
Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s
Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s
Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s
Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s
Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s
Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s
Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s
Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s
Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s
Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s
Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s
Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s
Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s
Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s
Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s
Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s
Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s
Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s
Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s
Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s
Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s
Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s
Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s
Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s
Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s
Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s
Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s
Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s
Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s
Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s
Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s
Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s
Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s
Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s
Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s
Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s
Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s
Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s
Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s
Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s
Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s
Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s
Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s
Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s
Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s
Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s
Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s
Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s
Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s
Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s
Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s
Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s
Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s
Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s
Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s
Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s
Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s
Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s
Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s
Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s
Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s
Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s
Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s
Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s
Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s
Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s
Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s
Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s
Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s
Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s
Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s
Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s
Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s
Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s
Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s
Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s
Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s
Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s
Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s
Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s
Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s
Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s
Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s
Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s
Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s
Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s
Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s
Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s
Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s
Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s
Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s
Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s
Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s
Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s
Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s
Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s
Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s
Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s
Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s
Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s
Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s
Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s
Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s
Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s
Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s
Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s
Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s
Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s
Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s
Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s
Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s
Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s
Search_Digits_PCRE 500000 7534 ns/op
Search_Digits_RE2 50000 44162 ns/op
Parse_Digits_PCRE 200000 7664 ns/op
Parse_Digits_RE2 100000 22595 ns/op
Parse_CachedDigits_PCRE 5000000 721 ns/op
Parse_CachedDigits_RE2 5000000 413 ns/op
Parse_DigitDs_PCRE 500000 7095 ns/op
Parse_DigitDs_RE2 100000 22259 ns/op
Parse_CachedDigitDs_PCRE 5000000 704 ns/op
Parse_CachedDigitDs_RE2 5000000 415 ns/op
Parse_Split_PCRE 500000 5540 ns/op
Parse_Split_RE2 100000 23817 ns/op
Parse_CachedSplit_PCRE 5000000 490 ns/op
Parse_CachedSplit_RE2 10000000 251 ns/op
Parse_SplitHard_PCRE 500000 5410 ns/op
Parse_SplitHard_RE2 100000 28518 ns/op
Parse_CachedSplitHard_PCRE 5000000 488 ns/op
Parse_CachedSplitHard_RE2 1000000 2489 ns/op
Parse_CachedSplitBig1_PCRE 500 7171752 ns/op
Parse_CachedSplitBig1_RE2 2000 990722 ns/op
Parse_CachedSplitBig2_PCRE 5000 658331 ns/op
Parse_CachedSplitBig2_RE2 20 81205250 ns/op
BM_PCRE_Compile 500000 6443 ns/op
BM_RE2_Compile 100000 24103 ns/op
SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s
SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s
SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s
SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s
SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s
SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s
SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s
SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s
SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s
SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s
SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s
SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s
SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s
SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s
SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s
SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s
SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s
SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s
SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s
SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s
SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s
SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s
SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s
SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s
SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s
SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s
SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s
SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s
SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s
SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s
SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s
SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s
SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s
SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s
SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s
SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s
SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s
SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s
SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s
SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s
SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s
SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s
SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s
SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s
EmptyPartialMatchPCRE 10000000 195 ns/op
EmptyPartialMatchRE2 5000000 497 ns/op
SimplePartialMatchPCRE 10000000 276 ns/op
SimplePartialMatchRE2 5000000 548 ns/op
HTTPPartialMatchPCRE 2000000 826 ns/op
HTTPPartialMatchRE2 2000000 894 ns/op
SmallHTTPPartialMatchPCRE 2000000 825 ns/op
SmallHTTPPartialMatchRE2 2000000 895 ns/op
DotMatchPCRE 2000000 810 ns/op
DotMatchRE2 2000000 976 ns/op
ASCIIMatchPCRE 5000000 604 ns/op
ASCIIMatchRE2 2000000 976 ns/op

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

155
outside/re2/benchlog/mktable Executable file
View File

@ -0,0 +1,155 @@
#!/usr/bin/perl
# XXX
sub table() {
my ($name) = @_;
print <<'EOF';
<table border=0>
<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
EOF
foreach my $sys (@sys) {
my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
}
print <<'EOF';
<tr height=5><td colspan=3></td></tr>
</table>
EOF
}
@sizes = (
"8", "16", "32", "64", "128", "256", "512",
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
"1M", "2M", "4M", "8M", "16M"
);
%color = (
"PCRE" => "0.7 0 0",
"RE2" => "0 0 1",
);
$ngraph = 0;
sub graph() {
my ($name) = @_;
my $sys = "wreck";
my $base = sprintf("regexp3g%d", ++$ngraph);
open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
printf JGR "bbox -20 -12 392 95\n";
printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
$ymax = 0;
%lastx = ();
%lasty = ();
foreach my $who ("PCRE", "RE2") {
printf JGR "newcurve pts\n";
for(my $i=0; $i<@sizes; $i++) {
my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
my $val = $data{$sys}->{$key}->{'MB/s'};
next if !defined($val);
if($val > $ymax) {
$ymax = $val;
}
$lastx{$who} = $i;
$lasty{$who} = $val;
printf JGR "$i %f (* %s *)\n", $val, $key;
}
my $color = $color{$who};
printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
}
my $n = @sizes;
printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
for($i=0; $i<@sizes; $i+=3) {
printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
}
my $y = 1;
while(10*$y <= $ymax) {
$y = 10*$y;
}
for($i=2; $i<=10; $i++) {
if($i*$y > $ymax) {
$y = $i*$y;
last;
}
}
foreach my $who ("PCRE", "RE2") {
$x1 = $lastx{$who};
$y1 = $lasty{$who};
$x1 *= 1.01;
my $v = "vjc";
if($y1 < 0.05 * $y) {
$v = "vjb";
$y1 = 0.05 * $y;
}
printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
}
printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
printf JGR " hash_labels fontsize 9\n";
# printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
system("jgraph $base.jgr >$base.eps"); # die "system: $!";
system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
printf "<img src=$base.png>\n"
}
sub skip() {
while(<>) {
if(/^<!-- -->/) {
print;
last;
}
}
}
@sys = ("r70", "c2", "wreck", "mini");
%sysname = (
"r70" => "AMD Opteron 8214 HE, 2.2 GHz",
"c2" => "Intel Core2 Duo E7200, 2.53 GHz",
"wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
"mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
);
%func = (
"table" => \&table,
"graph" => \&graph,
);
foreach my $sys (@sys) {
open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
my %sysdat;
while(<F>) {
if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
my %row;
$row{"name"} = $1;
$row{"iter"} = $2;
$row{"ns/op"} = $3;
if(/([\d.]+) MB\/s/){
$row{"MB/s"} = $1;
}
$sysdat{$row{"name"}} = \%row;
}
}
close F;
$data{$sys} = \%sysdat;
}
while(<>) {
print;
if(/^<!-- benchlog (\w+) -->/) {
$func{$1}();
skip();
next;
}
if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
$func{$1}($2);
skip();
next;
}
}

View File

@ -0,0 +1 @@
xkcd.png is a cropped version of http://xkcd.com/208/

41
outside/re2/doc/mksyntaxgo Executable file
View File

@ -0,0 +1,41 @@
#!/bin/sh
set -e
out=$GOROOT/src/pkg/regexp/syntax/doc.go
cp syntax.txt $out
sam -d $out <<'!'
,x g/NOT SUPPORTED/d
/^Unicode character class/,$d
,s/[«»]//g
,x g/^Possessive repetitions:/d
,x g/\\C/d
,x g/Flag syntax/d
,s/.=(true|false)/flag &/g
,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
,s/\n\n\n+/\n\n/g
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
1,2c
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package regexp (such as Compile and Match) instead of this package.
Syntax
The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to Parse.
.
$a
*/
package syntax
.
w
q
!

42
outside/re2/doc/mksyntaxhtml Executable file
View File

@ -0,0 +1,42 @@
#!/bin/sh
cp syntax.txt syntax.html
sam -d syntax.html <<'!'
,s/\&/\&amp;/g
,s/</\&lt;/g
,s/>/\&gt;/g
,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
,s!«!<code>!g
,s!»!</code>!g
,s! vim$! <font size=-2>VIM</font>!g
,s! pcre$! <font size=-2>PCRE</font>!g
,s! perl$! <font size=-2>PERL</font>!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
,s!NOT SUPPORTED!!g
,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
1,2c
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
</body>
</html>
.
w
q
!

36
outside/re2/doc/mksyntaxwiki Executable file
View File

@ -0,0 +1,36 @@
#!/bin/sh
cp syntax.txt syntax.wiki
sam -d syntax.wiki <<'!'
,s!`!`````!g
,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
,s!«!`!g
,s!»!`!g
,s! vim$! <font size="1">VIM</font>!g
,s! pcre$! <font size="1">PCRE</font>!g
,s! perl$! <font size="1">PERL</font>!g
,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
,s!NOT SUPPORTED!<font size="1">(&)</font>!g
,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
1,2c
#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
<wiki:comment>
GENERATED BY mksyntaxwiki. DO NOT EDIT
</wiki:comment>
<table border="0" cellpadding="2" cellspacing="2">
<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
.
w
q
!

388
outside/re2/doc/syntax.html Normal file
View File

@ -0,0 +1,388 @@
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
<tr><td colspan=2>See <a href="http://go/re2">http://go/re2</a> and <a href="http://go/re2quick">http://go/re2quick</a>.</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Single characters:</b></td></tr>
<tr><td><code>.</code></td><td>any character, including newline (s=true)</td></tr>
<tr><td><code>[xyz]</code></td><td>character class</td></tr>
<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
<tr><td><code>[:alpha:]</code></td><td>ASCII character class</td></tr>
<tr><td><code>[:^alpha:]</code></td><td>negated ASCII character class</td></tr>
<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Composites:</b></td></tr>
<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Repetitions:</b></td></tr>
<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags until outer paren closes; non-capturing</td></tr>
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
<tr><td><code><font color=#808080>(?&gt;re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re@&gt;</font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Flags:</b></td></tr>
<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
<tr><td><code>m</code></td><td>multi-line mode (default false)</td></tr>
<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Empty strings:</b></td></tr>
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>$</code></td><td>at end of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> to left and <code>\W</code> to right or vice versa)</td></tr>
<tr><td><code>\B</code></td><td>not a word boundary</td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
<tr><td><code>\z</code></td><td>at end of text</td></tr>
<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re&amp;</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Escape sequences:</b></td></tr>
<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\g&lt;name&gt;</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\k&lt;name&gt;</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Character class elements:</b></td></tr>
<tr><td><code>x</code></td><td>single character</td></tr>
<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Perl character classes:</b></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
<tr><td><code>[:alnum:]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
<tr><td><code>[:alpha:]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
<tr><td><code>[:ascii:]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
<tr><td><code>[:blank:]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
<tr><td><code>[:cntrl:]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
<tr><td><code>[:digit:]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>[:graph:]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&amp;'()*+,\-./:;&lt;=&gt;?@[\\\]^_`{|}~]</code>)</td></tr>
<tr><td><code>[:lower:]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
<tr><td><code>[:print:]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
<tr><td><code>[:punct:]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
<tr><td><code>[:space:]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
<tr><td><code>[:upper:]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
<tr><td><code>[:word:]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>[:xdigit:]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
<tr><td><code>C</code></td><td>other</td></tr>
<tr><td><code>Cc</code></td><td>control</td></tr>
<tr><td><code>Cf</code></td><td>format</td></tr>
<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
<tr><td><code>Co</code></td><td>private use</td></tr>
<tr><td><code>Cs</code></td><td>surrogate</td></tr>
<tr><td><code>L</code></td><td>letter</td></tr>
<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
<tr><td><code><font color=#808080>L&amp;</font></code></td><td>cased letter </td></tr>
<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
<tr><td><code>Lo</code></td><td>other letter</td></tr>
<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
<tr><td><code>M</code></td><td>mark</td></tr>
<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
<tr><td><code>N</code></td><td>number</td></tr>
<tr><td><code>Nd</code></td><td>decimal number</td></tr>
<tr><td><code>Nl</code></td><td>letter number</td></tr>
<tr><td><code>No</code></td><td>other number</td></tr>
<tr><td><code>P</code></td><td>punctuation</td></tr>
<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
<tr><td><code>Po</code></td><td>other punctuation</td></tr>
<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
<tr><td><code>S</code></td><td>symbol</td></tr>
<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
<tr><td><code>Sm</code></td><td>math symbol</td></tr>
<tr><td><code>So</code></td><td>other symbol</td></tr>
<tr><td><code>Z</code></td><td>separator</td></tr>
<tr><td><code>Zl</code></td><td>line separator</td></tr>
<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
<tr><td><code>Zs</code></td><td>space separator</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
<tr><td><code>Arabic</code></td><td>Arabic</td></tr>
<tr><td><code>Armenian</code></td><td>Armenian</td></tr>
<tr><td><code>Balinese</code></td><td>Balinese</td></tr>
<tr><td><code>Bengali</code></td><td>Bengali</td></tr>
<tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr>
<tr><td><code>Braille</code></td><td>Braille</td></tr>
<tr><td><code>Buginese</code></td><td>Buginese</td></tr>
<tr><td><code>Buhid</code></td><td>Buhid</td></tr>
<tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr>
<tr><td><code>Carian</code></td><td>Carian</td></tr>
<tr><td><code>Cham</code></td><td>Cham</td></tr>
<tr><td><code>Cherokee</code></td><td>Cherokee</td></tr>
<tr><td><code>Common</code></td><td>characters not specific to one script</td></tr>
<tr><td><code>Coptic</code></td><td>Coptic</td></tr>
<tr><td><code>Cuneiform</code></td><td>Cuneiform</td></tr>
<tr><td><code>Cypriot</code></td><td>Cypriot</td></tr>
<tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr>
<tr><td><code>Deseret</code></td><td>Deseret</td></tr>
<tr><td><code>Devanagari</code></td><td>Devanagari</td></tr>
<tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr>
<tr><td><code>Georgian</code></td><td>Georgian</td></tr>
<tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr>
<tr><td><code>Gothic</code></td><td>Gothic</td></tr>
<tr><td><code>Greek</code></td><td>Greek</td></tr>
<tr><td><code>Gujarati</code></td><td>Gujarati</td></tr>
<tr><td><code>Gurmukhi</code></td><td>Gurmukhi</td></tr>
<tr><td><code>Han</code></td><td>Han</td></tr>
<tr><td><code>Hangul</code></td><td>Hangul</td></tr>
<tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr>
<tr><td><code>Hebrew</code></td><td>Hebrew</td></tr>
<tr><td><code>Hiragana</code></td><td>Hiragana</td></tr>
<tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr>
<tr><td><code>Kannada</code></td><td>Kannada</td></tr>
<tr><td><code>Katakana</code></td><td>Katakana</td></tr>
<tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr>
<tr><td><code>Kharoshthi</code></td><td>Kharoshthi</td></tr>
<tr><td><code>Khmer</code></td><td>Khmer</td></tr>
<tr><td><code>Lao</code></td><td>Lao</td></tr>
<tr><td><code>Latin</code></td><td>Latin</td></tr>
<tr><td><code>Lepcha</code></td><td>Lepcha</td></tr>
<tr><td><code>Limbu</code></td><td>Limbu</td></tr>
<tr><td><code>Linear_B</code></td><td>Linear B</td></tr>
<tr><td><code>Lycian</code></td><td>Lycian</td></tr>
<tr><td><code>Lydian</code></td><td>Lydian</td></tr>
<tr><td><code>Malayalam</code></td><td>Malayalam</td></tr>
<tr><td><code>Mongolian</code></td><td>Mongolian</td></tr>
<tr><td><code>Myanmar</code></td><td>Myanmar</td></tr>
<tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr>
<tr><td><code>Nko</code></td><td>Nko</td></tr>
<tr><td><code>Ogham</code></td><td>Ogham</td></tr>
<tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr>
<tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr>
<tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr>
<tr><td><code>Oriya</code></td><td>Oriya</td></tr>
<tr><td><code>Osmanya</code></td><td>Osmanya</td></tr>
<tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr>
<tr><td><code>Phoenician</code></td><td>Phoenician</td></tr>
<tr><td><code>Rejang</code></td><td>Rejang</td></tr>
<tr><td><code>Runic</code></td><td>Runic</td></tr>
<tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr>
<tr><td><code>Shavian</code></td><td>Shavian</td></tr>
<tr><td><code>Sinhala</code></td><td>Sinhala</td></tr>
<tr><td><code>Sundanese</code></td><td>Sundanese</td></tr>
<tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr>
<tr><td><code>Syriac</code></td><td>Syriac</td></tr>
<tr><td><code>Tagalog</code></td><td>Tagalog</td></tr>
<tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr>
<tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr>
<tr><td><code>Tamil</code></td><td>Tamil</td></tr>
<tr><td><code>Telugu</code></td><td>Telugu</td></tr>
<tr><td><code>Thaana</code></td><td>Thaana</td></tr>
<tr><td><code>Thai</code></td><td>Thai</td></tr>
<tr><td><code>Tibetan</code></td><td>Tibetan</td></tr>
<tr><td><code>Tifinagh</code></td><td>Tifinagh</td></tr>
<tr><td><code>Ugaritic</code></td><td>Ugaritic</td></tr>
<tr><td><code>Vai</code></td><td>Vai</td></tr>
<tr><td><code>Yi</code></td><td>Yi</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim flags:</b></td></tr>
<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Magic:</b></td></tr>
<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
<tr><td><code><font color=#808080>(?&amp;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>(?P&gt;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
<tr><td></td></tr>
</table>
</body>
</html>

395
outside/re2/doc/syntax.txt Normal file
View File

@ -0,0 +1,395 @@
RE2 regular expression syntax reference
-------------------------­-------­-----
Single characters:
. any character, possibly including newline (s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[:alpha:] ASCII character class
[:^alpha:] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy «x» followed by «y»
x|y «x» or «y» (prefer «x»)
Repetitions:
x* zero or more «x», prefer more
x+ one or more «x», prefer more
x? zero or one «x», prefer one
x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
x{n,} «n» or more «x», prefer more
x{n} exactly «n» «x»
x*? zero or more «x», prefer fewer
x+? one or more «x», prefer fewer
x?? zero or one «x», prefer zero
x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
x{n,}? «n» or more «x», prefer fewer
x{n}? exactly «n» «x»
x{} (== x*) NOT SUPPORTED vim
x{-} (== x*?) NOT SUPPORTED vim
x{-n} (== x{n}?) NOT SUPPORTED vim
x= (== x?) NOT SUPPORTED vim
Possessive repetitions:
x*+ zero or more «x», possessive NOT SUPPORTED
x++ one or more «x», possessive NOT SUPPORTED
x?+ zero or one «x», possessive NOT SUPPORTED
x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
x{n,}+ «n» or more «x», possessive NOT SUPPORTED
x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group
(?P<name>re) named & numbered capturing group
(?<name>re) named & numbered capturing group NOT SUPPORTED
(?'name're) named & numbered capturing group NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
(?#text) comment NOT SUPPORTED
(?|x|y|z) branch numbering reset NOT SUPPORTED
(?>re) possessive match of «re» NOT SUPPORTED
re@> possessive match of «re» NOT SUPPORTED vim
%(re) non-capturing group NOT SUPPORTED vim
Flags:
i case-insensitive (default false)
m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
s let «.» match «\n» (default false)
U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
Empty strings:
^ at beginning of text or line («m»=true)
$ at end of text (like «\z» not «\Z») or line («m»=true)
\A at beginning of text
\b at word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
\B not a word boundary
\G at beginning of subtext being searched NOT SUPPORTED pcre
\G at end of last match NOT SUPPORTED perl
\Z at end of text, or before newline at end of text NOT SUPPORTED
\z at end of text
(?=re) before text matching «re» NOT SUPPORTED
(?!re) before text not matching «re» NOT SUPPORTED
(?<=re) after text matching «re» NOT SUPPORTED
(?<!re) after text not matching «re» NOT SUPPORTED
re& before text matching «re» NOT SUPPORTED vim
re@= before text matching «re» NOT SUPPORTED vim
re@! before text not matching «re» NOT SUPPORTED vim
re@<= after text matching «re» NOT SUPPORTED vim
re@<! after text not matching «re» NOT SUPPORTED vim
\zs sets start of match (= \K) NOT SUPPORTED vim
\ze sets end of match NOT SUPPORTED vim
\%^ beginning of file NOT SUPPORTED vim
\%$ end of file NOT SUPPORTED vim
\%V on screen NOT SUPPORTED vim
\%# cursor position NOT SUPPORTED vim
\%'m mark «m» position NOT SUPPORTED vim
\%23l in line 23 NOT SUPPORTED vim
\%23c in column 23 NOT SUPPORTED vim
\%23v in virtual column 23 NOT SUPPORTED vim
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal «*», for any punctuation character «*»
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\C match a single byte even in UTF-8 mode
\Q...\E literal text «...» even if «...» has punctuation
\1 backreference NOT SUPPORTED
\b backspace NOT SUPPORTED (use «\010»)
\cK control char ^K NOT SUPPORTED (use «\001» etc)
\e escape NOT SUPPORTED (use «\033»)
\g1 backreference NOT SUPPORTED
\g{1} backreference NOT SUPPORTED
\g{+1} backreference NOT SUPPORTED
\g{-1} backreference NOT SUPPORTED
\g{name} named backreference NOT SUPPORTED
\g<name> subroutine call NOT SUPPORTED
\g'name' subroutine call NOT SUPPORTED
\k<name> named backreference NOT SUPPORTED
\k'name' named backreference NOT SUPPORTED
\lX lowercase «X» NOT SUPPORTED
\ux uppercase «x» NOT SUPPORTED
\L...\E lowercase text «...» NOT SUPPORTED
\K reset beginning of «$0» NOT SUPPORTED
\N{name} named Unicode character NOT SUPPORTED
\R line break NOT SUPPORTED
\U...\E upper case text «...» NOT SUPPORTED
\X extended Unicode sequence NOT SUPPORTED
\%d123 decimal character 123 NOT SUPPORTED vim
\%xFF hex character FF NOT SUPPORTED vim
\%o123 octal character 123 NOT SUPPORTED vim
\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class «foo»
\p{Foo} Unicode character class «Foo»
\pF Unicode character class «F» (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes:
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
\h horizontal space NOT SUPPORTED
\H not horizontal space NOT SUPPORTED
\v vertical space NOT SUPPORTED
\V not vertical space NOT SUPPORTED
ASCII character classes:
[:alnum:] alphanumeric (== [0-9A-Za-z])
[:alpha:] alphabetic (== [A-Za-z])
[:ascii:] ASCII (== [\x00-\x7F])
[:blank:] blank (== [\t ])
[:cntrl:] control (== [\x00-\x1F\x7F])
[:digit:] digits (== [0-9])
[:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[:lower:] lower case (== [a-z])
[:print:] printable (== [ -~] == [ [:graph:]])
[:punct:] punctuation (== [!-/:-@[-`{-~])
[:space:] whitespace (== [\t\n\v\f\r ])
[:upper:] upper case (== [A-Z])
[:word:] word characters (== [0-9A-Za-z_])
[:xdigit:] hex digit (== [0-9A-Fa-f])
Unicode character class names--general category:
C other
Cc control
Cf format
Cn unassigned code points NOT SUPPORTED
Co private use
Cs surrogate
L letter
LC cased letter NOT SUPPORTED
L& cased letter NOT SUPPORTED
Ll lowercase letter
Lm modifier letter
Lo other letter
Lt titlecase letter
Lu uppercase letter
M mark
Mc spacing mark
Me enclosing mark
Mn non-spacing mark
N number
Nd decimal number
Nl letter number
No other number
P punctuation
Pc connector punctuation
Pd dash punctuation
Pe close punctuation
Pf final punctuation
Pi initial punctuation
Po other punctuation
Ps open punctuation
S symbol
Sc currency symbol
Sk modifier symbol
Sm math symbol
So other symbol
Z separator
Zl line separator
Zp paragraph separator
Zs space separator
Unicode character class names--scripts:
Arabic Arabic
Armenian Armenian
Balinese Balinese
Bamum Bamum
Batak Batak
Bengali Bengali
Bopomofo Bopomofo
Brahmi Brahmi
Braille Braille
Buginese Buginese
Buhid Buhid
Canadian_Aboriginal Canadian Aboriginal
Carian Carian
Chakma Chakma
Cham Cham
Cherokee Cherokee
Common characters not specific to one script
Coptic Coptic
Cuneiform Cuneiform
Cypriot Cypriot
Cyrillic Cyrillic
Deseret Deseret
Devanagari Devanagari
Egyptian_Hieroglyphs Egyptian Hieroglyphs
Ethiopic Ethiopic
Georgian Georgian
Glagolitic Glagolitic
Gothic Gothic
Greek Greek
Gujarati Gujarati
Gurmukhi Gurmukhi
Han Han
Hangul Hangul
Hanunoo Hanunoo
Hebrew Hebrew
Hiragana Hiragana
Imperial_Aramaic Imperial Aramaic
Inherited inherit script from previous character
Inscriptional_Pahlavi Inscriptional Pahlavi
Inscriptional_Parthian Inscriptional Parthian
Javanese Javanese
Kaithi Kaithi
Kannada Kannada
Katakana Katakana
Kayah_Li Kayah Li
Kharoshthi Kharoshthi
Khmer Khmer
Lao Lao
Latin Latin
Lepcha Lepcha
Limbu Limbu
Linear_B Linear B
Lycian Lycian
Lydian Lydian
Malayalam Malayalam
Mandaic Mandaic
Meetei_Mayek Meetei Mayek
Meroitic_Cursive Meroitic Cursive
Meroitic_Hieroglyphs Meroitic Hieroglyphs
Miao Miao
Mongolian Mongolian
Myanmar Myanmar
New_Tai_Lue New Tai Lue (aka Simplified Tai Lue)
Nko Nko
Ogham Ogham
Ol_Chiki Ol Chiki
Old_Italic Old Italic
Old_Persian Old Persian
Old_South_Arabian Old South Arabian
Old_Turkic Old Turkic
Oriya Oriya
Osmanya Osmanya
Phags_Pa 'Phags Pa
Phoenician Phoenician
Rejang Rejang
Runic Runic
Saurashtra Saurashtra
Sharada Sharada
Shavian Shavian
Sinhala Sinhala
Sora_Sompeng Sora Sompeng
Sundanese Sundanese
Syloti_Nagri Syloti Nagri
Syriac Syriac
Tagalog Tagalog
Tagbanwa Tagbanwa
Tai_Le Tai Le
Tai_Tham Tai Tham
Tai_Viet Tai Viet
Takri Takri
Tamil Tamil
Telugu Telugu
Thaana Thaana
Thai Thai
Tibetan Tibetan
Tifinagh Tifinagh
Ugaritic Ugaritic
Vai Vai
Yi Yi
Vim character classes:
\i identifier character NOT SUPPORTED vim
\I «\i» except digits NOT SUPPORTED vim
\k keyword character NOT SUPPORTED vim
\K «\k» except digits NOT SUPPORTED vim
\f file name character NOT SUPPORTED vim
\F «\f» except digits NOT SUPPORTED vim
\p printable character NOT SUPPORTED vim
\P «\p» except digits NOT SUPPORTED vim
\s whitespace character (== [ \t]) NOT SUPPORTED vim
\S non-white space character (== [^ \t]) NOT SUPPORTED vim
\d digits (== [0-9]) vim
\D not «\d» vim
\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
\X not «\x» NOT SUPPORTED vim
\o octal digits (== [0-7]) NOT SUPPORTED vim
\O not «\o» NOT SUPPORTED vim
\w word character vim
\W not «\w» vim
\h head of word character NOT SUPPORTED vim
\H not «\h» NOT SUPPORTED vim
\a alphabetic NOT SUPPORTED vim
\A not «\a» NOT SUPPORTED vim
\l lowercase NOT SUPPORTED vim
\L not lowercase NOT SUPPORTED vim
\u uppercase NOT SUPPORTED vim
\U not uppercase NOT SUPPORTED vim
\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
Vim flags:
\c ignore case NOT SUPPORTED vim
\C match case NOT SUPPORTED vim
\m magic NOT SUPPORTED vim
\M nomagic NOT SUPPORTED vim
\v verymagic NOT SUPPORTED vim
\V verynomagic NOT SUPPORTED vim
\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
Magic:
(?{code}) arbitrary Perl code NOT SUPPORTED perl
(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
(?+n) recursive call to relative group «+n» NOT SUPPORTED
(?-n) recursive call to relative group «-n» NOT SUPPORTED
(?C) PCRE callout NOT SUPPORTED pcre
(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
(?&name) recursive call to named group NOT SUPPORTED
(?P=name) named backreference NOT SUPPORTED
(?P>name) recursive call to named group NOT SUPPORTED
(?(cond)true|false) conditional branch NOT SUPPORTED
(?(cond)true) conditional branch NOT SUPPORTED
(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
(*COMMIT) NOT SUPPORTED
(*F) NOT SUPPORTED
(*FAIL) NOT SUPPORTED
(*MARK) NOT SUPPORTED
(*PRUNE) NOT SUPPORTED
(*SKIP) NOT SUPPORTED
(*THEN) NOT SUPPORTED
(*ANY) set newline convention NOT SUPPORTED
(*ANYCRLF) NOT SUPPORTED
(*CR) NOT SUPPORTED
(*CRLF) NOT SUPPORTED
(*LF) NOT SUPPORTED
(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
(*BSR_UNICODE) NOT SUPPORTED pcre

BIN
outside/re2/doc/xkcd.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

View File

@ -0,0 +1 @@
defaultcc: re2-dev@googlegroups.com

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
{
global:
# re2::RE2*
_ZN3re23RE2*;
_ZNK3re23RE2*;
# re2::StringPiece*
_ZN3re211StringPiece*;
_ZNK3re211StringPiece*;
# operator<<(std::ostream&, re2::StringPiece const&)
_ZlsRSoRKN3re211StringPieceE;
# re2::FilteredRE2*
_ZN3re211FilteredRE2*;
_ZNK3re211FilteredRE210AllMatches*;
local:
*;
};

View File

@ -0,0 +1,13 @@
# Linker doesn't like these unmangled:
# re2::RE2*
__ZN3re23RE2*
__ZNK3re23RE2*
# re2::StringPiece*
__ZN3re211StringPiece*
__ZNK3re211StringPiece*
# operator<<(std::ostream&, re2::StringPiece const&)
__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
# re2::FilteredRE2*
__ZN3re211FilteredRE2*
__ZNK3re211FilteredRE210AllMatches*

1
outside/re2/re2/Makefile Normal file
View File

@ -0,0 +1 @@

378
outside/re2/re2/bitstate.cc Normal file
View File

@ -0,0 +1,378 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Like
// testing/backtrack.cc, it allocates a bit vector with (length of
// text) * (length of prog) bits, to make sure it never explores the
// same (character position, instruction) state multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int arg;
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
~BitState();
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p, int arg);
bool GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char** cap_; // capture registers
int ncap_;
static const int VisitedBits = 32;
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
int maxjob_;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
cap_(NULL),
ncap_(0),
visited_(NULL),
nvisited_(0),
job_(NULL),
njob_(0),
maxjob_(0) {
}
BitState::~BitState() {
delete[] visited_;
delete[] job_;
delete[] cap_;
}
// Should the search visit the pair ip, p?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
uint n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
bool BitState::GrowStack() {
// VLOG(0) << "Reallocate.";
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
delete[] job_;
job_ = newjob;
if (njob_ >= maxjob_) {
LOG(DFATAL) << "Job stack overflow.";
return false;
}
return true;
}
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p, int arg) {
if (njob_ >= maxjob_) {
if (!GrowStack())
return;
}
int op = prog_->inst(id)->opcode();
if (op == kInstFail)
return;
// Only check ShouldVisit when arg == 0.
// When arg > 0, we are continuing a previous visit.
if (arg == 0 && !ShouldVisit(id, p))
return;
Job* j = &job_[njob_++];
j->id = id;
j->p = p;
j->arg = arg;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
const char* p = job_[njob_].p;
int arg = job_[njob_].arg;
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
if (0) {
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
// VLOG(0) << "Job: " << ip->id() << " "
// << (p - text_.begin()) << " " << arg;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstFail:
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstAlt:
// Cannot just
// Push(ip->out1(), p, 0);
// Push(ip->out(), p, 0);
// If, during the processing of ip->out(), we encounter
// ip->out1() via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// ip with arg==1 as a reminder to push ip->out1() later.
switch (arg) {
case 0:
Push(id, p, 1); // come back when we're done
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); try ip->out1().
arg = 0;
id = ip->out1();
goto CheckAndLoop;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstAltMatch:
// One opcode is byte range; the other leads to match.
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
id = ip->out1();
p = end;
goto CheckAndLoop;
}
// out is the match - non-greedy
Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (ip->Matches(c)) {
id = ip->out();
p++;
goto CheckAndLoop;
}
continue;
}
case kInstCapture:
switch (arg) {
case 0:
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
continue;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
continue;
id = ip->out();
goto CheckAndLoop;
case kInstNop:
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
continue;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == text_.end())
return true;
// Otherwise, continue on in hope of a longer match.
continue;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = NULL;
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// VLOG(0) << "nvisited_ = " << nvisited_;
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
ncap_ = 2;
cap_ = new const char*[ncap_];
memset(cap_, 0, ncap_*sizeof cap_[0]);
maxjob_ = 256;
job_ = new Job[maxjob_];
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

1140
outside/re2/re2/compile.cc Normal file

File diff suppressed because it is too large Load Diff

2115
outside/re2/re2/dfa.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,102 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/util.h"
#include "re2/filtered_re2.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::~FilteredRE2() {
for (int i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
}
delete re;
} else {
*id = re2_vec_.size();
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(vector<string>* atoms) {
if (compiled_ || re2_vec_.size() == 0) {
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
return;
}
for (int i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (int i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return i;
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile";
return -1;
}
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const {
matching_regexps->clear();
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

View File

@ -0,0 +1,101 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const;
// The number of regexps added.
int NumRegexps() const { return re2_vec_.size(); }
private:
// Get the individual RE2 objects. Useful for testing.
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
FilteredRE2(const FilteredRE2&);
void operator=(const FilteredRE2&);
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

View File

@ -0,0 +1,110 @@
#!/usr/bin/perl
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
@perlclasses = (
"\\d",
"\\s",
"\\w",
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && chr($i) =~ $regexp) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "static const URange16 code${cname}[] = { /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
my $escname = $name;
$escname =~ s/\\/\\\\/g;
$negname = $escname;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "const UGroup ${cname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
print "const int num_${cname}_groups = $count;\n";
}
print <<EOF;
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);
print <<EOF;
} // namespace re2
EOF

View File

@ -0,0 +1,146 @@
#!/usr/bin/python
# coding=utf-8
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# See unicode_casefold.h for description of case folding tables.
"""Generate C++ table for Unicode case folding."""
import unicode, sys
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
def _Delta(a, b):
"""Compute the delta for b - a. Even/odd and odd/even
are handled specially, as described above."""
if a+1 == b:
if a%2 == 0:
return 'EvenOdd'
else:
return 'OddEven'
if a == b+1:
if a%2 == 0:
return 'OddEven'
else:
return 'EvenOdd'
return b - a
def _AddDelta(a, delta):
"""Return a + delta, handling EvenOdd and OddEven specially."""
if type(delta) == int:
return a+delta
if delta == 'EvenOdd':
if a%2 == 0:
return a+1
else:
return a-1
if delta == 'OddEven':
if a%2 == 1:
return a+1
else:
return a-1
print >>sys.stderr, "Bad Delta: ", delta
raise "Bad Delta"
def _MakeRanges(pairs):
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
into [(65, 90, +32)]."""
ranges = []
last = -100
def evenodd(last, a, b, r):
if a != last+1 or b != _AddDelta(a, r[2]):
return False
r[1] = a
return True
def evenoddpair(last, a, b, r):
if a != last+2:
return False
delta = r[2]
d = delta
if type(delta) is not str:
return False
if delta.endswith('Skip'):
d = delta[:-4]
else:
delta = d + 'Skip'
if b != _AddDelta(a, d):
return False
r[1] = a
r[2] = delta
return True
for a, b in pairs:
if ranges and evenodd(last, a, b, ranges[-1]):
pass
elif ranges and evenoddpair(last, a, b, ranges[-1]):
pass
else:
ranges.append([a, a, _Delta(a, b)])
last = a
return ranges
# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
# case-folding group, so it is important that this bound be small.
# The current tables have no group bigger than 4.
# If there are ever groups bigger than 10 or so, it will be
# time to rework the code in parse.cc.
MaxCasefoldGroup = 4
def main():
lowergroups, casegroups = unicode.CaseGroups()
foldpairs = []
seen = {}
for c in casegroups:
if len(c) > MaxCasefoldGroup:
raise unicode.Error("casefold group too long: %s" % (c,))
for i in range(len(c)):
if c[i-1] in seen:
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
seen[c[i-1]] = True
foldpairs.append([c[i-1], c[i]])
lowerpairs = []
for lower, group in lowergroups.iteritems():
for g in group:
if g != lower:
lowerpairs.append([g, lower])
def printpairs(name, foldpairs):
foldpairs.sort()
foldranges = _MakeRanges(foldpairs)
print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
print "const CaseFold unicode_%s[] = {" % (name,)
for lo, hi, delta in foldranges:
print "\t{ %d, %d, %s }," % (lo, hi, delta)
print "};"
print "const int num_unicode_%s = %d;" % (name, len(foldranges),)
print ""
print _header
printpairs("casefold", foldpairs)
printpairs("tolower", lowerpairs)
print _trailer
if __name__ == '__main__':
main()

View File

@ -0,0 +1,111 @@
#!/usr/bin/python
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Generate C++ tables for Unicode Script and Category groups."""
import sys
import unicode
_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
n16 = 0
n32 = 0
def MakeRanges(codes):
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
ranges = []
last = -100
for c in codes:
if c == last+1:
ranges[-1][1] = c
else:
ranges.append([c, c])
last = c
return ranges
def PrintRanges(type, name, ranges):
"""Print the ranges as an array of type named name."""
print "static const %s %s[] = {" % (type, name,)
for lo, hi in ranges:
print "\t{ %d, %d }," % (lo, hi)
print "};"
# def PrintCodes(type, name, codes):
# """Print the codes as an array of type named name."""
# print "static %s %s[] = {" % (type, name,)
# for c in codes:
# print "\t%d," % (c,)
# print "};"
def PrintGroup(name, codes):
"""Print the data structures for the group of codes.
Return a UGroup literal for the group."""
# See unicode_groups.h for a description of the data structure.
# Split codes into 16-bit ranges and 32-bit ranges.
range16 = MakeRanges([c for c in codes if c < 65536])
range32 = MakeRanges([c for c in codes if c >= 65536])
# Pull singleton ranges out of range16.
# code16 = [lo for lo, hi in range16 if lo == hi]
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
global n16
global n32
n16 += len(range16)
n32 += len(range32)
ugroup = "{ \"%s\", +1" % (name,)
# if len(code16) > 0:
# PrintCodes("uint16", name+"_code16", code16)
# ugroup += ", %s_code16, %d" % (name, len(code16))
# else:
# ugroup += ", 0, 0"
if len(range16) > 0:
PrintRanges("URange16", name+"_range16", range16)
ugroup += ", %s_range16, %d" % (name, len(range16))
else:
ugroup += ", 0, 0"
if len(range32) > 0:
PrintRanges("URange32", name+"_range32", range32)
ugroup += ", %s_range32, %d" % (name, len(range32))
else:
ugroup += ", 0, 0"
ugroup += " }"
return ugroup
def main():
print _header
ugroups = []
for name, codes in unicode.Categories().iteritems():
ugroups.append(PrintGroup(name, codes))
for name, codes in unicode.Scripts().iteritems():
ugroups.append(PrintGroup(name, codes))
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
print "const UGroup unicode_groups[] = {";
ugroups.sort()
for ug in ugroups:
print "\t%s," % (ug,)
print "};"
print "const int num_unicode_groups = %d;" % (len(ugroups),)
print _trailer
if __name__ == '__main__':
main()

View File

@ -0,0 +1,185 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

709
outside/re2/re2/nfa.cc Normal file
View File

@ -0,0 +1,709 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
namespace re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
int j;
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
AddState()
: id(0), j(-1), cap_j(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
// Add id (or its children, following unlabeled arrows)
// to the workqueue q with associated capture info.
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is position of the next byte (the one after c)
// in the input string, used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
const char** match_; // best match so far
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
DISALLOW_EVIL_CONSTRUCTORS(NFA);
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
return t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// The pointer p is the current input position, and m is the
// current set of match boundaries.
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
if (id0 == 0)
return;
// Astack_ is pre-allocated to avoid resize operations.
// It has room for 2*prog_->size() entries, which is enough:
// Each inst in prog can be processed at most once,
// pushing at most two entries on stk.
int nstk = 0;
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
// fall through
case kInstAlt:
// Explore alternatives.
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, capture[j], j);
// Record capture.
capture[j] = p;
}
stk[nstk++] = AddState(ip->out());
break;
case kInstMatch:
case kInstByteRange:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
break;
case kInstEmptyWidth:
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates match as new, better matches are found.
// p is position of the byte c in the input string,
// used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
continue;
}
}
int id = t->id;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
break;
const char* old = t->capture[1]; // previous end pointer
t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture((const char**)match_, t->capture);
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
t->capture[0] = old;
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
// Returns whether haystack contains needle's memory.
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
if (Debug) {
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
longest);
}
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
// Check for empty-width specials.
int flag = 0;
// ^ and \A
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
// $ and \z
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
// \b and \B
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
}
fprintf(stderr, "\n");
}
// Process previous character (waited until now to avoid
// repeating the flag computation above).
// This is a no-op the first time around the loop, because
// runq is empty.
int id = Step(runq, nextq, c, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (Debug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
// Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
return true;
}
VLOG(1) << "No matches found";
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1; // first byte, not yet computed
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
// Explore alternatives.
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

614
outside/re2/re2/onepass.cc Normal file
View File

@ -0,0 +1,614 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <string.h>
#include <map>
#include "util/util.h"
#include "util/arena.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
static const int Debug = 0;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA (aka DFA) - just an array of actions.
struct OneState;
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32 matchcond; // conditions to match right now.
uint32 action[1];
};
// The uint32 conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32 kMatchWins = 1 << kEmptyShift;
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
kEmptyShift_disagrees_with_kEmptyAllFlags);
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
kMaxCap_disagrees_with_kMaxOnePassCapture);
}
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
uint32 satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32 cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Compute a node pointer.
// Basically (OneState*)(nodes + statesize*nodeindex)
// but the version with the C++ casts overflows 80 characters (and is ugly).
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(
const_cast<uint8*>(nodes + statesize*nodeindex));
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
// State and act are marked volatile to
// keep the compiler from re-ordering the
// memory accesses walking over the NFA.
// This is worth about 5%.
volatile OneState* state = onepass_start_;
volatile uint8* nodes = onepass_nodes_;
volatile uint32 statesize = onepass_statesize_;
uint8* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32 nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32 matchcond = nextmatchcond;
uint32 cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32 nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32 matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32 cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_start_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + byte_inst_count_;
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int size = this->size();
InstCond *stack = new InstCond[size];
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
uint8* nodes = new uint8[maxnodes*statesize];
uint8* nodep = nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
nodep += statesize;
int nalloc = 1;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes, statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
// Fall through.
case kInstAlt:
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
goto fail;
stack[nstack].id = ip->out1();
stack[nstack++].cond = cond;
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR)
<< StringPrintf("Not OnePass: hit node limit %d > %d",
nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
nodep += statesize;
nodebyid[ip->out()] = nextindex;
nalloc++;
AddQ(&tovisit, ip->out());
}
if (matched)
cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in byte class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
}
break;
}
case kInstCapture:
if (ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
goto QueueEmpty;
case kInstEmptyWidth:
cond |= ip->empty();
goto QueueEmpty;
case kInstNop:
QueueEmpty:
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
" %d -> %d\n",
*it, ip->out());
}
goto fail;
}
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstMatch:
if (matched) {
// (3) is violated
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
" from %d\n", *it);
}
goto fail;
}
matched = true;
node->matchcond = cond;
break;
case kInstFail:
break;
}
}
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
string dump = "prog dump:\n" + Dump() + "node dump\n";
map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
StringAppendF(&dump, "byte ranges:\n");
int i = 0;
for (int b = 0; b < bytemap_range_; b++) {
int lo = i;
while (bytemap_[i] == b)
i++;
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
}
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
string s;
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << dump;
}
// Overallocated earlier; cut down to actual size.
nodep = new uint8[nalloc*statesize];
memmove(nodep, nodes, nalloc*statesize);
delete[] nodes;
nodes = nodep;
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
onepass_nodes_ = nodes;
onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
delete[] stack;
delete[] nodebyid;
return true;
fail:
delete[] stack;
delete[] nodebyid;
delete[] nodes;
return false;
}
} // namespace re2

2216
outside/re2/re2/parse.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
const int num_posix_groups = 28;
} // namespace re2

View File

@ -0,0 +1,715 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
alloc_id_ = alloc_id++;
VLOG(10) << "alloc_id: " << alloc_id_;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (int i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->size() == 0) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (int i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching ab will already allow this regexp to be a
// candidate for match, so further matching abc is redundant.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
// Increment j early so that we can erase the element it points to.
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
string Prefilter::Info::ToString() {
if (this == NULL) {
// Sometimes when iterating on children of a node,
// some children might have NULL Info. Adding
// the check here for NULL to take care of cases where
// the caller is not checking.
return "";
}
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
static string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
bool latin1 = re->parse_flags() & Regexp::Latin1;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
// Claim nothing, except that it's not empty.
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
<< ": " << info->ToString();
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
if (this == NULL)
return "<nil>";
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
s += (*subs_)[i]->DebugString();
}
return s;
}
case OR: {
string s = "(";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
s += (*subs_)[i]->DebugString();
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

105
outside/re2/re2/prefilter.h Normal file
View File

@ -0,0 +1,105 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
#include "util/util.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
vector<Prefilter*>* subs() {
CHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(set<string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
// Used for debugging, helps in tracking memory leaks.
int alloc_id_;
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
};
} // namespace re2
#endif // RE2_PREFILTER_H_

View File

@ -0,0 +1,397 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2/re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
namespace re2 {
PrefilterTree::PrefilterTree()
: compiled_(false) {
}
PrefilterTree::~PrefilterTree() {
for (int i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (int i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
FLAGS_filtered_re2_min_atom_len;
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (int i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (int i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
}
prefilter_vec_.push_back(f);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
AssignUniqueIds(atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (int i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
PrintDebugInfo();
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (int i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (int i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
for (int j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
// Create parent IntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
set<int> uniq_child;
for (int j = 0; j < prefilter->subs()->size() ; j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end())
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (int i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
for (int i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
for (int j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
for (int i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (int i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->first;
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
for (int i = 0; i < entries_.size(); ++i) {
StdIntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->first;
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

View File

@ -0,0 +1,131 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
#include "util/util.h"
#include "util/sparse_array.h"
namespace re2 {
typedef SparseArray<int> IntMap;
typedef map<int,int> StdIntMap;
class Prefilter;
class PrefilterTree {
public:
PrefilterTree();
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
StdIntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
vector<int> regexps;
};
private:
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo();
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
vector<Entry> entries_;
// Map node string to canonical Prefilter node.
map<string, Prefilter*> node_map_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
vector<int> unfiltered_;
// vector of Prefilter for all regexps.
vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

343
outside/re2/re2/prog.cc Normal file
View File

@ -0,0 +1,343 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
if (0 < lo && lo <= 255)
byterange_.Set(lo - 1);
if (0 <= hi && hi <= 255)
byterange_.Set(hi);
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = i;
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = i;
unbytemap_[i] = i;
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
} // namespace re2

376
outside/re2/re2/prog.h Normal file
View File

@ -0,0 +1,376 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#ifndef RE2_PROG_H__
#define RE2_PROG_H__
#include "util/util.h"
#include "re2/re2.h"
namespace re2 {
// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
public:
Bitmap() { Reset(); }
int Size() { return Bits; }
void Reset() {
for (int i = 0; i < Words; i++)
w_[i] = 0;
}
bool Get(int k) const {
return w_[k >> WordLog] & (1<<(k & 31));
}
void Set(int k) {
w_[k >> WordLog] |= 1<<(k & 31);
}
void Clear(int k) {
w_[k >> WordLog] &= ~(1<<(k & 31));
}
uint32 Word(int i) const {
return w_[i];
}
private:
static const int WordLog = 5;
static const int Words = (Bits+31)/32;
uint32 w_[Words];
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
};
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class Regexp;
class DFA;
struct OneState;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) { }
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
void InitCapture(int cap, uint32 out);
void InitEmptyWidth(EmptyOp empty, uint32 out);
void InitMatch(int id);
void InitNop(uint32 out);
void InitFail();
// Getters
int id(Prog* p) { return this - p->inst_; }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int out() { return out_opcode_>>3; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog *p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange;
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_, and PatchList steals another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<3) | opcode;
}
void set_out(int out) {
out_opcode_ = (out<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<3) | opcode;
}
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
// alternate next instruction
int32 cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32 match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8 lo_; // byte range is lo_-hi_ inclusive
uint8 hi_; //
uint8 foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
DISALLOW_EVIL_CONSTRUCTORS(Inst);
};
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int64 size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int64 byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
// Record that at some point in the prog, the bytes in the range
// lo-hi (inclusive) are treated as different from bytes outside the range.
// Tracking this lets the DFA collapse commonly-treated byte ranges
// when recording state pointers, greatly reducing its memory footprint.
void MarkByteRange(int lo, int hi);
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8 c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match0, bool* failed,
vector<int>* matches);
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work. This function is useful only
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
// Compute byte map.
void ComputeByteMap();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int byte_inst_count_; // number of kInstByteRange instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
Inst* inst_; // pointer to instruction array
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
void (*delete_dfa_)(DFA* dfa);
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
// commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
DISALLOW_EVIL_CONSTRUCTORS(Prog);
};
} // namespace re2
#endif // RE2_PROG_H__

1218
outside/re2/re2/re2.cc Normal file

File diff suppressed because it is too large Load Diff

877
outside/re2/re2/re2.h Normal file
View File

@ -0,0 +1,877 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H
#define RE2_RE2_H
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stdint.h>
#include <map>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
#ifndef RE2_HAVE_LONGLONG
#define RE2_HAVE_LONGLONG 1
#endif
namespace re2 {
using std::string;
using std::map;
class Mutex;
class Prog;
class Regexp;
// The following enum should be used only as a constructor argument to indicate
// that the variable has static storage class, and that the constructor should
// do nothing to its state. It indicates to the reader that it is legal to
// declare a static instance of the class, provided the constructor is given
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
// static variable that has a constructor or a destructor because invocation
// order is undefined. However, IF the type can be initialized by filling with
// zeroes (which the loader does for static variables), AND the type's
// destructor does nothing to the storage, then a constructor for static
// initialization can be declared as
// explicit MyClass(LinkerInitialized x) {}
// and invoked as
// static MyClass my_variable_name(LINKER_INITIALIZED);
enum LinkerInitialized { LINKER_INITIALIZED };
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, possibly passing one of these to
// the Option constructor, change the settings, and pass that
// Option class to the RE2 constructor.
enum CannedOptions {
DefaultOptions = 0,
Latin1, // treat input as Latin-1 (default UTF-8)
POSIX, // POSIX syntax, leftmost-longest match
Quiet // do not log about regexp parse errors
};
// Need to have the const char* and const string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& option);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
string *out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
// runs even faster if nmatch == 0.
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i] == NULL.
bool Match(const StringPiece& text,
int startpos,
int endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
static int MaxSubmatch(const StringPiece& rewrite);
// Append the "rewrite" string, with backslash subsitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece* vec,
int veclen) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// dot_nl (false) dot matches everything including new line
// never_capture (false) parse all parens as non-capturing
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// (When posix_syntax == false these features are always enabled and
// cannot be turned off.)
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
never_capture_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
/*implicit*/ Options(CannedOptions);
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int64_t max_mem() const { return max_mem_; }
void set_max_mem(int64_t m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool dot_nl() const { return dot_nl_; }
void set_dot_nl(bool b) { dot_nl_ = b; }
bool never_capture() const { return never_capture_; }
void set_never_capture(bool b) { never_capture_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
encoding_ = src.encoding_;
posix_syntax_ = src.posix_syntax_;
longest_match_ = src.longest_match_;
log_errors_ = src.log_errors_;
max_mem_ = src.max_mem_;
literal_ = src.literal_;
never_nl_ = src.never_nl_;
dot_nl_ = src.dot_nl_;
never_capture_ = src.never_capture_;
case_sensitive_ = src.case_sensitive_;
perl_classes_ = src.perl_classes_;
word_boundary_ = src.word_boundary_;
one_line_ = src.one_line_;
}
int ParseFlags() const;
private:
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
bool never_capture_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
//DISALLOW_EVIL_CONSTRUCTORS(Options);
Options(const Options&);
void operator=(const Options&);
};
// Returns the options set in the constructor.
const Options& options() const { return options_; };
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
#endif
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
#endif
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
#endif
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
mutable Mutex* mutex_;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
mutable re2::Prog* rprog_; // reverse program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
// Map from capture indices to names
mutable const map<int, string>* group_names_;
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
RE2(const RE2&);
void operator=(const RE2&);
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type,name) \
Arg(type* p) : arg_(p), parser_(name) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
#ifdef RE2_HAVE_LONGLONG
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#endif
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
}
// Parse the data
bool Parse(const char* str, int n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
#ifdef RE2_HAVE_LONGLONG
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#endif
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, int n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
#ifdef RE2_HAVE_LONGLONG
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#endif
#undef MAKE_INTEGER_PARSER
} // namespace re2
using re2::RE2;
#endif /* RE2_RE2_H */

931
outside/re2/re2/regexp.cc Normal file
View File

@ -0,0 +1,931 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
static map<Regexp*, int> *ref_map;
GLOBAL_MUTEX(ref_mutex);
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = 0;
if (ref_map != NULL) {
r = (*ref_map)[this];
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return r;
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
// Store ref count in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
if (ref_map == NULL) {
ref_map = new map<Regexp*, int>;
}
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
} else {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = r;
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
// Can use memmove because Regexp is just a struct (no vtable).
char tmp[sizeof *this];
memmove(tmp, this, sizeof tmp);
memmove(this, that, sizeof tmp);
memmove(that, tmp, sizeof tmp);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
int n = stk.size();
if (n == 0)
break;
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
map<string, int>* TakeMap() {
map<string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
map<string, int>* map_;
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
};
map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
map<int, string>* TakeMap() {
map<int, string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<int, string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
map<int, string>* map_;
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
};
map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = r;
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase);
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = max<Rune>(lo, 'A');
Rune hi1 = min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = max<Rune>(lo, 'a');
hi1 = min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (int i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
if (this == NULL)
return;
uint8 *data = reinterpret_cast<uint8*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(n, ranges_.size());
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

633
outside/re2/re2/regexp.h Normal file
View File

@ -0,0 +1,633 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#ifndef RE2_REGEXP_H__
#define RE2_REGEXP_H__
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(enum RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
enum RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static string CodeText(enum RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
enum RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
};
// Walker to implement Simplify.
class SimplifyWalker;
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_; }
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64 max_mem);
Prog* CompileToReverseProg(int64 max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
static int FactorAlternationRecursive(Regexp** sub, int nsub,
ParseFlags flags, int maxdepth);
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
if (n < 0 || static_cast<uint16>(n) != n)
LOG(FATAL) << "Cannot AllocSub " << n;
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = n;
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8 instead of bool to control space usage.
uint8 simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16 instead of ParseFlags to control space usage.
uint16 parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16 to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (100),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16 ref_;
static const uint16 kMaxRef = 0xffff;
// Subexpressions.
// uint16 to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16 nsub_;
static const uint16 kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32 AlphaMask = (1<<26) - 1;
uint32 upper_; // bitmap of A-Z
uint32 lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
};
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
{
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
}
} // namespace re2
#endif // RE2_REGEXP_H__

113
outside/re2/re2/set.cc Normal file
View File

@ -0,0 +1,113 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include "util/util.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
using namespace re2;
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
}
RE2::Set::~Set() {
for (int i = 0; i < re_.size(); i++)
re_[i]->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add after Compile";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = re_.size();
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub, nsub + 1, pf);
delete[] sub;
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
re_.push_back(re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile multiple times";
return false;
}
compiled_ = true;
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
re_.size(), pf);
re_.clear();
re2::Regexp* sre = re->Simplify();
re->Decref();
re = sre;
if (re == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error simplifying during Compile.";
return false;
}
prog_ = Prog::CompileSet(options_, anchor_, re);
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
}
v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
if (failed)
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
if (ret == false)
return false;
if (v->size() == 0) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
return false;
}
return true;
}

55
outside/re2/re2/set.h Normal file
View File

@ -0,0 +1,55 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H
#define RE2_SET_H
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Add adds regexp pattern to the set, interpreted using the RE2 options.
// (The RE2 constructor's default options parameter is RE2::UTF8.)
// Add returns the regexp index that will be used to identify
// it in the result of Match, or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Error returns do not increment the index.
// If an error occurs and error != NULL, *error will hold an error message.
int Add(const StringPiece& pattern, string* error);
// Compile prepares the Set for matching.
// Add must not be called again after Compile.
// Compile must be called before FullMatch or PartialMatch.
// Compile may return false if it runs out of memory.
bool Compile();
// Match returns true if text matches any of the regexps in the set.
// If so, it fills v with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
private:
RE2::Options options_;
RE2::Anchor anchor_;
vector<re2::Regexp*> re_;
re2::Prog* prog_;
bool compiled_;
//DISALLOW_EVIL_CONSTRUCTORS(Set);
Set(const Set&);
void operator=(const Set&);
};
} // namespace re2
#endif // RE2_SET_H

393
outside/re2/re2/simplify.cc Normal file
View File

@ -0,0 +1,393 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple_)
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple_;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple_)
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
if (simple_)
return Incref();
SimplifyWalker w;
return w.Walk(this, NULL);
}
#define Simplify DontCallSimplify // Avoid accidental recursion
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple_) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
break;
}
}
if (!changed) {
for (int i = 0; i < re->nsub_; i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
VLOG(1) << "Simplify " << min;
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return nre;
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

View File

@ -0,0 +1,182 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#ifndef STRINGS_STRINGPIECE_H__
#define STRINGS_STRINGPIECE_H__
#include <string.h>
#include <cstddef>
#include <iosfwd>
#include <string>
namespace re2 {
class StringPiece {
private:
const char* ptr_;
int length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
int size() const { return length_; }
int length() const { return length_; }
bool empty() const { return length_ == 0; }
void clear() { ptr_ = NULL; length_ = 0; }
void set(const char* data, int len) { ptr_ = data; length_ = len; }
void set(const char* str) {
ptr_ = str;
if (str != NULL)
length_ = static_cast<int>(strlen(str));
else
length_ = 0;
}
void set(const void* data, int len) {
ptr_ = reinterpret_cast<const char*>(data);
length_ = len;
}
char operator[](int i) const { return ptr_[i]; }
void remove_prefix(int n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(int n) {
length_ -= n;
}
int compare(const StringPiece& x) const {
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
if (r == 0) {
if (length_ < x.length_) r = -1;
else if (length_ > x.length_) r = +1;
}
return r;
}
std::string as_string() const {
return std::string(data(), size());
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data(), size());
}
void CopyToString(std::string* target) const;
void AppendToString(std::string* target) const;
// Does "this" start with "x"
bool starts_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_, x.ptr_, x.length_) == 0));
}
// Does "this" end with "x"
bool ends_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
}
// standard STL container boilerplate
typedef char value_type;
typedef const char* pointer;
typedef const char& reference;
typedef const char& const_reference;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos;
typedef const char* const_iterator;
typedef const char* iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
iterator begin() const { return ptr_; }
iterator end() const { return ptr_ + length_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(ptr_ + length_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(ptr_);
}
// STLS says return size_type, but Google says return int
int max_size() const { return length_; }
int capacity() const { return length_; }
int copy(char* buf, size_type n, size_type pos = 0) const;
int find(const StringPiece& s, size_type pos = 0) const;
int find(char c, size_type pos = 0) const;
int rfind(const StringPiece& s, size_type pos = npos) const;
int rfind(char c, size_type pos = npos) const;
StringPiece substr(size_type pos, size_type n = npos) const;
static bool _equal(const StringPiece&, const StringPiece&);
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
return StringPiece::_equal(x, y);
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = memcmp(x.data(), y.data(),
std::min(x.size(), y.size()));
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
} // namespace re2
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
#endif // STRINGS_STRINGPIECE_H__

View File

@ -0,0 +1,254 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
//
// Prog::BadSearchBacktrack is a backtracking regular expression search,
// except that it remembers where it has been, trading a lot of
// memory for a lot of time. It exists only for testing purposes.
//
// Let me repeat that.
//
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
// - It uses a ton of memory.
// - It uses a ton of stack.
// - It uses CHECK and LOG(FATAL).
// - It implements unanchored search by repeated anchored search.
//
// On the other hand, it is very simple and a good reference
// implementation for the more complicated regexp packages.
//
// In BUILD, this file is linked into the ":testing" library,
// not the main library, in order to make it harder to pick up
// accidentally.
#include "util/util.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
// Backtracker holds the state for a backtracking search.
//
// Excluding the search parameters, the main search state
// is just the "capture registers", which record, for the
// current execution, the string position at which each
// parenthesis was passed. cap_[0] and cap_[1] are the
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
//
// To avoid infinite loops during backtracking on expressions
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
// pairs that have already been explored and are thus not worth
// re-exploring if we get there via another path. Modern backtracking
// libraries engineer their program representation differently, to make
// such infinite loops possible to avoid without keeping a giant visited_
// bitmap, but visited_ works fine for a reference implementation
// and it has the nice benefit of making the search run in linear time.
class Backtracker {
public:
explicit Backtracker(Prog* prog);
~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
// Explores from instruction ip at string position p looking for a match.
// Returns true if found (so that caller can stop trying other possibilities).
bool Visit(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether search must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
};
Backtracker::Backtracker(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
visited_(NULL),
nvisited_(0) {
}
Backtracker::~Backtracker() {
delete[] visited_;
}
// Runs a backtracking search.
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && text.begin() > context_.begin())
return false;
if (prog_->anchor_end() && text.end() < context_.end())
return false;
anchored_ = anchored | prog_->anchor_start();
longest_ = longest | prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
CHECK(2*nsubmatch_ < arraysize(cap_));
memset(cap_, 0, sizeof cap_);
// We use submatch_[0] for our own bookkeeping,
// so it had better exist.
StringPiece sp0;
if (nsubmatch < 1) {
submatch_ = &sp0;
nsubmatch_ = 1;
}
submatch_[0] = NULL;
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
delete[] visited_;
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return Visit(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Explores from instruction ip at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.end());
int n = id*(text_.size()+1) + (p - text_.begin());
CHECK_LT(n/32, nvisited_);
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
if (p < text_.end())
c = *p & 0xFF;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
return false; // not reached
case kInstAlt:
case kInstAltMatch:
// Try both possible next states: out is preferred to out1.
if (Visit(ip->out(), p)) {
if (longest_)
Visit(ip->out1(), p);
return true;
}
return Visit(ip->out1(), p);
case kInstByteRange:
if (ip->Matches(c))
return Visit(ip->out(), p+1);
return false;
case kInstCapture:
if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
// Capture p to register, but save old value.
const char* q = cap_[ip->cap()];
cap_[ip->cap()] = p;
bool ret = Visit(ip->out(), p);
// Restore old value as we backtrack.
cap_[ip->cap()] = q;
return ret;
}
return Visit(ip->out(), p);
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
return false;
return Visit(ip->out(), p);
case kInstNop:
return Visit(ip->out(), p);
case kInstMatch:
// We found a match. If it's the best so far, record the
// parameters in the caller's submatch_ array.
if (endmatch_ && p != context_.end())
return false;
cap_[1] = p;
if (submatch_[0].data() == NULL || // First match so far ...
(longest_ && p > submatch_[0].end())) { // ... or better match
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
return true;
case kInstFail:
return false;
}
}
// Runs a backtracking search.
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
Backtracker b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

View File

@ -0,0 +1,223 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test character class manipulations.
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct CCTest {
struct {
Rune lo;
Rune hi;
} add[10];
int remove;
struct {
Rune lo;
Rune hi;
} final[10];
};
static CCTest tests[] = {
{ { { 10, 20 }, {-1} }, -1,
{ { 10, 20 }, {-1} } },
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
{ { 10, 30 }, {-1} } },
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
{ { 10, 40 }, {-1} } },
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
{ { 5, 25 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
{ { 10, 23 }, {-1} } },
// These check boundary cases during negation.
{ { { 0, Runemax }, {-1} }, -1,
{ { 0, Runemax }, {-1} } },
{ { { 0, 50 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 50, Runemax }, {-1} }, -1,
{ { 50, Runemax }, {-1} } },
// Check RemoveAbove.
{ { { 50, Runemax }, {-1} }, 255,
{ { 50, 255 }, {-1} } },
{ { { 50, Runemax }, {-1} }, 65535,
{ { 50, 65535 }, {-1} } },
{ { { 50, Runemax }, {-1} }, Runemax,
{ { 50, Runemax }, {-1} } },
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
{ { 50, 60 }, { 250, 255 }, {-1} } },
{ { { 50, 60 }, {-1} }, 255,
{ { 50, 60 }, {-1} } },
{ { { 350, 360 }, {-1} }, 255,
{ {-1} } },
{ { {-1} }, 255,
{ {-1} } },
};
template<class CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
printf("\t%s:", desc);
} else {
printf("\n");
printf("CharClass added: [%s]", desc);
for (int k = 0; t->add[k].lo >= 0; k++)
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
printf("\n");
if (t->remove >= 0)
printf("Removed > %d\n", t->remove);
printf("\twant:");
for (int k = 0; t->final[k].lo >= 0; k++)
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
printf("\n");
printf("\thave:");
}
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
printf(" %d-%d", it->lo, it->hi);
printf("\n");
}
bool ShouldContain(CCTest *t, int x) {
for (int j = 0; t->final[j].lo >= 0; j++)
if (t->final[j].lo <= x && x <= t->final[j].hi)
return true;
return false;
}
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
CharClass* Negate(CharClass *cc) {
return cc->Negate();
}
void Delete(CharClass* cc) {
cc->Delete();
}
CharClassBuilder* Negate(CharClassBuilder* cc) {
CharClassBuilder* ncc = cc->Copy();
ncc->Negate();
return ncc;
}
void Delete(CharClassBuilder* cc) {
delete cc;
}
template<class CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
typename CharClass::iterator it = cc->begin();
int size = 0;
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
if (it == cc->end() ||
it->lo != t->final[j].lo ||
it->hi != t->final[j].hi) {
Broke(desc, t, cc);
return false;
}
size += it->hi - it->lo + 1;
}
if (it != cc->end()) {
Broke(desc, t, cc);
return false;
}
if (cc->size() != size) {
Broke(desc, t, cc);
printf("wrong size: want %d have %d\n", size, cc->size());
return false;
}
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) != cc->Contains(j)) {
Broke(desc, t, cc);
printf("want contains(%d)=%d, got %d\n",
j, ShouldContain(t, j), cc->Contains(j));
return false;
}
}
CharClass* ncc = Negate(cc);
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("want ncc contains(%d)!=%d, got %d\n",
j, ShouldContain(t, j), ncc->Contains(j));
Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("ncc size should be %d is %d\n",
Runemax+1 - cc->size(), ncc->size());
Delete(ncc);
return false;
}
}
Delete(ncc);
return true;
}
TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
for (int i = 0; i < arraysize(tests); i++) {
CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
ccb.AddRange(t->add[j].lo, t->add[j].hi);
if (t->remove >= 0)
ccb.RemoveAbove(t->remove);
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
nfail++;
CharClass* cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "before copy (CharClass)"))
nfail++;
cc->Delete();
CharClassBuilder *ccb1 = ccb.Copy();
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
nfail++;
cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "after copy (CharClass)"))
nfail++;
cc->Delete();
delete ccb1;
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

View File

@ -0,0 +1,171 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test prog.cc, compile.cc
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/prog.h"
DEFINE_string(show, "", "regular expression to compile and dump");
namespace re2 {
// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.
struct Test {
const char* regexp;
const char* code;
};
static Test tests[] = {
{ "a",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "ab",
"1. byte [61-61] -> 2\n"
"2. byte [62-62] -> 3\n"
"3. match! 0\n" },
{ "a|c",
"3. alt -> 1 | 2\n"
"1. byte [61-61] -> 4\n"
"2. byte [63-63] -> 4\n"
"4. match! 0\n" },
{ "a|b",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "[ab]",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "a+",
"1. byte [61-61] -> 2\n"
"2. alt -> 1 | 3\n"
"3. match! 0\n" },
{ "a+?",
"1. byte [61-61] -> 2\n"
"2. alt -> 3 | 1\n"
"3. match! 0\n" },
{ "a*",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 2\n"
"3. match! 0\n" },
{ "a*?",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 2\n" },
{ "a?",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 3\n"
"3. match! 0\n" },
{ "a??",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 3\n" },
{ "a{4}",
"1. byte [61-61] -> 2\n"
"2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
"5. match! 0\n" },
{ "(a)",
"2. capture 2 -> 1\n"
"1. byte [61-61] -> 3\n"
"3. capture 3 -> 4\n"
"4. match! 0\n" },
{ "(?:a)",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "",
"2. match! 0\n" },
{ ".",
"3. alt -> 1 | 2\n"
"1. byte [00-09] -> 4\n"
"2. byte [0b-ff] -> 4\n"
"4. match! 0\n" },
{ "[^ab]",
"5. alt -> 3 | 4\n"
"3. alt -> 1 | 2\n"
"4. byte [63-ff] -> 6\n"
"1. byte [00-09] -> 6\n"
"2. byte [0b-60] -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
"1. byte/i [61-61] -> 2\n"
"2. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
for (int i = 0; i < arraysize(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
LOG(ERROR) << "Cannot parse: " << t.regexp;
failed++;
continue;
}
Prog* prog = re->CompileToProg(0);
if (prog == NULL) {
LOG(ERROR) << "Cannot compile: " << t.regexp;
re->Decref();
failed++;
continue;
}
CHECK(re->CompileToProg(1) == NULL);
string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
LOG(ERROR) << "Want:\n" << t.code;
LOG(ERROR) << "Got:\n" << s;
failed++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(failed, 0);
}
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
static struct UTF8ByteRange {
int lo;
int hi;
} utf8ranges[] = {
{ 0x00, 0x09 },
{ 0x0A, 0x0A },
{ 0x10, 0x7F },
{ 0x80, 0x8F },
{ 0x90, 0x9F },
{ 0xA0, 0xBF },
{ 0xC0, 0xC1 },
{ 0xC2, 0xDF },
{ 0xE0, 0xE0 },
{ 0xE1, 0xEF },
{ 0xF0, 0xF0 },
{ 0xF1, 0xF3 },
{ 0xF4, 0xF4 },
{ 0xF5, 0xFF },
};
TEST(TestCompile, ByteRanges) {
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
for (int i = 0; i < arraysize(utf8ranges); i++)
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
delete prog;
re->Decref();
}
} // namespace re2

View File

@ -0,0 +1,344 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "util/thread.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
DECLARE_bool(re2_dfa_bail_when_slow);
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
DEFINE_int32(repeat, 2, "Repetition count.");
DEFINE_int32(threads, 4, "number of threads");
namespace re2 {
// Check that multithreaded access to DFA class works.
// Helper thread: builds entire DFA for prog.
class BuildThread : public Thread {
public:
BuildThread(Prog* prog) : prog_(prog) {}
virtual void Run() {
CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
}
private:
Prog* prog_;
};
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
string s = "a";
for (int i = 0; i < FLAGS_size; i++)
s += "[ab]";
s += "b";
// Check that single-threaded code works.
{
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
BuildThread* t = new BuildThread(prog);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
re->Decref();
}
// Build the DFA simultaneously in a bunch of threads.
for (int i = 0; i < FLAGS_repeat; i++) {
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
vector<BuildThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
BuildThread *t = new BuildThread(prog);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
// One more compile, to make sure everything is okay.
prog->BuildEntireDFA(Prog::kFirstMatch);
delete prog;
re->Decref();
}
}
// Check that DFA size requirements are followed.
// BuildEntireDFA will, like SearchDFA, stop building out
// the DFA once the memory limits are reached.
TEST(SingleThreaded, BuildEntireDFA) {
// Create regexp with 2^30 states in DFA.
string s = "a";
for (int i = 0; i < 30; i++)
s += "[ab]";
s += "b";
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
int max = 24;
for (int i = 17; i < max; i++) {
int limit = 1<<i;
int usage;
//int progusage, dfamem;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(limit);
CHECK(prog);
//progusage = m.HeapGrowth();
//dfamem = prog->dfa_mem();
prog->BuildEntireDFA(Prog::kFirstMatch);
prog->BuildEntireDFA(Prog::kLongestMatch);
usage = m.HeapGrowth();
delete prog;
}
if (!UsingMallocCounter)
continue;
//LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
// limit, progusage, dfamem, usage);
CHECK_GT(usage, limit*9/10);
CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
}
re->Decref();
}
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
static string DeBruijnString(int n) {
CHECK_LT(n, 8*sizeof(int));
CHECK_GT(n, 0);
vector<bool> did(1<<n);
for (int i = 0; i < 1<<n; i++)
did[i] = false;
string s;
for (int i = 0; i < n-1; i++)
s.append("0");
int bits = 0;
int mask = (1<<n) - 1;
for (int i = 0; i < (1<<n); i++) {
bits <<= 1;
bits &= mask;
if (!did[bits|1]) {
bits |= 1;
s.append("1");
} else {
s.append("0");
}
CHECK(!did[bits]);
did[bits] = true;
}
return s;
}
// Test that the DFA gets the right result even if it runs
// out of memory during a search. The regular expression
// 0[01]{n}$ matches a binary string of 0s and 1s only if
// the (n+1)th-to-last character is a 0. Matching this in
// a single forward pass (as done by the DFA) requires
// keeping one bit for each of the last n+1 characters
// (whether each was a 0), or 2^(n+1) possible states.
// If we run this regexp to search in a string that contains
// every possible n-character binary string as a substring,
// then it will have to run through at least 2^n states.
// States are big data structures -- certainly more than 1 byte --
// so if the DFA can search correctly while staying within a
// 2^n byte limit, it must be handling out-of-memory conditions
// gracefully.
TEST(SingleThreaded, SearchDFA) {
// Choice of n is mostly arbitrary, except that:
// * making n too big makes the test run for too long.
// * making n too small makes the DFA refuse to run,
// because it has so little memory compared to the program size.
// Empirically, n = 18 is a good compromise between the two.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
string no_match = DeBruijnString(n);
string match = no_match + "0";
// The De Bruijn string is the worst case input for this regexp.
// By default, the DFA will notice that it is flushing its cache
// too frequently and will bail out early, so that RE2 can use the
// NFA implementation instead. (The DFA loses its speed advantage
// if it can't get a good cache hit rate.)
// Tell the DFA to trudge along instead.
FLAGS_re2_dfa_bail_when_slow = false;
int64 usage;
int64 peak_usage;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
for (int i = 0; i < 10; i++) {
bool matched, failed = false;
matched = prog->SearchDFA(match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog->SearchDFA(no_match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
usage = m.HeapGrowth();
peak_usage = m.PeakHeapGrowth();
delete prog;
}
re->Decref();
if (!UsingMallocCounter)
return;
//LOG(INFO) << "usage " << usage << " " << peak_usage;
CHECK_LT(usage, 1<<n);
CHECK_LT(peak_usage, 1<<n);
}
// Helper thread: searches for match, which should match,
// and no_match, which should not.
class SearchThread : public Thread {
public:
SearchThread(Prog* prog, const StringPiece& match,
const StringPiece& no_match)
: prog_(prog), match_(match), no_match_(no_match) {}
virtual void Run() {
for (int i = 0; i < 2; i++) {
bool matched, failed = false;
matched = prog_->SearchDFA(match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog_->SearchDFA(no_match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
}
private:
Prog* prog_;
StringPiece match_;
StringPiece no_match_;
};
TEST(Multithreaded, SearchDFA) {
// Same as single-threaded test above.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
string no_match = DeBruijnString(n);
string match = no_match + "0";
FLAGS_re2_dfa_bail_when_slow = false;
// Check that single-threaded code works.
{
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
SearchThread* t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
}
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
for (int i = 0; i < FLAGS_repeat; i++) {
//LOG(INFO) << "Search " << i;
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
vector<SearchThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
SearchThread *t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
delete prog;
}
re->Decref();
}
struct ReverseTest {
const char *regexp;
const char *text;
bool match;
};
// Test that reverse DFA handles anchored/unanchored correctly.
// It's in the DFA interface but not used by RE2.
ReverseTest reverse_tests[] = {
{ "\\A(a|b)", "abc", true },
{ "(a|b)\\z", "cba", true },
{ "\\A(a|b)", "cba", false },
{ "(a|b)\\z", "abc", false },
};
TEST(DFA, ReverseMatch) {
int nfail = 0;
for (int i = 0; i < arraysize(reverse_tests); i++) {
const ReverseTest& t = reverse_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog *prog = re->CompileToReverseProg(0);
CHECK(prog);
bool failed = false;
bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
if (matched != t.match) {
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
nfail++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

View File

@ -0,0 +1,164 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Dump the regexp into a string showing structure.
// Tested by parse_unittest.cc
// This function traverses the regexp recursively,
// meaning that on inputs like Regexp::Simplify of
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
// it takes time and space exponential in the size of the
// original regular expression. It can also use stack space
// linear in the size of the regular expression for inputs
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
// As a result, Dump is provided only in the testing
// library (see BUILD).
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/stringpiece.h"
#include "re2/regexp.h"
// Cause a link error if this file is used outside of testing.
DECLARE_string(test_tmpdir);
namespace re2 {
static const char* kOpcodeNames[] = {
"bad",
"no",
"emp",
"lit",
"str",
"cat",
"alt",
"star",
"plus",
"que",
"rep",
"cap",
"dot",
"byte",
"bol",
"eol",
"wb", // kRegexpWordBoundary
"nwb", // kRegexpNoWordBoundary
"bot",
"eot",
"cc",
"match",
};
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, string* s) {
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
StringAppendF(s, "op%d", re->op());
} else {
switch (re->op()) {
default:
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (re->parse_flags() & Regexp::NonGreedy)
s->append("n");
break;
}
s->append(kOpcodeNames[re->op()]);
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
Rune r = re->rune();
if ('a' <= r && r <= 'z')
s->append("fold");
}
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
if ('a' <= r && r <= 'z') {
s->append("fold");
break;
}
}
}
}
s->append("{");
switch (re->op()) {
default:
break;
case kRegexpEndText:
if (!(re->parse_flags() & Regexp::WasDollar)) {
s->append("\\z");
}
break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
break;
case kRegexpConcat:
case kRegexpAlternate:
for (int i = 0; i < re->nsub(); i++)
DumpRegexpAppending(re->sub()[i], s);
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCapture:
if (re->name()) {
s->append(*re->name());
s->append(":");
}
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpRepeat:
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCharClass: {
string sep;
for (CharClass::iterator it = re->cc()->begin();
it != re->cc()->end(); ++it) {
RuneRange rr = *it;
s->append(sep);
if (rr.lo == rr.hi)
s->append(StringPrintf("%#x", rr.lo));
else
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
sep = " ";
}
break;
}
}
s->append("}");
}
string Regexp::Dump() {
string s;
// Make sure being called from a unit test.
if (FLAGS_test_tmpdir.empty()) {
LOG(ERROR) << "Cannot use except for testing.";
return s;
}
DumpRegexpAppending(this, &s);
return s;
}
} // namespace re2

View File

@ -0,0 +1,42 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test simple repetition operators
TEST(Repetition, Simple) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Explode("abc."), ops,
6, Explode("ab"), "(?:%s)", "");
ExhaustiveTest(3, 2, Explode("abc."), ops,
40, Explode("a"), "(?:%s)", "");
}
// Test capturing parens -- (a) -- inside repetition operators
TEST(Repetition, Capturing) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
7, Explode("ab"), "(?:%s)", "");
// This would be a great test, but it runs forever when PCRE is enabled.
if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
100, Explode("a"), "(?:%s)", "");
}
} // namespace re2

View File

@ -0,0 +1,70 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test empty string matches (aka "(?:)")
TEST(EmptyString, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
RegexpGenerator::EgrepOps(),
5, Split("", "ab"), "", "");
}
// Test escaped versions of regexp syntax.
TEST(Punctuation, Literals) {
vector<string> alphabet = Explode("()*+?{}[]\\^$.");
vector<string> escaped = alphabet;
for (int i = 0; i < escaped.size(); i++)
escaped[i] = "\\" + escaped[i];
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
2, alphabet, "", "");
}
// Test ^ $ . \A \z in presence of line endings.
// Have to wrap the empty-width ones in (?:) so that
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
RegexpGenerator::EgrepOps(),
4, Explode("ab\n"), "", "");
}
// Test what does and does not match \n.
// This would be a good test, except that PCRE seems to have a bug:
// in single-byte character set mode (the default),
// [^a] matches \n, but in UTF-8 mode it does not.
// So when we run the test, the tester complains that
// we don't agree with PCRE, but it's PCRE that is at fault.
// For what it's worth, Perl gets this right (matches
// regardless of whether UTF-8 input is selected):
//
// #!/usr/bin/perl
// use POSIX qw(locale_h);
// print "matches in latin1\n" if "\n" =~ /[^a]/;
// setlocale("en_US.utf8");
// print "matches in utf8\n" if "\n" =~ /[^a]/;
//
// The rule chosen for RE2 is that by default, like Perl,
// dot does not match \n but negated character classes [^a] do.
// (?s) will allow dot to match \n; there is no way in RE2
// to stop [^a] from matching \n, though the underlying library
// provides a mechanism, and RE2 could add new syntax if needed.
//
// TEST(Newlines, Exhaustive) {
// vector<string> empty_vector;
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
// RegexpGenerator::EgrepOps(),
// 4, Explode("a\n"), "");
// }
} // namespace re2

View File

@ -0,0 +1,94 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test simple character classes by themselves.
TEST(CharacterClasses, Exhaustive) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "", "");
}
// Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses, ExhaustiveAB) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "a%sb", "");
}
// Returns UTF8 for Rune r
static string UTF8(Rune r) {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
return string(buf);
}
// Returns a vector of "interesting" UTF8 characters.
// Unicode is now too big to just return all of them,
// so UTF8Characters return a set likely to be good test cases.
static const vector<string>& InterestingUTF8() {
static bool init;
static vector<string> v;
if (init)
return v;
init = true;
// All the Latin1 equivalents are interesting.
for (int i = 1; i < 256; i++)
v.push_back(UTF8(i));
// After that, the codes near bit boundaries are
// interesting, because they span byte sequence lengths.
for (int j = 0; j < 8; j++)
v.push_back(UTF8(256 + j));
for (int i = 512; i < Runemax; i <<= 1)
for (int j = -8; j < 8; j++)
v.push_back(UTF8(i + j));
// The codes near Runemax, including Runemax itself, are interesting.
for (int j = -8; j <= 0; j++)
v.push_back(UTF8(Runemax + j));
return v;
}
// Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8, SingleOps) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
ExhaustiveTest(1, 0, atoms, ops,
1, InterestingUTF8(), "", "");
}
// Test interesting UTF-8 characters against character classes,
// but wrap everything inside AB.
TEST(InterestingUTF8, AB) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
vector<string> alpha = InterestingUTF8();
for (int i = 0; i < alpha.size(); i++)
alpha[i] = "a" + alpha[i] + "b";
ExhaustiveTest(1, 0, atoms, ops,
1, alpha, "a%sb", "");
}
} // namespace re2

View File

@ -0,0 +1,38 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
DECLARE_string(regexp_engines);
// Test very simple expressions.
TEST(EgrepLiterals, Lowercase) {
EgrepTest(3, 2, "abc.", 3, "abc", "");
}
// Test mixed-case expressions.
TEST(EgrepLiterals, MixedCase) {
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
}
// Test mixed-case in case-insensitive mode.
TEST(EgrepLiterals, FoldCase) {
// The punctuation characters surround A-Z and a-z
// in the ASCII table. This looks for bugs in the
// bytemap range code in the DFA.
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
}
// Test very simple expressions.
TEST(EgrepLiterals, UTF8) {
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
}
} // namespace re2

View File

@ -0,0 +1,188 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
// a maximum regular expression length, and a maximum number of letters
// that can appear in the regular expression. Given these parameters,
// it tries every possible regular expression and string, verifying that
// the NFA, DFA, and a trivial backtracking implementation agree about
// the location of the match.
#include <stdlib.h>
#include <stdio.h>
#ifndef LOGGING
#define LOGGING 0
#endif
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/tester.h"
DEFINE_bool(show_regexps, false, "show regexps during testing");
DEFINE_int32(max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
// Compiled in debug mode, the usual tests run for over an hour.
// Have to cut it down to make the unit test machines happy.
DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
namespace re2 {
static char* escape(const StringPiece& sp) {
static char buf[512];
char* p = buf;
*p++ = '\"';
for (int i = 0; i < sp.size(); i++) {
if(p+5 >= buf+sizeof buf)
LOG(FATAL) << "ExhaustiveTester escape: too long";
if(sp[i] == '\\' || sp[i] == '\"') {
*p++ = '\\';
*p++ = sp[i];
} else if(sp[i] == '\n') {
*p++ = '\\';
*p++ = 'n';
} else {
*p++ = sp[i];
}
}
*p++ = '\"';
*p = '\0';
return buf;
}
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
printf("-");
return;
}
for (int i = 0; i < n; i++) {
if (i > 0)
printf(" ");
if (m[i].begin() == NULL)
printf("-");
else
printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
}
}
// Processes a single generated regexp.
// Compiles it using Regexp interface and PCRE, and then
// checks that NFA, DFA, and PCRE all return the same results.
void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
regexps_++;
string regexp = const_regexp;
if (!topwrapper_.empty())
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
if (FLAGS_show_regexps) {
printf("\r%s", regexp.c_str());
fflush(stdout);
}
if (LOGGING) {
// Write out test cases and answers for use in testing
// other implementations, such as Go's regexp package.
if (randomstrings_)
LOG(ERROR) << "Cannot log with random strings.";
if (regexps_ == 1) { // first
printf("strings\n");
strgen_.Reset();
while (strgen_.HasNext())
printf("%s\n", escape(strgen_.Next()));
printf("regexps\n");
}
printf("%s\n", escape(regexp));
RE2 re(regexp);
RE2::Options longest;
longest.set_longest_match(true);
RE2 relongest(regexp, longest);
int ngroup = re.NumberOfCapturingGroups()+1;
StringPiece* group = new StringPiece[ngroup];
strgen_.Reset();
while (strgen_.HasNext()) {
StringPiece input = strgen_.Next();
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
printf("\n");
}
delete[] group;
return;
}
Tester tester(regexp);
if (tester.error())
return;
strgen_.Reset();
strgen_.GenerateNULL();
if (randomstrings_)
strgen_.Random(stringseed_, stringcount_);
int bad_inputs = 0;
while (strgen_.HasNext()) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
break;
}
}
}
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper) {
if (DEBUG_MODE && FLAGS_quick_debug_mode) {
if (maxatoms > 1)
maxatoms--;
if (maxops > 1)
maxops--;
if (maxstrlen > 1)
maxstrlen--;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper,
topwrapper);
t.Generate();
if (!LOGGING) {
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
}
EXPECT_EQ(0, t.failures());
}
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper) {
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
for (int i = 0; i < arraysize(tops); i++) {
ExhaustiveTest(maxatoms, maxops,
Split("", alphabet),
RegexpGenerator::EgrepOps(),
maxstrlen,
Split("", stralphabet),
wrapper,
tops[i]);
}
}
} // namespace re2

View File

@ -0,0 +1,85 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Exhaustive regular expression test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
// each possible string, and if so, where the match is.
//
// Can also be used in a "random" mode that generates a given number
// of random regexp and strings, allowing testing of larger expressions
// and inputs.
class ExhaustiveTester : public RegexpGenerator {
public:
ExhaustiveTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
wrapper_(wrapper),
topwrapper_(topwrapper),
regexps_(0), tests_(0), failures_(0),
randomstrings_(0), stringseed_(0), stringcount_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
int failures() { return failures_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
// Causes testing to generate random input strings.
void RandomStrings(int32 seed, int32 count) {
randomstrings_ = true;
stringseed_ = seed;
stringcount_ = count;
}
private:
StringGenerator strgen_;
string wrapper_; // Regexp wrapper - either empty or has one %s.
string topwrapper_; // Regexp top-level wrapper.
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
int failures_; // Number of tests failed.
bool randomstrings_; // Whether to use random strings
int32 stringseed_; // If so, the seed.
int stringcount_; // If so, how many to generate.
DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
};
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper);
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper);
} // namespace re2
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__

View File

@ -0,0 +1,275 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
namespace re2 {
struct FilterTestVars {
vector<string> atoms;
vector<int> atom_indices;
vector<int> matches;
RE2::Options opts;
FilteredRE2 f;
};
TEST(FilteredRE2Test, EmptyTest) {
FilterTestVars v;
v.f.AllMatches("foo", v.atom_indices, &v.matches);
EXPECT_EQ(0, v.matches.size());
}
TEST(FilteredRE2Test, SmallOrTest) {
FLAGS_filtered_re2_min_atom_len = 4;
FilterTestVars v;
int id;
v.f.Add("(foo|bar)", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(0, v.atoms.size());
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
TEST(FilteredRE2Test, SmallLatinTest) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
int id;
v.opts.set_utf8(false);
v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(1, v.atoms.size());
EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
v.atom_indices.push_back(0);
v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
struct AtomTest {
const char* testname;
// If any test needs more than this many regexps or atoms, increase
// the size of the corresponding array.
const char* regexps[20];
const char* atoms[20];
};
AtomTest atom_tests[] = {
{
// This test checks to make sure empty patterns are allowed.
"CheckEmptyPattern",
{""},
{}
}, {
// This test checks that all atoms of length greater than min length
// are found, and no atoms that are of smaller length are found.
"AllAtomsGtMinLengthFound", {
"(abc123|def456|ghi789).*mnop[x-z]+",
"abc..yyy..zz",
"mnmnpp[a-z]+PPP"
}, {
"abc123",
"def456",
"ghi789",
"mnop",
"abc",
"yyy",
"mnmnpp",
"ppp"
}
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
// substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
"(abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
"abc",
"ghi789",
"abcd",
"yyy",
"yyyzzz",
"mnmnpp",
"ppp"
}
}, {
// Test character class expansion.
"CharClassExpansion", {
"m[a-c][d-f]n.*[x-z]+",
"[x-y]bcde[ab]"
}, {
"madn", "maen", "mafn",
"mbdn", "mben", "mbfn",
"mcdn", "mcen", "mcfn",
"xbcdea", "xbcdeb",
"ybcdea", "ybcdeb"
}
}, {
// Test upper/lower of non-ASCII.
"UnicodeLower", {
"(?i)ΔδΠϖπΣςσ",
"ΛΜΝΟΠ",
"ψρστυ",
}, {
"δδπππσσσ",
"λμνοπ",
"ψρστυ",
},
},
};
void AddRegexpsAndCompile(const char* regexps[],
int n,
struct FilterTestVars* v) {
for (int i = 0; i < n; i++) {
int id;
v->f.Add(regexps[i], v->opts, &id);
}
v->f.Compile(&v->atoms);
}
bool CheckExpectedAtoms(const char* atoms[],
int n,
const char* testname,
struct FilterTestVars* v) {
vector<string> expected;
for (int i = 0; i < n; i++)
expected.push_back(atoms[i]);
bool pass = expected.size() == v->atoms.size();
sort(v->atoms.begin(), v->atoms.end());
sort(expected.begin(), expected.end());
for (int i = 0; pass && i < n; i++)
pass = pass && expected[i] == v->atoms[i];
if (!pass) {
LOG(WARNING) << "Failed " << testname;
LOG(WARNING) << "Expected #atoms = " << expected.size();
for (int i = 0; i < expected.size(); i++)
LOG(WARNING) << expected[i];
LOG(WARNING) << "Found #atoms = " << v->atoms.size();
for (int i = 0; i < v->atoms.size(); i++)
LOG(WARNING) << v->atoms[i];
}
return pass;
}
TEST(FilteredRE2Test, AtomTests) {
FLAGS_filtered_re2_min_atom_len = 3;
int nfail = 0;
for (int i = 0; i < arraysize(atom_tests); i++) {
FilterTestVars v;
AtomTest* t = &atom_tests[i];
int natom, nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
for (natom = 0; natom < arraysize(t->atoms); natom++)
if (t->atoms[natom] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
nfail++;
}
EXPECT_EQ(0, nfail);
}
void FindAtomIndices(const vector<string> atoms,
const vector<string> matched_atoms,
vector<int>* atom_indices) {
atom_indices->clear();
for (int i = 0; i < matched_atoms.size(); i++) {
int j = 0;
for (; j < atoms.size(); j++) {
if (matched_atoms[i] == atoms[j]) {
atom_indices->push_back(j);
break;
}
EXPECT_LT(j, atoms.size());
}
}
}
TEST(FilteredRE2Test, MatchEmptyPattern) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[0];
// We are using the regexps used in one of the atom tests
// for this test. Adding the EXPECT here to make sure
// the index we use for the test is for the correct test.
EXPECT_EQ("CheckEmptyPattern", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "0123";
vector<int> atom_ids;
vector<int> matching_regexps;
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
}
TEST(FilteredRE2Test, MatchTests) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[2];
// We are using the regexps used in one of the atom tests
// for this test.
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "abc121212xyz";
// atoms = abc
vector<int> atom_ids;
vector<string> atoms;
atoms.push_back("abc");
FindAtomIndices(v.atoms, atoms, &atom_ids);
vector<int> matching_regexps;
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abc12312yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abcd12yyy32yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("abcd");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
LOG(INFO) << "S: " << atom_ids.size();
for (int i = 0; i < atom_ids.size(); i++)
LOG(INFO) << "i: " << i << " : " << atom_ids[i];
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(2, matching_regexps.size());
}
} // namespace re2

View File

@ -0,0 +1,76 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct PCRETest {
const char* regexp;
bool should_match;
};
static PCRETest tests[] = {
// Most things should behave exactly.
{ "abc", true },
{ "(a|b)c", true },
{ "(a*|b)c", true },
{ "(a|b*)c", true },
{ "a(b|c)d", true },
{ "a(()|())c", true },
{ "ab*c", true },
{ "ab+c", true },
{ "a(b*|c*)d", true },
{ "\\W", true },
{ "\\W{1,2}", true },
{ "\\d", true },
// Check that repeated empty strings do not.
{ "(a*)*", false },
{ "x(a*)*y", false },
{ "(a*)+", false },
{ "(a+)*", true },
{ "(a+)+", true },
{ "(a+)+", true },
// \v is the only character class that shouldn't.
{ "\\b", true },
{ "\\v", false },
{ "\\d", true },
// The handling of ^ in multi-line mode is different, as is
// the handling of $ in single-line mode. (Both involve
// boundary cases if the string ends with \n.)
{ "\\A", true },
{ "\\z", true },
{ "(?m)^", false },
{ "(?m)$", true },
{ "(?-m)^", true },
{ "(?-m)$", false }, // In PCRE, == \Z
{ "(?m)\\A", true },
{ "(?m)\\z", true },
{ "(?-m)\\A", true },
{ "(?-m)\\z", true },
};
TEST(MimicsPCRE, SimpleTests) {
for (int i = 0; i < arraysize(tests); i++) {
const PCRETest& t = tests[i];
for (int j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
CHECK(re) << " " << t.regexp;
CHECK_EQ(t.should_match, re->MimicsPCRE())
<< " " << t.regexp << " "
<< (j==0 ? "latin1" : "utf");
re->Decref();
}
}
}
} // namespace re2

View File

@ -0,0 +1,44 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Null walker. For benchmarking the walker itself.
class NullWalker : public Regexp::Walker<bool> {
public:
NullWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NullWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
return false;
}
// Returns whether re can match an empty string.
void Regexp::NullWalk() {
NullWalker w;
w.Walk(this, false);
}
} // namespace re2

View File

@ -0,0 +1,433 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30);
struct Test {
const char* regexp;
const char* parse;
Regexp::ParseFlags flags;
};
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
Regexp::PerlX |
Regexp::PerlClasses |
Regexp::UnicodeGroups;
static Test tests[] = {
// Base cases
{ "a", "lit{a}" },
{ "a.", "cat{lit{a}dot{}}" },
{ "a.b", "cat{lit{a}dot{}lit{b}}" },
{ "ab", "str{ab}" },
{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
{ "abc", "str{abc}" },
{ "a|^", "alt{lit{a}bol{}}" },
{ "a|b", "cc{0x61-0x62}" },
{ "(a)", "cap{lit{a}}" },
{ "(a)|b", "alt{cap{lit{a}}lit{b}}" },
{ "a*", "star{lit{a}}" },
{ "a+", "plus{lit{a}}" },
{ "a?", "que{lit{a}}" },
{ "a{2}", "rep{2,2 lit{a}}" },
{ "a{2,3}", "rep{2,3 lit{a}}" },
{ "a{2,}", "rep{2,-1 lit{a}}" },
{ "a*?", "nstar{lit{a}}" },
{ "a+?", "nplus{lit{a}}" },
{ "a??", "nque{lit{a}}" },
{ "a{2}?", "nrep{2,2 lit{a}}" },
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
{ "", "emp{}" },
{ "|", "emp{}" }, // alt{emp{}emp{}} but got factored
{ "|x|", "alt{emp{}lit{x}emp{}}" },
{ ".", "dot{}" },
{ "^", "bol{}" },
{ "$", "eol{}" },
{ "\\|", "lit{|}" },
{ "\\(", "lit{(}" },
{ "\\)", "lit{)}" },
{ "\\*", "lit{*}" },
{ "\\+", "lit{+}" },
{ "\\?", "lit{?}" },
{ "{", "lit{{}" },
{ "}", "lit{}}" },
{ "\\.", "lit{.}" },
{ "\\^", "lit{^}" },
{ "\\$", "lit{$}" },
{ "\\\\", "lit{\\}" },
{ "[ace]", "cc{0x61 0x63 0x65}" },
{ "[abc]", "cc{0x61-0x63}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[a]", "lit{a}" },
{ "\\-", "lit{-}" },
{ "-", "lit{-}" },
{ "\\_", "lit{_}" },
// Posix and Perl extensions
{ "[[:lower:]]", "cc{0x61-0x7a}" },
{ "[a-z]", "cc{0x61-0x7a}" },
{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "\\d", "cc{0x30-0x39}" },
{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
{ "\\C", "byte{}" },
// Unicode, negatives, and a double negative.
{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
// More interesting regular expressions.
{ "a{,2}", "str{a{,2}}" },
{ "\\.\\^\\$\\\\", "str{.^$\\}" },
{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
{ "a*{", "cat{star{lit{a}}lit{{}}" },
// Test precedences
{ "(?:ab)*", "star{str{ab}}" },
{ "(ab)*", "star{cap{str{ab}}}" },
{ "ab|cd", "alt{str{ab}str{cd}}" },
{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
// Test flattening.
{ "(?:a)", "lit{a}" },
{ "(?:ab)(?:cd)", "str{abcd}" },
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
{ "a|.", "dot{}" },
{ ".|a", "dot{}" },
// Test Perl quoted literals
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
{ "\\Q+\\E+", "plus{lit{+}}" },
{ "\\Q\\\\E", "lit{\\}" },
{ "\\Q\\\\\\E", "str{\\\\}" },
// Test Perl \A and \z
{ "(?m)^", "bol{}" },
{ "(?m)$", "eol{}" },
{ "(?-m)^", "bot{}" },
{ "(?-m)$", "eot{}" },
{ "(?m)\\A", "bot{}" },
{ "(?m)\\z", "eot{\\z}" },
{ "(?-m)\\A", "bot{}" },
{ "(?-m)\\z", "eot{\\z}" },
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },
// Strings
{ "abcde", "str{abcde}" },
{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
// Reported bug involving \n leaking in despite use of NeverNL.
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::FoldCase },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL },
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::FoldCase },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL },
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
};
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
return Regexp::Equal(a, b);
}
void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
const string& title) {
Regexp** re = new Regexp*[ntests];
for (int i = 0; i < ntests; i++) {
RegexpStatus status;
Regexp::ParseFlags f = flags;
if (tests[i].flags != 0) {
f = tests[i].flags & ~TestZeroFlags;
}
re[i] = Regexp::Parse(tests[i].regexp, f, &status);
CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
<< status.Text();
string s = re[i]->Dump();
EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
<< "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f;
}
for (int i = 0; i < ntests; i++) {
for (int j = 0; j < ntests; j++) {
EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
RegexpEqualTestingOnly(re[i], re[j]))
<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
}
}
for (int i = 0; i < ntests; i++)
re[i]->Decref();
delete[] re;
}
// Test that regexps parse to expected structures.
TEST(TestParse, SimpleRegexps) {
TestParse(tests, arraysize(tests), kTestFlags, "simple");
}
Test foldcase_tests[] = {
{ "AbCdE", "strfold{abcde}" },
{ "[Aa]", "litfold{a}" },
{ "a", "litfold{a}" },
// 0x17F is an old English long s (looks like an f) and folds to s.
// 0x212A is the Kelvin symbol and folds to k.
{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
};
// Test that parsing with FoldCase works.
TEST(TestParse, FoldCase) {
TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
}
Test literal_tests[] = {
{ "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
};
// Test that parsing with Literal works.
TEST(TestParse, Literal) {
TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
}
Test matchnl_tests[] = {
{ ".", "dot{}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing with MatchNL works.
// (Also tested above during simple cases.)
TEST(TestParse, MatchNL) {
TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
}
Test nomatchnl_tests[] = {
{ ".", "cc{0-0x9 0xb-0x10ffff}" },
{ "\n", "lit{\n}" },
{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
{ "[a\\n]", "cc{0xa 0x61}" },
};
// Test that parsing without MatchNL works.
TEST(TestParse, NoMatchNL) {
TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
}
Test prefix_tests[] = {
{ "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
{ "abc|abd|aef|bcx|bcy",
"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
"cat{str{bc}cc{0x78-0x79}}}" },
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
{ "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
{ "(?:xx|yy)c|(?:xx|yy)d",
"cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
{ "x{2}|x{2}[0-9]",
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
{ "x{2}y|x{2}[0-9]y",
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
};
// Test that prefix factoring works.
TEST(TestParse, Prefix) {
TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
}
// Invalid regular expressions
const char* badtests[] = {
"(",
")",
"(a",
"(a|b|",
"(a|b",
"[a-z",
"([a-z)",
"x{1001}",
"\xff", // Invalid UTF-8
"[\xff]",
"[\\\xff]",
"\\\xff",
"(?P<name>a",
"(?P<name>",
"(?P<name",
"(?P<x y>a)",
"(?P<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
"a{100000,}",
};
// Valid in Perl, bad in POSIX
const char* only_perl[] = {
"[a-b-c]",
"\\Qabc\\E",
"\\Q*+?{[\\E",
"\\Q\\\\E",
"\\Q\\\\\\E",
"\\Q\\\\\\\\E",
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?P<name>a)",
};
// Valid in POSIX, bad in Perl.
const char* only_posix[] = {
"a++",
"a**",
"a?*",
"a+*",
"a{1}*",
};
// Test that parser rejects bad regexps.
TEST(TestParse, InvalidRegexps) {
for (int i = 0; i < arraysize(badtests); i++) {
CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
<< " " << badtests[i];
CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << badtests[i];
}
for (int i = 0; i < arraysize(only_posix); i++) {
CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
<< " " << only_posix[i];
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
CHECK(re) << " " << only_posix[i];
re->Decref();
}
for (int i = 0; i < arraysize(only_perl); i++) {
CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
<< " " << only_perl[i];
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
CHECK(re) << " " << only_perl[i];
re->Decref();
}
}
// Test that ToString produces original regexp or equivalent one.
TEST(TestToString, EquivalentParse) {
for (int i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
Regexp::ParseFlags f = kTestFlags;
if (tests[i].flags != 0) {
f = tests[i].flags & ~TestZeroFlags;
}
Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
string s = re->Dump();
EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s;
string t = re->ToString();
if (t != tests[i].regexp) {
// If ToString didn't return the original regexp,
// it must have found one with fewer parens.
// Unfortunately we can't check the length here, because
// ToString produces "\\{" for a literal brace,
// but "{" is a shorter equivalent.
// CHECK_LT(t.size(), strlen(tests[i].regexp))
// << " t=" << t << " regexp=" << tests[i].regexp;
// Test that if we parse the new regexp we get the same structure.
Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
string ss = nre->Dump();
string tt = nre->ToString();
if (s != ss || t != tt)
LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
EXPECT_EQ(s, ss);
EXPECT_EQ(t, tt);
nre->Decref();
}
re->Decref();
}
}
// Test that capture error args are correct.
TEST(NamedCaptures, ErrorArgs) {
RegexpStatus status;
Regexp* re;
re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<name");
re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
}
} // namespace re2

View File

@ -0,0 +1,240 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <vector>
#include "util/test.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Test that C++ strings are compared as uint8s, not int8s.
// PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings, EightBit) {
string s = "\x70";
string t = "\xA0";
EXPECT_LT(s, t);
}
struct PrefixTest {
const char* regexp;
int maxlen;
const char* min;
const char* max;
};
static PrefixTest tests[] = {
{ "", 10, "", "", },
{ "Abcdef", 10, "Abcdef", "Abcdef" },
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
{ "a+hello", 10, "aa", "ahello" },
{ "a*hello", 10, "a", "hello" },
{ "def|abc", 10, "abc", "def" },
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
{ "ab(cab|cat)", 10, "abcab", "abcat" },
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
{ "(ab|x)(c|de)", 10, "abc", "xde" },
{ "(ab|x)?(c|z)?", 10, "", "z" },
{ "[^\\s\\S]", 10, "", "" },
{ "(abc)+", 5, "abc", "abcac" },
{ "(abc)+", 2, "ab", "ac" },
{ "(abc)+", 1, "a", "b" },
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "a*", 10, "", "ab" },
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)a+hello", 10, "AA", "ahello" },
{ "(?i)a*hello", 10, "A", "hello" },
{ "(?i)def|abc", 10, "ABC", "def" },
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)[^\\s\\S]", 10, "", "" },
{ "(?i)(abc)+", 5, "ABC", "abcac" },
{ "(?i)(abc)+", 2, "AB", "ac" },
{ "(?i)(abc)+", 1, "A", "b" },
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)a*", 10, "", "ab" },
{ "(?i)A*", 10, "", "ab" },
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
{ "\\Aa+hello", 10, "aa", "ahello" },
{ "\\Aa*hello", 10, "a", "hello" },
{ "\\Adef|abc", 10, "abc", "def" },
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "\\A[^\\s\\S]", 10, "", "" },
{ "\\A(abc)+", 5, "abc", "abcac" },
{ "\\A(abc)+", 2, "ab", "ac" },
{ "\\A(abc)+", 1, "a", "b" },
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "\\Aa*", 10, "", "ab" },
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
{ "(?i)\\Aa*hello", 10, "A", "hello" },
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
{ "(?i)\\A(abc)+", 1, "A", "b" },
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)\\Aa*", 10, "", "ab" },
{ "(?i)\\AA*", 10, "", "ab" },
};
TEST(PossibleMatchRange, HandWritten) {
for (int i = 0; i < arraysize(tests); i++) {
for (int j = 0; j < 2; j++) {
const PrefixTest& t = tests[i];
string min, max;
if (j == 0) {
LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
<< " " << t.regexp;
delete prog;
re->Decref();
} else {
CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
}
EXPECT_EQ(t.min, min) << t.regexp;
EXPECT_EQ(t.max, max) << t.regexp;
}
}
}
// Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange, Failures) {
string min, max;
// Fails because no room to write max.
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
// Fails because there is no max -- any non-empty string matches
// or begins a match. Have to use Latin-1 input, because there
// are no valid UTF-8 strings beginning with byte 0xFF.
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("\\C*").
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
// Fails because it's a malformed regexp.
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
}
// Exhaustive test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that the prefix information agrees with whether
// the regexp matches each of the strings.
class PossibleMatchTester : public RegexpGenerator {
public:
PossibleMatchTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
regexps_(0), tests_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
private:
StringGenerator strgen_;
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
};
// Processes a single generated regexp.
// Checks that all accepted strings agree with the prefix range.
void PossibleMatchTester::HandleRegexp(const string& regexp) {
regexps_++;
VLOG(3) << CEscape(regexp);
RE2 re(regexp, RE2::Latin1);
CHECK_EQ(re.error(), "");
string min, max;
if(!re.PossibleMatchRange(&min, &max, 10)) {
// There's no good max for "\\C*". Can't use strcmp
// because sometimes it gets embedded in more
// complicated expressions.
if(strstr(regexp.c_str(), "\\C*"))
return;
LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
}
strgen_.Reset();
while (strgen_.HasNext()) {
const StringPiece& s = strgen_.Next();
tests_++;
if (!RE2::FullMatch(s, re))
continue;
CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
}
}
TEST(PossibleMatchRange, Exhaustive) {
int natom = 3;
int noperator = 3;
int stringlen = 5;
if (DEBUG_MODE) {
natom = 2;
noperator = 3;
stringlen = 3;
}
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
RegexpGenerator::EgrepOps(),
stringlen, Explode("ab4"));
t.Generate();
LOG(INFO) << t.regexps() << " regexps, "
<< t.tests() << " tests";
}
} // namespace re2

View File

@ -0,0 +1,95 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Random testing of regular expression matching.
#include <stdio.h>
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DEFINE_int32(regexpseed, 404, "Random regexp seed.");
DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
DEFINE_int32(stringseed, 200, "Random string seed.");
DEFINE_int32(stringcount, 100, "How many random strings to generate.");
namespace re2 {
// Runs a random test on the given parameters.
// (Always uses the same random seeds for reproducibility.
// Can give different seeds on command line.)
static void RandomTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper) {
// Limit to smaller test cases in debug mode,
// because everything is so much slower.
if (DEBUG_MODE) {
maxatoms--;
maxops--;
maxstrlen /= 2;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper, "");
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
EXPECT_EQ(0, t.failures());
}
// Tests random small regexps involving literals and egrep operators.
TEST(Random, SmallEgrepLiterals) {
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals and egrep operators.
TEST(Random, BigEgrepLiterals) {
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random small regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, SmallEgrepCaptures) {
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random bigger regexps involving literals, capturing parens,
// and egrep operators.
TEST(Random, BigEgrepCaptures) {
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
15, Explode("abc"),
"");
}
// Tests random large complicated expressions, using all the possible
// operators, some literals, some parenthesized literals, and predefined
// character classes like \d. (Adding larger character classes would
// make for too many possibilities.)
TEST(Random, Complicated) {
vector<string> ops = Split(" ",
"%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
"%s{2} %s{2,} %s{3,4} %s{4,5}");
// Use (?:\b) and (?:\B) instead of \b and \B,
// because PCRE rejects \b* but accepts (?:\b)*.
// Ditto ^ and $.
vector<string> atoms = Split(" ",
". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
"\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
"a (a) b c - \\\\");
vector<string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
RandomTest(10, 10, atoms, ops, 20, alphabet, "");
}
} // namespace re2

View File

@ -0,0 +1,133 @@
// Copyright 2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This tests to make sure numbers are parsed from strings
// correctly.
// Todo: Expand the test to validate strings parsed to the other types
// supported by RE2::Arg class
#include "util/test.h"
#include "re2/re2.h"
namespace re2 {
struct SuccessTable {
const char * value_string;
int64 value;
bool success[6];
};
// Test boundary cases for different integral sizes.
// Specifically I want to make sure that values outside the boundries
// of an integral type will fail and that negative numbers will fail
// for unsigned types. The following table contains the boundaries for
// the various integral types and has entries for whether or not each
// type can contain the given value.
const SuccessTable kSuccessTable[] = {
// string integer value short ushort int uint int64 uint64
// 0 to 2^7-1
{ "0", 0, { true, true, true, true, true, true }},
{ "127", 127, { true, true, true, true, true, true }},
// -1 to -2^7
{ "-1", -1, { true, false, true, false, true, false }},
{ "-128", -128, { true, false, true, false, true, false }},
// 2^7 to 2^8-1
{ "128", 128, { true, true, true, true, true, true }},
{ "255", 255, { true, true, true, true, true, true }},
// 2^8 to 2^15-1
{ "256", 256, { true, true, true, true, true, true }},
{ "32767", 32767, { true, true, true, true, true, true }},
// -2^7-1 to -2^15
{ "-129", -129, { true, false, true, false, true, false }},
{ "-32768", -32768, { true, false, true, false, true, false }},
// 2^15 to 2^16-1
{ "32768", 32768, { false, true, true, true, true, true }},
{ "65535", 65535, { false, true, true, true, true, true }},
// 2^16 to 2^31-1
{ "65536", 65536, { false, false, true, true, true, true }},
{ "2147483647", 2147483647, { false, false, true, true, true, true }},
// -2^15-1 to -2^31
{ "-32769", -32769, { false, false, true, false, true, false }},
{ "-2147483648",
static_cast<int64>(0xFFFFFFFF80000000LL),
{ false, false, true, false, true, false }},
// 2^31 to 2^32-1
{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
// 2^32 to 2^63-1
{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
{ "9223372036854775807",
9223372036854775807LL, { false, false, false, false, true, true }},
// -2^31-1 to -2^63
{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
{ "-9223372036854775808", static_cast<int64>(0x8000000000000000LL),
{ false, false, false, false, true, false }},
// 2^63 to 2^64-1
{ "9223372036854775808", static_cast<int64>(9223372036854775808ULL),
{ false, false, false, false, false, true }},
{ "18446744073709551615", static_cast<int64>(18446744073709551615ULL),
{ false, false, false, false, false, true }},
// >= 2^64
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
};
const int kNumStrings = ARRAYSIZE(kSuccessTable);
// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M
// macro outside of a TEST block and this seems to be the only way to
// avoid code duplication. I can also pull off a couple nice tricks
// using concatenation for the type I'm checking against.
#define PARSE_FOR_TYPE(type, column) { \
type r; \
for ( int i = 0; i < kNumStrings; ++i ) { \
RE2::Arg arg(&r); \
const char* const p = kSuccessTable[i].value_string; \
bool retval = arg.Parse(p, strlen(p)); \
bool success = kSuccessTable[i].success[column]; \
ASSERT_TRUE_M(retval == success, \
StringPrintf("Parsing '%s' for type " #type " should return %d", \
p, success).c_str()); \
if ( success ) { \
ASSERT_EQUALS(r, kSuccessTable[i].value); \
} \
} \
}
TEST(REArgTest, Int16Test) {
PARSE_FOR_TYPE(int16, 0);
}
TEST(REArgTest, Uint16Test) {
PARSE_FOR_TYPE(uint16, 1);
}
TEST(REArgTest, IntTest) {
PARSE_FOR_TYPE(int, 2);
}
TEST(REArgTest, UInt32Test) {
PARSE_FOR_TYPE(uint32, 3);
}
TEST(REArgTest, Iint64Test) {
PARSE_FOR_TYPE(int64, 4);
}
TEST(REArgTest, Uint64Test) {
PARSE_FOR_TYPE(uint64, 5);
}
} // namespace re2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,264 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression generator: generates all possible
// regular expressions within parameters (see regexp_generator.h for details).
// The regexp generator first generates a sequence of commands in a simple
// postfix language. Each command in the language is a string,
// like "a" or "%s*" or "%s|%s".
//
// To evaluate a command, enough arguments are popped from the value stack to
// plug into the %s slots. Then the result is pushed onto the stack.
// For example, the command sequence
// a b %s%s c
// results in the stack
// ab c
//
// GeneratePostfix generates all possible command sequences.
// Then RunPostfix turns each sequence into a regular expression
// and passes the regexp to HandleRegexp.
#include <string.h>
#include <string>
#include <stack>
#include <vector>
#include "util/test.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns a vector of the egrep regexp operators.
const vector<string>& RegexpGenerator::EgrepOps() {
static const char *ops[] = {
"%s%s",
"%s|%s",
"%s*",
"%s+",
"%s?",
"%s\\C*",
};
static vector<string> v(ops, ops + arraysize(ops));
return v;
}
RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
const vector<string>& atoms,
const vector<string>& ops)
: maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
// Degenerate case.
if (atoms_.size() == 0)
maxatoms_ = 0;
if (ops_.size() == 0)
maxops_ = 0;
}
// Generates all possible regular expressions (within the parameters),
// calling HandleRegexp for each one.
void RegexpGenerator::Generate() {
vector<string> postfix;
GeneratePostfix(&postfix, 0, 0, 0);
}
// Generates random regular expressions, calling HandleRegexp for each one.
void RegexpGenerator::GenerateRandom(int32 seed, int n) {
ACMRandom acm(seed);
acm_ = &acm;
for (int i = 0; i < n; i++) {
vector<string> postfix;
GenerateRandomPostfix(&postfix, 0, 0, 0);
}
acm_ = NULL;
}
// Counts and returns the number of occurrences of "%s" in s.
static int CountArgs(const string& s) {
const char *p = s.c_str();
int n = 0;
while ((p = strstr(p, "%s")) != NULL) {
p += 2;
n++;
}
return n;
}
// Generates all possible postfix command sequences.
// Each sequence is handed off to RunPostfix to generate a regular expression.
// The arguments are:
// post: the current postfix sequence
// nstk: the number of elements that would be on the stack after executing
// the sequence
// ops: the number of operators used in the sequence
// atoms: the number of atoms used in the sequence
// For example, if post were ["a", "b", "%s%s", "c"],
// then nstk = 2, ops = 1, atoms = 3.
//
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
//
void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
int ops, int atoms) {
if (nstk == 1)
RunPostfix(*post);
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return;
// Add atoms if there is room.
if (atoms < maxatoms_) {
for (int i = 0; i < atoms_.size(); i++) {
post->push_back(atoms_[i]);
GeneratePostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
}
}
// Add operators if there are enough arguments.
if (ops < maxops_) {
for (int i = 0; i < ops_.size(); i++) {
const string& fmt = ops_[i];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
post->pop_back();
}
}
}
}
// Generates a random postfix command sequence.
// Stops and returns true once a single sequence has been generated.
bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
int ops, int atoms) {
for (;;) {
// Stop if we get to a single element, but only sometimes.
if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
RunPostfix(*post);
return true;
}
// Early out: if used too many operators or can't
// get back down to a single expression on the stack
// using binary operators, give up.
if (ops + nstk - 1 > maxops_)
return false;
// Add operators if there are enough arguments.
if (ops < maxops_ && acm_->Uniform(2) == 0) {
const string& fmt = ops_[acm_->Uniform(ops_.size())];
int nargs = CountArgs(fmt);
if (nargs <= nstk) {
post->push_back(fmt);
bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
ops + 1, atoms);
post->pop_back();
if (ret)
return true;
}
}
// Add atoms if there is room.
if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
post->pop_back();
if (ret)
return true;
}
}
}
// Interprets the postfix command sequence to create a regular expression
// passed to HandleRegexp. The results of operators like %s|%s are wrapped
// in (?: ) to avoid needing to maintain a precedence table.
void RegexpGenerator::RunPostfix(const vector<string>& post) {
stack<string> regexps;
for (int i = 0; i < post.size(); i++) {
switch (CountArgs(post[i])) {
default:
LOG(FATAL) << "Bad operator: " << post[i];
case 0:
regexps.push(post[i]);
break;
case 1: {
string a = regexps.top();
regexps.pop();
regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
break;
}
case 2: {
string b = regexps.top();
regexps.pop();
string a = regexps.top();
regexps.pop();
regexps.push("(?:" +
StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
")");
break;
}
}
}
if (regexps.size() != 1) {
// Internal error - should never happen.
printf("Bad regexp program:\n");
for (int i = 0; i < post.size(); i++) {
printf(" %s\n", CEscape(post[i]).c_str());
}
printf("Stack after running program:\n");
while (!regexps.empty()) {
printf(" %s\n", CEscape(regexps.top()).c_str());
regexps.pop();
}
LOG(FATAL) << "Bad regexp program.";
}
HandleRegexp(regexps.top());
HandleRegexp("^(?:" + regexps.top() + ")$");
HandleRegexp("^(?:" + regexps.top() + ")");
HandleRegexp("(?:" + regexps.top() + ")$");
}
// Split s into an vector of strings, one for each UTF-8 character.
vector<string> Explode(const StringPiece& s) {
vector<string> v;
for (const char *q = s.begin(); q < s.end(); ) {
const char* p = q;
Rune r;
q += chartorune(&r, q);
v.push_back(string(p, q - p));
}
return v;
}
// Split string everywhere a substring is found, returning
// vector of pieces.
vector<string> Split(const StringPiece& sep, const StringPiece& s) {
vector<string> v;
if (sep.size() == 0)
return Explode(s);
const char *p = s.begin();
for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
if (StringPiece(q, sep.size()) == sep) {
v.push_back(string(p, q - p));
p = q + sep.size();
q = p - 1; // -1 for ++ in loop
continue;
}
}
if (p < s.end())
v.push_back(string(p, s.end() - p));
return v;
}
} // namespace re2

View File

@ -0,0 +1,70 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression generator: generates all possible
// regular expressions within given parameters (see below for details).
#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
#define RE2_TESTING_REGEXP_GENERATOR_H__
#include <string>
#include <vector>
#include "util/random.h"
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Regular expression generator.
//
// Given a set of atom expressions like "a", "b", or "."
// and operators like "%s*", generates all possible regular expressions
// using at most maxbases base expressions and maxops operators.
// For each such expression re, calls HandleRegexp(re).
//
// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
//
class RegexpGenerator {
public:
RegexpGenerator(int maxatoms, int maxops, const vector<string>& atoms,
const vector<string>& ops);
virtual ~RegexpGenerator() {}
// Generates all the regular expressions, calling HandleRegexp(re) for each.
void Generate();
// Generates n random regular expressions, calling HandleRegexp(re) for each.
void GenerateRandom(int32 seed, int n);
// Handles a regular expression. Must be provided by subclass.
virtual void HandleRegexp(const string& regexp) = 0;
// The egrep regexp operators: * + ? | and concatenation.
static const vector<string>& EgrepOps();
private:
void RunPostfix(const vector<string>& post);
void GeneratePostfix(vector<string>* post, int nstk, int ops, int lits);
bool GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int lits);
int maxatoms_; // Maximum number of atoms allowed in expr.
int maxops_; // Maximum number of ops allowed in expr.
vector<string> atoms_; // Possible atoms.
vector<string> ops_; // Possible ops.
ACMRandom* acm_; // Random generator.
DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator);
};
// Helpers for preparing arguments to RegexpGenerator constructor.
// Returns one string for each character in s.
vector<string> Explode(const StringPiece& s);
// Splits string everywhere sep is found, returning
// vector of pieces.
vector<string> Split(const StringPiece& sep, const StringPiece& s);
} // namespace re2
#endif // RE2_TESTING_REGEXP_GENERATOR_H__

View File

@ -0,0 +1,81 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test parse.cc, dump.cc, and tostring.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
// Test that overflowed ref counts work.
TEST(Regexp, BigRef) {
Regexp* re;
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
for (int i = 0; i < 100000; i++)
re->Incref();
for (int i = 0; i < 100000; i++)
re->Decref();
CHECK_EQ(re->Ref(), 1);
re->Decref();
}
// Test that very large Concats work.
// Depends on overflowed ref counts working.
TEST(Regexp, BigConcat) {
Regexp* x;
x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
vector<Regexp*> v(90000, x); // ToString bails out at 100000
for (int i = 0; i < v.size(); i++)
x->Incref();
CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref();
Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags);
CHECK_EQ(re->ToString(), string(v.size(), 'x'));
re->Decref();
CHECK_EQ(x->Ref(), 1) << x->Ref();
x->Decref();
}
TEST(Regexp, NamedCaptures) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const map<string, int>* have = x->NamedCaptures();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(2, have->size()); // there are only two named groups in
// the regexp: 'g1' and 'g2'.
map<string, int> want;
want["g1"] = 1;
want["g2"] = 3;
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
TEST(Regexp, CaptureNames) {
Regexp* x;
RegexpStatus status;
x = Regexp::Parse(
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
EXPECT_TRUE(status.ok());
EXPECT_EQ(4, x->NumCaptures());
const map<int, string>* have = x->CaptureNames();
EXPECT_TRUE(have != NULL);
EXPECT_EQ(3, have->size());
map<int, string> want;
want[1] = "g1";
want[3] = "g2";
want[4] = "g1";
EXPECT_EQ(want, *have);
x->Decref();
delete have;
}
} // namespace re2

View File

@ -0,0 +1,67 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct PrefixTest {
const char* regexp;
bool return_value;
const char* prefix;
bool foldcase;
const char* suffix;
};
static PrefixTest tests[] = {
// If the regexp is missing a ^, there's no required prefix.
{ "abc", false },
{ "", false },
{ "(?m)^", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
{ "^(abc)", false },
{ "^a*", false },
// Otherwise, it should work.
{ "^abc$", true, "abc", false, "(?-m:$)" },
{ "^abc", "true", "abc", false, "" },
{ "^(?i)abc", true, "abc", true, "" },
{ "^abcd*", true, "abc", false, "d*" },
{ "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
{ "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
{ "^☺abc", true, "☺abc", false, "" },
};
TEST(RequiredPrefix, SimpleTests) {
for (int i = 0; i < arraysize(tests); i++) {
const PrefixTest& t = tests[i];
for (int j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
CHECK(re) << " " << t.regexp;
string p;
bool f = false;
Regexp* s = NULL;
CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump();
if (t.return_value) {
CHECK_EQ(p, string(t.prefix))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
CHECK_EQ(f, t.foldcase)
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
CHECK_EQ(s->ToString(), string(t.suffix))
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
s->Decref();
}
re->Decref();
}
}
}
} // namespace re2

View File

@ -0,0 +1,325 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdlib.h>
#include <vector>
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/testing/tester.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
struct RegexpTest {
const char* regexp;
const char* text;
};
RegexpTest simple_tests[] = {
{ "a", "a" },
{ "a", "zyzzyva" },
{ "a+", "aa" },
{ "(a+|b)+", "ab" },
{ "ab|cd", "xabcdx" },
{ "h.*od?", "hello\ngoodbye\n" },
{ "h.*o", "hello\ngoodbye\n" },
{ "h.*o", "goodbye\nhello\n" },
{ "h.*o", "hello world" },
{ "h.*o", "othello, world" },
{ "[^\\s\\S]", "aaaaaaa" },
{ "a", "aaaaaaa" },
{ "a*", "aaaaaaa" },
{ "a*", "" },
{ "a*", NULL },
{ "ab|cd", "xabcdx" },
{ "a", "cab" },
{ "a*b", "cab" },
{ "((((((((((((((((((((x))))))))))))))))))))", "x" },
{ "[abcd]", "xxxabcdxxx" },
{ "[^x]", "xxxabcdxxx" },
{ "[abcd]+", "xxxabcdxxx" },
{ "[^x]+", "xxxabcdxxx" },
{ "(fo|foo)", "fo" },
{ "(foo|fo)", "foo" },
{ "aa", "aA" },
{ "a", "Aa" },
{ "a", "A" },
{ "ABC", "abc" },
{ "abc", "XABCY" },
{ "ABC", "xabcy" },
// Make sure ^ and $ work.
// The pathological cases didn't work
// in the original grep code.
{ "foo|bar|[A-Z]", "foo" },
{ "^(foo|bar|[A-Z])", "foo" },
{ "(foo|bar|[A-Z])$", "foo\n" },
{ "(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "foo\n" },
{ "^(foo|bar|[A-Z])$", "foo" },
{ "^(foo|bar|[A-Z])$", "bar" },
{ "^(foo|bar|[A-Z])$", "X" },
{ "^(foo|bar|[A-Z])$", "XY" },
{ "^(fo|foo)$", "fo" },
{ "^(fo|foo)$", "foo" },
{ "^^(fo|foo)$", "fo" },
{ "^^(fo|foo)$", "foo" },
{ "^$", "" },
{ "^$", "x" },
{ "^^$", "" },
{ "^$$", "" },
{ "^^$", "x" },
{ "^$$", "x" },
{ "^^$$", "" },
{ "^^$$", "x" },
{ "^^^^^^^^$$$$$$$$", "" },
{ "^", "x" },
{ "$", "x" },
// Word boundaries.
{ "\\bfoo\\b", "nofoo foo that" },
{ "a\\b", "faoa x" },
{ "\\bbar", "bar x" },
{ "\\bbar", "foo\nbar x" },
{ "bar\\b", "foobar" },
{ "bar\\b", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\b", "foo" },
{ "(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b", "" },
{ "\\b", "x" },
{ "\\b(foo|bar|[A-Z])", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "X" },
{ "\\b(foo|bar|[A-Z])\\b", "XY" },
{ "\\b(foo|bar|[A-Z])\\b", "bar" },
{ "\\b(foo|bar|[A-Z])\\b", "foo" },
{ "\\b(foo|bar|[A-Z])\\b", "foo\n" },
{ "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
{ "\\b(fo|foo)\\b", "fo" },
{ "\\b(fo|foo)\\b", "foo" },
{ "\\b\\b", "" },
{ "\\b\\b", "x" },
{ "\\b$", "" },
{ "\\b$", "x" },
{ "\\b$", "y x" },
{ "\\b.$", "x" },
{ "^\\b(fo|foo)\\b", "fo" },
{ "^\\b(fo|foo)\\b", "foo" },
{ "^\\b", "" },
{ "^\\b", "x" },
{ "^\\b\\b", "" },
{ "^\\b\\b", "x" },
{ "^\\b$", "" },
{ "^\\b$", "x" },
{ "^\\b.$", "x" },
{ "^\\b.\\b$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "" },
{ "^^^^^^^^\\b.$$$$$$", "x" },
{ "^^^^^^^^\\b$$$$$$$", "x" },
// Non-word boundaries.
{ "\\Bfoo\\B", "n foo xfoox that" },
{ "a\\B", "faoa x" },
{ "\\Bbar", "bar x" },
{ "\\Bbar", "foo\nbar x" },
{ "bar\\B", "foobar" },
{ "bar\\B", "foobar\nxxx" },
{ "(foo|bar|[A-Z])\\B", "foox" },
{ "(foo|bar|[A-Z])\\B", "foo\n" },
{ "\\B", "" },
{ "\\B", "x" },
{ "\\B(foo|bar|[A-Z])", "foo" },
{ "\\B(foo|bar|[A-Z])\\B", "xXy" },
{ "\\B(foo|bar|[A-Z])\\B", "XY" },
{ "\\B(foo|bar|[A-Z])\\B", "XYZ" },
{ "\\B(foo|bar|[A-Z])\\B", "abara" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
{ "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
{ "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
{ "\\B(fo|foo)\\B", "xfoo" },
{ "\\B(foo|fo)\\B", "xfooo" },
{ "\\B\\B", "" },
{ "\\B\\B", "x" },
{ "\\B$", "" },
{ "\\B$", "x" },
{ "\\B$", "y x" },
{ "\\B.$", "x" },
{ "^\\B(fo|foo)\\B", "fo" },
{ "^\\B(fo|foo)\\B", "foo" },
{ "^\\B", "" },
{ "^\\B", "x" },
{ "^\\B\\B", "" },
{ "^\\B\\B", "x" },
{ "^\\B$", "" },
{ "^\\B$", "x" },
{ "^\\B.$", "x" },
{ "^\\B.\\B$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "" },
{ "^^^^^^^^\\B.$$$$$$", "x" },
{ "^^^^^^^^\\B$$$$$$$", "x" },
// PCRE uses only ASCII for \b computation.
// All non-ASCII are *not* word characters.
{ "\\bx\\b", "x" },
{ "\\bx\\b", "x>" },
{ "\\bx\\b", "<x" },
{ "\\bx\\b", "<x>" },
{ "\\bx\\b", "ax" },
{ "\\bx\\b", "xb" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "«x" },
{ "\\bx\\b", "" },
{ "\\bx\\b", "«x»" },
{ "\\bx\\b", "axb" },
{ "\\bx\\b", "áxβ" },
{ "\\Bx\\B", "axb" },
{ "\\Bx\\B", "áxβ" },
// Weird boundary cases.
{ "^$^$", "" },
{ "^$^", "" },
{ "$^$", "" },
{ "^$^$", "x" },
{ "^$^", "x" },
{ "$^$", "x" },
{ "^$^$", "x\ny" },
{ "^$^", "x\ny" },
{ "$^$", "x\ny" },
{ "^$^$", "x\n\ny" },
{ "^$^", "x\n\ny" },
{ "$^$", "x\n\ny" },
{ "^(foo\\$)$", "foo$bar" },
{ "(foo\\$)", "foo$bar" },
{ "^...$", "abc" },
// UTF-8
{ "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^\\C\\C\\C$", "\xe6\x9c\xac" },
{ "^\\C$", "\xe6\x9c\xac" },
{ "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
// Latin1
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
{ "^...$", ".\xe6\x9c\xac." },
{ "^.....$", ".\xe6\x9c\xac." },
// Perl v Posix
{ "\\B(fo|foo)\\B", "xfooo" },
{ "(fo|foo)", "foo" },
// Octal escapes.
{ "\\141", "a" },
{ "\\060", "0" },
{ "\\0600", "00" },
{ "\\608", "08" },
{ "\\01", "\01" },
{ "\\018", "\01" "8" },
// Hexadecimal escapes
{ "\\x{61}", "a" },
{ "\\x61", "a" },
{ "\\x{00000061}", "a" },
// Unicode scripts.
{ "\\p{Greek}+", "aαβb" },
{ "\\P{Greek}+", "aαβb" },
{ "\\p{^Greek}+", "aαβb" },
{ "\\P{^Greek}+", "aαβb" },
// Unicode properties. Nd is decimal number. N is any number.
{ "[^0-9]+", "abc123" },
{ "\\p{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{Nd}+", "abc123²³¼½¾₀₉" },
{ "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
{ "\\pN+", "abc123²³¼½¾₀₉" },
{ "\\p{N}+", "abc123²³¼½¾₀₉" },
{ "\\p{^N}+", "abc123²³¼½¾₀₉" },
{ "\\p{Any}+", "abc123" },
// Character classes & case folding.
{ "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
{ "(?i)[A-Z]+", "aAzZ" },
{ "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
// splits the ranges in an interesting way.
// would like to use, but PCRE mishandles in full-match, non-greedy mode
// { "(?i)[\\\\]+", "Aa" },
{ "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Character classes & case folding.
{ "[@-A]+", "@AaB" },
{ "[A-Z]+", "aAzZ" },
{ "[^\\\\]+", "Aa\\" },
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
// Anchoring. (^abc in aabcdef was a former bug)
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "^abc", "abcdef" },
{ "^abc", "aabcdef" },
{ "^[ay]*[bx]+c", "abcdef" },
{ "^[ay]*[bx]+c", "aabcdef" },
{ "def$", "abcdef" },
{ "def$", "abcdeff" },
{ "d[ex][fy]$", "abcdef" },
{ "d[ex][fy]$", "abcdeff" },
{ "[dz][ex][fy]$", "abcdef" },
{ "[dz][ex][fy]$", "abcdeff" },
{ "(?m)^abc", "abcdef" },
{ "(?m)^abc", "aabcdef" },
{ "(?m)^[ay]*[bx]+c", "abcdef" },
{ "(?m)^[ay]*[bx]+c", "aabcdef" },
{ "(?m)def$", "abcdef" },
{ "(?m)def$", "abcdeff" },
{ "(?m)d[ex][fy]$", "abcdef" },
{ "(?m)d[ex][fy]$", "abcdeff" },
{ "(?m)[dz][ex][fy]$", "abcdef" },
{ "(?m)[dz][ex][fy]$", "abcdeff" },
{ "^", "a" },
{ "^^", "a" },
// Context.
// The tester checks for a match in the text and
// subpieces of the text with a byte removed on either side.
{ "a", "a" },
{ "ab*", "a" },
{ "a\\C*", "a" },
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
};
TEST(Regexp, SearchTests) {
int failures = 0;
for (int i = 0; i < arraysize(simple_tests); i++) {
const RegexpTest& t = simple_tests[i];
if (!TestRegexpOnText(t.regexp, t.text))
failures++;
#ifdef LOGGING
// Build a dummy ExhaustiveTest call that will trigger just
// this one test, so that we log the test case.
vector<string> atom, alpha, ops;
atom.push_back(StringPiece(t.regexp).as_string());
alpha.push_back(StringPiece(t.text).as_string());
ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
#endif
}
EXPECT_EQ(failures, 0);
}
} // namespace re2

View File

@ -0,0 +1,114 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <sys/types.h>
#include <sys/stat.h>
#include <vector>
#include "util/test.h"
#include "re2/re2.h"
#include "re2/set.h"
namespace re2 {
TEST(Set, Unanchored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
v.clear();
CHECK_EQ(s.Match("oobar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
}
TEST(Set, UnanchoredFactored) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("foobar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("obarfoobaroo", &v), true);
CHECK_EQ(v.size(), 2);
CHECK_EQ(v[0], 0);
CHECK_EQ(v[1], 1);
v.clear();
CHECK_EQ(s.Match("fooba", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
v.clear();
CHECK_EQ(s.Match("oobar", &v), false);
CHECK_EQ(v.size(), 0);
}
TEST(Set, UnanchoredDollar) {
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
CHECK_EQ(s.Add("foo$", NULL), 0);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
}
TEST(Set, Anchored) {
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
CHECK_EQ(s.Add("foo", NULL), 0);
CHECK_EQ(s.Add("(", NULL), -1);
CHECK_EQ(s.Add("bar", NULL), 1);
CHECK_EQ(s.Compile(), true);
vector<int> v;
CHECK_EQ(s.Match("foobar", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("fooba", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("oobar", &v), false);
CHECK_EQ(v.size(), 0);
CHECK_EQ(s.Match("foo", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 0);
CHECK_EQ(s.Match("bar", &v), true);
CHECK_EQ(v.size(), 1);
CHECK_EQ(v[0], 1);
}
} // namespace re2

View File

@ -0,0 +1,167 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test simplify.cc.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct Test {
const char* regexp;
const char* simplified;
};
static Test tests[] = {
// Already-simple constructs
{ "a", "a" },
{ "ab", "ab" },
{ "a|b", "[a-b]" },
{ "ab|cd", "ab|cd" },
{ "(ab)*", "(ab)*" },
{ "(ab)+", "(ab)+" },
{ "(ab)?", "(ab)?" },
{ ".", "." },
{ "^", "^" },
{ "$", "$" },
{ "[ac]", "[ac]" },
{ "[^ac]", "[^ac]" },
// Posix character classes
{ "[[:alnum:]]", "[0-9A-Za-z]" },
{ "[[:alpha:]]", "[A-Za-z]" },
{ "[[:blank:]]", "[\\t ]" },
{ "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
{ "[[:digit:]]", "[0-9]" },
{ "[[:graph:]]", "[!-~]" },
{ "[[:lower:]]", "[a-z]" },
{ "[[:print:]]", "[ -~]" },
{ "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
{ "[[:space:]]" , "[\\t-\\r ]" },
{ "[[:upper:]]", "[A-Z]" },
{ "[[:xdigit:]]", "[0-9A-Fa-f]" },
// Perl character classes
{ "\\d", "[0-9]" },
{ "\\s", "[\\t-\\n\\f-\\r ]" },
{ "\\w", "[0-9A-Z_a-z]" },
{ "\\D", "[^0-9]" },
{ "\\S", "[^\\t-\\n\\f-\\r ]" },
{ "\\W", "[^0-9A-Z_a-z]" },
{ "[\\d]", "[0-9]" },
{ "[\\s]", "[\\t-\\n\\f-\\r ]" },
{ "[\\w]", "[0-9A-Z_a-z]" },
{ "[\\D]", "[^0-9]" },
{ "[\\S]", "[^\\t-\\n\\f-\\r ]" },
{ "[\\W]", "[^0-9A-Z_a-z]" },
// Posix repetitions
{ "a{1}", "a" },
{ "a{2}", "aa" },
{ "a{5}", "aaaaa" },
{ "a{0,1}", "a?" },
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version fewer parens.
{ "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
{ "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
{ "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,2}", "(?:aa?)?" }, // (aa?)?
{ "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
{ "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
{ "a{0,}", "a*" },
{ "a{1,}", "a+" },
{ "a{2,}", "aa+" },
{ "a{5,}", "aaaaa+" },
// Test that operators simplify their arguments.
// (Simplify used to not simplify arguments to a {} repeat.)
{ "(?:a{1,}){1,}", "a+" },
{ "(a{1,}b{1,})", "(a+b+)" },
{ "a{1,}|b{1,}", "a+|b+" },
{ "(?:a{1,})*", "(?:a+)*" },
{ "(?:a{1,})+", "a+" },
{ "(?:a{1,})?", "(?:a+)?" },
{ "a{0}", "" },
// Character class simplification
{ "[ab]", "[a-b]" },
{ "[a-za-za-z]", "[a-z]" },
{ "[A-Za-zA-Za-z]", "[A-Za-z]" },
{ "[ABCDEFGH]", "[A-H]" },
{ "[AB-CD-EF-GH]", "[A-H]" },
{ "[W-ZP-XE-R]", "[E-Z]" },
{ "[a-ee-gg-m]", "[a-m]" },
{ "[a-ea-ha-m]", "[a-m]" },
{ "[a-ma-ha-e]", "[a-m]" },
{ "[a-zA-Z0-9 -~]", "[ -~]" },
// Empty character classes
{ "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
// Full character classes
{ "[[:cntrl:][:^cntrl:]]", "." },
// Unicode case folding.
{ "(?i)A", "[Aa]" },
{ "(?i)a", "[Aa]" },
{ "(?i)K", "[Kk\\x{212a}]" },
{ "(?i)k", "[Kk\\x{212a}]" },
{ "(?i)\\x{212a}", "[Kk\\x{212a}]" },
{ "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
{ "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
{ "(?i)[\\x00-\\x{10ffff}]", "." },
// Empty string as a regular expression.
// Empty string must be preserved inside parens in order
// to make submatches work right, so these are less
// interesting than they used to be. ToString inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{ "(a|b|)", "([a-b]|(?:))" },
{ "(|)", "()" },
{ "a()", "a()" },
{ "(()|())", "(()|())" },
{ "(a|)", "(a|(?:))" },
{ "ab()cd()", "ab()cd()" },
{ "()", "()" },
{ "()*", "()*" },
{ "()+", "()+" },
{ "()?" , "()?" },
{ "(){0}", "" },
{ "(){1}", "()" },
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
};
TEST(TestSimplify, SimpleRegexps) {
for (int i = 0; i < arraysize(tests); i++) {
RegexpStatus status;
VLOG(1) << "Testing " << tests[i].regexp;
Regexp* re = Regexp::Parse(tests[i].regexp,
Regexp::MatchNL | (Regexp::LikePerl &
~Regexp::OneLine),
&status);
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
Regexp* sre = re->Simplify();
CHECK(sre != NULL);
// Check that already-simple regexps don't allocate new ones.
if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
CHECK(re == sre) << " " << tests[i].regexp
<< " " << re->ToString() << " " << sre->ToString();
}
EXPECT_EQ(tests[i].simplified, sre->ToString())
<< " " << tests[i].regexp << " " << sre->Dump();
re->Decref();
sre->Decref();
}
}
} // namespace re2

View File

@ -0,0 +1,113 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/string_generator.h"
namespace re2 {
StringGenerator::StringGenerator(int maxlen, const vector<string>& alphabet)
: maxlen_(maxlen), alphabet_(alphabet),
generate_null_(false),
random_(false), nrandom_(0), acm_(NULL) {
// Degenerate case: no letters, no non-empty strings.
if (alphabet_.size() == 0)
maxlen_ = 0;
// Next() will return empty string (digits_ is empty).
hasnext_ = true;
}
StringGenerator::~StringGenerator() {
delete acm_;
}
// Resets the string generator state to the beginning.
void StringGenerator::Reset() {
digits_.clear();
hasnext_ = true;
random_ = false;
nrandom_ = 0;
generate_null_ = false;
}
// Increments the big number in digits_, returning true if successful.
// Returns false if all the numbers have been used.
bool StringGenerator::IncrementDigits() {
// First try to increment the current number.
for (int i = digits_.size() - 1; i >= 0; i--) {
if (++digits_[i] < alphabet_.size())
return true;
digits_[i] = 0;
}
// If that failed, make a longer number.
if (digits_.size() < maxlen_) {
digits_.push_back(0);
return true;
}
return false;
}
// Generates random digits_, return true if successful.
// Returns false if the random sequence is over.
bool StringGenerator::RandomDigits() {
if (--nrandom_ <= 0)
return false;
// Pick length.
int len = acm_->Uniform(maxlen_+1);
digits_.resize(len);
for (int i = 0; i < len; i++)
digits_[i] = acm_->Uniform(alphabet_.size());
return true;
}
// Returns the next string in the iteration, which is the one
// currently described by digits_. Calls IncrementDigits
// after computing the string, so that it knows the answer
// for subsequent HasNext() calls.
const StringPiece& StringGenerator::Next() {
CHECK(hasnext_);
if (generate_null_) {
generate_null_ = false;
sp_ = NULL;
return sp_;
}
s_.clear();
for (int i = 0; i < digits_.size(); i++) {
s_ += alphabet_[digits_[i]];
}
hasnext_ = random_ ? RandomDigits() : IncrementDigits();
sp_ = s_;
return sp_;
}
// Sets generator up to return n random strings.
void StringGenerator::Random(int32 seed, int n) {
if (acm_ == NULL)
acm_ = new ACMRandom(seed);
else
acm_->Reset(seed);
random_ = true;
nrandom_ = n;
hasnext_ = nrandom_ > 0;
}
void StringGenerator::GenerateNULL() {
generate_null_ = true;
hasnext_ = true;
}
} // namespace re2

View File

@ -0,0 +1,58 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// String generator: generates all possible strings of up to
// maxlen letters using the set of letters in alpha.
// Fetch strings using a Java-like Next()/HasNext() interface.
#ifndef RE2_TESTING_STRING_GENERATOR_H__
#define RE2_TESTING_STRING_GENERATOR_H__
#include <string>
#include <vector>
#include "util/util.h"
#include "util/random.h"
#include "re2/stringpiece.h"
namespace re2 {
class StringGenerator {
public:
StringGenerator(int maxlen, const vector<string>& alphabet);
~StringGenerator();
const StringPiece& Next();
bool HasNext() { return hasnext_; }
// Resets generator to start sequence over.
void Reset();
// Causes generator to emit random strings for next n calls to Next().
void Random(int32 seed, int n);
// Causes generator to emit a NULL as the next call.
void GenerateNULL();
private:
bool IncrementDigits();
bool RandomDigits();
// Global state.
int maxlen_; // Maximum length string to generate.
vector<string> alphabet_; // Alphabet, one string per letter.
// Iteration state.
StringPiece sp_; // Last StringPiece returned by Next().
string s_; // String data in last StringPiece returned by Next().
bool hasnext_; // Whether Next() can be called again.
vector<int> digits_; // Alphabet indices for next string.
bool generate_null_; // Whether to generate a NULL StringPiece next.
bool random_; // Whether generated strings are random.
int nrandom_; // Number of random strings left to generate.
ACMRandom* acm_; // Random number generator
DISALLOW_EVIL_CONSTRUCTORS(StringGenerator);
};
} // namespace re2
#endif // RE2_TESTING_STRING_GENERATOR_H__

View File

@ -0,0 +1,109 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test StringGenerator.
#include <stdlib.h>
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/testing/string_generator.h"
#include "re2/testing/regexp_generator.h"
namespace re2 {
// Returns i to the e.
static int64 IntegerPower(int i, int e) {
int64 p = 1;
while (e-- > 0)
p *= i;
return p;
}
// Checks that for given settings of the string generator:
// * it generates strings that are non-decreasing in length.
// * strings of the same length are sorted in alphabet order.
// * it doesn't generate the same string twice.
// * it generates the right number of strings.
//
// If all of these hold, the StringGenerator is behaving.
// Assumes that the alphabet is sorted, so that the generated
// strings can just be compared lexicographically.
static void RunTest(int len, string alphabet, bool donull) {
StringGenerator g(len, Explode(alphabet));
int n = 0;
int last_l = -1;
string last_s;
if (donull) {
g.GenerateNULL();
EXPECT_TRUE(g.HasNext());
StringPiece sp = g.Next();
EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
EXPECT_EQ(sp.size(), 0);
}
while (g.HasNext()) {
string s = g.Next().as_string();
n++;
// Check that all characters in s appear in alphabet.
for (const char *p = s.c_str(); *p != '\0'; ) {
Rune r;
p += chartorune(&r, p);
EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
}
// Check that string is properly ordered w.r.t. previous string.
int l = utflen(s.c_str());
EXPECT_LE(l, len);
if (last_l < l) {
last_l = l;
} else {
EXPECT_EQ(last_l, l);
EXPECT_LT(last_s, s);
}
last_s = s;
}
// Check total string count.
int64 m = 0;
int alpha = utflen(alphabet.c_str());
if (alpha == 0) // Degenerate case.
len = 0;
for (int i = 0; i <= len; i++)
m += IntegerPower(alpha, i);
EXPECT_EQ(n, m);
}
TEST(StringGenerator, NoLength) {
RunTest(0, "abc", false);
}
TEST(StringGenerator, NoLengthNoAlphabet) {
RunTest(0, "", false);
}
TEST(StringGenerator, NoAlphabet) {
RunTest(5, "", false);
}
TEST(StringGenerator, Simple) {
RunTest(3, "abc", false);
}
TEST(StringGenerator, UTF8) {
RunTest(4, "abc\xE2\x98\xBA", false);
}
TEST(StringGenerator, GenNULL) {
RunTest(0, "abc", true);
RunTest(0, "", true);
RunTest(5, "", true);
RunTest(3, "abc", true);
RunTest(4, "abc\xE2\x98\xBA", true);
}
} // namespace re2

View File

@ -0,0 +1,640 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression engine tester -- test all the implementations against each other.
#include "util/util.h"
#include "util/flags.h"
#include "re2/testing/tester.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
DEFINE_bool(dump_prog, false, "dump regexp program");
DEFINE_bool(log_okay, false, "log successful runs");
DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
DEFINE_int32(max_regexp_failures, 100,
"maximum number of regexp test failures (-1 = unlimited)");
DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
namespace re2 {
enum {
kMaxSubmatch = 1+16, // $0...$16
};
const char* engine_types[kEngineMax] = {
"Backtrack",
"NFA",
"DFA",
"DFA1",
"OnePass",
"BitState",
"RE2",
"RE2a",
"RE2b",
"PCRE",
};
// Returns the name string for the type t.
static string EngineString(Engine t) {
if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
return StringPrintf("type%d", static_cast<int>(t));
}
return engine_types[t];
}
// Returns bit mask of engines to use.
static uint32 Engines() {
static uint32 cached_engines;
static bool did_parse;
if (did_parse)
return cached_engines;
if (FLAGS_regexp_engines.empty()) {
cached_engines = ~0;
} else {
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
cached_engines |= 1<<i;
}
if (cached_engines == 0)
LOG(INFO) << "Warning: no engines enabled.";
if (!UsingPCRE)
cached_engines &= ~(1<<kEnginePCRE);
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
if (cached_engines & (1<<i))
LOG(INFO) << EngineString(i) << " enabled";
}
did_parse = true;
return cached_engines;
}
// The result of running a match.
struct TestInstance::Result {
bool skipped; // test skipped: wasn't applicable
bool matched; // found a match
bool untrusted; // don't really trust the answer
bool have_submatch; // computed all submatch info
bool have_submatch0; // computed just submatch[0]
StringPiece submatch[kMaxSubmatch];
};
typedef TestInstance::Result Result;
// Formats a single capture range s in text in the form (a,b)
// where a and b are the starting and ending offsets of s in text.
static string FormatCapture(const StringPiece& text, const StringPiece& s) {
if (s.begin() == NULL)
return "(?,?)";
return StringPrintf("(%d,%d)",
static_cast<int>(s.begin() - text.begin()),
static_cast<int>(s.end() - text.begin()));
}
// Returns whether text contains non-ASCII (>= 0x80) bytes.
static bool NonASCII(const StringPiece& text) {
for (int i = 0; i < text.size(); i++)
if ((uint8)text[i] >= 0x80)
return true;
return false;
}
// Returns string representation of match kind.
static string FormatKind(Prog::MatchKind kind) {
switch (kind) {
case Prog::kFullMatch:
return "full match";
case Prog::kLongestMatch:
return "longest match";
case Prog::kFirstMatch:
return "first match";
case Prog::kManyMatch:
return "many match";
}
return "???";
}
// Returns string representation of anchor kind.
static string FormatAnchor(Prog::Anchor anchor) {
switch (anchor) {
case Prog::kAnchored:
return "anchored";
case Prog::kUnanchored:
return "unanchored";
}
return "???";
}
struct ParseMode {
Regexp::ParseFlags parse_flags;
string desc;
};
static const Regexp::ParseFlags single_line =
Regexp::LikePerl;
static const Regexp::ParseFlags multi_line =
static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
static ParseMode parse_modes[] = {
{ single_line, "single-line" },
{ single_line|Regexp::Latin1, "single-line, latin1" },
{ multi_line, "multiline" },
{ multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
{ multi_line|Regexp::Latin1, "multiline, latin1" },
};
static string FormatMode(Regexp::ParseFlags flags) {
for (int i = 0; i < arraysize(parse_modes); i++)
if (parse_modes[i].parse_flags == flags)
return parse_modes[i].desc;
return StringPrintf("%#x", static_cast<uint>(flags));
}
// Constructs and saves all the matching engines that
// will be required for the given tests.
TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
Regexp::ParseFlags flags)
: regexp_str_(regexp_str),
kind_(kind),
flags_(flags),
error_(false),
regexp_(NULL),
num_captures_(0),
prog_(NULL),
rprog_(NULL),
re_(NULL),
re2_(NULL) {
VLOG(1) << CEscape(regexp_str);
// Compile regexp to prog.
// Always required - needed for backtracking (reference implementation).
RegexpStatus status;
regexp_ = Regexp::Parse(regexp_str, flags, &status);
if (regexp_ == NULL) {
LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
<< " mode: " << FormatMode(flags);
error_ = true;
return;
}
num_captures_ = regexp_->NumCaptures();
prog_ = regexp_->CompileToProg(0);
if (prog_ == NULL) {
LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_prog) {
LOG(INFO) << "Prog for "
<< " regexp "
<< CEscape(regexp_str_)
<< " (" << FormatKind(kind_)
<< ", " << FormatMode(flags_)
<< ")\n"
<< prog_->Dump();
}
// Compile regexp to reversed prog. Only needed for DFA engines.
if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
rprog_ = regexp_->CompileToReverseProg(0);
if (rprog_ == NULL) {
LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
error_ = true;
return;
}
if (FLAGS_dump_rprog)
LOG(INFO) << rprog_->Dump();
}
// Create re string that will be used for RE and RE2.
string re = regexp_str.as_string();
// Accomodate flags.
// Regexp::Latin1 will be accomodated below.
if (!(flags & Regexp::OneLine))
re = "(?m)" + re;
if (flags & Regexp::NonGreedy)
re = "(?U)" + re;
if (flags & Regexp::DotNL)
re = "(?s)" + re;
// Compile regexp to RE2.
if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
RE2::Options options;
if (flags & Regexp::Latin1)
options.set_encoding(RE2::Options::EncodingLatin1);
if (kind_ == Prog::kLongestMatch)
options.set_longest_match(true);
re2_ = new RE2(re, options);
if (!re2_->error().empty()) {
LOG(INFO) << "Cannot RE2: " << CEscape(re);
error_ = true;
return;
}
}
// Compile regexp to RE.
// PCRE as exposed by the RE interface isn't always usable.
// 1. It disagrees about handling of empty-string reptitions
// like matching (a*)* against "b". PCRE treats the (a*) as
// occurring once, while we treat it as occurring not at all.
// 2. It treats $ as this weird thing meaning end of string
// or before the \n at the end of the string.
// 3. It doesn't implement POSIX leftmost-longest matching.
// MimicsPCRE() detects 1 and 2.
if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
kind_ != Prog::kLongestMatch) {
PCRE_Options o;
o.set_option(PCRE::UTF8);
if (flags & Regexp::Latin1)
o.set_option(PCRE::None);
// PCRE has interface bug keeping us from finding $0, so
// add one more layer of parens.
re_ = new PCRE("("+re+")", o);
if (!re_->error().empty()) {
LOG(INFO) << "Cannot PCRE: " << CEscape(re);
error_ = true;
return;
}
}
}
TestInstance::~TestInstance() {
if (regexp_)
regexp_->Decref();
delete prog_;
delete rprog_;
delete re_;
delete re2_;
}
// Runs a single search using the named engine type.
// This interface hides all the irregularities of the various
// engine interfaces from the rest of this file.
void TestInstance::RunSearch(Engine type,
const StringPiece& orig_text,
const StringPiece& orig_context,
Prog::Anchor anchor,
Result *result) {
memset(result, 0, sizeof *result);
if (regexp_ == NULL) {
result->skipped = true;
return;
}
int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
if (nsubmatch > kMaxSubmatch)
nsubmatch = kMaxSubmatch;
StringPiece text = orig_text;
StringPiece context = orig_context;
switch (type) {
default:
LOG(FATAL) << "Bad RunSearch type: " << (int)type;
case kEngineBacktrack:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineNFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchNFA(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineDFA:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
&result->skipped, NULL);
break;
case kEngineDFA1:
if (prog_ == NULL || rprog_ == NULL) {
result->skipped = true;
break;
}
result->matched =
prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
&result->skipped, NULL);
// If anchored, no need for second run,
// but do it anyway to find more bugs.
if (result->matched) {
if (!rprog_->SearchDFA(result->submatch[0], context,
Prog::kAnchored, Prog::kLongestMatch,
result->submatch,
&result->skipped, NULL)) {
LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
<< " on " << CEscape(text);
result->matched = false;
}
}
result->have_submatch0 = true;
break;
case kEngineOnePass:
if (prog_ == NULL ||
anchor == Prog::kUnanchored ||
!prog_->IsOnePass() ||
nsubmatch > Prog::kMaxOnePassCapture) {
result->skipped = true;
break;
}
result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineBitState:
if (prog_ == NULL) {
result->skipped = true;
break;
}
result->matched = prog_->SearchBitState(text, context, anchor, kind_,
result->submatch, nsubmatch);
result->have_submatch = true;
break;
case kEngineRE2:
case kEngineRE2a:
case kEngineRE2b: {
if (!re2_ || text.end() != context.end()) {
result->skipped = true;
break;
}
RE2::Anchor re_anchor;
if (anchor == Prog::kAnchored)
re_anchor = RE2::ANCHOR_START;
else
re_anchor = RE2::UNANCHORED;
if (kind_ == Prog::kFullMatch)
re_anchor = RE2::ANCHOR_BOTH;
result->matched = re2_->Match(context,
text.begin() - context.begin(),
text.end() - context.begin(),
re_anchor, result->submatch, nsubmatch);
result->have_submatch = nsubmatch > 0;
break;
}
case kEnginePCRE: {
if (!re_ || text.begin() != context.begin() ||
text.end() != context.end()) {
result->skipped = true;
break;
}
const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
PCRE::Arg *a = new PCRE::Arg[nsubmatch];
for (int i = 0; i < nsubmatch; i++) {
a[i] = PCRE::Arg(&result->submatch[i]);
argptr[i] = &a[i];
}
int consumed;
PCRE::Anchor pcre_anchor;
if (anchor == Prog::kAnchored)
pcre_anchor = PCRE::ANCHOR_START;
else
pcre_anchor = PCRE::UNANCHORED;
if (kind_ == Prog::kFullMatch)
pcre_anchor = PCRE::ANCHOR_BOTH;
re_->ClearHitLimit();
result->matched =
re_->DoMatch(text,
pcre_anchor,
&consumed,
argptr, nsubmatch);
if (re_->HitLimit()) {
result->untrusted = true;
delete[] argptr;
delete[] a;
break;
}
result->have_submatch = true;
// Work around RE interface bug: PCRE returns -1 as the
// offsets for an unmatched subexpression, and RE should
// turn that into StringPiece(NULL) but in fact it uses
// StringPiece(text.begin() - 1, 0). Oops.
for (int i = 0; i < nsubmatch; i++)
if (result->submatch[i].begin() == text.begin() - 1)
result->submatch[i] = NULL;
delete[] argptr;
delete[] a;
break;
}
}
if (!result->matched)
memset(result->submatch, 0, sizeof result->submatch);
}
// Checks whether r is okay given that correct is the right answer.
// Specifically, r's answers have to match (but it doesn't have to
// claim to have all the answers).
static bool ResultOkay(const Result& r, const Result& correct) {
if (r.skipped)
return true;
if (r.matched != correct.matched)
return false;
if (r.have_submatch || r.have_submatch0) {
for (int i = 0; i < kMaxSubmatch; i++) {
if (correct.submatch[i].begin() != r.submatch[i].begin() ||
correct.submatch[i].size() != r.submatch[i].size())
return false;
if (!r.have_submatch)
break;
}
}
return true;
}
// Runs a single test.
bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
// Backtracking is the gold standard.
Result correct;
RunSearch(kEngineBacktrack, text, context, anchor, &correct);
if (correct.skipped) {
if (regexp_ == NULL)
return true;
LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
<< " " << FormatMode(flags_);
return false;
}
VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
<< " text " << CEscape(text)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
// Compare the others.
bool all_okay = true;
for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
if (!(Engines() & (1<<i)))
continue;
Result r;
RunSearch(i, text, context, anchor, &r);
if (ResultOkay(r, correct)) {
if (FLAGS_log_okay)
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
continue;
}
// We disagree with PCRE on the meaning of some Unicode matches.
// In particular, we treat all non-ASCII UTF-8 as word characters.
// We also treat "empty" character sets like [^\w\W] as being
// impossible to match, while PCRE apparently excludes some code
// points (e.g., 0x0080) from both \w and \W.
if (i == kEnginePCRE && NonASCII(text))
continue;
if (!r.untrusted)
all_okay = false;
LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
context, anchor);
if (r.matched != correct.matched) {
if (r.matched) {
LOG(INFO) << " Should not match (but does).";
} else {
LOG(INFO) << " Should match (but does not).";
continue;
}
}
for (int i = 0; i < 1+num_captures_; i++) {
if (r.submatch[i].begin() != correct.submatch[i].begin() ||
r.submatch[i].end() != correct.submatch[i].end()) {
LOG(INFO) <<
StringPrintf(" $%d: should be %s is %s",
i,
FormatCapture(text, correct.submatch[i]).c_str(),
FormatCapture(text, r.submatch[i]).c_str());
} else {
LOG(INFO) <<
StringPrintf(" $%d: %s ok", i,
FormatCapture(text, r.submatch[i]).c_str());
}
}
}
if (!all_okay) {
if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
LOG(QFATAL) << "Too many regexp failures.";
}
return all_okay;
}
void TestInstance::LogMatch(const char* prefix, Engine e,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
LOG(INFO) << prefix
<< EngineString(e)
<< " regexp "
<< CEscape(regexp_str_)
<< " "
<< CEscape(regexp_->ToString())
<< " text "
<< CEscape(text)
<< " ("
<< text.begin() - context.begin()
<< ","
<< text.end() - context.begin()
<< ") of context "
<< CEscape(context)
<< " (" << FormatKind(kind_)
<< ", " << FormatAnchor(anchor)
<< ", " << FormatMode(flags_)
<< ")";
}
static Prog::MatchKind kinds[] = {
Prog::kFirstMatch,
Prog::kLongestMatch,
Prog::kFullMatch,
};
// Test all possible match kinds and parse modes.
Tester::Tester(const StringPiece& regexp) {
error_ = false;
for (int i = 0; i < arraysize(kinds); i++) {
for (int j = 0; j < arraysize(parse_modes); j++) {
TestInstance* t = new TestInstance(regexp, kinds[i],
parse_modes[j].parse_flags);
error_ |= t->error();
v_.push_back(t);
}
}
}
Tester::~Tester() {
for (int i = 0; i < v_.size(); i++)
delete v_[i];
}
bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
bool okay = true;
for (int i = 0; i < v_.size(); i++)
okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
return okay;
}
static Prog::Anchor anchors[] = {
Prog::kAnchored,
Prog::kUnanchored
};
bool Tester::TestInput(const StringPiece& text) {
bool okay = TestInputInContext(text, text);
if (text.size() > 0) {
StringPiece sp;
sp = text;
sp.remove_prefix(1);
okay &= TestInputInContext(sp, text);
sp = text;
sp.remove_suffix(1);
okay &= TestInputInContext(sp, text);
}
return okay;
}
bool Tester::TestInputInContext(const StringPiece& text,
const StringPiece& context) {
bool okay = true;
for (int i = 0; i < arraysize(anchors); i++)
okay &= TestCase(text, context, anchors[i]);
return okay;
}
bool TestRegexpOnText(const StringPiece& regexp,
const StringPiece& text) {
Tester t(regexp);
return t.TestInput(text);
}
} // namespace re2

View File

@ -0,0 +1,121 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Comparative tester for regular expression matching.
// Checks all implementations against each other.
#ifndef RE2_TESTING_TESTER_H__
#define RE2_TESTING_TESTER_H__
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/regexp.h"
#include "re2/re2.h"
#include "util/pcre.h"
namespace re2 {
class Regexp;
// All the supported regexp engines.
enum Engine {
kEngineBacktrack = 0, // Prog::BadSearchBacktrack
kEngineNFA, // Prog::SearchNFA
kEngineDFA, // Prog::SearchDFA, only ask whether it matched
kEngineDFA1, // Prog::SearchDFA, ask for match[0]
kEngineOnePass, // Prog::SearchOnePass, if applicable
kEngineBitState, // Prog::SearchBitState
kEngineRE2, // RE2, all submatches
kEngineRE2a, // RE2, only ask for match[0]
kEngineRE2b, // RE2, only ask whether it matched
kEnginePCRE, // PCRE (util/pcre.h)
kEngineMax,
};
// Make normal math on the enum preserve the type.
// By default, C++ doesn't define ++ on enum, and e+1 has type int.
static inline void operator++(Engine& e, int unused) {
e = static_cast<Engine>(e+1);
}
static inline Engine operator+(Engine e, int i) {
return static_cast<Engine>(static_cast<int>(e)+i);
}
// A TestInstance caches per-regexp state for a given
// regular expression in a given configuration
// (UTF-8 vs Latin1, longest vs first match, etc.).
class TestInstance {
public:
struct Result;
TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
Regexp::ParseFlags flags);
~TestInstance();
Regexp::ParseFlags flags() { return flags_; }
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
private:
// Runs a single search using the named engine type.
void RunSearch(Engine type,
const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor,
Result *result);
void LogMatch(const char* prefix, Engine e, const StringPiece& text,
const StringPiece& context, Prog::Anchor anchor);
const StringPiece& regexp_str_; // regexp being tested
Prog::MatchKind kind_; // kind of match
Regexp::ParseFlags flags_; // flags for parsing regexp_str_
bool error_; // error during constructor?
Regexp* regexp_; // parsed regexp
int num_captures_; // regexp_->NumCaptures() cached
Prog* prog_; // compiled program
Prog* rprog_; // compiled reverse program
PCRE* re_; // PCRE implementation
RE2* re2_; // RE2 implementation
DISALLOW_EVIL_CONSTRUCTORS(TestInstance);
};
// A group of TestInstances for all possible configurations.
class Tester {
public:
explicit Tester(const StringPiece& regexp);
~Tester();
bool error() { return error_; }
// Runs a single test case: search in text, which is in context,
// using the given anchoring.
bool TestCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor);
// Run TestCase(text, text, anchor) for all anchoring modes.
bool TestInput(const StringPiece& text);
// Run TestCase(text, context, anchor) for all anchoring modes.
bool TestInputInContext(const StringPiece& text, const StringPiece& context);
private:
bool error_;
vector<TestInstance*> v_;
DISALLOW_EVIL_CONSTRUCTORS(Tester);
};
// Run all possible tests using regexp and text.
bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
} // namespace re2
#endif // RE2_TESTING_TESTER_H__

View File

@ -0,0 +1,207 @@
#!/usr/bin/python2.4
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Unittest for the util/regexp/re2/unicode.py module."""
import os
import StringIO
from google3.pyglib import flags
from google3.testing.pybase import googletest
from google3.util.regexp.re2 import unicode
_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party",
"unicode", "ucd-5.1.0")
class ConvertTest(googletest.TestCase):
"""Test the conversion functions."""
def testUInt(self):
self.assertEquals(0x0000, unicode._UInt("0000"))
self.assertEquals(0x263A, unicode._UInt("263A"))
self.assertEquals(0x10FFFF, unicode._UInt("10FFFF"))
self.assertRaises(unicode.InputError, unicode._UInt, "263")
self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA")
self.assertRaises(unicode.InputError, unicode._UInt, "110000")
def testURange(self):
self.assertEquals([1, 2, 3], unicode._URange("0001..0003"))
self.assertEquals([1], unicode._URange("0001"))
self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005")
self.assertRaises(unicode.InputError, unicode._URange, "0003..0001")
self.assertRaises(unicode.InputError, unicode._URange, "0001..0001")
def testUStr(self):
self.assertEquals("0x263A", unicode._UStr(0x263a))
self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF))
self.assertRaises(unicode.InputError, unicode._UStr, 0x110000)
self.assertRaises(unicode.InputError, unicode._UStr, -1)
_UNICODE_TABLE = """# Commented line, should be ignored.
# The next line is blank and should be ignored.
0041;Capital A;Line 1
0061..007A;Lowercase;Line 2
1F00;<Greek, First>;Ignored
1FFE;<Greek, Last>;Line 3
10FFFF;Runemax;Line 4
0000;Zero;Line 5
"""
_BAD_TABLE1 = """
111111;Not a code point;
"""
_BAD_TABLE2 = """
0000;<Zero, First>;Missing <Zero, Last>
"""
_BAD_TABLE3 = """
0010..0001;Bad range;
"""
class AbortError(Exception):
"""Function should not have been called."""
def Abort():
raise AbortError("Abort")
def StringTable(s, n, f):
unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f)
class ReadUnicodeTableTest(googletest.TestCase):
"""Test the ReadUnicodeTable function."""
def testSimpleTable(self):
ncall = [0] # can't assign to ordinary int in DoLine
def DoLine(codes, fields):
self.assertEquals(3, len(fields))
ncall[0] += 1
self.assertEquals("Line %d" % (ncall[0],), fields[2])
if ncall[0] == 1:
self.assertEquals([0x0041], codes)
self.assertEquals("0041", fields[0])
self.assertEquals("Capital A", fields[1])
elif ncall[0] == 2:
self.assertEquals(range(0x0061, 0x007A + 1), codes)
self.assertEquals("0061..007A", fields[0])
self.assertEquals("Lowercase", fields[1])
elif ncall[0] == 3:
self.assertEquals(range(0x1F00, 0x1FFE + 1), codes)
self.assertEquals("1F00..1FFE", fields[0])
self.assertEquals("Greek", fields[1])
elif ncall[0] == 4:
self.assertEquals([0x10FFFF], codes)
self.assertEquals("10FFFF", fields[0])
self.assertEquals("Runemax", fields[1])
elif ncall[0] == 5:
self.assertEquals([0x0000], codes)
self.assertEquals("0000", fields[0])
self.assertEquals("Zero", fields[1])
StringTable(_UNICODE_TABLE, 3, DoLine)
self.assertEquals(5, ncall[0])
def testErrorTables(self):
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort)
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort)
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort)
class ParseContinueTest(googletest.TestCase):
"""Test the ParseContinue function."""
def testParseContinue(self):
self.assertEquals(("Private Use", "First"),
unicode._ParseContinue("<Private Use, First>"))
self.assertEquals(("Private Use", "Last"),
unicode._ParseContinue("<Private Use, Last>"))
self.assertEquals(("<Private Use, Blah>", None),
unicode._ParseContinue("<Private Use, Blah>"))
class CaseGroupsTest(googletest.TestCase):
"""Test the CaseGroups function (and the CaseFoldingReader)."""
def FindGroup(self, c):
if type(c) == str:
c = ord(c)
for g in self.groups:
if c in g:
return g
return None
def testCaseGroups(self):
self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR)
self.assertEquals([ord("A"), ord("a")], self.FindGroup("a"))
self.assertEquals(None, self.FindGroup("0"))
class ScriptsTest(googletest.TestCase):
"""Test the Scripts function (and the ScriptsReader)."""
def FindScript(self, c):
if type(c) == str:
c = ord(c)
for script, codes in self.scripts.items():
for code in codes:
if c == code:
return script
return None
def testScripts(self):
self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR)
self.assertEquals("Latin", self.FindScript("a"))
self.assertEquals("Common", self.FindScript("0"))
self.assertEquals(None, self.FindScript(0xFFFE))
class CategoriesTest(googletest.TestCase):
"""Test the Categories function (and the UnicodeDataReader)."""
def FindCategory(self, c):
if type(c) == str:
c = ord(c)
short = None
for category, codes in self.categories.items():
for code in codes:
if code == c:
# prefer category Nd over N
if len(category) > 1:
return category
if short == None:
short = category
return short
def testCategories(self):
self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR)
self.assertEquals("Ll", self.FindCategory("a"))
self.assertEquals("Nd", self.FindCategory("0"))
self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range
self.assertEquals(None, self.FindCategory(0xFFFE))
self.assertEquals("Lo", self.FindCategory(0x8B5A))
self.assertEquals("Lo", self.FindCategory(0x6C38))
self.assertEquals("Lo", self.FindCategory(0x92D2))
self.assertTrue(ord("a") in self.categories["L"])
self.assertTrue(ord("0") in self.categories["N"])
self.assertTrue(0x8B5A in self.categories["L"])
self.assertTrue(0x6C38 in self.categories["L"])
self.assertTrue(0x92D2 in self.categories["L"])
def main():
googletest.main()
if __name__ == "__main__":
main()

341
outside/re2/re2/tostring.cc Normal file
View File

@ -0,0 +1,341 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
string* t_; // The string the walker appends to.
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
};
string Regexp::ToString() {
string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, r);
} else if (foldcase && 'a' <= r && r <= 'z') {
if ('a' <= r && r <= 'z')
r += 'A' - 'a';
t->append(1, '[');
t->append(1, r);
t->append(1, r + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE)) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append("(?HaveMatch:%d)", re->match_id());
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, r);
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
StringAppendF(t, "\\x%02x", static_cast<int>(r));
return;
}
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

297
outside/re2/re2/unicode.py Normal file
View File

@ -0,0 +1,297 @@
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Parser for Unicode data files (as distributed by unicode.org)."""
import os
import re
import urllib2
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
class Error(Exception):
"""Unicode error base class."""
class InputError(Error):
"""Unicode input error class. Raised on invalid input."""
def _UInt(s):
"""Converts string to Unicode code point ('263A' => 0x263a).
Args:
s: string to convert
Returns:
Unicode code point
Raises:
InputError: the string is not a valid Unicode value.
"""
try:
v = int(s, 16)
except ValueError:
v = -1
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (s,))
return v
def _URange(s):
"""Converts string to Unicode range.
'0001..0003' => [1, 2, 3].
'0001' => [1].
Args:
s: string to convert
Returns:
Unicode range
Raises:
InputError: the string is not a valid Unicode range.
"""
a = s.split("..")
if len(a) == 1:
return [_UInt(a[0])]
if len(a) == 2:
lo = _UInt(a[0])
hi = _UInt(a[1])
if lo < hi:
return range(lo, hi + 1)
raise InputError("invalid Unicode range %s" % (s,))
def _UStr(v):
"""Converts Unicode code point to hex string.
0x263a => '0x263A'.
Args:
v: code point to convert
Returns:
Unicode string
Raises:
InputError: the argument is not a valid Unicode value.
"""
if v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (v,))
return "0x%04X" % (v,)
def _ParseContinue(s):
"""Parses a Unicode continuation field.
These are of the form '<Name, First>' or '<Name, Last>'.
Instead of giving an explicit range in a single table entry,
some Unicode tables use two entries, one for the first
code value in the range and one for the last.
The first entry's description is '<Name, First>' instead of 'Name'
and the second is '<Name, Last>'.
'<Name, First>' => ('Name', 'First')
'<Name, Last>' => ('Name', 'Last')
'Anything else' => ('Anything else', None)
Args:
s: continuation field string
Returns:
pair: name and ('First', 'Last', or None)
"""
match = re.match("<(.*), (First|Last)>", s)
if match is not None:
return match.groups()
return (s, None)
def ReadUnicodeTable(filename, nfields, doline):
"""Generic Unicode table text file reader.
The reader takes care of stripping out comments and also
parsing the two different ways that the Unicode tables specify
code ranges (using the .. notation and splitting the range across
multiple lines).
Each non-comment line in the table is expected to have the given
number of fields. The first field is known to be the Unicode value
and the second field its description.
The reader calls doline(codes, fields) for each entry in the table.
If fn raises an exception, the reader prints that exception,
prefixed with the file name and line number, and continues
processing the file. When done with the file, the reader re-raises
the first exception encountered during the file.
Arguments:
filename: the Unicode data file to read, or a file-like object.
nfields: the number of expected fields per line in that file.
doline: the function to call for each table entry.
Raises:
InputError: nfields is invalid (must be >= 2).
"""
if nfields < 2:
raise InputError("invalid number of fields %d" % (nfields,))
if type(filename) == str:
if filename.startswith("http://"):
fil = urllib2.urlopen(filename)
else:
fil = open(filename, "r")
else:
fil = filename
first = None # first code in multiline range
expect_last = None # tag expected for "Last" line in multiline range
lineno = 0 # current line number
for line in fil:
lineno += 1
try:
# Chop # comments and white space; ignore empty lines.
sharp = line.find("#")
if sharp >= 0:
line = line[:sharp]
line = line.strip()
if not line:
continue
# Split fields on ";", chop more white space.
# Must have the expected number of fields.
fields = [s.strip() for s in line.split(";")]
if len(fields) != nfields:
raise InputError("wrong number of fields %d %d - %s" %
(len(fields), nfields, line))
# The Unicode text files have two different ways
# to list a Unicode range. Either the first field is
# itself a range (0000..FFFF), or the range is split
# across two lines, with the second field noting
# the continuation.
codes = _URange(fields[0])
(name, cont) = _ParseContinue(fields[1])
if expect_last is not None:
# If the last line gave the First code in a range,
# this one had better give the Last one.
if (len(codes) != 1 or codes[0] <= first or
cont != "Last" or name != expect_last):
raise InputError("expected Last line for %s" %
(expect_last,))
codes = range(first, codes[0] + 1)
first = None
expect_last = None
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
fields[1] = name
elif cont == "First":
# Otherwise, if this is the First code in a range,
# remember it and go to the next line.
if len(codes) != 1:
raise InputError("bad First line: range given")
expect_last = name
first = codes[0]
continue
doline(codes, fields)
except Exception, e:
print "%s:%d: %s" % (filename, lineno, e)
raise
if expect_last is not None:
raise InputError("expected Last line for %s; got EOF" %
(expect_last,))
def CaseGroups(unicode_dir=_UNICODE_DIR):
"""Returns list of Unicode code groups equivalent under case folding.
Each group is a sorted list of code points,
and the list of groups is sorted by first code point
in the group.
Args:
unicode_dir: Unicode data directory
Returns:
list of Unicode code groups
"""
# Dict mapping lowercase code point to fold-equivalent group.
togroup = {}
def DoLine(codes, fields):
"""Process single CaseFolding.txt line, updating togroup."""
(_, foldtype, lower, _) = fields
if foldtype not in ("C", "S"):
return
lower = _UInt(lower)
togroup.setdefault(lower, [lower]).extend(codes)
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
groups = togroup.values()
for g in groups:
g.sort()
groups.sort()
return togroup, groups
def Scripts(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping script names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping script names to code lists
"""
scripts = {}
def DoLine(codes, fields):
"""Process single Scripts.txt line, updating scripts."""
(_, name) = fields
scripts.setdefault(name, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
return scripts
def Categories(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping category names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping category names to code lists
"""
categories = {}
def DoLine(codes, fields):
"""Process single UnicodeData.txt line, updating categories."""
category = fields[2]
categories.setdefault(category, []).extend(codes)
# Add codes from Lu into L, etc.
if len(category) > 1:
short = category[0]
categories.setdefault(short, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
return categories

View File

@ -0,0 +1,480 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
// 1034 groups, 2089 pairs, 289 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 608, 608, -205 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 619, 619, 10743 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 643, 643, -218 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1319, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11310, 48 },
{ 11312, 11358, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11506, 11507, EvenOdd },
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42605, EvenOdd },
{ 42624, 42647, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42912, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
};
const int num_unicode_casefold = 289;
// 1034 groups, 1055 pairs, 167 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1318, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11310, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42646, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42912, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
};
const int num_unicode_tolower = 167;
} // namespace re2

View File

@ -0,0 +1,75 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
// to the next largest Unicode point with equivalent folding. The largest
// point wraps back to the first. For example, the tables map:
//
// 'A' -> 'a'
// 'a' -> 'A'
//
// 'K' -> 'k'
// 'k' -> '' (Kelvin symbol)
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
// and deltas, so that the table entries for 'A' through 'Z' can be represented
// as a single entry { 'A', 'Z', +32 }.
//
// In addition to blocks that map to each other (A-Z mapping to a-z)
// there are blocks of pairs that individually map to each other
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
// For those, the special delta value EvenOdd marks even/odd pairs
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
//
// In this form, the table has 274 entries, about 3kB. If we were to split
// the table into one for 16-bit codes and an overflow table for larger ones,
// we could get it down to about 1.5kB, but that's not worth the complexity.
//
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#ifndef RE2_UNICODE_CASEFOLD_H__
#define RE2_UNICODE_CASEFOLD_H__
#include "util/util.h"
namespace re2 {
enum {
EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
};
struct CaseFold {
uint32 lo;
uint32 hi;
int32 delta;
};
extern const CaseFold unicode_casefold[];
extern const int num_unicode_casefold;
extern const CaseFold unicode_tolower[];
extern const int num_unicode_tolower;
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H__

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,64 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
// and ranges of 32-bit codes. It would be simpler
// to use only 32-bit ranges, but these tables are large
// enough to warrant extra care.
//
// Using just 32-bit ranges gives 27 kB of data.
// Adding 16-bit ranges gives 18 kB of data.
// Adding an extra table of 16-bit singletons would reduce
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#ifndef RE2_UNICODE_GROUPS_H__
#define RE2_UNICODE_GROUPS_H__
#include "util/util.h"
namespace re2 {
struct URange16
{
uint16 lo;
uint16 hi;
};
struct URange32
{
uint32 lo;
uint32 hi;
};
struct UGroup
{
const char *name;
int sign; // +1 for [abc], -1 for [^abc]
const URange16 *r16;
int nr16;
const URange32 *r32;
int nr32;
};
// Named by property or script name (e.g., "Nd", "N", "Han").
// Negated groups are not included.
extern const UGroup unicode_groups[];
extern const int num_unicode_groups;
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
// Negated groups are included.
extern const UGroup posix_groups[];
extern const int num_posix_groups;
// Named by Perl name (e.g., "\\d", "\\D").
// Negated groups are included.
extern const UGroup perl_groups[];
extern const int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H__

View File

@ -0,0 +1,344 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_VARIADIC_FUNCTION_H_
#define RE2_VARIADIC_FUNCTION_H_
namespace re2 {
template <typename Result, typename Param0, typename Param1, typename Arg,
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
class VariadicFunction2 {
public:
Result operator()(Param0 p0, Param1 p1) const {
return Func(p0, p1, 0, 0);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
const Arg* const args[] = { &a0 };
return Func(p0, p1, args, 1);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
const Arg* const args[] = { &a0, &a1 };
return Func(p0, p1, args, 2);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2) const {
const Arg* const args[] = { &a0, &a1, &a2 };
return Func(p0, p1, args, 3);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
return Func(p0, p1, args, 4);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
return Func(p0, p1, args, 5);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
return Func(p0, p1, args, 6);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
return Func(p0, p1, args, 7);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
return Func(p0, p1, args, 8);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
return Func(p0, p1, args, 9);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9 };
return Func(p0, p1, args, 10);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10 };
return Func(p0, p1, args, 11);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11 };
return Func(p0, p1, args, 12);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12 };
return Func(p0, p1, args, 13);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13 };
return Func(p0, p1, args, 14);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14 };
return Func(p0, p1, args, 15);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
return Func(p0, p1, args, 16);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
return Func(p0, p1, args, 17);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
return Func(p0, p1, args, 18);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
return Func(p0, p1, args, 19);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
return Func(p0, p1, args, 20);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
&a20 };
return Func(p0, p1, args, 21);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21 };
return Func(p0, p1, args, 22);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22 };
return Func(p0, p1, args, 23);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23 };
return Func(p0, p1, args, 24);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24 };
return Func(p0, p1, args, 25);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25 };
return Func(p0, p1, args, 26);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26 };
return Func(p0, p1, args, 27);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
return Func(p0, p1, args, 28);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
return Func(p0, p1, args, 29);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
return Func(p0, p1, args, 30);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
return Func(p0, p1, args, 31);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30, const Arg& a31) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
return Func(p0, p1, args, 32);
}
};
} // namespace re2
#endif // RE2_VARIADIC_FUNCTION_H_

View File

@ -0,0 +1,244 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
// and after visiting the subexpressions.
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#ifndef RE2_WALKER_INL_H__
#define RE2_WALKER_INL_H__
#include "re2/regexp.h"
namespace re2 {
template<typename T> struct WalkState;
template<typename T> class Regexp::Walker {
public:
Walker();
virtual ~Walker();
// Virtual method called before visiting re's children.
// PreVisit passes ownership of its return value to its caller.
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
// and passed to the child PreVisits and PostVisits as parent_arg.
// At the top-most Regexp, parent_arg is arg passed to walk.
// If PreVisit sets *stop to true, the walk does not recurse
// into the children. Instead it behaves as though the return
// value from PreVisit is the return value from PostVisit.
// The default PreVisit returns parent_arg.
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
// Virtual method called after visiting re's children.
// The pre_arg is the T that PreVisit returned.
// The child_args is a vector of the T that the child PostVisits returned.
// PostVisit takes ownership of pre_arg.
// PostVisit takes ownership of the Ts
// in *child_args, but not the vector itself.
// PostVisit passes ownership of its return value
// to its caller.
// The default PostVisit simply returns pre_arg.
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
T* child_args, int nchild_args);
// Virtual method called to copy a T,
// when Walk notices that more than one child is the same re.
virtual T Copy(T arg);
// Virtual method called to do a "quick visit" of the re,
// but not its children. Only called once the visit budget
// has been used up and we're trying to abort the walk
// as quickly as possible. Should return a value that
// makes sense for the parent PostVisits still to be run.
// This function is (hopefully) only called by
// WalkExponential, but must be implemented by all clients,
// just in case.
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
// Walks over a regular expression.
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
// Returns the T returned by PostVisit on re.
T Walk(Regexp* re, T top_arg);
// Like Walk, but doesn't use Copy. This can lead to
// exponential runtimes on cross-linked Regexps like the
// ones generated by Simplify. To help limit this,
// at most max_visits nodes will be visited and then
// the walk will be cut off early.
// If the walk *is* cut off early, ShortVisit(re)
// will be called on regexps that cannot be fully
// visited rather than calling PreVisit/PostVisit.
T WalkExponential(Regexp* re, T top_arg, int max_visits);
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
void Reset();
// Returns whether walk was cut off.
bool stopped_early() { return stopped_early_; }
private:
// Walk state for the entire traversal.
stack<WalkState<T> >* stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
T parent_arg,
bool* stop) {
return parent_arg;
}
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
T parent_arg,
T pre_arg,
T* child_args,
int nchild_args) {
return pre_arg;
}
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
return arg;
}
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState<T>(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
child_args(NULL) { }
Regexp* re; // The regexp
int n; // The index of the next child to process; -1 means need to PreVisit
T parent_arg; // Accumulated arguments.
T pre_arg;
T child_arg; // One-element buffer for child_args.
T* child_args;
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (stack_ && stack_->size() > 0) {
LOG(DFATAL) << "Stack not empty.";
while (stack_->size() > 0) {
delete stack_->top().child_args;
stack_->pop();
}
}
}
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
bool use_copy) {
Reset();
if (re == NULL) {
LOG(DFATAL) << "Walk NULL";
return top_arg;
}
stack_->push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_->top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
if (--max_visits_ < 0) {
stopped_early_ = true;
t = ShortVisit(re, s->parent_arg);
break;
}
bool stop = false;
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
if (stop) {
t = s->pre_arg;
break;
}
s->n = 0;
s->child_args = NULL;
if (re->nsub_ == 1)
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
// Fall through.
}
default: {
if (re->nsub_ > 0) {
Regexp** sub = re->sub();
if (s->n < re->nsub_) {
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
}
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
if (re->nsub_ > 1)
delete[] s->child_args;
break;
}
}
// We've finished stack_->top().
// Update next guy down.
stack_->pop();
if (stack_->size() == 0)
return t;
s = &stack_->top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else
s->child_arg = t;
s->n++;
}
}
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
// Without the exponential walking behavior,
// this budget should be more than enough for any
// regexp, and yet not enough to get us in trouble
// as far as CPU time.
max_visits_ = 1000000;
return WalkInternal(re, top_arg, true);
}
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
int max_visits) {
max_visits_ = max_visits;
return WalkInternal(re, top_arg, false);
}
} // namespace re2
#endif // RE2_WALKER_INL_H__

21
outside/re2/runtests Executable file
View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
success=true
for i
do
printf "%-40s" $i
if sh -c "$i >$i.log 2>&1" 2>/dev/null
then
echo PASS
else
echo FAIL';' output in $i.log
success=false
fi
done
if $success; then
echo 'ALL TESTS PASSED.'
exit 0
fi
echo 'TESTS FAILED.'
exit 1

View File

@ -0,0 +1,26 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <re2/re2.h>
#include <re2/filtered_re2.h>
#include <stdio.h>
using namespace re2;
int main(void) {
FilteredRE2 f;
int id;
f.Add("a.*b.*c", RE2::DefaultOptions, &id);
vector<string> v;
f.Compile(&v);
vector<int> ids;
f.FirstMatch("abbccc", ids);
if(RE2::FullMatch("axbyc", "a.*b.*c")) {
printf("PASS\n");
return 0;
}
printf("FAIL\n");
return 2;
}

567
outside/re2/ucs2.diff Normal file
View File

@ -0,0 +1,567 @@
This is a dump from Google's source control system of the change
that removed UCS-2 support from RE2. As the explanation below
says, UCS-2 mode is fundamentally at odds with things like ^ and $,
so it never really worked very well. But if you are interested in using
it without those operators, it did work for that. It assumed that the
UCS-2 data was in the native host byte order.
If you are interested in adding UCS-2 mode back, this patch might
be a good starting point.
Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
Retire UCS-2 mode.
I added it as an experiment for V8, but it
requires 2-byte lookahead to do completely,
and RE2 has 1-byte lookahead (enough for UTF-8)
as a fairly deep fundamental assumption,
so it did not support ^ or $.
==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
- if (prog_->flags() & Regexp::UCS2)
- p++;
}
return false;
}
==== re2/compile.cc#17 - re2/compile.cc#18 ====
re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
// Input encodings.
enum Encoding {
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
kEncodingLatin1, // Latin1 (0-FF)
};
re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
void Add_80_10ffff();
- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
- uint8 lo2, uint8 hi2, bool fold2);
// New suffix that matches the byte range lo-hi, then goes to next.
Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
// Converts rune range lo-hi into a fragment that recognizes
// the bytes that would make up those runes in the current
- // encoding (Latin 1, UTF-8, or UCS-2).
+ // encoding (Latin 1 or UTF-8).
// This lets the machine work byte-by-byte even when
// using multibyte encodings.
re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
case kEncodingLatin1:
AddRuneRangeLatin1(lo, hi, foldcase);
break;
- case kEncodingUCS2:
- AddRuneRangeUCS2(lo, hi, foldcase);
- break;
}
}
re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
}
- // Test whether 16-bit values are big or little endian.
- static bool BigEndian() {
- union {
- char byte[2];
- int16 endian;
- } u;
-
- u.byte[0] = 1;
- u.byte[1] = 2;
- return u.endian == 0x0102;
- }
-
- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
- uint8 lo2, uint8 hi2, bool fold2) {
- Inst* ip;
- if (reversed_) {
- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
- ip = RuneByteSuffix(lo2, hi2, fold2, ip);
- } else {
- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
- ip = RuneByteSuffix(lo1, hi1, fold1, ip);
- }
- AddSuffix(ip);
- }
-
- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
- if (lo > hi || lo > 0xFFFF)
- return;
- if (hi > 0xFFFF)
- hi = 0xFFFF;
-
- // We'll assemble a pattern assuming big endian.
- // If the machine isn't, tell Cat to reverse its arguments.
- bool oldreversed = reversed_;
- if (!BigEndian()) {
- reversed_ = !oldreversed;
- }
-
- // Split into bytes.
- int lo1 = lo >> 8;
- int lo2 = lo & 0xFF;
- int hi1 = hi >> 8;
- int hi2 = hi & 0xFF;
-
- if (lo1 == hi1) {
- // Easy case: high bits are same in both.
- // Only do ASCII case folding on the second byte if the top byte is 00.
- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
- } else {
- // Harder case: different second byte ranges depending on first byte.
-
- // Initial fragment.
- if (lo2 > 0) {
- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
- lo1++;
- }
-
- // Trailing fragment.
- if (hi2 < 0xFF) {
- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
- hi1--;
- }
-
- // Inner ranges.
- if (lo1 <= hi1) {
- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
- }
- }
-
- // Restore reverse setting.
- reversed_ = oldreversed;
- }
-
// Table describing how to make a UTF-8 matching machine
// for the rune range 80-10FFFF (Runeself-Runemax).
// This range happens frequently enough (for example /./ and /[^a-z]/)
re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
Frag Compiler::Literal(Rune r, bool foldcase) {
switch (encoding_) {
- default: // UCS-2 or something new
- BeginRange();
- AddRuneRange(r, r, foldcase);
- return EndRange();
+ default:
+ return kNullFrag;
case kEncodingLatin1:
return ByteRange(r, r, foldcase);
re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
if (re->parse_flags() & Regexp::Latin1)
c.encoding_ = kEncodingLatin1;
- else if (re->parse_flags() & Regexp::UCS2)
- c.encoding_ = kEncodingUCS2;
c.reversed_ = reversed;
if (max_mem <= 0) {
c.max_inst_ = 100000; // more than enough
re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
c.prog_->set_start_unanchored(c.prog_->start());
} else {
Frag dot;
- if (c.encoding_ == kEncodingUCS2) {
- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
- } else {
- dot = c.ByteRange(0x00, 0xFF, false);
- }
+ dot = c.ByteRange(0x00, 0xFF, false);
Frag dotloop = c.Star(dot, true);
Frag unanchored = c.Cat(dotloop, all);
c.prog_->set_start_unanchored(unanchored.begin);
==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
const char* bp = context.begin();
int c = -1;
int wasword = 0;
- bool ucs2 = prog_->flags() & Regexp::UCS2;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
flag = Prog::EmptyFlags(context, p);
}
- // In UCS-2 mode, if we need to start a new thread,
- // make sure to do it on an even boundary.
- if(ucs2 && runq->size() == 0 &&
- (p - context.begin()) % 2 && p < text.end()) {
- p++;
- flag = Prog::EmptyFlags(context, p);
- }
-
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
- // In UCS-2 mode, only start the thread on a 2-byte boundary.
- if(!ucs2 || (p - context.begin()) % 2 == 0) {
- match_[0] = p;
- AddToThreadq(runq, start_, flag, p, match_);
- match_[0] = NULL;
- }
+ match_[0] = p;
+ AddToThreadq(runq, start_, flag, p, match_);
+ match_[0] = NULL;
}
// If all the threads have died, stop early.
==== re2/parse.cc#22 - re2/parse.cc#23 ====
re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
status_(status), stacktop_(NULL), ncap_(0) {
if (flags_ & Latin1)
rune_max_ = 0xFF;
- else if (flags & UCS2)
- rune_max_ = 0xFFFF;
else
rune_max_ = Runemax;
}
re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
bool Regexp::ParseState::PushCarat() {
if (flags_ & OneLine) {
return PushSimpleOp(kRegexpBeginText);
- } else {
- if (flags_ & UCS2) {
- status_->set_code(kRegexpUnsupported);
- status_->set_error_arg("multiline ^ in UCS-2 mode");
- return false;
- }
- return PushSimpleOp(kRegexpBeginLine);
}
+ return PushSimpleOp(kRegexpBeginLine);
}
// Pushes a \b or \B onto the stack.
bool Regexp::ParseState::PushWordBoundary(bool word) {
- if (flags_ & UCS2) {
- status_->set_code(kRegexpUnsupported);
- status_->set_error_arg("\\b or \\B in UCS-2 mode");
- return false;
- }
if (word)
return PushSimpleOp(kRegexpWordBoundary);
return PushSimpleOp(kRegexpNoWordBoundary);
re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
bool ret = PushSimpleOp(kRegexpEndText);
flags_ = oflags;
return ret;
- }
- if (flags_ & UCS2) {
- status_->set_code(kRegexpUnsupported);
- status_->set_error_arg("multiline $ in UCS-2 mode");
- return false;
}
return PushSimpleOp(kRegexpEndLine);
}
==== re2/re2.cc#34 - re2/re2.cc#35 ====
re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
return RE2::ErrorBadUTF8;
case re2::kRegexpBadNamedCapture:
return RE2::ErrorBadNamedCapture;
- case re2::kRegexpUnsupported:
- return RE2::ErrorUnsupported;
}
return RE2::ErrorInternal;
}
re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
break;
case RE2::Options::EncodingLatin1:
flags |= Regexp::Latin1;
- break;
- case RE2::Options::EncodingUCS2:
- flags |= Regexp::UCS2;
break;
}
==== re2/re2.h#36 - re2/re2.h#37 ====
re2/re2.h#36:246,252 - re2/re2.h#37:246,251
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge, // pattern too large (compile failed)
- ErrorUnsupported, // unsupported feature (in UCS-2 mode)
};
// Predefined common options.
re2/re2.h#36:570,576 - re2/re2.h#37:569,574
enum Encoding {
EncodingUTF8 = 1,
- EncodingUCS2, // 16-bit Unicode 0-FFFF only
EncodingLatin1
};
==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
- // Don't even bother for UCS-2; it's time to throw that code away.
- if (parse_flags_ & UCS2)
- return false;
-
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
==== re2/regexp.h#20 - re2/regexp.h#21 ====
re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
- kRegexpUnsupported, // unsupported operator
};
// Error status for certain operations.
re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
- NeverNL = 1<<12, // Never match NL, even if the regexp mentions
+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
// As close to Perl as we can get.
==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
- if (prog_->flags() & Regexp::UCS2)
- p++;
}
return false;
}
==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
static ParseMode parse_modes[] = {
{ single_line, "single-line" },
{ single_line|Regexp::Latin1, "single-line, latin1" },
- { single_line|Regexp::UCS2, "single-line, ucs2" },
{ multi_line, "multiline" },
{ multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
{ multi_line|Regexp::Latin1, "multiline, latin1" },
- { multi_line|Regexp::UCS2, "multiline, ucs2" },
};
static string FormatMode(Regexp::ParseFlags flags) {
re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
RegexpStatus status;
regexp_ = Regexp::Parse(regexp_str, flags, &status);
if (regexp_ == NULL) {
- if (status.code() != kRegexpUnsupported) {
- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
- << " mode: " << FormatMode(flags);
- error_ = true;
- }
+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+ << " mode: " << FormatMode(flags);
+ error_ = true;
return;
}
prog_ = regexp_->CompileToProg(0);
re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
RE2::Options options;
if (flags & Regexp::Latin1)
options.set_encoding(RE2::Options::EncodingLatin1);
- else if (flags & Regexp::UCS2)
- options.set_encoding(RE2::Options::EncodingUCS2);
if (kind_ == Prog::kLongestMatch)
options.set_longest_match(true);
re2_ = new RE2(re, options);
re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
delete re2_;
}
- // Converts UTF-8 string in text into UCS-2 string in new_text.
- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
- const char* p = text.begin();
- const char* ep = text.end();
- uint16* q = new uint16[ep - p];
- uint16* q0 = q;
-
- int n;
- Rune r;
- for (; p < ep; p += n) {
- if (!fullrune(p, ep - p)) {
- delete[] q0;
- return false;
- }
- n = chartorune(&r, p);
- if (r > 0xFFFF) {
- delete[] q0;
- return false;
- }
- *q++ = r;
- }
- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
- return true;
- }
-
- // Rewrites *sp from being a pointer into text8 (UTF-8)
- // to being a pointer into text16 (equivalent text but in UCS-2).
- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
- StringPiece *sp) {
- if (sp->begin() == NULL && text8.begin() != NULL)
- return;
-
- int nrune = 0;
- int n;
- Rune r;
- const char* p = text8.begin();
- const char* ep = text8.end();
- const char* spbegin = NULL;
- const char* spend = NULL;
- for (;;) {
- if (p == sp->begin())
- spbegin = text16.begin() + sizeof(uint16)*nrune;
- if (p == sp->end())
- spend = text16.begin() + sizeof(uint16)*nrune;
- if (p >= ep)
- break;
- n = chartorune(&r, p);
- p += n;
- nrune++;
- }
- if (spbegin == NULL || spend == NULL) {
- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
- << CEscape(text8) << " "
- << (int)(sp->begin() - text8.begin()) << " "
- << (int)(sp->end() - text8.begin());
- }
- *sp = StringPiece(spbegin, spend - spbegin);
- }
-
- // Rewrites *sp from begin a pointer into text16 (UCS-2)
- // to being a pointer into text8 (equivalent text but in UTF-8).
- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
- StringPiece* sp) {
- if (sp->begin() == NULL)
- return;
-
- int nrune = 0;
- int n;
- Rune r;
- const char* p = text8.begin();
- const char* ep = text8.end();
- const char* spbegin = NULL;
- const char* spend = NULL;
- for (;;) {
- if (nrune == (sp->begin() - text16.begin())/2)
- spbegin = p;
- if (nrune == (sp->end() - text16.begin())/2)
- spend = p;
- if (p >= ep)
- break;
- n = chartorune(&r, p);
- p += n;
- nrune++;
- }
- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
- << CEscape(text16) << " "
- << (int)(sp->begin() - text16.begin()) << " "
- << (int)(sp->end() - text16.begin());
- }
- *sp = StringPiece(spbegin, spend - spbegin);
- }
-
// Runs a single search using the named engine type.
// This interface hides all the irregularities of the various
// engine interfaces from the rest of this file.
re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
StringPiece text = orig_text;
StringPiece context = orig_context;
- bool ucs2 = false;
- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
- if (!ConvertUTF8ToUCS2(orig_context, &context)) {
- result->skipped = true;
- return;
- }
-
- // Rewrite context to refer to new text.
- AdjustUTF8ToUCS2(orig_context, context, &text);
- ucs2 = true;
- }
-
switch (type) {
default:
LOG(FATAL) << "Bad RunSearch type: " << (int)type;
re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
}
}
- // If we did UCS-2 matching, rewrite the matches to refer
- // to the original UTF-8 text.
- if (ucs2) {
- if (result->matched) {
- if (result->have_submatch0) {
- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
- } else if (result->have_submatch) {
- for (int i = 0; i < nsubmatch; i++) {
- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
- }
- }
- }
- delete[] context.begin();
- }
-
if (!result->matched)
memset(result->submatch, 0, sizeof result->submatch);
}
re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
return true;
}
- // Check whether text uses only Unicode points <= 0xFFFF
- // (in the BMP).
- static bool IsBMP(const StringPiece& text) {
- const char* p = text.begin();
- const char* ep = text.end();
- while (p < ep) {
- if (!fullrune(p, ep - p))
- return false;
- Rune r;
- p += chartorune(&r, p);
- if (r > 0xFFFF)
- return false;
- }
- return true;
- }
-
// Runs a single test.
bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
Prog::Anchor anchor) {
re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
Result correct;
RunSearch(kEngineBacktrack, text, context, anchor, &correct);
if (correct.skipped) {
- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode
+ if (regexp_ == NULL)
return true;
LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
<< " " << FormatMode(flags_);

168
outside/re2/util/arena.cc Normal file
View File

@ -0,0 +1,168 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
namespace re2 {
// ----------------------------------------------------------------------
// UnsafeArena::UnsafeArena()
// UnsafeArena::~UnsafeArena()
// Destroying the arena automatically calls Reset()
// ----------------------------------------------------------------------
UnsafeArena::UnsafeArena(const size_t block_size)
: block_size_(block_size),
freestart_(NULL), // set for real in Reset()
last_alloc_(NULL),
remaining_(0),
blocks_alloced_(1),
overflow_blocks_(NULL) {
assert(block_size > kDefaultAlignment);
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
first_blocks_[0].size = block_size_;
Reset();
}
UnsafeArena::~UnsafeArena() {
FreeBlocks();
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
// The first X blocks stay allocated always by default. Delete them now.
for (int i = 0; i < blocks_alloced_; i++)
free(first_blocks_[i].mem);
}
// ----------------------------------------------------------------------
// UnsafeArena::Reset()
// Clears all the memory an arena is using.
// ----------------------------------------------------------------------
void UnsafeArena::Reset() {
FreeBlocks();
freestart_ = first_blocks_[0].mem;
remaining_ = first_blocks_[0].size;
last_alloc_ = NULL;
// We do not know for sure whether or not the first block is aligned,
// so we fix that right now.
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
(kDefaultAlignment-1);
if (overage > 0) {
const int waste = kDefaultAlignment - overage;
freestart_ += waste;
remaining_ -= waste;
}
freestart_when_empty_ = freestart_;
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
}
// -------------------------------------------------------------
// UnsafeArena::AllocNewBlock()
// Adds and returns an AllocatedBlock.
// The returned AllocatedBlock* is valid until the next call
// to AllocNewBlock or Reset. (i.e. anything that might
// affect overflow_blocks_).
// -------------------------------------------------------------
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
AllocatedBlock *block;
// Find the next block.
if ( blocks_alloced_ < arraysize(first_blocks_) ) {
// Use one of the pre-allocated blocks
block = &first_blocks_[blocks_alloced_++];
} else { // oops, out of space, move to the vector
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
// Adds another block to the vector.
overflow_blocks_->resize(overflow_blocks_->size()+1);
// block points to the last block of the vector.
block = &overflow_blocks_->back();
}
block->mem = reinterpret_cast<char*>(malloc(block_size));
block->size = block_size;
return block;
}
// ----------------------------------------------------------------------
// UnsafeArena::GetMemoryFallback()
// We take memory out of our pool, aligned on the byte boundary
// requested. If we don't have space in our current pool, we
// allocate a new block (wasting the remaining space in the
// current block) and give you that. If your memory needs are
// too big for a single block, we make a special your-memory-only
// allocation -- this is equivalent to not using the arena at all.
// ----------------------------------------------------------------------
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
if (size == 0)
return NULL; // stl/stl_alloc.h says this is okay
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
// If the object is more than a quarter of the block size, allocate
// it separately to avoid wasting too much space in leftover bytes
if (block_size_ == 0 || size > block_size_/4) {
// then it gets its own block in the arena
assert(align <= kDefaultAlignment); // because that's what new gives us
// This block stays separate from the rest of the world; in particular
// we don't update last_alloc_ so you can't reclaim space on this block.
return AllocNewBlock(size)->mem;
}
const int overage =
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
if (overage) {
const int waste = align - overage;
freestart_ += waste;
if (waste < remaining_) {
remaining_ -= waste;
} else {
remaining_ = 0;
}
}
if (size > remaining_) {
AllocatedBlock *block = AllocNewBlock(block_size_);
freestart_ = block->mem;
remaining_ = block->size;
}
remaining_ -= size;
last_alloc_ = freestart_;
freestart_ += size;
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
return reinterpret_cast<void*>(last_alloc_);
}
// ----------------------------------------------------------------------
// UnsafeArena::FreeBlocks()
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
// no-op: we don't "free" memory until Reset() is called. We do
// update some stats, though. Note we do no checking that the
// pointer you pass in was actually allocated by us, or that it
// was allocated for the size you say, so be careful here!
// FreeBlocks() does the work for Reset(), actually freeing all
// memory allocated in one fell swoop.
// ----------------------------------------------------------------------
void UnsafeArena::FreeBlocks() {
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
free(first_blocks_[i].mem);
first_blocks_[i].mem = NULL;
first_blocks_[i].size = 0;
}
blocks_alloced_ = 1;
if (overflow_blocks_ != NULL) {
vector<AllocatedBlock>::iterator it;
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
free(it->mem);
}
delete overflow_blocks_; // These should be used very rarely
overflow_blocks_ = NULL;
}
}
} // namespace re2

103
outside/re2/util/arena.h Normal file
View File

@ -0,0 +1,103 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Sometimes it is necessary to allocate a large number of small
// objects. Doing this the usual way (malloc, new) is slow,
// especially for multithreaded programs. An UnsafeArena provides a
// mark/release method of memory management: it asks for a large chunk
// from the operating system and doles it out bit by bit as required.
// Then you free all the memory at once by calling UnsafeArena::Reset().
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
// call from multiple threads.
//
// The global operator new that can be used as follows:
//
// #include "lib/arena-inl.h"
//
// UnsafeArena arena(1000);
// Foo* foo = new (AllocateInArena, &arena) Foo;
//
#ifndef RE2_UTIL_ARENA_H_
#define RE2_UTIL_ARENA_H_
namespace re2 {
// This class is thread-compatible.
class UnsafeArena {
public:
UnsafeArena(const size_t block_size);
virtual ~UnsafeArena();
void Reset();
// This should be the worst-case alignment for any type. This is
// good for IA-32, SPARC version 7 (the last one I know), and
// supposedly Alpha. i386 would be more time-efficient with a
// default alignment of 8, but ::operator new() uses alignment of 4,
// and an assertion will fail below after the call to MakeNewBlock()
// if you try to use a larger alignment.
#ifdef __i386__
static const int kDefaultAlignment = 4;
#else
static const int kDefaultAlignment = 8;
#endif
private:
void* GetMemoryFallback(const size_t size, const int align);
public:
void* GetMemory(const size_t size, const int align) {
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
last_alloc_ = freestart_;
freestart_ += size;
remaining_ -= size;
return reinterpret_cast<void*>(last_alloc_);
}
return GetMemoryFallback(size, align);
}
private:
struct AllocatedBlock {
char *mem;
size_t size;
};
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
// or Reset (i.e. anything that might affect overflow_blocks_).
AllocatedBlock *AllocNewBlock(const size_t block_size);
const AllocatedBlock *IndexToBlock(int index) const;
const size_t block_size_;
char* freestart_; // beginning of the free space in most recent block
char* freestart_when_empty_; // beginning of the free space when we're empty
char* last_alloc_; // used to make sure ReturnBytes() is safe
size_t remaining_;
// STL vector isn't as efficient as it could be, so we use an array at first
int blocks_alloced_; // how many of the first_blocks_ have been alloced
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
vector<AllocatedBlock>* overflow_blocks_;
void FreeBlocks(); // Frees all except first block
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
};
// Operators for allocation on the arena
// Syntax: new (AllocateInArena, arena) MyClass;
// STL containers, etc.
enum AllocateInArenaType { AllocateInArena };
} // namespace re2
inline void* operator new(size_t size,
re2::AllocateInArenaType /* unused */,
re2::UnsafeArena *arena) {
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
}
#endif // RE2_UTIL_ARENA_H_

View File

@ -0,0 +1,137 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_ATOMICOPS_H__
#define RE2_UTIL_ATOMICOPS_H__
// The memory ordering constraints resemble the ones in C11.
// RELAXED - no memory ordering, just an atomic operation.
// CONSUME - data-dependent ordering.
// ACQUIRE - prevents memory accesses from hoisting above the operation.
// RELEASE - prevents memory accesses from sinking below the operation.
#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \
(__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801)
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED)
#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE)
#else // old compiler
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0)
#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0)
// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier()
// are an implementation detail and must not be used in the rest of the code.
#if defined(__i386__)
static inline void WriteMemoryBarrier() {
int x;
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
:: "r" (&x));
}
#elif defined(__x86_64__)
// 64-bit implementations of memory barrier can be simpler, because
// "sfence" is guaranteed to exist.
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("sfence" : : : "memory");
}
#elif defined(__ppc__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("eieio" : : : "memory");
}
#elif defined(__alpha__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("wmb" : : : "memory");
}
#elif defined(__aarch64__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("dmb st" : : : "memory");
}
#else
#include "util/mutex.h"
static inline void WriteMemoryBarrier() {
// Slight overkill, but good enough:
// any mutex implementation must have
// a read barrier after the lock operation and
// a write barrier before the unlock operation.
//
// It may be worthwhile to write architecture-specific
// barriers for the common platforms, as above, but
// this is a correct fallback.
re2::Mutex mu;
re2::MutexLock l(&mu);
}
/*
#error Need WriteMemoryBarrier for architecture.
// Windows
inline void WriteMemoryBarrier() {
LONG x;
::InterlockedExchange(&x, 0);
}
*/
#endif
// Alpha has very weak memory ordering. If relying on WriteBarriers, one must
// use read barriers for the readers too.
#if defined(__alpha__)
static inline void MaybeReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void MaybeReadMemoryBarrier() {}
#endif // __alpha__
// Read barrier for various targets.
#if defined(__aarch64__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("dmb ld" : : : "memory");
}
#elif defined(__alpha__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void ReadMemoryBarrier() {}
#endif
#endif // old compiler
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
#endif // RE2_UTIL_ATOMICOPS_H__

View File

@ -0,0 +1,153 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "util/benchmark.h"
#include "re2/re2.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
using testing::Benchmark;
using namespace re2;
static Benchmark* benchmarks[10000];
static int nbenchmarks;
void Benchmark::Register() {
benchmarks[nbenchmarks] = this;
if(lo < 1)
lo = 1;
if(hi < lo)
hi = lo;
nbenchmarks++;
}
static int64 nsec() {
struct timeval tv;
if(gettimeofday(&tv, 0) < 0)
return -1;
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
}
static int64 bytes;
static int64 ns;
static int64 t0;
static int64 items;
void SetBenchmarkBytesProcessed(long long x) {
bytes = x;
}
void StopBenchmarkTiming() {
if(t0 != 0)
ns += nsec() - t0;
t0 = 0;
}
void StartBenchmarkTiming() {
if(t0 == 0)
t0 = nsec();
}
void SetBenchmarkItemsProcessed(int n) {
items = n;
}
void BenchmarkMemoryUsage() {
// TODO(rsc): Implement.
}
int NumCPUs() {
return 1;
}
static void runN(Benchmark *b, int n, int siz) {
bytes = 0;
items = 0;
ns = 0;
t0 = nsec();
if(b->fn)
b->fn(n);
else if(b->fnr)
b->fnr(n, siz);
else {
fprintf(stderr, "%s: missing function\n", b->name);
exit(2);
}
if(t0 != 0)
ns += nsec() - t0;
}
static int round(int n) {
int base = 1;
while(base*10 < n)
base *= 10;
if(n < 2*base)
return 2*base;
if(n < 5*base)
return 5*base;
return 10*base;
}
void RunBench(Benchmark* b, int nthread, int siz) {
int n, last;
// TODO(rsc): Threaded benchmarks.
if(nthread != 1)
return;
// run once in case it's expensive
n = 1;
runN(b, n, siz);
while(ns < (int)1e9 && n < (int)1e9) {
last = n;
if(ns/n == 0)
n = 1e9;
else
n = 1e9 / (ns/n);
n = max(last+1, min(n+n/2, 100*last));
n = round(n);
runN(b, n, siz);
}
char mb[100];
char suf[100];
mb[0] = '\0';
suf[0] = '\0';
if(ns > 0 && bytes > 0)
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
if(b->fnr || b->lo != b->hi) {
if(siz >= (1<<20))
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
else if(siz >= (1<<10))
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
else
snprintf(suf, sizeof suf, "/%d", siz);
}
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
fflush(stdout);
}
static int match(const char* name, int argc, const char** argv) {
if(argc == 1)
return 1;
for(int i = 1; i < argc; i++)
if(RE2::PartialMatch(name, argv[i]))
return 1;
return 0;
}
int main(int argc, const char** argv) {
for(int i = 0; i < nbenchmarks; i++) {
Benchmark* b = benchmarks[i];
if(match(b->name, argc, argv))
for(int j = b->threadlo; j <= b->threadhi; j++)
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
RunBench(b, j, k);
}
}

View File

@ -0,0 +1,41 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_BENCHMARK_H__
#define RE2_UTIL_BENCHMARK_H__
namespace testing {
struct Benchmark {
const char* name;
void (*fn)(int);
void (*fnr)(int, int);
int lo;
int hi;
int threadlo;
int threadhi;
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
};
} // namespace testing
void SetBenchmarkBytesProcessed(long long);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
void SetBenchmarkItemsProcessed(int);
int NumCPUs();
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // RE2_UTIL_BENCHMARK_H__

27
outside/re2/util/flags.h Normal file
View File

@ -0,0 +1,27 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// http://code.google.com/p/google-gflags
#ifndef RE2_UTIL_FLAGS_H__
#define RE2_UTIL_FLAGS_H__
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
#define DECLARE_flag(type, name) \
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
#endif // RE2_UTIL_FLAGS_H__

Some files were not shown because too many files have changed in this diff Show More