From bb6ce3fee4b72fc8ef5d38d92f62244b969b9cf6 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 11:57:48 -0700 Subject: [PATCH 01/13] Regex! --- Makefile | 22 +- gen164/5/repg.c | 155 ++ gen164/5/rexp.c | 154 ++ gen164/watt.c | 4 + outside/cre2/share/doc/cre2/COPYING | 35 + outside/cre2/share/doc/cre2/LICENSE.re2 | 27 + outside/cre2/share/doc/cre2/README | 175 ++ outside/cre2/share/info/cre2.info | 1489 +++++++++++++++++ outside/cre2/share/info/dir | 18 + outside/cre2/src/.gitignore | 44 + outside/cre2/src/COPYING | 35 + outside/cre2/src/INSTALL | 365 ++++ outside/cre2/src/LICENSE.re2 | 27 + outside/cre2/src/Makefile.am | 48 + outside/cre2/src/README | 175 ++ outside/cre2/src/autogen.sh | 11 + outside/cre2/src/build.sh | 17 + outside/cre2/src/clean.sh | 9 + outside/cre2/src/configure.ac | 58 + outside/cre2/src/configure.sh | 24 + outside/cre2/src/doc/cre2.texi | 1474 ++++++++++++++++ outside/cre2/src/doc/fdl-1.3.texi | 509 ++++++ outside/cre2/src/prepare.sh | 10 + outside/cre2/src/src/cre2.cpp | 605 +++++++ outside/cre2/src/src/cre2.h | 299 ++++ outside/cre2/src/tests/test-consume-match.c | 335 ++++ outside/cre2/src/tests/test-easy-matching.c | 103 ++ .../src/tests/test-find-and-consume-match.c | 335 ++++ outside/cre2/src/tests/test-full-match.c | 308 ++++ outside/cre2/src/tests/test-matching.c | 122 ++ outside/cre2/src/tests/test-misc.c | 119 ++ outside/cre2/src/tests/test-options.c | 43 + outside/cre2/src/tests/test-partial-match.c | 308 ++++ outside/cre2/src/tests/test-replace.c | 257 +++ outside/cre2/src/tests/test-rex-alloc.c | 113 ++ outside/cre2/src/tests/test-version.c | 30 + 36 files changed, 7855 insertions(+), 7 deletions(-) create mode 100644 gen164/5/repg.c create mode 100644 gen164/5/rexp.c create mode 100644 outside/cre2/share/doc/cre2/COPYING create mode 100644 outside/cre2/share/doc/cre2/LICENSE.re2 create mode 100644 outside/cre2/share/doc/cre2/README create mode 100644 outside/cre2/share/info/cre2.info create mode 100644 outside/cre2/share/info/dir create mode 100644 outside/cre2/src/.gitignore create mode 100644 outside/cre2/src/COPYING create mode 100644 outside/cre2/src/INSTALL create mode 100644 outside/cre2/src/LICENSE.re2 create mode 100644 outside/cre2/src/Makefile.am create mode 100644 outside/cre2/src/README create mode 100644 outside/cre2/src/autogen.sh create mode 100644 outside/cre2/src/build.sh create mode 100644 outside/cre2/src/clean.sh create mode 100644 outside/cre2/src/configure.ac create mode 100644 outside/cre2/src/configure.sh create mode 100644 outside/cre2/src/doc/cre2.texi create mode 100644 outside/cre2/src/doc/fdl-1.3.texi create mode 100644 outside/cre2/src/prepare.sh create mode 100644 outside/cre2/src/src/cre2.cpp create mode 100644 outside/cre2/src/src/cre2.h create mode 100644 outside/cre2/src/tests/test-consume-match.c create mode 100644 outside/cre2/src/tests/test-easy-matching.c create mode 100644 outside/cre2/src/tests/test-find-and-consume-match.c create mode 100644 outside/cre2/src/tests/test-full-match.c create mode 100644 outside/cre2/src/tests/test-matching.c create mode 100644 outside/cre2/src/tests/test-misc.c create mode 100644 outside/cre2/src/tests/test-options.c create mode 100644 outside/cre2/src/tests/test-partial-match.c create mode 100644 outside/cre2/src/tests/test-replace.c create mode 100644 outside/cre2/src/tests/test-rex-alloc.c create mode 100644 outside/cre2/src/tests/test-version.c diff --git a/Makefile b/Makefile index 5b2abc7d5..fa62da733 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ LIB=$(PWD)/lib RM=rm -f CC=gcc -CLD=gcc -O2 -g -L/usr/local/lib -L/opt/local/lib +CLD=g++ -O2 -g -L/usr/local/lib -L/opt/local/lib YACC=bison -v -b$(GENERATED)/y LEX=lex @@ -43,24 +43,25 @@ ifeq ($(OS),osx) OSLIBS=-framework CoreServices -framework CoreFoundation endif ifeq ($(OS),linux) - OSLIBS=-lpthread -lrt -lcurses + OSLIBS=-lcrypto -lpthread -lrt -lcurses DEFINES=-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE endif ifeq ($(OS),bsd) - OSLIBS=-lpthread -lncurses -lkvm + OSLIBS=-lcrypto -lpthread -lncurses -lkvm endif -LIBS=-lssl -lcrypto -lgmp -lncurses -lsigsegv $(OSLIBS) +LIBS=-lgmp -lncurses -lsigsegv $(OSLIBS) -lre2 INCLUDE=include GENERATED=generated MDEFINES=-DU2_OS_$(OS) -DU2_OS_ENDIAN_$(ENDIAN) -D U2_LIB=\"$(LIB)\" -CFLAGS=-O2 -g \ +CFLAGS= -O2 -g \ -I/usr/local/include \ -I/opt/local/include \ -I$(INCLUDE) \ -Ioutside/libuv/include \ + -Ioutside/cre2/include \ -I $(GENERATED) \ $(DEFINES) \ $(MDEFINES) @@ -451,6 +452,8 @@ J164_5_OFILES=\ gen164/5/mat.o \ gen164/5/mink.o \ gen164/5/parse.o \ + gen164/5/repg.o \ + gen164/5/rexp.o \ gen164/5/rub.o \ gen164/5/shax.o \ gen164/5/lore.o \ @@ -549,17 +552,21 @@ VERE_OFILES=\ $(OUT_OFILES) LIBUV=outside/libuv/libuv.a +LIBCRE=outside/cre2/lib/libcre2.a all: $(BIN)/vere $(LIBUV): $(MAKE) -C outside/libuv libuv.a +$(LIBCRE): + cd outside/cre2/src && sh build.sh + $(V_OFILES) f/loom.o f/trac.o: include/v/vere.h -$(BIN)/vere: $(VERE_OFILES) $(LIBUV) $(CAPN) +$(BIN)/vere: $(LIBCRE) $(VERE_OFILES) $(LIBUV) $(CAPN) mkdir -p $(BIN) - $(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(CAPN) $(LIBS) + $(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(LIBCRE) $(CAPN) $(LIBS) tags: ctags -R -f .tags --exclude=root @@ -570,4 +577,5 @@ etags: clean: $(RM) $(VERE_OFILES) $(BIN)/vere $(BIN)/eyre $(MAKE) -C outside/libuv clean + cd outside/cre2/src && sh clean.sh diff --git a/gen164/5/repg.c b/gen164/5/repg.c new file mode 100644 index 000000000..6982a533b --- /dev/null +++ b/gen164/5/repg.c @@ -0,0 +1,155 @@ +/* j/5/repg.c +** +** This file is in the public domain. +*/ +#include "all.h" +#include "../pit.h" +#include "cre2.h" +#include + + u2_noun // produce + j2_mbc(Pt5, repg)(u2_wire wir_r, + u2_noun lub, + u2_noun rad, + u2_noun rep) // retain + { + c3_y* lub_y = u2_cr_tape(lub); + c3_y* rad_y = u2_cr_tape(rad); + c3_y* rep_y = u2_cr_tape(rep); + + + + char* rec = (char*)lub_y; + char* end; + while(*rec != 0) { + if(*rec == '\\') { + rec++; + switch (*rec) { + case 'P': + case 'p': + free(lub_y); + free(rad_y); + return u2_nul; + case 'Q': + end = strstr(rec, "\\E"); + if(end == NULL) rec += strlen(rec) - 1; + else rec = end; + } + rec++; + } + else if(*rec == '(') { + rec++; + if(*rec == '?') { + rec++; + if(*rec != ':') { + free(lub_y); + free(rad_y); + return u2_nul; + } + rec++; + } + } + else + rec++; + } + + fprintf(stderr, "\r\nrepg: \r\n%s : %s\r\n", lub_y, rad_y); + + cre2_regexp_t * rex; + cre2_options_t * opt; + + opt = cre2_opt_new(); + if (opt) { + cre2_opt_set_log_errors(opt, 0); + cre2_opt_set_encoding(opt, CRE2_Latin1); + cre2_opt_set_perl_classes(opt, 1); + cre2_opt_set_one_line(opt, 1); + cre2_opt_set_longest_match(opt, 1); + rex = cre2_new((const char *)lub_y, strlen((char *)lub_y), opt); + if (rex) { + if (!cre2_error_code(rex)) { + int text_len = strlen((char *)rad_y); + cre2_string_t matches[1]; + int ic = 0; + + u2_noun ret = u2_nul; + while (ic <= text_len) { + int match = cre2_match(rex, (const char*)rad_y, text_len, ic, text_len, CRE2_ANCHOR_START, matches, 1); + + if (!match) { + if(rad_y[ic]) + ret = u2_cn_cell((u2_atom)rad_y[ic], ret); + ic++; + } + else { + int mlen = matches[0].length; + if (mlen == 0) { + ret = u2_ckb_weld(u2_ckb_flop(u2_ci_tape((char *) rad_y+ic)), u2_ckb_flop(u2_ci_tape((char *)rep_y))); + ic = text_len + 1; + } + else { + ret = u2_ckb_weld(u2_ckb_flop(u2_ci_tape((char *)rep_y)), ret); + ic += mlen; + } + } + } + cre2_opt_delete(opt); + cre2_delete(rex); + free(lub_y); + free(rad_y); + free(rep_y); + return u2_cn_cell(u2_nul, u2_ckb_flop(ret)); + } + else { + // Compiling the regular expression failed + cre2_opt_delete(opt); + cre2_delete(rex); + free(lub_y); + free(rad_y); + return u2_nul; + } + cre2_opt_delete(opt); + cre2_delete(rex); + } + else { + // rex Allocation Error + cre2_opt_delete(opt); + free(lub_y); + free(rad_y); + u2_bl_bail(wir_r, c3__exit); + } + cre2_opt_delete(opt); + } + // opt Allocation Error + free(lub_y); + free(rad_y); + u2_bl_bail(wir_r, c3__exit); + return u2_nul; + } + + u2_weak // produce + j2_mb(Pt5, repg)(u2_wire wir_r, + u2_noun cor) // retain + { + u2_noun lub; + u2_noun rad; + u2_noun rep; + + if ( (u2_none == (lub = u2_frag(u2_cv_sam_2, cor))) || + (u2_none == (rad = u2_frag(u2_cv_sam_6, cor))) || + (u2_none == (rep = u2_frag(u2_cv_sam_7, cor))) ) + { + return u2_bl_bail(wir_r, c3__fail); + } else { + return j2_mbc(Pt5, repg)(wir_r, lub, rad, rep); + } + } + + +/* structures +*/ + u2_ho_jet + j2_mbj(Pt5, repg)[] = { + { ".2", c3__lite, j2_mb(Pt5, repg), u2_jet_live | u2_jet_test, u2_none, u2_none }, + { } + }; diff --git a/gen164/5/rexp.c b/gen164/5/rexp.c new file mode 100644 index 000000000..d2363e699 --- /dev/null +++ b/gen164/5/rexp.c @@ -0,0 +1,154 @@ +/* j/5/rexp.c +** +** This file is in the public domain. +*/ +#include "all.h" +#include "../pit.h" +#include "cre2.h" +#include + + u2_noun // produce + j2_mbc(Pt5, rexp)(u2_wire wir_r, + u2_noun lub, + u2_noun rad) // retain + { + c3_y* lub_y = u2_cr_tape(lub); + c3_y* rad_y = u2_cr_tape(rad); + + u2k(lub); + int lub_l = u2_ckb_lent(lub); + if (lub_l != strlen(lub_y)) { + free(lub_y); + free(rad_y); + return u2_nul; + } + + char* rec = (char*)lub_y; + char* end; + while(*rec != 0) { + if(*rec > 127) { + free(lub_y); + free(rad_y); + return u2_nul; + } + else if(*rec == '\\') { + rec++; + switch (*rec) { + case 'P': + case 'p': + free(lub_y); + free(rad_y); + return u2_nul; + case 'Q': + end = strstr(rec, "\\E"); + if(end == NULL) rec += strlen(rec) - 1; + else rec = end; + } + } + else if(*rec == '(') { + rec++; + if(*rec == '?') { + rec++; + if(*rec != ':') { + free(lub_y); + free(rad_y); + return u2_nul; + } + rec++; + } + } + else + rec++; + } + + fprintf(stderr, "\r\n%s : %s\r\n", lub_y, rad_y); + + cre2_regexp_t * rex; + cre2_options_t * opt; + + opt = cre2_opt_new(); + if (opt) { + cre2_opt_set_log_errors(opt, 0); + cre2_opt_set_encoding(opt, CRE2_UTF8); + cre2_opt_set_perl_classes(opt, 1); + cre2_opt_set_one_line(opt, 1); + cre2_opt_set_longest_match(opt, 1); + rex = cre2_new((const char *)lub_y, strlen((char *)lub_y), opt); + if (rex) { + if (!cre2_error_code(rex)) { + int text_len = strlen((char *)rad_y); + int captures = cre2_num_capturing_groups(rex); + cre2_string_t matches[captures+1]; + + int match = cre2_match(rex, (const char*)rad_y, text_len, 0, text_len, CRE2_UNANCHORED, matches, captures+1); + + if (!match) { + // No matches + cre2_opt_delete(opt); + cre2_delete(rex); + free(lub_y); + free(rad_y); + return u2_cn_cell(u2_nul, u2_nul); + } + + u2_noun map = u2_nul; + + int i; + for (i = 0; i < captures+1; i++) { + char * buf = malloc(matches[i].length + 1); + memcpy(buf, matches[i].data, matches[i].length); + buf[matches[i].length] = 0; + fprintf(stderr, "%d: %s\r\n", i, buf); + map = u2_ckd_by_put(map, i, u2_ci_tape(buf)); + free(buf); + } + + cre2_opt_delete(opt); + cre2_delete(rex); + free(lub_y); + free(rad_y); + return u2_cn_cell(u2_nul, u2_cn_cell(u2_nul, map)); + + } + else { + // Compiling the regular expression failed + cre2_opt_delete(opt); + cre2_delete(rex); + free(lub_y); + free(rad_y); + return u2_nul; + } + cre2_delete(rex); + } + cre2_opt_delete(opt); + } + free(lub_y); + free(rad_y); + u2_bl_bail(wir_r, c3__exit); + return u2_nul; + } + + u2_weak // produce + j2_mb(Pt5, rexp)(u2_wire wir_r, + u2_noun cor) // retain + { + u2_noun lub; + u2_noun rad; + + if ( (u2_none == (lub = u2_frag(u2_cv_sam_2, cor))) || + (u2_none == (rad = u2_frag(u2_cv_sam_3, cor))) ) + { + return u2_bl_bail(wir_r, c3__fail); + } else { + return j2_mbc(Pt5, rexp)(wir_r, lub, rad); + } + } + + +/* structures +*/ + u2_ho_jet + j2_mbj(Pt5, rexp)[] = { + { ".2", c3__lite, j2_mb(Pt5, rexp), u2_jet_live | u2_jet_test, u2_none, u2_none }, + { } + }; diff --git a/gen164/watt.c b/gen164/watt.c index 9328eab95..407e514e9 100644 --- a/gen164/watt.c +++ b/gen164/watt.c @@ -87,6 +87,8 @@ extern u2_ho_jet j2_mbj(Pt5, pfix)[]; extern u2_ho_jet j2_mbj(Pt5, plug)[]; extern u2_ho_jet j2_mbj(Pt5, pose)[]; + extern u2_ho_jet j2_mbj(Pt5, repg)[]; + extern u2_ho_jet j2_mbj(Pt5, rexp)[]; extern u2_ho_jet j2_mbj(Pt5, rub)[]; extern u2_ho_jet j2_mbj(Pt5, sfix)[]; extern u2_ho_jet j2_mbj(Pt5, shax)[]; @@ -229,6 +231,8 @@ { j2_sb(Pt5, pfix), j2_mbj(Pt5, pfix), 0, 0, u2_none }, { j2_sb(Pt5, plug), j2_mbj(Pt5, plug), 0, 0, u2_none }, { j2_sb(Pt5, pose), j2_mbj(Pt5, pose), 0, 0, u2_none }, + { j2_sb(Pt5, repg), j2_mbj(Pt5, repg), 0, 0, u2_none }, + { j2_sb(Pt5, rexp), j2_mbj(Pt5, rexp), 0, 0, u2_none }, { j2_sb(Pt5, rub), j2_mbj(Pt5, rub), 0, 0, u2_none }, { j2_sb(Pt5, sfix), j2_mbj(Pt5, sfix), 0, 0, u2_none }, { j2_sb(Pt5, shax), j2_mbj(Pt5, shax), 0, 0, u2_none }, diff --git a/outside/cre2/share/doc/cre2/COPYING b/outside/cre2/share/doc/cre2/COPYING new file mode 100644 index 000000000..672d3b902 --- /dev/null +++ b/outside/cre2/share/doc/cre2/COPYING @@ -0,0 +1,35 @@ +Copyright (c) 2012 Marco Maggi +Copyright (c) 2011 Keegan McAllister +All rights reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +3. Neither the name of the author nor the names of his + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/outside/cre2/share/doc/cre2/LICENSE.re2 b/outside/cre2/share/doc/cre2/LICENSE.re2 new file mode 100644 index 000000000..09e5ec1c7 --- /dev/null +++ b/outside/cre2/share/doc/cre2/LICENSE.re2 @@ -0,0 +1,27 @@ +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/outside/cre2/share/doc/cre2/README b/outside/cre2/share/doc/cre2/README new file mode 100644 index 000000000..d1d254314 --- /dev/null +++ b/outside/cre2/share/doc/cre2/README @@ -0,0 +1,175 @@ + + C wrapper for re2 + ================= + + +Topics +------ + + 1. Introduction + 2. License + 3. Install + 4. Usage + A. Credits + B. Bugs + C. Resources + + +1. Introduction +--------------- + +The CRE2 distribution is a C language wrapper for the RE2 +library, which is implemented in C++. RE2 is a fast, safe, +thread-friendly alternative to backtracking regular +expression engines like those used in PCRE, Perl, and +Python. + + This distribution makes use of the GNU Autotools. + + +2. License +---------- + +Copyright (c) 2012, 2013 Marco Maggi +Copyright (c) 2011 Keegan McAllister +All rights reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +3. Neither the name of the author nor the names of his + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +3. Install +---------- + +To install RE2 follow the instructions in the README file in +RE2's. To install CRE2 from a proper release tarball, do +this: + + $ cd cre2-0.1.0 + $ mkdir "=build" + $ cd "=build" + +to inspect the available configuration options: + + $ ../configure --help + +then do it: + + $ ../configure [options] + $ make + $ make check + $ make install + + From a repository checkout or snapshot (the ones from the +Github site): we may need to manually run "libtoolize" the +first time, then we must first run the script "autogen.sh" +from the top source directory, to generate the needed files: + + $ sh autogen.sh + +for this we need to have installed the GNU Autotools: +Automake, Autoconf, Libtool. After this: + + $ ../configure --enable-maintainer-mode [options] + $ make + $ make check + $ make install + + The Makefile supports the DESTDIR environment variable to +install files in a temporary location, example: to see what +will happen: + + $ make -n install DESTDIR=/tmp/marco/cre2 + +to really do it: + + $ make install DESTDIR=/tmp/marco/cre2 + + +4. Usage +-------- + +Read the documentation. + + +A. Credits +---------- + +RE2 is a Google project. CRE2 is based on code by Keegan +McAllister. This distribution was assembled by Marco Maggi. + + +B. Bugs +------- + +Bug reports are appreciated. Register issues at the CRE2 +issue tracker: + + + + +C. Resources +------------ + +The GNU Project software can be found here: + + + +RE2 is available at: + + + +development of this package happens at: + + + +and as backup at: + + + +proper release tarballs for this package are in the download +area at: + + + +the documentation is available online: + + + + +### end of file +# Local Variables: +# mode: text +# coding: utf-8-unix +# fill-column: 60 +# paragraph-start: "*" +# End: diff --git a/outside/cre2/share/info/cre2.info b/outside/cre2/share/info/cre2.info new file mode 100644 index 000000000..b0ba9e3ef --- /dev/null +++ b/outside/cre2/share/info/cre2.info @@ -0,0 +1,1489 @@ +This is cre2.info, produced by makeinfo version 5.2 from cre2.texi. + +This document describes version 0.1b5 of CRE2, a C language wrapper for +the C++ library RE2: a fast, safe, thread-friendly alternative to +backtracking regular expression engines like those used in PCRE, Perl, +and Python. + + The package is distributed under the terms of a BSD-like license and +can be downloaded from: + + + +development takes place at: + + + +and as a backup at: + + + +Copyright (C) 2012 by Marco Maggi +Copyright (C) 2011 by Keegan McAllister + + Portions of this document come from the source code of RE2 itself, +see the file 'LICENSE.re2' for the license notice. + + Permission is granted to copy, distribute and/or modify this + document under the terms of the GNU Free Documentation License, + Version 1.3 or any later version published by the Free Software + Foundation; with Invariant Sections being "GNU Free Documentation + License" and "GNU General Public License", no Front-Cover Texts, + and no Back-Cover Texts. A copy of the license is included in the + section entitled "GNU Free Documentation License". +INFO-DIR-SECTION Development +START-INFO-DIR-ENTRY +* cre2: (cre2). C wrapper for RE2. +END-INFO-DIR-ENTRY + + +File: cre2.info, Node: Top, Next: overview, Up: (dir) + +C wrapper for RE2 +***************** + +This document describes version 0.1b5 of CRE2, a C language wrapper for +the C++ library RE2: a fast, safe, thread-friendly alternative to +backtracking regular expression engines like those used in PCRE, Perl, +and Python. + + The package is distributed under the terms of a BSD-like license and +can be downloaded from: + + + +development takes place at: + + + +and as a backup at: + + + +Copyright (C) 2012 by Marco Maggi +Copyright (C) 2011 by Keegan McAllister + + Portions of this document come from the source code of RE2 itself, +see the file 'LICENSE.re2' for the license notice. + + Permission is granted to copy, distribute and/or modify this + document under the terms of the GNU Free Documentation License, + Version 1.3 or any later version published by the Free Software + Foundation; with Invariant Sections being "GNU Free Documentation + License" and "GNU General Public License", no Front-Cover Texts, + and no Back-Cover Texts. A copy of the license is included in the + section entitled "GNU Free Documentation License". + +* Menu: + +* overview:: Overview of the package. +* version:: Version functions. +* regexps:: Precompiled regular expressions + construction. +* options:: Matching configuration. +* matching:: Matching regular expressions. +* other:: Other matching functions. +* tips:: Tips for using the regexp syntax. + +Appendices + +* Package License:: Package license. +* Documentation License:: GNU Free Documentation License. +* references:: Bibliography and references. + +Indexes + +* concept index:: An entry for each concept. +* function index:: An entry for each function. +* variable index:: An entry for each variable. +* type index:: An entry for each type. + + +File: cre2.info, Node: overview, Next: version, Prev: Top, Up: Top + +1 Overview of the package +************************* + +CRE2 is a C language wrapper for the C++ library RE2: a fast, safe, +thread-friendly alternative to backtracking regular expression engines +like those used in PCRE, Perl, and Python. CRE2 is based on code by +Keegan McAllister for the 'haskell-re2' binding: + + + + For the supported regular expressions syntax we should refer to the +original documentation: + + + + The C wrapper is meant to make it easier to interface RE2 with other +languages. The exposed API allows searching for substrings of text +matching regular expressions and reporting portions of text matching +parenthetical subexpressions. + + CRE2 installs the single header file 'cre2.h'. All the function +names in the API are prefixed with 'cre2_'; all the constant names are +prefixed with 'CRE2_'; all the type names are prefixed with 'cre2_' and +suffixed with '_t'. + + When searching for the installed libraries with the GNU Autotools, we +can use the following macros in 'configure.ac': + + AC_CHECK_LIB([re2],[main],, + [AC_MSG_FAILURE([test for RE2 library failed])]) + + AC_CHECK_LIB([cre2],[cre2_version_string],, + [AC_MSG_FAILURE([test for CRE2 library failed])]) + AC_CHECK_HEADERS([cre2.h],, + [AC_MSG_ERROR([test for RE2 header failed])]) + +notice that there is no need to check for the header file 're2/re2.h'. + + It is customary for regular expression engines to provide methods to +replace backslash sequences like '\1', '\2', ... in a given string with +portions of text that matched the first, second, ... parenthetical +subexpression; CRE2 does *not* provide such methods in its public API, +because they require interacting with the storage mechanism in the +client code. However, it is not difficult to implement such +substitutions given the results of a regular expression matching +operation. + + Some functions and methods from RE2 requiring memory allocation +handling are unofficially wrapped by CRE2 with unsafe code (execution +will succeed when no memory allocation errors happen). These +"problematic" functions are documented in the header file 'cre2.h' and, +at present, are not considered part of the public API of CRE2. + + It is sometimes useful to try a program in the original C++ to verify +if a problem is caused by CRE2 or is in the original RE2 code; we may +want to start by customising this program: + + /* compile and run with: + + $ g++ -Wall -o proof proof.cpp -lre2 && ./proof + */ + + #include + #include + + static void try_match (RE2::Options& opt, const char * text); + + int + main (int argc, const char *const argv[]) + { + RE2::Options opt; + opt.set_never_nl(true); + try_match(opt, "abcdef"); + return 0; + } + void + try_match (RE2::Options& opt, const char * text) + { + RE2 re("abcdef", opt); + assert(re.ok()); + assert(RE2::FullMatch(text, re)); + //assert(RE2::PartialMatch(text, re)); + } + + +File: cre2.info, Node: version, Next: regexps, Prev: overview, Up: Top + +2 Version functions +******************* + +The installed libraries follow version numbering as established by the +GNU Autotools. For an explanation of interface numbers as managed by +GNU Libtool *Note interface: (libtool)Libtool versioning. + + -- Function: const char * cre2_version_string (void) + Return a pointer to a statically allocated ASCIIZ string + representing the interface version number. + + -- Function: int cre2_version_interface_current (void) + Return an integer representing the library interface current + number. + + -- Function: int cre2_version_interface_revision (void) + Return an integer representing the library interface current + revision number. + + -- Function: int cre2_version_interface_age (void) + Return an integer representing the library interface current age. + + +File: cre2.info, Node: regexps, Next: options, Prev: version, Up: Top + +3 Precompiled regular expressions construction +********************************************** + +Regular expression objects are built and finalised as follows: + + cre2_regexp_t * rex; + cre2_options_t * opt; + + opt = cre2_opt_new(); + if (opt) { + cre2_opt_set_log_errors(opt, 0); + rex = cre2_new("ciao", 4, opt); + if (rex) { + if (!cre2_error_code(rex)) + /* successfully built */ + else + /* an error occurred while compiling rex */ + cre2_delete(rex); + } else { + /* rex memory allocation error */ + } + cre2_opt_delete(opt); + } else { + /* opt memory allocation error */ + } + + -- Opaque Type: cre2_regexp_t + Opaque type for regular expression objects; it is meant to be used + to declare pointers to objects. Instances of this type can be used + for any number of matching operations and are safe for concurrent + use by multiple threads. + + -- Struct Typedef: cre2_string_t + Simple data structure used to reference a portion of another + string. It has the following fields: + + 'const char * data' + Pointer to the first byte in the referenced substring. + + 'int length' + The number of bytes in the referenced substring. + + -- Enumeration Typedef: cre2_error_code_t + Enumeration type for error codes returned by 'cre2_error_code()'. + It contains the following symbols: + + 'CRE2_NO_ERROR' + Defined as '0', represents a successful operation. + + 'CRE2_ERROR_INTERNAL' + Unexpected error. + + 'CRE2_ERROR_BAD_ESCAPE' + Bad escape sequence. + + 'CRE2_ERROR_BAD_CHAR_CLASS' + Bad character class. + + 'CRE2_ERROR_BAD_CHAR_RANGE' + Bad character class range. + + 'CRE2_ERROR_MISSING_BRACKET' + Missing closing ']'. + + 'CRE2_ERROR_MISSING_PAREN' + Missing closing ')'. + + 'CRE2_ERROR_TRAILING_BACKSLASH' + Trailing '\' at end of regexp. + + 'CRE2_ERROR_REPEAT_ARGUMENT' + Repeat argument missing, e.g. '*'. + + 'CRE2_ERROR_REPEAT_SIZE' + Bad repetition argument. + + 'CRE2_ERROR_REPEA_TOP' + Bad repetition operator. + + 'CRE2_ERROR_BAD_PERL_OP' + Bad Perl operator. + + 'CRE2_ERROR_BAD_UTF8' + Invalid UTF-8 in regexp. + + 'CRE2_ERROR_BAD_NAMED_CAPTURE' + Bad named capture group. + + 'CRE2_ERROR_PATTERN_TOO_LARGE' + Pattern too large (compile failed). + + -- Function: cre2_regexp_t * cre2_new (const char * PATTERN, int + PATTERN_LEN, const cre2_options_t * OPT) + Build and return a new regular expression object representing the + PATTERN of length PATTERN_LEN bytes; the object is configured with + the options in OPT. If memory allocation fails: the return value + is a 'NULL' pointer. + + The options object OPT is duplicated in the internal state of the + regular expression instance, so OPT can be safely mutated or + finalised after this call. If OPT is 'NULL': the regular + expression object is built with the default set of options. + + -- Function: void cre2_delete (cre2_regexp_t * REX) + Finalise a regular expression object releasing all the associated + resources. + + -- Function: const char * cre2_pattern (const cre2_regexp_t * REX) + Whether REX is a successfully built regular expression object or + not: return a pointer to the pattern string. The returned pointer + is valid only while REX is alive: if 'cre2_delete()' is applied to + REX the pointer becomes invalid. + + -- Function: int cre2_num_capturing_groups (const cre2_regexp_t * REX) + If REX is a successfully built regular expression object: return a + non-negative integer representing the number of capturing groups + (parenthetical subexpressions) in the pattern. If an error + occurred while building REX: return '-1'. + + -- Function: int cre2_program_size (const cre2_regexp_t * REX) + If REX is a successfully built regular expression object: return a + non-negative integer representing the program size, a very + approximate measure of a regexp's "cost"; larger numbers are more + expensive than smaller numbers. If an error occurred while + building REX: return '-1'. + + -- Function: int cre2_error_code (const cre2_regexp_t * REX) + In case an error occurred while building REX: return an integer + representing the associated error code. Return zero if no error + occurred. + + -- Function: const char * cre2_error_string (const cre2_regexp_t * REX) + If an error occurred while building REX: return a pointer to an + ASCIIZ string representing the associated error message. The + returned pointer is valid only while REX is alive: if + 'cre2_delete()' is applied to REX the pointer becomes invalid. + + If REX is a successfully built regular expression object: return a + pointer to an empty string. + + The following code: + + cre2_regexp_t * rex; + + rex = cre2_new("ci(ao", 5, NULL); + { + printf("error: code=%d, msg=\"%s\"\n", + cre2_error_code(rex), + cre2_error_string(rex)); + } + cre2_delete(rex); + + prints: + + error: code=6, msg="missing ): ci(ao" + + -- Function: void cre2_error_arg (const cre2_regexp_t * REX, + cre2_string_t * ARG) + If an error occurred while building REX: fill the structure + referenced by ARG with the interval of bytes representing the + offending portion of the pattern. + + If REX is a successfully built regular expression object: ARG + references an empty string. + + The following code: + + cre2_regexp_t * rex; + cre2_string_t S; + + rex = cre2_new("ci(ao", 5, NULL); + { + cre2_error_arg(rex, &S); + printf("arg: len=%d, data=\"%s\"\n", S.length, S.data); + } + cre2_delete(rex); + + prints: + + arg: len=5 data="ci(ao" + + +File: cre2.info, Node: options, Next: matching, Prev: regexps, Up: Top + +4 Matching configuration +************************ + +Compiled regular expressions can be configured, at construction-time, +with a number of options collected in a 'cre2_options_t' object. Notice +that, by default, when attempting to compile an invalid regular +expression pattern, RE2 will print to 'stderr' an error message; usually +we want to avoid this logging by disabling the associated option: + + cre2_options_t * opt; + + opt = cre2_opt_new(); + cre2_opt_set_log_errors(opt, 0); + + -- Opaque Typedef: cre2_options_t + Type of opaque pointers to options objects. Any instance of this + type can be used to configure any number of regular expression + objects. + + -- Enumeration Typedef: cre2_encoding_t + Enumeration type for constants selecting encoding. It contains the + following values: + + CRE2_UNKNOWN + CRE2_UTF8 + CRE2_Latin1 + + The value 'CRE2_UNKNOWN' should never be used: it exists only in + case there is a mismatch between the definitions of RE2 and CRE2. + + -- Function: cre2_options_t * cre2_opt_new (void) + Allocate and return a new options object. If memory allocation + fails: the return value is a 'NULL' pointer. + + -- Function: void cre2_opt_delete (cre2_options_t * OPT) + Finalise an options object releasing all the associated resources. + Compiled regular expressions configured with this object are *not* + affected by its destruction. + + All the following functions are getters and setters for regular +expression options; the FLAG argument to the setter must be false to +disable the option and true to enable it; unless otherwise specified the +'int' return value is true if the option is enabled and false if it is +disabled. + + -- Function: cre2_encoding_t cre2_opt_encoding (cre2_options_t * OPT) + -- Function: void cre2_opt_set_encoding (cre2_options_t * OPT, + cre2_encoding_t ENC) + By default, the regular expression pattern and input text are + interpreted as UTF-8. CRE2_Latin1 encoding causes them to be + interpreted as Latin-1. + + The getter returns 'CRE2_UNKNOWN' if the encoding value returned by + RE2 is unknown. + + -- Function: int cre2_opt_posix_syntax (cre2_options_t * OPT) + -- Function: void cre2_opt_set_posix_syntax (cre2_options_t * OPT, int + FLAG) + Restrict regexps to POSIX egrep syntax. Default is disabled. + + -- Function: int cre2_opt_longest_match (cre2_options_t * OPT) + -- Function: void cre2_opt_set_longest_match (cre2_options_t * OPT, int + FLAG) + Search for longest match, not first match. Default is disabled. + + -- Function: int cre2_opt_log_errors (cre2_options_t * OPT) + -- Function: void cre2_opt_set_log_errors (cre2_options_t * OPT, int + FLAG) + Log syntax and execution errors to 'stderr'. Default is enabled. + + -- Function: int cre2_opt_literal (cre2_options_t * OPT) + -- Function: void cre2_opt_set_literal (cre2_options_t * OPT, int FLAG) + Interpret the pattern string as literal, not as regular expression. + Default is disabled. + + Setting this option is equivalent to quoting all the special + characters defining a regular expression pattern: + + cre2_regexp_t * rex; + cre2_options_t * opt; + const char * pattern = "(ciao) (hello)"; + const char * text = pattern; + int len = strlen(pattern); + + opt = cre2_opt_new(); + cre2_opt_set_literal(opt, 1); + rex = cre2_new(pattern, len, opt); + { + /* successful match */ + cre2_match(rex, text, len, 0, len, + CRE2_UNANCHORED, NULL, 0); + } + cre2_delete(rex); + cre2_opt_delete(opt); + + -- Function: int cre2_opt_never_nl (cre2_options_t * OPT) + -- Function: void cre2_opt_set_never_nl (cre2_options_t * OPT, int + FLAG) + Never match a newline character, even if it is in the regular + expression pattern; default is disabled. Turning on this option + allows us to attempt a partial match, against the beginning of a + multiline text, without using subpatterns to exclude the newline in + the regexp pattern. + + * When set to true: matching always fails if the text or the + regexp contains a newline. + + * When set to false: matching succeeds or fails taking normal + account of newlines. + + * The option does *not* cause newlines to be skipped. + + -- Function: int cre2_opt_case_sensitive (cre2_options_t * OPT) + -- Function: void cre2_opt_set_case_sensitive (cre2_options_t * OPT, + int FLAG) + Match is case-sensitive; the regular expression pattern can + override this setting with '(?i)' unless configured in POSIX syntax + mode. Default is enabled. + + -- Function: int cre2_opt_max_mem (cre2_options_t * OPT) + -- Function: void cre2_opt_set_max_mem (cre2_options_t * OPT, int M) + The max memory option controls how much memory can be used to hold + the compiled form of the regular expression and its cached DFA + graphs. These functions set and get such amount of memory. See + the documentation of RE2 for details. + + The following options are only consulted when POSIX syntax is +enabled; when POSIX syntax is disabled: these features are always +enabled and cannot be turned off. + + -- Function: int cre2_opt_perl_classes (cre2_options_t * OPT) + -- Function: void cre2_opt_set_perl_classes (cre2_options_t * OPT, int + FLAG) + Allow Perl's '\d', '\s', '\w', '\D', '\S', '\W'. Default is + disabled. + + -- Function: int cre2_opt_word_boundary (cre2_options_t * OPT) + -- Function: void cre2_opt_set_word_boundary (cre2_options_t * OPT, int + FLAG) + Allow Perl's '\b', '\B' (word boundary and not). Default is + disabled. + + -- Function: int cre2_opt_one_line (cre2_options_t * OPT) + -- Function: void cre2_opt_set_one_line (cre2_options_t * OPT, int + FLAG) + The patterns '^' and '$' only match at the beginning and end of the + text. Default is disabled. + + +File: cre2.info, Node: matching, Next: other, Prev: options, Up: Top + +5 Matching regular expressions +****************************** + +Basic pattern matching goes as follows (with error checking omitted): + + cre2_regexp_t * rex; + cre2_options_t * opt; + const char * pattern = "(ciao) (hello)"; + + opt = cre2_opt_new(); + cre2_opt_set_posix_syntax(opt, 1); + + rex = cre2_new(pattern, strlen(pattern), opt); + { + const char * text = "ciao hello"; + int text_len = strlen(text); + int nmatch = 3; + cre2_string_t match[nmatch]; + + cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, + match, nmatch); + + /* prints: full match: ciao hello */ + printf("full match: "); + fwrite(match[0].data, match[0].length, 1, stdout); + printf("\n"); + + /* prints: first group: ciao */ + printf("first group: "); + fwrite(match[1].data, match[1].length, 1, stdout); + printf("\n"); + + /* prints: second group: hello */ + printf("second group: "); + fwrite(match[2].data, match[2].length, 1, stdout); + printf("\n"); + } + cre2_delete(rex); + cre2_opt_delete(opt); + + -- Enumeration Typedef: cre2_anchor_t + Enumeration type for the anchor point of matching operations. It + contains the following constants: + + CRE2_UNANCHORED + CRE2_ANCHOR_START + CRE2_ANCHOR_BOTH + + -- Function: int cre2_match (const cre2_regexp_t * REX, const char * + TEXT, int TEXT_LEN, int START_POS, int END_POS, cre2_anchor_t + ANCHOR, cre2_string_t * MATCH, int NMATCH) + Match a substring of the text referenced by TEXT and holding + TEXT_LEN bytes against the regular expression object REX. Return + true if the text matched, false otherwise. + + The zero-based indices START_POS (inclusive) and END_POS + (exclusive) select the substring of TEXT to be examined. ANCHOR + selects the anchor point for the matching operation. + + Data about the matching groups is stored in the array MATCH, which + must have at least NMATCH entries; the referenced substrings are + portions of the TEXT buffer. If we are only interested in + verifying if the text matches or not (ignoring the matching + portions of text): we can use 'NULL' as MATCH argument and 0 as + NMATCH argument. + + The first element of MATCH (index 0) references the full portion of + the substring of TEXT matching the pattern; the second element of + MATCH (index 1) references the portion of text matching the first + parenthetical subexpression, the third element of MATCH (index 2) + references the portion of text matching the second parenthetical + subexpression; and so on. + + -- Function: int cre2_easy_match (const char * PATTERN, int + PATTERN_LEN, const char * TEXT, int TEXT_LEN, cre2_string_t * + MATCH, int NMATCH) + Like 'cre2_match()' but the pattern is specified as string PATTERN + holding PATTERN_LEN bytes. Also the text is fully matched without + anchoring. + + If the text matches the pattern: the return value is 1. If the + text does not match the pattern: the return value is 0. If the + pattern is invalid: the return value is 2. + + -- Struct Typedef: cre2_range_t + Structure type used to represent a substring of the text to be + matched as starting and ending indices. It has the following + fields: + + 'long start' + Inclusive start byte index. + + 'long past' + Exclusive end byte index. + + -- Function: void cre2_strings_to_ranges (const char * TEXT, + cre2_range_t * RANGES, cre2_string_t * STRINGS, int NMATCH) + Given an array of STRINGS with NMATCH elements being the result of + matching TEXT against a regular expression: fill the array of + RANGES with the index intervals in the TEXT buffer representing the + same results. + + +File: cre2.info, Node: other, Next: tips, Prev: matching, Up: Top + +6 Other matching functions +************************** + +The following functions match a buffer of text against a regular +expression, allowing the extraction of portions of text matching +parenthetical subexpressions. All of them show the following behaviour: + + * If the text matches the pattern: the return value is 1; if the text + does not match the pattern: the return value is 0. + + * If the pattern is invalid: the return value is 0; there is no way + to distinguish this case from the case of text not matching other + than looking at what RE2 prints to 'stderr'. + + * It is impossible to turn off logging of error messages to 'stderr' + when the specification of the regular expression is invalid. + + * Data about the matching groups is stored in the array MATCH, which + must have at least NMATCH slots; the referenced substrings are + portions of the TEXT buffer. + + * The array MATCH can have a number of slots between zero (included) + and the number of parenthetical subexpressions in PATTERN + (excluded); if NMATCH is greater than the number of parenthetical + subexpressions: the return value is 0. + + * If we are only interested in verifying if the text matches the + pattern or not: we can use 'NULL' as MATCH argument and 0 as NMATCH + argument. + + * The first slot of MATCH (index 0) references the portion of text + matching the first parenthetical subexpression; the second slot of + MATCH (index 1) references the portion of text matching the second + parenthetical subexpression; and so on. + +see the documentation of each function for the differences. + + The following example is a successful match: + + const char * pattern = "ci.*ut"; + const char * text = "ciao salut"; + cre2_string_t input = { + .data = text, + .length = strlen(text) + }; + int result; + result = cre2_full_match(pattern, &input, NULL, 0); + + result => 1 + +the following example is a successful match in which the parenthetical +subexpression is ignored: + + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { + .data = text, + .length = strlen(text) + }; + int result; + result = cre2_full_match(pattern, &input, NULL, 0); + + result => 1 + +the following example is a successful match in which the portion of text +matching the parenthetical subexpression is reported: + + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { + .data = text, + .length = strlen(text) + }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + + result => 1 + strncmp(text, input.data, input.length) => 0 + strncmp("ciao", match[0].data, match[0].length) => 0 + + -- Function: int cre2_full_match (const char * PATTERN, const + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + -- Function: int cre2_full_match_re (cre2_regexp_t * REX, const + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + Match the zero-terminated string PATTERN or the precompiled regular + expression REX against the full buffer TEXT. + + For example: the text 'abcdef' matches the pattern 'abcdef' + according to this function, but neither the pattern 'abc' nor the + pattern 'def' will match. + + -- Function: int cre2_partial_match (const char * PATTERN, const + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + -- Function: int cre2_partial_match_re (cre2_regexp_t * REX, const + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + Match the zero-terminated string PATTERN or the precompiled regular + expression REX against the buffer TEXT, resulting in success if a + substring of TEXT matches; these functions behave like the full + match ones, but the matching text does not need to be anchored to + the beginning and end. + + For example: the text 'abcDEFghi' matches the pattern 'DEF' + according to this function. + + -- Function: int cre2_consume (const char * PATTERN, cre2_string_t * + TEXT, cre2_string_t * MATCH, int NMATCH) + -- Function: int cre2_consume_re (cre2_regexp_t * REX, cre2_string_t * + TEXT, cre2_string_t * MATCH, int NMATCH) + Match the zero-terminated string PATTERN or the precompiled regular + expression REX against the buffer TEXT, resulting in success if the + prefix of TEXT matches. The data structure referenced by TEXT is + mutated to reference text right after the last byte that matched + the pattern. + + For example: the text 'abcDEF' matches the pattern 'abc' according + to this function; after the call TEXT will reference the text + 'DEF'. + + -- Function: int cre2_find_and_consume (const char * PATTERN, + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + -- Function: int cre2_find_and_consume_re (cre2_regexp_t * REX, + cre2_string_t * TEXT, cre2_string_t * MATCH, int NMATCH) + Match the zero-terminated string PATTERN or the precompiled regular + expression REX against the buffer TEXT, resulting in success if, + after skipping a non-matching prefix in TEXT, a substring of TEXT + matches. The data structure referenced by TEXT is mutated to + reference text right after the last byte that matched the pattern. + + For example: the text 'abcDEFghi' matches the pattern 'DEF' + according to this function; the prefix 'abc' is skipped; after the + call TEXT will reference the text 'ghi'. + + +File: cre2.info, Node: tips, Next: Package License, Prev: other, Up: Top + +7 Tips for using the regexp syntax +********************************** + +* Menu: + +* tips dot:: Matching newlines with the + '.' subpattern. + + +File: cre2.info, Node: tips dot, Up: tips + +7.1 Matching newlines with the '.' subpattern +============================================= + +By default the dot subpattern '.' matches any character but newlines; to +enable newline matching we have to enable the 's' flag using the special +subpattern '(?)' or '(?:)', where '' is a +sequence of characters, one character for each flag, and '' is a +regexp subpattern. Notice that the parentheses in '(?:)' are +non-capturing. + + So let's consider the text 'ciao\nhello': + + * The regexp 'ciao.hello' does *not* match because 's' is disabled. + + * The regexp '(?s)ciao.hello' matches because the subpattern '(?s)' + has enabled flag 's' for the rest of the pattern, including the + dot. + + * The regexp 'ciao(?s).hello' matches because the subpattern '(?s)' + has enabled flag 's' for the rest of the pattern, including the + dot. + + * The regexp 'ciao(?s:.)hello' matches because the subpattern + '(?s:.)' has enabled flag 's' for the subpattern '.' which is the + dot. + + +File: cre2.info, Node: Package License, Next: Documentation License, Prev: tips, Up: Top + +Appendix A Package license +************************** + +Copyright (C) 2012 Marco Maggi +Copyright (C) 2011 Keegan McAllister +All rights reserved. + + Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + 3. Neither the name of the author nor the names of his contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +File: cre2.info, Node: Documentation License, Next: references, Prev: Package License, Up: Top + +Appendix B GNU Free Documentation License +***************************************** + + Version 1.3, 3 November 2008 + + Copyright (C) 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc. + + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + 0. PREAMBLE + + The purpose of this License is to make a manual, textbook, or other + functional and useful document "free" in the sense of freedom: to + assure everyone the effective freedom to copy and redistribute it, + with or without modifying it, either commercially or + noncommercially. Secondarily, this License preserves for the + author and publisher a way to get credit for their work, while not + being considered responsible for modifications made by others. + + This License is a kind of "copyleft", which means that derivative + works of the document must themselves be free in the same sense. + It complements the GNU General Public License, which is a copyleft + license designed for free software. + + We have designed this License in order to use it for manuals for + free software, because free software needs free documentation: a + free program should come with manuals providing the same freedoms + that the software does. But this License is not limited to + software manuals; it can be used for any textual work, regardless + of subject matter or whether it is published as a printed book. We + recommend this License principally for works whose purpose is + instruction or reference. + + 1. APPLICABILITY AND DEFINITIONS + + This License applies to any manual or other work, in any medium, + that contains a notice placed by the copyright holder saying it can + be distributed under the terms of this License. Such a notice + grants a world-wide, royalty-free license, unlimited in duration, + to use that work under the conditions stated herein. The + "Document", below, refers to any such manual or work. Any member + of the public is a licensee, and is addressed as "you". You accept + the license if you copy, modify or distribute the work in a way + requiring permission under copyright law. + + A "Modified Version" of the Document means any work containing the + Document or a portion of it, either copied verbatim, or with + modifications and/or translated into another language. + + A "Secondary Section" is a named appendix or a front-matter section + of the Document that deals exclusively with the relationship of the + publishers or authors of the Document to the Document's overall + subject (or to related matters) and contains nothing that could + fall directly within that overall subject. (Thus, if the Document + is in part a textbook of mathematics, a Secondary Section may not + explain any mathematics.) The relationship could be a matter of + historical connection with the subject or with related matters, or + of legal, commercial, philosophical, ethical or political position + regarding them. + + The "Invariant Sections" are certain Secondary Sections whose + titles are designated, as being those of Invariant Sections, in the + notice that says that the Document is released under this License. + If a section does not fit the above definition of Secondary then it + is not allowed to be designated as Invariant. The Document may + contain zero Invariant Sections. If the Document does not identify + any Invariant Sections then there are none. + + The "Cover Texts" are certain short passages of text that are + listed, as Front-Cover Texts or Back-Cover Texts, in the notice + that says that the Document is released under this License. A + Front-Cover Text may be at most 5 words, and a Back-Cover Text may + be at most 25 words. + + A "Transparent" copy of the Document means a machine-readable copy, + represented in a format whose specification is available to the + general public, that is suitable for revising the document + straightforwardly with generic text editors or (for images composed + of pixels) generic paint programs or (for drawings) some widely + available drawing editor, and that is suitable for input to text + formatters or for automatic translation to a variety of formats + suitable for input to text formatters. A copy made in an otherwise + Transparent file format whose markup, or absence of markup, has + been arranged to thwart or discourage subsequent modification by + readers is not Transparent. An image format is not Transparent if + used for any substantial amount of text. A copy that is not + "Transparent" is called "Opaque". + + Examples of suitable formats for Transparent copies include plain + ASCII without markup, Texinfo input format, LaTeX input format, + SGML or XML using a publicly available DTD, and standard-conforming + simple HTML, PostScript or PDF designed for human modification. + Examples of transparent image formats include PNG, XCF and JPG. + Opaque formats include proprietary formats that can be read and + edited only by proprietary word processors, SGML or XML for which + the DTD and/or processing tools are not generally available, and + the machine-generated HTML, PostScript or PDF produced by some word + processors for output purposes only. + + The "Title Page" means, for a printed book, the title page itself, + plus such following pages as are needed to hold, legibly, the + material this License requires to appear in the title page. For + works in formats which do not have any title page as such, "Title + Page" means the text near the most prominent appearance of the + work's title, preceding the beginning of the body of the text. + + The "publisher" means any person or entity that distributes copies + of the Document to the public. + + A section "Entitled XYZ" means a named subunit of the Document + whose title either is precisely XYZ or contains XYZ in parentheses + following text that translates XYZ in another language. (Here XYZ + stands for a specific section name mentioned below, such as + "Acknowledgements", "Dedications", "Endorsements", or "History".) + To "Preserve the Title" of such a section when you modify the + Document means that it remains a section "Entitled XYZ" according + to this definition. + + The Document may include Warranty Disclaimers next to the notice + which states that this License applies to the Document. These + Warranty Disclaimers are considered to be included by reference in + this License, but only as regards disclaiming warranties: any other + implication that these Warranty Disclaimers may have is void and + has no effect on the meaning of this License. + + 2. VERBATIM COPYING + + You may copy and distribute the Document in any medium, either + commercially or noncommercially, provided that this License, the + copyright notices, and the license notice saying this License + applies to the Document are reproduced in all copies, and that you + add no other conditions whatsoever to those of this License. You + may not use technical measures to obstruct or control the reading + or further copying of the copies you make or distribute. However, + you may accept compensation in exchange for copies. If you + distribute a large enough number of copies you must also follow the + conditions in section 3. + + You may also lend copies, under the same conditions stated above, + and you may publicly display copies. + + 3. COPYING IN QUANTITY + + If you publish printed copies (or copies in media that commonly + have printed covers) of the Document, numbering more than 100, and + the Document's license notice requires Cover Texts, you must + enclose the copies in covers that carry, clearly and legibly, all + these Cover Texts: Front-Cover Texts on the front cover, and + Back-Cover Texts on the back cover. Both covers must also clearly + and legibly identify you as the publisher of these copies. The + front cover must present the full title with all words of the title + equally prominent and visible. You may add other material on the + covers in addition. Copying with changes limited to the covers, as + long as they preserve the title of the Document and satisfy these + conditions, can be treated as verbatim copying in other respects. + + If the required texts for either cover are too voluminous to fit + legibly, you should put the first ones listed (as many as fit + reasonably) on the actual cover, and continue the rest onto + adjacent pages. + + If you publish or distribute Opaque copies of the Document + numbering more than 100, you must either include a machine-readable + Transparent copy along with each Opaque copy, or state in or with + each Opaque copy a computer-network location from which the general + network-using public has access to download using public-standard + network protocols a complete Transparent copy of the Document, free + of added material. If you use the latter option, you must take + reasonably prudent steps, when you begin distribution of Opaque + copies in quantity, to ensure that this Transparent copy will + remain thus accessible at the stated location until at least one + year after the last time you distribute an Opaque copy (directly or + through your agents or retailers) of that edition to the public. + + It is requested, but not required, that you contact the authors of + the Document well before redistributing any large number of copies, + to give them a chance to provide you with an updated version of the + Document. + + 4. MODIFICATIONS + + You may copy and distribute a Modified Version of the Document + under the conditions of sections 2 and 3 above, provided that you + release the Modified Version under precisely this License, with the + Modified Version filling the role of the Document, thus licensing + distribution and modification of the Modified Version to whoever + possesses a copy of it. In addition, you must do these things in + the Modified Version: + + A. Use in the Title Page (and on the covers, if any) a title + distinct from that of the Document, and from those of previous + versions (which should, if there were any, be listed in the + History section of the Document). You may use the same title + as a previous version if the original publisher of that + version gives permission. + + B. List on the Title Page, as authors, one or more persons or + entities responsible for authorship of the modifications in + the Modified Version, together with at least five of the + principal authors of the Document (all of its principal + authors, if it has fewer than five), unless they release you + from this requirement. + + C. State on the Title page the name of the publisher of the + Modified Version, as the publisher. + + D. Preserve all the copyright notices of the Document. + + E. Add an appropriate copyright notice for your modifications + adjacent to the other copyright notices. + + F. Include, immediately after the copyright notices, a license + notice giving the public permission to use the Modified + Version under the terms of this License, in the form shown in + the Addendum below. + + G. Preserve in that license notice the full lists of Invariant + Sections and required Cover Texts given in the Document's + license notice. + + H. Include an unaltered copy of this License. + + I. Preserve the section Entitled "History", Preserve its Title, + and add to it an item stating at least the title, year, new + authors, and publisher of the Modified Version as given on the + Title Page. If there is no section Entitled "History" in the + Document, create one stating the title, year, authors, and + publisher of the Document as given on its Title Page, then add + an item describing the Modified Version as stated in the + previous sentence. + + J. Preserve the network location, if any, given in the Document + for public access to a Transparent copy of the Document, and + likewise the network locations given in the Document for + previous versions it was based on. These may be placed in the + "History" section. You may omit a network location for a work + that was published at least four years before the Document + itself, or if the original publisher of the version it refers + to gives permission. + + K. For any section Entitled "Acknowledgements" or "Dedications", + Preserve the Title of the section, and preserve in the section + all the substance and tone of each of the contributor + acknowledgements and/or dedications given therein. + + L. Preserve all the Invariant Sections of the Document, unaltered + in their text and in their titles. Section numbers or the + equivalent are not considered part of the section titles. + + M. Delete any section Entitled "Endorsements". Such a section + may not be included in the Modified Version. + + N. Do not retitle any existing section to be Entitled + "Endorsements" or to conflict in title with any Invariant + Section. + + O. Preserve any Warranty Disclaimers. + + If the Modified Version includes new front-matter sections or + appendices that qualify as Secondary Sections and contain no + material copied from the Document, you may at your option designate + some or all of these sections as invariant. To do this, add their + titles to the list of Invariant Sections in the Modified Version's + license notice. These titles must be distinct from any other + section titles. + + You may add a section Entitled "Endorsements", provided it contains + nothing but endorsements of your Modified Version by various + parties--for example, statements of peer review or that the text + has been approved by an organization as the authoritative + definition of a standard. + + You may add a passage of up to five words as a Front-Cover Text, + and a passage of up to 25 words as a Back-Cover Text, to the end of + the list of Cover Texts in the Modified Version. Only one passage + of Front-Cover Text and one of Back-Cover Text may be added by (or + through arrangements made by) any one entity. If the Document + already includes a cover text for the same cover, previously added + by you or by arrangement made by the same entity you are acting on + behalf of, you may not add another; but you may replace the old + one, on explicit permission from the previous publisher that added + the old one. + + The author(s) and publisher(s) of the Document do not by this + License give permission to use their names for publicity for or to + assert or imply endorsement of any Modified Version. + + 5. COMBINING DOCUMENTS + + You may combine the Document with other documents released under + this License, under the terms defined in section 4 above for + modified versions, provided that you include in the combination all + of the Invariant Sections of all of the original documents, + unmodified, and list them all as Invariant Sections of your + combined work in its license notice, and that you preserve all + their Warranty Disclaimers. + + The combined work need only contain one copy of this License, and + multiple identical Invariant Sections may be replaced with a single + copy. If there are multiple Invariant Sections with the same name + but different contents, make the title of each such section unique + by adding at the end of it, in parentheses, the name of the + original author or publisher of that section if known, or else a + unique number. Make the same adjustment to the section titles in + the list of Invariant Sections in the license notice of the + combined work. + + In the combination, you must combine any sections Entitled + "History" in the various original documents, forming one section + Entitled "History"; likewise combine any sections Entitled + "Acknowledgements", and any sections Entitled "Dedications". You + must delete all sections Entitled "Endorsements." + + 6. COLLECTIONS OF DOCUMENTS + + You may make a collection consisting of the Document and other + documents released under this License, and replace the individual + copies of this License in the various documents with a single copy + that is included in the collection, provided that you follow the + rules of this License for verbatim copying of each of the documents + in all other respects. + + You may extract a single document from such a collection, and + distribute it individually under this License, provided you insert + a copy of this License into the extracted document, and follow this + License in all other respects regarding verbatim copying of that + document. + + 7. AGGREGATION WITH INDEPENDENT WORKS + + A compilation of the Document or its derivatives with other + separate and independent documents or works, in or on a volume of a + storage or distribution medium, is called an "aggregate" if the + copyright resulting from the compilation is not used to limit the + legal rights of the compilation's users beyond what the individual + works permit. When the Document is included in an aggregate, this + License does not apply to the other works in the aggregate which + are not themselves derivative works of the Document. + + If the Cover Text requirement of section 3 is applicable to these + copies of the Document, then if the Document is less than one half + of the entire aggregate, the Document's Cover Texts may be placed + on covers that bracket the Document within the aggregate, or the + electronic equivalent of covers if the Document is in electronic + form. Otherwise they must appear on printed covers that bracket + the whole aggregate. + + 8. TRANSLATION + + Translation is considered a kind of modification, so you may + distribute translations of the Document under the terms of section + 4. Replacing Invariant Sections with translations requires special + permission from their copyright holders, but you may include + translations of some or all Invariant Sections in addition to the + original versions of these Invariant Sections. You may include a + translation of this License, and all the license notices in the + Document, and any Warranty Disclaimers, provided that you also + include the original English version of this License and the + original versions of those notices and disclaimers. In case of a + disagreement between the translation and the original version of + this License or a notice or disclaimer, the original version will + prevail. + + If a section in the Document is Entitled "Acknowledgements", + "Dedications", or "History", the requirement (section 4) to + Preserve its Title (section 1) will typically require changing the + actual title. + + 9. TERMINATION + + You may not copy, modify, sublicense, or distribute the Document + except as expressly provided under this License. Any attempt + otherwise to copy, modify, sublicense, or distribute it is void, + and will automatically terminate your rights under this License. + + However, if you cease all violation of this License, then your + license from a particular copyright holder is reinstated (a) + provisionally, unless and until the copyright holder explicitly and + finally terminates your license, and (b) permanently, if the + copyright holder fails to notify you of the violation by some + reasonable means prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is + reinstated permanently if the copyright holder notifies you of the + violation by some reasonable means, this is the first time you have + received notice of violation of this License (for any work) from + that copyright holder, and you cure the violation prior to 30 days + after your receipt of the notice. + + Termination of your rights under this section does not terminate + the licenses of parties who have received copies or rights from you + under this License. If your rights have been terminated and not + permanently reinstated, receipt of a copy of some or all of the + same material does not give you any rights to use it. + + 10. FUTURE REVISIONS OF THIS LICENSE + + The Free Software Foundation may publish new, revised versions of + the GNU Free Documentation License from time to time. Such new + versions will be similar in spirit to the present version, but may + differ in detail to address new problems or concerns. See + . + + Each version of the License is given a distinguishing version + number. If the Document specifies that a particular numbered + version of this License "or any later version" applies to it, you + have the option of following the terms and conditions either of + that specified version or of any later version that has been + published (not as a draft) by the Free Software Foundation. If the + Document does not specify a version number of this License, you may + choose any version ever published (not as a draft) by the Free + Software Foundation. If the Document specifies that a proxy can + decide which future versions of this License can be used, that + proxy's public statement of acceptance of a version permanently + authorizes you to choose that version for the Document. + + 11. RELICENSING + + "Massive Multiauthor Collaboration Site" (or "MMC Site") means any + World Wide Web server that publishes copyrightable works and also + provides prominent facilities for anybody to edit those works. A + public wiki that anybody can edit is an example of such a server. + A "Massive Multiauthor Collaboration" (or "MMC") contained in the + site means any set of copyrightable works thus published on the MMC + site. + + "CC-BY-SA" means the Creative Commons Attribution-Share Alike 3.0 + license published by Creative Commons Corporation, a not-for-profit + corporation with a principal place of business in San Francisco, + California, as well as future copyleft versions of that license + published by that same organization. + + "Incorporate" means to publish or republish a Document, in whole or + in part, as part of another Document. + + An MMC is "eligible for relicensing" if it is licensed under this + License, and if all works that were first published under this + License somewhere other than this MMC, and subsequently + incorporated in whole or in part into the MMC, (1) had no cover + texts or invariant sections, and (2) were thus incorporated prior + to November 1, 2008. + + The operator of an MMC Site may republish an MMC contained in the + site under CC-BY-SA on the same site at any time before August 1, + 2009, provided the MMC is eligible for relicensing. + +ADDENDUM: How to use this License for your documents +==================================================== + +To use this License in a document you have written, include a copy of +the License in the document and put the following copyright and license +notices just after the title page: + + Copyright (C) YEAR YOUR NAME. + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.3 + or any later version published by the Free Software Foundation; + with no Invariant Sections, no Front-Cover Texts, and no Back-Cover + Texts. A copy of the license is included in the section entitled ``GNU + Free Documentation License''. + + If you have Invariant Sections, Front-Cover Texts and Back-Cover +Texts, replace the "with...Texts." line with this: + + with the Invariant Sections being LIST THEIR TITLES, with + the Front-Cover Texts being LIST, and with the Back-Cover Texts + being LIST. + + If you have Invariant Sections without Cover Texts, or some other +combination of the three, merge those two alternatives to suit the +situation. + + If your document contains nontrivial examples of program code, we +recommend releasing these examples in parallel under your choice of free +software license, such as the GNU General Public License, to permit +their use in free software. + + +File: cre2.info, Node: references, Next: concept index, Prev: Documentation License, Up: Top + +Appendix C Bibliography and references +************************************** + + +File: cre2.info, Node: concept index, Next: function index, Prev: references, Up: Top + +Appendix D An entry for each concept +************************************ + +[index] +* Menu: + +* 'CRE2_ANCHOR_BOTH': matching. (line 44) +* 'CRE2_ANCHOR_START': matching. (line 44) +* 'CRE2_ERROR_BAD_CHAR_CLASS': regexps. (line 59) +* 'CRE2_ERROR_BAD_CHAR_RANGE': regexps. (line 62) +* 'CRE2_ERROR_BAD_ESCAPE': regexps. (line 56) +* 'CRE2_ERROR_BAD_NAMED_CAPTURE': regexps. (line 89) +* 'CRE2_ERROR_BAD_PERL_OP': regexps. (line 83) +* 'CRE2_ERROR_BAD_UTF8': regexps. (line 86) +* 'CRE2_ERROR_INTERNAL': regexps. (line 53) +* 'CRE2_ERROR_MISSING_BRACKET': regexps. (line 65) +* 'CRE2_ERROR_MISSING_PAREN': regexps. (line 68) +* 'CRE2_ERROR_PATTERN_TOO_LARGE': regexps. (line 92) +* 'CRE2_ERROR_REPEAT_ARGUMENT': regexps. (line 74) +* 'CRE2_ERROR_REPEAT_SIZE': regexps. (line 77) +* 'CRE2_ERROR_REPEA_TOP': regexps. (line 80) +* 'CRE2_ERROR_TRAILING_BACKSLASH': regexps. (line 71) +* 'CRE2_Latin1': options. (line 23) +* 'CRE2_NO_ERROR': regexps. (line 50) +* 'CRE2_UNANCHORED': matching. (line 44) +* 'CRE2_UNKNOWN': options. (line 23) +* 'CRE2_UTF8': options. (line 23) +* FDL, GNU Free Documentation License: Documentation License. + (line 6) + + +File: cre2.info, Node: function index, Next: variable index, Prev: concept index, Up: Top + +Appendix E An entry for each function. +************************************** + +[index] +* Menu: + +* cre2_consume: other. (line 109) +* cre2_consume_re: other. (line 111) +* cre2_delete: regexps. (line 106) +* cre2_easy_match: matching. (line 76) +* cre2_error_arg: regexps. (line 159) +* cre2_error_code: regexps. (line 129) +* cre2_error_string: regexps. (line 134) +* cre2_find_and_consume: other. (line 123) +* cre2_find_and_consume_re: other. (line 125) +* cre2_full_match: other. (line 85) +* cre2_full_match_re: other. (line 87) +* cre2_match: matching. (line 51) +* cre2_new: regexps. (line 94) +* cre2_num_capturing_groups: regexps. (line 116) +* cre2_opt_case_sensitive: options. (line 115) +* cre2_opt_delete: options. (line 37) +* cre2_opt_encoding: options. (line 48) +* cre2_opt_literal: options. (line 73) +* cre2_opt_log_errors: options. (line 68) +* cre2_opt_longest_match: options. (line 63) +* cre2_opt_max_mem: options. (line 122) +* cre2_opt_never_nl: options. (line 98) +* cre2_opt_new: options. (line 33) +* cre2_opt_one_line: options. (line 145) +* cre2_opt_perl_classes: options. (line 133) +* cre2_opt_posix_syntax: options. (line 58) +* cre2_opt_set_case_sensitive: options. (line 116) +* cre2_opt_set_encoding: options. (line 49) +* cre2_opt_set_literal: options. (line 74) +* cre2_opt_set_log_errors: options. (line 69) +* cre2_opt_set_longest_match: options. (line 64) +* cre2_opt_set_max_mem: options. (line 123) +* cre2_opt_set_never_nl: options. (line 99) +* cre2_opt_set_one_line: options. (line 146) +* cre2_opt_set_perl_classes: options. (line 134) +* cre2_opt_set_posix_syntax: options. (line 59) +* cre2_opt_set_word_boundary: options. (line 140) +* cre2_opt_word_boundary: options. (line 139) +* cre2_partial_match: other. (line 96) +* cre2_partial_match_re: other. (line 98) +* cre2_pattern: regexps. (line 110) +* cre2_program_size: regexps. (line 122) +* cre2_strings_to_ranges: matching. (line 98) +* cre2_version_interface_age: version. (line 22) +* cre2_version_interface_current: version. (line 14) +* cre2_version_interface_revision: version. (line 18) +* cre2_version_string: version. (line 10) + + +File: cre2.info, Node: variable index, Next: type index, Prev: function index, Up: Top + +Appendix F An entry for each variable. +************************************** + + +File: cre2.info, Node: type index, Prev: variable index, Up: Top + +Appendix G An entry for each type. +********************************** + +[index] +* Menu: + +* cre2_anchor_t: matching. (line 43) +* cre2_encoding_t: options. (line 22) +* cre2_error_code_t: regexps. (line 45) +* cre2_options_t: options. (line 17) +* cre2_range_t: matching. (line 87) +* cre2_regexp_t: regexps. (line 29) +* cre2_string_t: regexps. (line 35) + + + +Tag Table: +Node: Top1464 +Node: overview3738 +Node: version6906 +Node: regexps7796 +Node: options13843 +Node: matching19960 +Node: other23916 +Node: tips29670 +Node: tips dot29938 +Node: Package License31005 +Node: Documentation License32762 +Node: references57906 +Node: concept index58085 +Node: function index59945 +Node: variable index63574 +Node: type index63747 + +End Tag Table diff --git a/outside/cre2/share/info/dir b/outside/cre2/share/info/dir new file mode 100644 index 000000000..8a98a5f75 --- /dev/null +++ b/outside/cre2/share/info/dir @@ -0,0 +1,18 @@ +This is the file .../info/dir, which contains the +topmost node of the Info hierarchy, called (dir)Top. +The first time you invoke Info you start off looking at this node. + +File: dir, Node: Top This is the top of the INFO tree + + This (the Directory node) gives a menu of major topics. + Typing "q" exits, "?" lists all Info commands, "d" returns here, + "h" gives a primer for first-timers, + "mEmacs" visits the Emacs manual, etc. + + In Emacs, you can click mouse button 2 on a menu item or cross reference + to select it. + +* Menu: + +Development +* cre2: (cre2). C wrapper for RE2. diff --git a/outside/cre2/src/.gitignore b/outside/cre2/src/.gitignore new file mode 100644 index 000000000..6b70ebd7a --- /dev/null +++ b/outside/cre2/src/.gitignore @@ -0,0 +1,44 @@ +*~ +=* +,,* +*.a +*.bz2 +*.fasl +*.gz +*.html +*.info +*.o +*.out +*.so +*.so.* +*.tgz +*.tmp +.DS_Store +.arch +.deps/ +.emacs.* +.gdb_history +.vimview +Makefile +Makefile.in +aclocal.m4 +ar-lib +autom4te* +compile +config.guess +config.h.in +config.sub +config.cache +configure +depcomp +test-driver +doc/mdate-sh +doc/stamp-vti +doc/texinfo.tex +doc/version.texi +install-sh +missing +mkinstalldirs +ltmain.sh +m4/ +autotools/ diff --git a/outside/cre2/src/COPYING b/outside/cre2/src/COPYING new file mode 100644 index 000000000..672d3b902 --- /dev/null +++ b/outside/cre2/src/COPYING @@ -0,0 +1,35 @@ +Copyright (c) 2012 Marco Maggi +Copyright (c) 2011 Keegan McAllister +All rights reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +3. Neither the name of the author nor the names of his + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/outside/cre2/src/INSTALL b/outside/cre2/src/INSTALL new file mode 100644 index 000000000..7d1c323be --- /dev/null +++ b/outside/cre2/src/INSTALL @@ -0,0 +1,365 @@ +Installation Instructions +************************* + +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, +2006, 2007, 2008, 2009 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. This file is offered as-is, +without warranty of any kind. + +Basic Installation +================== + + Briefly, the shell commands `./configure; make; make install' should +configure, build, and install this package. The following +more-detailed instructions are generic; see the `README' file for +instructions specific to this package. Some packages provide this +`INSTALL' file but do not implement all of the features documented +below. The lack of an optional feature in a given package is not +necessarily a bug. More recommendations for GNU packages can be found +in *note Makefile Conventions: (standards)Makefile Conventions. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, and a +file `config.log' containing compiler output (useful mainly for +debugging `configure'). + + It can also use an optional file (typically called `config.cache' +and enabled with `--cache-file=config.cache' or simply `-C') that saves +the results of its tests to speed up reconfiguring. Caching is +disabled by default to prevent problems with accidental use of stale +cache files. + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If you are using the cache, and at +some point `config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file `configure.ac' (or `configure.in') is used to create +`configure' by a program called `autoconf'. You need `configure.ac' if +you want to change it or regenerate `configure' using a newer version +of `autoconf'. + + The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. + + Running `configure' might take a while. While running, it prints + some messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package, generally using the just-built uninstalled binaries. + + 4. Type `make install' to install the programs and any data files and + documentation. When installing into a prefix owned by root, it is + recommended that the package be configured and built as a regular + user, and only the `make install' phase executed with root + privileges. + + 5. Optionally, type `make installcheck' to repeat any self-tests, but + this time using the binaries in their final installed location. + This target does not install anything. Running this target as a + regular user, particularly if the prior `make install' required + root privileges, verifies that the installation completed + correctly. + + 6. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. There is + also a `make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + + 7. Often, you can also type `make uninstall' to remove the installed + files again. In practice, not all packages have tested that + uninstallation works correctly, even though it is required by the + GNU Coding Standards. + + 8. Some packages, particularly those that use Automake, provide `make + distcheck', which can by used by developers to test that all other + targets like `make install' and `make uninstall' work correctly. + This target is generally not run by end users. + +Compilers and Options +===================== + + Some systems require unusual options for compilation or linking that +the `configure' script does not know about. Run `./configure --help' +for details on some of the pertinent environment variables. + + You can give `configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here +is an example: + + ./configure CC=c99 CFLAGS=-g LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + + You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you can use GNU `make'. `cd' to the +directory where you want the object files and executables to go and run +the `configure' script. `configure' automatically checks for the +source code in the directory that `configure' is in and in `..'. This +is known as a "VPATH" build. + + With a non-GNU `make', it is safer to compile the package for one +architecture at a time in the source code directory. After you have +installed the package for one architecture, use `make distclean' before +reconfiguring for another architecture. + + On MacOS X 10.5 and later systems, you can create libraries and +executables that work on multiple system types--known as "fat" or +"universal" binaries--by specifying multiple `-arch' options to the +compiler but only a single `-arch' option to the preprocessor. Like +this: + + ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ + CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ + CPP="gcc -E" CXXCPP="g++ -E" + + This is not guaranteed to produce working output in all cases, you +may have to build one architecture at a time and combine the results +using the `lipo' tool if you have problems. + +Installation Names +================== + + By default, `make install' installs the package's commands under +`/usr/local/bin', include files under `/usr/local/include', etc. You +can specify an installation prefix other than `/usr/local' by giving +`configure' the option `--prefix=PREFIX', where PREFIX must be an +absolute file name. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option `--exec-prefix=PREFIX' to `configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like `--bindir=DIR' to specify different values for particular +kinds of files. Run `configure --help' for a list of the directories +you can set and what kinds of files go in them. In general, the +default for these options is expressed in terms of `${prefix}', so that +specifying just `--prefix' will affect all of the other directory +specifications that were not explicitly provided. + + The most portable way to affect installation locations is to pass the +correct locations to `configure'; however, many packages provide one or +both of the following shortcuts of passing variable assignments to the +`make install' command line to change installation locations without +having to reconfigure or recompile. + + The first method involves providing an override variable for each +affected directory. For example, `make install +prefix=/alternate/directory' will choose an alternate location for all +directory configuration variables that were expressed in terms of +`${prefix}'. Any directories that were specified during `configure', +but not in terms of `${prefix}', must each be overridden at install +time for the entire installation to be relocated. The approach of +makefile variable overrides for each directory variable is required by +the GNU Coding Standards, and ideally causes no recompilation. +However, some platforms have known limitations with the semantics of +shared libraries that end up requiring recompilation when using this +method, particularly noticeable in packages that use GNU Libtool. + + The second method involves providing the `DESTDIR' variable. For +example, `make install DESTDIR=/alternate/directory' will prepend +`/alternate/directory' before all installation names. The approach of +`DESTDIR' overrides is not required by the GNU Coding Standards, and +does not work on platforms that have drive letters. On the other hand, +it does better at avoiding recompilation issues, and works well even +when some directory options were not specified in terms of `${prefix}' +at `configure' time. + +Optional Features +================= + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + + Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + + Some packages offer the ability to configure how verbose the +execution of `make' will be. For these packages, running `./configure +--enable-silent-rules' sets the default to minimal output, which can be +overridden with `make V=1'; while running `./configure +--disable-silent-rules' sets the default to verbose, which can be +overridden with `make V=0'. + +Particular systems +================== + + On HP-UX, the default C compiler is not ANSI C compatible. If GNU +CC is not installed, it is recommended to use the following options in +order to use an ANSI C compiler: + + ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" + +and if that doesn't work, install pre-built binaries of GCC for HP-UX. + + On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot +parse its `' header file. The option `-nodtk' can be used as +a workaround. If GNU CC is not installed, it is therefore recommended +to try + + ./configure CC="cc" + +and if that doesn't work, try + + ./configure CC="cc -nodtk" + + On Solaris, don't put `/usr/ucb' early in your `PATH'. This +directory contains several dysfunctional programs; working variants of +these programs are available in `/usr/bin'. So, if you need `/usr/ucb' +in your `PATH', put it _after_ `/usr/bin'. + + On Haiku, software installed for all users goes in `/boot/common', +not `/usr/local'. It is recommended to use the following options: + + ./configure --prefix=/boot/common + +Specifying the System Type +========================== + + There may be some features `configure' cannot figure out +automatically, but needs to determine by the type of machine the package +will run on. Usually, assuming the package is built to be run on the +_same_ architectures, `configure' can figure that out, but if it prints +a message saying it cannot guess the machine type, give it the +`--build=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS + KERNEL-OS + + See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option `--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with `--host=TYPE'. + +Sharing Defaults +================ + + If you want to set default values for `configure' scripts to share, +you can create a site shell script called `config.site' that gives +default values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Defining Variables +================== + + Variables not defined in a site shell script can be set in the +environment passed to `configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the `configure' command line, using `VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified `gcc' to be used as the C compiler (unless it is +overridden in the site shell script). + +Unfortunately, this technique does not work for `CONFIG_SHELL' due to +an Autoconf bug. Until the bug is fixed you can use this workaround: + + CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash + +`configure' Invocation +====================== + + `configure' recognizes the following options to control how it +operates. + +`--help' +`-h' + Print a summary of all of the options to `configure', and exit. + +`--help=short' +`--help=recursive' + Print a summary of the options unique to this package's + `configure', and exit. The `short' variant lists options used + only in the top level, while the `recursive' variant lists options + also present in any nested packages. + +`--version' +`-V' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally `config.cache'. FILE defaults to `/dev/null' to + disable caching. + +`--config-cache' +`-C' + Alias for `--cache-file=config.cache'. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to `/dev/null' (any error + messages will still be shown). + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`--prefix=DIR' + Use DIR as the installation prefix. *note Installation Names:: + for more details, including other options available for fine-tuning + the installation locations. + +`--no-create' +`-n' + Run the configure checks, but stop before creating any output + files. + +`configure' also accepts some other, not widely useful, options. Run +`configure --help' for more details. + diff --git a/outside/cre2/src/LICENSE.re2 b/outside/cre2/src/LICENSE.re2 new file mode 100644 index 000000000..09e5ec1c7 --- /dev/null +++ b/outside/cre2/src/LICENSE.re2 @@ -0,0 +1,27 @@ +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/outside/cre2/src/Makefile.am b/outside/cre2/src/Makefile.am new file mode 100644 index 000000000..4eabb2ef9 --- /dev/null +++ b/outside/cre2/src/Makefile.am @@ -0,0 +1,48 @@ +## Process this file with automake to produce Makefile.in + +ACLOCAL_AMFLAGS = -I autotools +EXTRA_DIST = INSTALL configure.sh prepare.sh +dist_doc_DATA = README COPYING LICENSE.re2 + +## -------------------------------------------------------------------- + +cre2_CURRENT = @cre2_VERSION_INTERFACE_CURRENT@ +cre2_REVISION = @cre2_VERSION_INTERFACE_REVISION@ +cre2_AGE = @cre2_VERSION_INTERFACE_AGE@ + +include_HEADERS = src/cre2.h + +lib_LTLIBRARIES = libcre2.la +libcre2_la_LDFLAGS = -version-info $(cre2_CURRENT):$(cre2_REVISION):$(cre2_AGE) +libcre2_la_SOURCES = src/cre2.cpp + +## -------------------------------------------------------------------- + +AM_MAKEINFOFLAGS = --no-split + +info_TEXINFOS = doc/cre2.texi +doc_cre2_TEXINFOS = doc/fdl-1.3.texi + +## -------------------------------------------------------------------- + +check_PROGRAMS = \ + tests/test-version \ + tests/test-options \ + tests/test-rex-alloc \ + tests/test-matching \ + tests/test-easy-matching \ + tests/test-full-match \ + tests/test-partial-match \ + tests/test-consume-match \ + tests/test-find-and-consume-match \ + tests/test-replace \ + tests/test-misc + +AM_CPPFLAGS = -I$(top_srcdir)/src +LDADD = libcre2.la +TESTS = $(check_PROGRAMS) + +installcheck-local: $(check_PROGRAMS) + for f in $(check_PROGRAMS); do $$f; done + +### end of file diff --git a/outside/cre2/src/README b/outside/cre2/src/README new file mode 100644 index 000000000..d1d254314 --- /dev/null +++ b/outside/cre2/src/README @@ -0,0 +1,175 @@ + + C wrapper for re2 + ================= + + +Topics +------ + + 1. Introduction + 2. License + 3. Install + 4. Usage + A. Credits + B. Bugs + C. Resources + + +1. Introduction +--------------- + +The CRE2 distribution is a C language wrapper for the RE2 +library, which is implemented in C++. RE2 is a fast, safe, +thread-friendly alternative to backtracking regular +expression engines like those used in PCRE, Perl, and +Python. + + This distribution makes use of the GNU Autotools. + + +2. License +---------- + +Copyright (c) 2012, 2013 Marco Maggi +Copyright (c) 2011 Keegan McAllister +All rights reserved. + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the + following disclaimer in the documentation and/or other + materials provided with the distribution. + +3. Neither the name of the author nor the names of his + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +3. Install +---------- + +To install RE2 follow the instructions in the README file in +RE2's. To install CRE2 from a proper release tarball, do +this: + + $ cd cre2-0.1.0 + $ mkdir "=build" + $ cd "=build" + +to inspect the available configuration options: + + $ ../configure --help + +then do it: + + $ ../configure [options] + $ make + $ make check + $ make install + + From a repository checkout or snapshot (the ones from the +Github site): we may need to manually run "libtoolize" the +first time, then we must first run the script "autogen.sh" +from the top source directory, to generate the needed files: + + $ sh autogen.sh + +for this we need to have installed the GNU Autotools: +Automake, Autoconf, Libtool. After this: + + $ ../configure --enable-maintainer-mode [options] + $ make + $ make check + $ make install + + The Makefile supports the DESTDIR environment variable to +install files in a temporary location, example: to see what +will happen: + + $ make -n install DESTDIR=/tmp/marco/cre2 + +to really do it: + + $ make install DESTDIR=/tmp/marco/cre2 + + +4. Usage +-------- + +Read the documentation. + + +A. Credits +---------- + +RE2 is a Google project. CRE2 is based on code by Keegan +McAllister. This distribution was assembled by Marco Maggi. + + +B. Bugs +------- + +Bug reports are appreciated. Register issues at the CRE2 +issue tracker: + + + + +C. Resources +------------ + +The GNU Project software can be found here: + + + +RE2 is available at: + + + +development of this package happens at: + + + +and as backup at: + + + +proper release tarballs for this package are in the download +area at: + + + +the documentation is available online: + + + + +### end of file +# Local Variables: +# mode: text +# coding: utf-8-unix +# fill-column: 60 +# paragraph-start: "*" +# End: diff --git a/outside/cre2/src/autogen.sh b/outside/cre2/src/autogen.sh new file mode 100644 index 000000000..284a55d60 --- /dev/null +++ b/outside/cre2/src/autogen.sh @@ -0,0 +1,11 @@ +# autogen.sh -- +# +# Run this in the top source directory to rebuild the infrastructure. + +set -xe +test -d autotools || mkdir autotools +test -f autotools/libtool.m4 || libtoolize +autoreconf --warnings=all --install --verbose "$@" + +### end of file + diff --git a/outside/cre2/src/build.sh b/outside/cre2/src/build.sh new file mode 100644 index 000000000..b67d94b53 --- /dev/null +++ b/outside/cre2/src/build.sh @@ -0,0 +1,17 @@ +if [ ! -d "=build" ]; then +mkdir "=build" +libtoolize +sh autogen.sh +cd "=build" +../configure --enable-maintainer-mode LDFLAGS=-pthread +make +cd .. +fi +if [ ! -d "../lib" ]; then +mkdir ../lib +fi +if [ ! -d "../include" ]; then +mkdir ../include +fi +cp \=build/.libs/* ../lib +cp src/cre2.h ../include diff --git a/outside/cre2/src/clean.sh b/outside/cre2/src/clean.sh new file mode 100644 index 000000000..207e3d30d --- /dev/null +++ b/outside/cre2/src/clean.sh @@ -0,0 +1,9 @@ +if [ -d "=build" ]; then + rm -r "=build" +fi +if [ -d "../lib" ]; then +rm -r ../lib +fi +if [ -d "../include" ]; then +rm -r ../include +fi diff --git a/outside/cre2/src/configure.ac b/outside/cre2/src/configure.ac new file mode 100644 index 000000000..4accbb626 --- /dev/null +++ b/outside/cre2/src/configure.ac @@ -0,0 +1,58 @@ +dnl @configure_input@ +dnl + +AC_PREREQ([2.68]) +AC_INIT([CRE2],[0.1b5],[marco.maggi-ipsu@poste.it], + [cre2],[http://github.com/marcomaggi/cre2/]) +AC_CONFIG_SRCDIR([src/]) +AC_CONFIG_MACRO_DIR([autotools]) +AC_CONFIG_AUX_DIR([autotools]) +AC_CANONICAL_BUILD +AC_CANONICAL_HOST +AC_CANONICAL_TARGET +AM_INIT_AUTOMAKE([1.14 foreign subdir-objects dist-xz no-dist-gzip -Wall]) +AM_MAINTAINER_MODE + +AM_PROG_AR +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_MAKE_SET +AC_PROG_MKDIR_P + +LT_PREREQ([2.4]) +LT_INIT + +AC_PROG_CC +AC_PROG_CC_C_O + +AC_LANG([C++]) +AC_PROG_CXX +AC_PROG_CXX_C_O + +AC_CHECK_LIB([re2],[main],,[AC_MSG_FAILURE([test for RE2 library failed])]) +AC_CHECK_HEADERS([re2/re2.h],,[AC_MSG_ERROR([test for RE2 header failed])]) + +cre2_VERSION_INTERFACE_CURRENT=0 +cre2_VERSION_INTERFACE_REVISION=0 +cre2_VERSION_INTERFACE_AGE=0 +AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_CURRENT], + [$cre2_VERSION_INTERFACE_CURRENT], + [current interface number]) +AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_REVISION], + [$cre2_VERSION_INTERFACE_REVISION], + [current interface implementation number]) +AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_AGE], + [$cre2_VERSION_INTERFACE_AGE], + [current interface age number]) +AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_STRING], + ["$cre2_VERSION_INTERFACE_CURRENT.$cre2_VERSION_INTERFACE_REVISION"], + [library interface version]) +AC_SUBST([cre2_VERSION_INTERFACE_CURRENT]) +AC_SUBST([cre2_VERSION_INTERFACE_REVISION]) +AC_SUBST([cre2_VERSION_INTERFACE_AGE]) + +AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT + +dnl end of file diff --git a/outside/cre2/src/configure.sh b/outside/cre2/src/configure.sh new file mode 100644 index 000000000..c7b936809 --- /dev/null +++ b/outside/cre2/src/configure.sh @@ -0,0 +1,24 @@ +# configure.sh -- +# +# Run this to configure. + +set -xe + +prefix=/usr/local +if test -d /lib64 +then libdir=${prefix}/lib64 +else libdir=${prefix}/lib +fi + +../configure \ + --config-cache \ + --cache-file=../config.cache \ + --enable-maintainer-mode \ + --disable-static --enable-shared \ + --prefix="${prefix}" \ + --libdir="${libdir}" \ + CFLAGS='-O3' \ + LDFLAGS="-L${libdir}" \ + "$@" + +### end of file diff --git a/outside/cre2/src/doc/cre2.texi b/outside/cre2/src/doc/cre2.texi new file mode 100644 index 000000000..86c858dc8 --- /dev/null +++ b/outside/cre2/src/doc/cre2.texi @@ -0,0 +1,1474 @@ +\input texinfo.tex +@c %**start of header +@setfilename cre2.info +@settitle C wrapper for RE2 +@c %**end of header + +@include version.texi + +@c page +@c ------------------------------------------------------------ +@c License macros. +@c ------------------------------------------------------------ + +@macro gnu{} +@acronym{GNU} +@end macro + +@macro gpl{} +@acronym{GPL} +@end macro + +@macro fdl{} +@acronym{FDL} +@end macro + +@macro bsd{} +@acronym{BSD} +@end macro + +@c ------------------------------------------------------------ +@c Software related macros. +@c ------------------------------------------------------------ + +@macro bash{} +@command{bash} +@end macro + +@macro gmp{} +@acronym{GMP} +@end macro + +@macro gcc{} +@acronym{GCC} +@end macro + +@macro glibc{} +@gnu{} C Library +@end macro + +@c ------------------------------------------------------------ +@c Network protocols acronyms. +@c ------------------------------------------------------------ + +@macro dns{} +@acronym{DNS} +@end macro + +@macro ftp{} +@acronym{FTP} +@end macro + +@macro dhcp{} +@acronym{DHCP} +@end macro + +@macro icmp{} +@acronym{ICMP} +@end macro + +@macro http{} +@acronym{HTTP} +@end macro + +@macro https{} +@acronym{HTTP} +@end macro + +@macro imap{} +@acronym{IMAP} +@end macro + +@macro ip{} +@acronym{IP} +@end macro + +@macro tcp{} +@acronym{TCP} +@end macro + +@macro udp{} +@acronym{UDP} +@end macro + +@macro ppp{} +@acronym{PPP} +@end macro + +@macro sftp{} +@acronym{FTP} +@end macro + +@macro smtp{} +@acronym{SMTP} +@end macro + +@macro snmp{} +@acronym{SNMP} +@end macro + +@macro ssh{} +@acronym{SSH} +@end macro + +@macro ssl{} +@acronym{SSL} +@end macro + +@macro tls{} +@acronym{TLS} +@end macro + +@c ------------------------------------------------------------ +@c Miscellaneous acronyms. +@c ------------------------------------------------------------ + +@macro adsl{} +@acronym{ADSL} +@end macro + +@macro alsa{} +@acronym{ALSA} +@end macro + +@macro ansi{} +@acronym{ANSI} +@end macro + +@macro api{} +@acronym{API} +@end macro + +@macro ascii{} +@acronym{ASCII} +@end macro + +@macro asciiz{} +@acronym{ASCIIZ} +@end macro + +@macro cdrom{} +@acronym{CDROM} +@end macro + +@macro cli{} +@acronym{CLI} +@end macro + +@macro cpu{} +@acronym{CPU} +@end macro + +@macro gui{} +@acronym{GUI} +@end macro + +@macro ieee{} +@acronym{IEEE} +@end macro + +@macro isp{} +@acronym{ISP} +@end macro + +@macro iso{} +@acronym{ISO} +@end macro + +@macro mime{} +@acronym{MIME} +@end macro + +@macro mpeg{} +@acronym{MPEG} +@end macro + +@macro posix{} +@acronym{POSIX} +@end macro + +@macro ram{} +@acronym{RAM} +@end macro + +@macro rfc{} +@acronym{RFC} +@end macro + +@c Remember that @url is already used by Texinfo. +@macro urla{} +@acronym{URL} +@end macro + +@macro usb{} +@acronym{USB} +@end macro + +@macro utf{} +@acronym{UTF} +@end macro + +@macro uri{} +@acronym{URI} +@end macro + +@macro xmla{} +@acronym{XML} +@end macro + + +@c ------------------------------------------------------------ +@c Arguments macros. +@c ------------------------------------------------------------ + +@macro vari{ARG} +@var{\ARG\1} +@end macro + +@macro varii{ARG} +@var{\ARG\2} +@end macro + +@macro variii{ARG} +@var{\ARG\3} +@end macro + +@macro variv{ARG} +@var{\ARG\4} +@end macro + +@macro varn{ARG} +@var{\ARG\n} +@end macro + +@macro vark{ARG} +@var{\ARG\k} +@end macro + +@macro varj{ARG} +@var{\ARG\j} +@end macro + +@c ------------------------------------------------------------ + +@macro meta{ARG} +<\ARG\> +@end macro + +@macro metai{ARG} +@meta{\ARG\1} +@end macro + +@macro metaii{ARG} +@meta{\ARG\2} +@end macro + +@macro metaiii{ARG} +@meta{\ARG\3} +@end macro + +@macro metaiv{ARG} +@meta{\ARG\4} +@end macro + +@macro metan{ARG} +@meta{\ARG\n} +@end macro + +@macro metak{ARG} +@meta{\ARG\k} +@end macro + +@macro metaj{ARG} +@meta{\ARG\j} +@end macro + +@c ------------------------------------------------------------ +@c C language macros. +@c ------------------------------------------------------------ + +@macro cfunc{NAME} +@code{\NAME\()} +@end macro + +@macro cnull{} +@code{NULL} +@end macro + +@c ------------------------------------------------------------ +@c Scheme language macros. +@c ------------------------------------------------------------ + +@macro clos{} +@acronym{CLOS} +@end macro + +@macro library{NAME} +@code{(\NAME\)} +@end macro + +@macro repl{} +@acronym{REPL} +@end macro + +@macro rnrs{VERSION} +@acronym{R\VERSION\RS} +@end macro + +@macro srfi{} +@acronym{SRFI} +@end macro + +@ignore +Separating the @srfi{} macro from the number with a '--' rather than a +'-' makes the expansion look ugly in menu entries under the Info reader. +IMHO this should not happen, but it does; so we live with this, because +the main purpose of this document is to provide an Info version. +@end ignore +@macro ansrfi{NUM} +@srfi{}-\NUM\ +@end macro + +@c ------------------------------------------------------------ + +@macro func{NAME} +@code{@sc{\NAME\}} +@end macro + +@macro nil{} +@code{()} +@end macro + +@macro true{} +@code{#t} +@end macro + +@macro false{} +@code{#f} +@end macro + +@macro keyword{NAME} +@code{#:\NAME\} +@end macro + +@macro class{NAME} +@code{<\NAME\>} +@end macro + +@c ------------------------------------------------------------ +@c TCL macros. +@c ------------------------------------------------------------ + +@ifinfo +@macro tclcmd{NAME} +[\NAME\] +@end macro +@end ifinfo + +@ifnotinfo +@macro tclcmd{NAME} +@code{[\NAME\]} +@end macro +@end ifnotinfo + +@macro tclvar{NAME} +@code{\NAME\} +@end macro + +@macro tclcode{CODE} +@code{[\CODE\]} +@end macro + +@c ------------------------------------------------------------ +@c Macros for references to external documents. +@c ------------------------------------------------------------ + +@macro glibcref{NODE, TITLE} +@ref{\NODE\,\TITLE\,\TITLE\,libc} +@end macro + +@macro rsixref{NODE, TITLE} +@ref{\NODE\,\TITLE\,\TITLE\,r6rs} +@end macro + +@macro rfiveref{NODE, TITLE} +@ref{\NODE\,\TITLE\,\TITLE\,r5rs} +@end macro + +@macro ikarusref{NODE, TITLE} +@ref{\NODE\,\TITLE\,\TITLE\,ikarus} +@end macro + +@macro bibref{TAG} +@code{[\TAG\]} +@end macro + +@c page +@c ------------------------------------------------------------ +@c Values. +@c ------------------------------------------------------------ + +@set TITLE C wrapper for RE2 + +@c To be used as @value{PACKAGE} whenever we need to include the full +@c name of this package. +@set PACKAGE CRE2 + +@c To be used as @value{PACKAGE} whenever we need to include the +@c nickname of the project: the name that is used to compose the +@c distribution tarball or the web address. +@set PACKAGE_NICKNAME cre2 + +@c To be used as @value{AUTHOR} whenever we need to include the list of +@c authors of this document. +@set AUTHOR Marco Maggi + +@c To be used as @value{AUTHOR_EMAIL} whenever we need to include the +@c email of the *single* author of this document. +@set AUTHOR_EMAIL @email{marco.maggi-ipsu@@poste.it} + +@set AUTHOR_URL @url{http://github.com/marcomaggi} + +@c To be used as @value{COPYRIGHT_YEARS} whenever we need to include the +@c list of copyright years. +@set COPYRIGHT_YEARS 2012 + +@c page +@c ------------------------------------------------------------ +@c Copyright notice. +@c ------------------------------------------------------------ + +@copying +This document describes version @value{VERSION} of @value{PACKAGE}, a C +language wrapper for the C++ library RE2: a fast, safe, thread--friendly +alternative to backtracking regular expression engines like those used +in PCRE, Perl, and Python. + +The package is distributed under the terms of a @acronym{BSD}--like +license and can be downloaded from: + +@center @url{http://sourceforge.net/projects/cre2/files/} + +@noindent +development takes place at: + +@center @url{http://github.com/marcomaggi/@value{PACKAGE_NICKNAME}} + +@noindent +and as a backup at: + +@center @url{http://sourceforge.net/projects/@value{PACKAGE_NICKNAME}} + +@noindent +Copyright @copyright{} @value{COPYRIGHT_YEARS} by @value{AUTHOR} @value{AUTHOR_URL}@* +Copyright @copyright{} 2011 by Keegan McAllister @url{http://github.com/kmcallister/} + +Portions of this document come from the source code of RE2 itself, see +the file @file{LICENSE.re2} for the license notice. + +@quotation +Permission is granted to copy, distribute and/or modify this document +under the terms of the @gnu{} Free Documentation License, Version 1.3 or +any later version published by the Free Software Foundation; with +Invariant Sections being ``@gnu{} Free Documentation License'' and +``@gnu{} General Public License'', no Front--Cover Texts, and no +Back--Cover Texts. A copy of the license is included in the section +entitled ``@gnu{} Free Documentation License''. +@end quotation +@end copying + +@c page +@c ------------------------------------------------------------ +@c Headers. +@c ------------------------------------------------------------ + +@titlepage +@title @value{TITLE} +@subtitle Revision @value{VERSION} +@author @value{AUTHOR} @value{AUTHOR_EMAIL} +@page +@vskip 0pt plus 1filll +@insertcopying +@end titlepage + +@c ------------------------------------------------------------ + +@ifinfo +@dircategory Development +@direntry +* cre2: (cre2). @value{TITLE}. +@end direntry +@end ifinfo + +@c page +@ifnottex +@node Top +@top @value{TITLE} + +@insertcopying + +@menu +* overview:: Overview of the package. +* version:: Version functions. +* regexps:: Precompiled regular expressions + construction. +* options:: Matching configuration. +* matching:: Matching regular expressions. +* other:: Other matching functions. +* tips:: Tips for using the regexp syntax. + +Appendices + +* Package License:: Package license. +* Documentation License:: GNU Free Documentation License. +* references:: Bibliography and references. + +Indexes + +* concept index:: An entry for each concept. +* function index:: An entry for each function. +* variable index:: An entry for each variable. +* type index:: An entry for each type. +@end menu +@end ifnottex + +@c page +@node overview +@chapter Overview of the package + + +@value{PACKAGE} is a C language wrapper for the C++ library RE2: a fast, +safe, thread--friendly alternative to backtracking regular expression +engines like those used in PCRE, Perl, and Python. @value{PACKAGE} is +based on code by Keegan McAllister for the @code{haskell-re2} binding: + +@center @url{http://github.com/kmcallister/haskell-re2} + +For the supported regular expressions syntax we should refer to the +original documentation: + +@center @url{http://code.google.com/p/re2/wiki/Syntax} + +The C wrapper is meant to make it easier to interface RE2 with other +languages. The exposed @api{} allows searching for substrings of text +matching regular expressions and reporting portions of text matching +parenthetical subexpressions. + +@value{PACKAGE} installs the single header file @file{cre2.h}. All the +function names in the @api{} are prefixed with @code{cre2_}; all the +constant names are prefixed with @code{CRE2_}; all the type names are +prefixed with @code{cre2_} and suffixed with @code{_t}. + +When searching for the installed libraries with the @gnu{} Autotools, we +can use the following macros in @file{configure.ac}: + +@example +AC_CHECK_LIB([re2],[main],, + [AC_MSG_FAILURE([test for RE2 library failed])]) + +AC_CHECK_LIB([cre2],[cre2_version_string],, + [AC_MSG_FAILURE([test for CRE2 library failed])]) +AC_CHECK_HEADERS([cre2.h],, + [AC_MSG_ERROR([test for RE2 header failed])]) +@end example + +@noindent +notice that there is no need to check for the header file +@file{re2/re2.h}. + +It is customary for regular expression engines to provide methods to +replace backslash sequences like @code{\1}, @code{\2}, @dots{} in a +given string with portions of text that matched the first, second, +@dots{} parenthetical subexpression; @value{PACKAGE} does @strong{not} +provide such methods in its public @api{}, because they require +interacting with the storage mechanism in the client code. However, it +is not difficult to implement such substitutions given the results of a +regular expression matching operation. + +Some functions and methods from RE2 requiring memory allocation handling +are unofficially wrapped by @value{PACKAGE} with unsafe code (execution +will succeed when no memory allocation errors happen). These +``problematic'' functions are documented in the header file +@file{cre2.h} and, at present, are not considered part of the public +@api{} of @value{PACKAGE}. + +It is sometimes useful to try a program in the original C++ to verify if +a problem is caused by @value{PACKAGE} or is in the original RE2 code; +we may want to start by customising this program: + +@example +/* compile and run with: + + $ g++ -Wall -o proof proof.cpp -lre2 && ./proof +*/ + +#include +#include + +static void try_match (RE2::Options& opt, const char * text); + +int +main (int argc, const char *const argv[]) +@{ + RE2::Options opt; + opt.set_never_nl(true); + try_match(opt, "abcdef"); + return 0; +@} +void +try_match (RE2::Options& opt, const char * text) +@{ + RE2 re("abcdef", opt); + assert(re.ok()); + assert(RE2::FullMatch(text, re)); + //assert(RE2::PartialMatch(text, re)); +@} +@end example + +@c page +@node version +@chapter Version functions + + +The installed libraries follow version numbering as established by the +@gnu{} Autotools. For an explanation of interface numbers as managed by +@gnu{} Libtool @xref{Libtool versioning, interface, Libtool's versioning +system, libtool, Shared library support for @gnu{}}. + + +@deftypefun {const char *} cre2_version_string (void) +Return a pointer to a statically allocated @asciiz{} string representing +the interface version number. +@end deftypefun + + +@deftypefun int cre2_version_interface_current (void) +Return an integer representing the library interface current number. +@end deftypefun + + +@deftypefun int cre2_version_interface_revision (void) +Return an integer representing the library interface current revision +number. +@end deftypefun + + +@deftypefun int cre2_version_interface_age (void) +Return an integer representing the library interface current age. +@end deftypefun + +@c page +@node regexps +@chapter Precompiled regular expressions construction + + +Regular expression objects are built and finalised as follows: + +@example +cre2_regexp_t * rex; +cre2_options_t * opt; + +opt = cre2_opt_new(); +if (opt) @{ + cre2_opt_set_log_errors(opt, 0); + rex = cre2_new("ciao", 4, opt); + if (rex) @{ + if (!cre2_error_code(rex)) + /* successfully built */ + else + /* an error occurred while compiling rex */ + cre2_delete(rex); + @} else @{ + /* rex memory allocation error */ + @} + cre2_opt_delete(opt); +@} else @{ + /* opt memory allocation error */ +@} +@end example + + +@deftp {Opaque Type} cre2_regexp_t +Opaque type for regular expression objects; it is meant to be used to +declare pointers to objects. Instances of this type can be used for any +number of matching operations and are safe for concurrent use by +multiple threads. +@end deftp + + +@deftp {Struct Typedef} cre2_string_t +Simple data structure used to reference a portion of another string. It +has the following fields: + +@table @code +@item const char * data +Pointer to the first byte in the referenced substring. + +@item int length +The number of bytes in the referenced substring. +@end table +@end deftp + + +@deftp {Enumeration Typedef} cre2_error_code_t +Enumeration type for error codes returned by @cfunc{cre2_error_code}. +It contains the following symbols: + +@table @code +@item CRE2_NO_ERROR +@cindex @code{CRE2_NO_ERROR} +Defined as @code{0}, represents a successful operation. + +@item CRE2_ERROR_INTERNAL +@cindex @code{CRE2_ERROR_INTERNAL} +Unexpected error. + +@item CRE2_ERROR_BAD_ESCAPE +@cindex @code{CRE2_ERROR_BAD_ESCAPE} +Bad escape sequence. + +@item CRE2_ERROR_BAD_CHAR_CLASS +@cindex @code{CRE2_ERROR_BAD_CHAR_CLASS} +Bad character class. + +@item CRE2_ERROR_BAD_CHAR_RANGE +@cindex @code{CRE2_ERROR_BAD_CHAR_RANGE} +Bad character class range. + +@item CRE2_ERROR_MISSING_BRACKET +@cindex @code{CRE2_ERROR_MISSING_BRACKET} +Missing closing @code{]}. + +@item CRE2_ERROR_MISSING_PAREN +@cindex @code{CRE2_ERROR_MISSING_PAREN} +Missing closing @code{)}. + +@item CRE2_ERROR_TRAILING_BACKSLASH +@cindex @code{CRE2_ERROR_TRAILING_BACKSLASH} +Trailing @code{\} at end of regexp. + +@item CRE2_ERROR_REPEAT_ARGUMENT +@cindex @code{CRE2_ERROR_REPEAT_ARGUMENT} +Repeat argument missing, e.g. @code{*}. + +@item CRE2_ERROR_REPEAT_SIZE +@cindex @code{CRE2_ERROR_REPEAT_SIZE} +Bad repetition argument. + +@item CRE2_ERROR_REPEA_TOP +@cindex @code{CRE2_ERROR_REPEA_TOP} +Bad repetition operator. + +@item CRE2_ERROR_BAD_PERL_OP +@cindex @code{CRE2_ERROR_BAD_PERL_OP} +Bad Perl operator. + +@item CRE2_ERROR_BAD_UTF8 +@cindex @code{CRE2_ERROR_BAD_UTF8} +Invalid @utf{}-8 in regexp. + +@item CRE2_ERROR_BAD_NAMED_CAPTURE +@cindex @code{CRE2_ERROR_BAD_NAMED_CAPTURE} +Bad named capture group. + +@item CRE2_ERROR_PATTERN_TOO_LARGE +@cindex @code{CRE2_ERROR_PATTERN_TOO_LARGE} +Pattern too large (compile failed). +@end table +@end deftp + + +@deftypefun {cre2_regexp_t *} cre2_new (const char * @var{pattern}, int @var{pattern_len}, const cre2_options_t * @var{opt}) +Build and return a new regular expression object representing the +@var{pattern} of length @var{pattern_len} bytes; the object is +configured with the options in @var{opt}. If memory allocation fails: +the return value is a @cnull{} pointer. + +The options object @var{opt} is duplicated in the internal state of the +regular expression instance, so @var{opt} can be safely mutated or +finalised after this call. If @var{opt} is @cnull{}: the regular +expression object is built with the default set of options. +@end deftypefun + + +@deftypefun void cre2_delete (cre2_regexp_t * @var{rex}) +Finalise a regular expression object releasing all the associated +resources. +@end deftypefun + + +@deftypefun {const char *} cre2_pattern (const cre2_regexp_t * @var{rex}) +Whether @var{rex} is a successfully built regular expression object or +not: return a pointer to the pattern string. The returned pointer is +valid only while @var{rex} is alive: if @cfunc{cre2_delete} is applied +to @var{rex} the pointer becomes invalid. +@end deftypefun + + +@deftypefun int cre2_num_capturing_groups (const cre2_regexp_t * @var{rex}) +If @var{rex} is a successfully built regular expression object: return a +non--negative integer representing the number of capturing groups +(parenthetical subexpressions) in the pattern. If an error occurred +while building @var{rex}: return @code{-1}. +@end deftypefun + + +@deftypefun int cre2_program_size (const cre2_regexp_t * @var{rex}) +If @var{rex} is a successfully built regular expression object: return a +non--negative integer representing the program size, a very approximate +measure of a regexp's ``cost''; larger numbers are more expensive than +smaller numbers. If an error occurred while building @var{rex}: return +@code{-1}. +@end deftypefun + + +@deftypefun int cre2_error_code (const cre2_regexp_t * @var{rex}) +In case an error occurred while building @var{rex}: return an integer +representing the associated error code. Return zero if no error +occurred. +@end deftypefun + + +@deftypefun {const char *} cre2_error_string (const cre2_regexp_t * @var{rex}) +If an error occurred while building @var{rex}: return a pointer to an +@asciiz{} string representing the associated error message. The +returned pointer is valid only while @var{rex} is alive: if +@cfunc{cre2_delete} is applied to @var{rex} the pointer becomes invalid. + +If @var{rex} is a successfully built regular expression object: return a +pointer to an empty string. + +The following code: + +@example +cre2_regexp_t * rex; + +rex = cre2_new("ci(ao", 5, NULL); +@{ + printf("error: code=%d, msg=\"%s\"\n", + cre2_error_code(rex), + cre2_error_string(rex)); +@} +cre2_delete(rex); +@end example + +@noindent +prints: + +@example +error: code=6, msg="missing ): ci(ao" +@end example +@end deftypefun + + +@deftypefun void cre2_error_arg (const cre2_regexp_t * @var{rex}, cre2_string_t * @var{arg}) +If an error occurred while building @var{rex}: fill the structure +referenced by @var{arg} with the interval of bytes representing the +offending portion of the pattern. + +If @var{rex} is a successfully built regular expression object: +@var{arg} references an empty string. + +The following code: + +@example +cre2_regexp_t * rex; +cre2_string_t S; + +rex = cre2_new("ci(ao", 5, NULL); +@{ + cre2_error_arg(rex, &S); + printf("arg: len=%d, data=\"%s\"\n", S.length, S.data); +@} +cre2_delete(rex); +@end example + +@noindent +prints: + +@example +arg: len=5 data="ci(ao" +@end example +@end deftypefun + +@c page +@node options +@chapter Matching configuration + + +Compiled regular expressions can be configured, at construction--time, +with a number of options collected in a @code{cre2_options_t} object. +Notice that, by default, when attempting to compile an invalid regular +expression pattern, RE2 will print to @code{stderr} an error message; +usually we want to avoid this logging by disabling the associated +option: + +@example +cre2_options_t * opt; + +opt = cre2_opt_new(); +cre2_opt_set_log_errors(opt, 0); +@end example + + +@deftp {Opaque Typedef} cre2_options_t +Type of opaque pointers to options objects. Any instance of this type +can be used to configure any number of regular expression objects. +@end deftp + + +@deftp {Enumeration Typedef} cre2_encoding_t +@cindex @code{CRE2_UNKNOWN} +@cindex @code{CRE2_UTF8} +@cindex @code{CRE2_Latin1} +Enumeration type for constants selecting encoding. It contains the +following values: + +@example +CRE2_UNKNOWN +CRE2_UTF8 +CRE2_Latin1 +@end example + +The value @code{CRE2_UNKNOWN} should never be used: it exists only in +case there is a mismatch between the definitions of RE2 and +@value{PACKAGE}. +@end deftp + + +@deftypefun {cre2_options_t *} cre2_opt_new (void) +Allocate and return a new options object. If memory allocation fails: +the return value is a @cnull{} pointer. +@end deftypefun + + +@deftypefun void cre2_opt_delete (cre2_options_t * @var{opt}) +Finalise an options object releasing all the associated resources. +Compiled regular expressions configured with this object are +@strong{not} affected by its destruction. +@end deftypefun + + +All the following functions are getters and setters for regular +expression options; the @var{flag} argument to the setter must be false +to disable the option and true to enable it; unless otherwise specified +the @code{int} return value is true if the option is enabled and false +if it is disabled. + + +@deftypefun cre2_encoding_t cre2_opt_encoding (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_encoding (cre2_options_t * @var{opt}, cre2_encoding_t @var{enc}) +By default, the regular expression pattern and input text are +interpreted as @utf{}-8. CRE2_Latin1 encoding causes them to be +interpreted as Latin-1. + +The getter returns @code{CRE2_UNKNOWN} if the encoding value returned by +RE2 is unknown. +@end deftypefun + + +@deftypefun int cre2_opt_posix_syntax (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_posix_syntax (cre2_options_t * @var{opt}, int @var{flag}) +Restrict regexps to @posix{} egrep syntax. Default is disabled. +@end deftypefun + + +@deftypefun int cre2_opt_longest_match (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_longest_match (cre2_options_t * @var{opt}, int @var{flag}) +Search for longest match, not first match. Default is disabled. +@end deftypefun + + +@deftypefun int cre2_opt_log_errors (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_log_errors (cre2_options_t * @var{opt}, int @var{flag}) +Log syntax and execution errors to @code{stderr}. Default is enabled. +@end deftypefun + + +@deftypefun int cre2_opt_literal (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_literal (cre2_options_t * @var{opt}, int @var{flag}) +Interpret the pattern string as literal, not as regular expression. +Default is disabled. + +Setting this option is equivalent to quoting all the special characters +defining a regular expression pattern: + +@example +cre2_regexp_t * rex; +cre2_options_t * opt; +const char * pattern = "(ciao) (hello)"; +const char * text = pattern; +int len = strlen(pattern); + +opt = cre2_opt_new(); +cre2_opt_set_literal(opt, 1); +rex = cre2_new(pattern, len, opt); +@{ + /* successful match */ + cre2_match(rex, text, len, 0, len, + CRE2_UNANCHORED, NULL, 0); +@} +cre2_delete(rex); +cre2_opt_delete(opt); +@end example +@end deftypefun + + +@deftypefun int cre2_opt_never_nl (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_never_nl (cre2_options_t * @var{opt}, int @var{flag}) +Never match a newline character, even if it is in the regular expression +pattern; default is disabled. Turning on this option allows us to +attempt a partial match, against the beginning of a multiline text, +without using subpatterns to exclude the newline in the regexp pattern. + +@itemize +@item +When set to true: matching always fails if the text or the regexp +contains a newline. + +@item +When set to false: matching succeeds or fails taking normal account of +newlines. + +@item +The option does @strong{not} cause newlines to be skipped. +@end itemize +@end deftypefun + + +@deftypefun int cre2_opt_case_sensitive (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_case_sensitive (cre2_options_t * @var{opt}, int @var{flag}) +Match is case--sensitive; the regular expression pattern can override +this setting with @code{(?i)} unless configured in @posix{} syntax +mode. Default is enabled. +@end deftypefun + + +@deftypefun int cre2_opt_max_mem (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_max_mem (cre2_options_t * @var{opt}, int @var{m}) +The max memory option controls how much memory can be used to hold the +compiled form of the regular expression and its cached @acronym{DFA} +graphs. These functions set and get such amount of memory. See the +documentation of RE2 for details. +@end deftypefun + + +The following options are only consulted when @posix{} syntax is +enabled; when @posix{} syntax is disabled: these features are always +enabled and cannot be turned off. + + +@deftypefun int cre2_opt_perl_classes (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_perl_classes (cre2_options_t * @var{opt}, int @var{flag}) +Allow Perl's @code{\d}, @code{\s}, @code{\w}, @code{\D}, @code{\S}, +@code{\W}. Default is disabled. +@end deftypefun + + +@deftypefun int cre2_opt_word_boundary (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_word_boundary (cre2_options_t * @var{opt}, int @var{flag}) +Allow Perl's @code{\b}, @code{\B} (word boundary and not). Default is +disabled. +@end deftypefun + + +@deftypefun int cre2_opt_one_line (cre2_options_t * @var{opt}) +@deftypefunx void cre2_opt_set_one_line (cre2_options_t * @var{opt}, int @var{flag}) +The patterns @code{^} and @code{$} only match at the beginning and end +of the text. Default is disabled. +@end deftypefun + +@c page +@node matching +@chapter Matching regular expressions + + +Basic pattern matching goes as follows (with error checking omitted): + +@example +cre2_regexp_t * rex; +cre2_options_t * opt; +const char * pattern = "(ciao) (hello)"; + +opt = cre2_opt_new(); +cre2_opt_set_posix_syntax(opt, 1); + +rex = cre2_new(pattern, strlen(pattern), opt); +@{ + const char * text = "ciao hello"; + int text_len = strlen(text); + int nmatch = 3; + cre2_string_t match[nmatch]; + + cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, + match, nmatch); + + /* prints: full match: ciao hello */ + printf("full match: "); + fwrite(match[0].data, match[0].length, 1, stdout); + printf("\n"); + + /* prints: first group: ciao */ + printf("first group: "); + fwrite(match[1].data, match[1].length, 1, stdout); + printf("\n"); + + /* prints: second group: hello */ + printf("second group: "); + fwrite(match[2].data, match[2].length, 1, stdout); + printf("\n"); +@} +cre2_delete(rex); +cre2_opt_delete(opt); +@end example + + +@deftp {Enumeration Typedef} cre2_anchor_t +@cindex @code{CRE2_UNANCHORED} +@cindex @code{CRE2_ANCHOR_START} +@cindex @code{CRE2_ANCHOR_BOTH} +Enumeration type for the anchor point of matching operations. It +contains the following constants: + +@example +CRE2_UNANCHORED +CRE2_ANCHOR_START +CRE2_ANCHOR_BOTH +@end example +@end deftp + + +@deftypefun int cre2_match (const cre2_regexp_t * @var{rex}, const char * @var{text}, int @var{text_len}, int @var{start_pos}, int @var{end_pos}, cre2_anchor_t @var{anchor}, cre2_string_t * @var{match}, int @var{nmatch}) +Match a substring of the text referenced by @var{text} and holding +@var{text_len} bytes against the regular expression object @var{rex}. +Return true if the text matched, false otherwise. + +The zero--based indices @var{start_pos} (inclusive) and @var{end_pos} +(exclusive) select the substring of @var{text} to be examined. +@var{anchor} selects the anchor point for the matching operation. + +Data about the matching groups is stored in the array @var{match}, which +must have at least @var{nmatch} entries; the referenced substrings are +portions of the @var{text} buffer. If we are only interested in +verifying if the text matches or not (ignoring the matching portions of +text): we can use @cnull{} as @var{match} argument and @math{0} as +@var{nmatch} argument. + +The first element of @var{match} (index @math{0}) references the full +portion of the substring of @var{text} matching the pattern; the second +element of @var{match} (index @math{1}) references the portion of text +matching the first parenthetical subexpression, the third element of +@var{match} (index @math{2}) references the portion of text matching the +second parenthetical subexpression; and so on. +@end deftypefun + + +@deftypefun int cre2_easy_match (const char * @var{pattern}, int @var{pattern_len}, const char * @var{text}, int @var{text_len}, cre2_string_t * @var{match}, int @var{nmatch}) +Like @cfunc{cre2_match} but the pattern is specified as string +@var{pattern} holding @var{pattern_len} bytes. Also the text is fully +matched without anchoring. + +If the text matches the pattern: the return value is @math{1}. If the +text does not match the pattern: the return value is @math{0}. If the +pattern is invalid: the return value is @math{2}. +@end deftypefun + + +@deftp {Struct Typedef} cre2_range_t +Structure type used to represent a substring of the text to be matched +as starting and ending indices. It has the following fields: + +@table @code +@item long start +Inclusive start byte index. + +@item long past +Exclusive end byte index. +@end table +@end deftp + + +@deftypefun void cre2_strings_to_ranges (const char * @var{text}, cre2_range_t * @var{ranges}, cre2_string_t * @var{strings}, int @var{nmatch}) +Given an array of @var{strings} with @var{nmatch} elements being the +result of matching @var{text} against a regular expression: fill the +array of @var{ranges} with the index intervals in the @var{text} buffer +representing the same results. +@end deftypefun + +@c page +@node other +@chapter Other matching functions + + +The following functions match a buffer of text against a regular +expression, allowing the extraction of portions of text matching +parenthetical subexpressions. All of them show the following behaviour: + +@itemize +@item +If the text matches the pattern: the return value is @math{1}; if the +text does not match the pattern: the return value is @math{0}. + +@item +If the pattern is invalid: the return value is @math{0}; there is no way +to distinguish this case from the case of text not matching other than +looking at what RE2 prints to @code{stderr}. + +@item +It is impossible to turn off logging of error messages to @code{stderr} +when the specification of the regular expression is invalid. + +@item +Data about the matching groups is stored in the array @var{match}, which +must have at least @var{nmatch} slots; the referenced substrings are +portions of the @var{text} buffer. + +@item +The array @var{match} can have a number of slots between zero (included) +and the number of parenthetical subexpressions in @var{pattern} +(excluded); if @var{nmatch} is greater than the number of parenthetical +subexpressions: the return value is @math{0}. + +@item +If we are only interested in verifying if the text matches the pattern +or not: we can use @cnull{} as @var{match} argument and @math{0} as +@var{nmatch} argument. + +@item +The first slot of @var{match} (index @math{0}) references the portion of +text matching the first parenthetical subexpression; the second slot of +@var{match} (index @math{1}) references the portion of text matching the +second parenthetical subexpression; and so on. +@end itemize + +@noindent +see the documentation of each function for the differences. + +The following example is a successful match: + +@example +const char * pattern = "ci.*ut"; +const char * text = "ciao salut"; +cre2_string_t input = @{ + .data = text, + .length = strlen(text) +@}; +int result; +result = cre2_full_match(pattern, &input, NULL, 0); + +result @result{} 1 +@end example + +@noindent +the following example is a successful match in which the parenthetical +subexpression is ignored: + +@example +const char * pattern = "(ciao) salut"; +const char * text = "ciao salut"; +cre2_string_t input = @{ + .data = text, + .length = strlen(text) +@}; +int result; +result = cre2_full_match(pattern, &input, NULL, 0); + +result @result{} 1 +@end example + +@noindent +the following example is a successful match in which the portion of text +matching the parenthetical subexpression is reported: + +@example +const char * pattern = "(ciao) salut"; +const char * text = "ciao salut"; +cre2_string_t input = @{ + .data = text, + .length = strlen(text) +@}; +int nmatch = 1; +cre2_string_t match[nmatch]; +int result; +result = cre2_full_match(pattern, &input, match, nmatch); + +result @result{} 1 +strncmp(text, input.data, input.length) @result{} 0 +strncmp("ciao", match[0].data, match[0].length) @result{} 0 +@end example + + +@deftypefun int cre2_full_match (const char * @var{pattern}, const cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +@deftypefunx int cre2_full_match_re (cre2_regexp_t * @var{rex}, const cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +Match the zero--terminated string @var{pattern} or the precompiled +regular expression @var{rex} against the full buffer @var{text}. + +For example: the text @code{abcdef} matches the pattern @code{abcdef} +according to this function, but neither the pattern @code{abc} nor the +pattern @code{def} will match. +@end deftypefun + + +@deftypefun int cre2_partial_match (const char * @var{pattern}, const cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +@deftypefunx int cre2_partial_match_re (cre2_regexp_t * @var{rex}, const cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +Match the zero--terminated string @var{pattern} or the precompiled +regular expression @var{rex} against the buffer @var{text}, resulting in +success if a substring of @var{text} matches; these functions behave +like the full match ones, but the matching text does not need to be +anchored to the beginning and end. + +For example: the text @code{abcDEFghi} matches the pattern @code{DEF} +according to this function. +@end deftypefun + + +@deftypefun int cre2_consume (const char * @var{pattern}, cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +@deftypefunx int cre2_consume_re (cre2_regexp_t * @var{rex}, cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +Match the zero--terminated string @var{pattern} or the precompiled +regular expression @var{rex} against the buffer @var{text}, resulting in +success if the prefix of @var{text} matches. The data structure +referenced by @var{text} is mutated to reference text right after the +last byte that matched the pattern. + +For example: the text @code{abcDEF} matches the pattern @code{abc} +according to this function; after the call @var{text} will reference the +text @code{DEF}. +@end deftypefun + + +@deftypefun int cre2_find_and_consume (const char * @var{pattern}, cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +@deftypefunx int cre2_find_and_consume_re (cre2_regexp_t * @var{rex}, cre2_string_t * @var{text}, cre2_string_t * @var{match}, int @var{nmatch}) +Match the zero--terminated string @var{pattern} or the precompiled +regular expression @var{rex} against the buffer @var{text}, resulting in +success if, after skipping a non--matching prefix in @var{text}, a +substring of @var{text} matches. The data structure referenced by +@var{text} is mutated to reference text right after the last byte that +matched the pattern. + +For example: the text @code{abcDEFghi} matches the pattern @code{DEF} +according to this function; the prefix @code{abc} is skipped; after the +call @var{text} will reference the text @code{ghi}. +@end deftypefun + +@c page +@node tips +@chapter Tips for using the regexp syntax + + +@menu +* tips dot:: Matching newlines with the + @code{.} subpattern. +@end menu + +@c page +@node tips dot +@section Matching newlines with the @code{.} subpattern + + +By default the dot subpattern @code{.} matches any character but +newlines; to enable newline matching we have to enable the @code{s} flag +using the special subpattern @samp{(?)} or +@samp{(?:)}, where @code{} is a sequence of +characters, one character for each flag, and @code{} is a regexp +subpattern. Notice that the parentheses in @code{(?:)} are +non--capturing. + + So let's consider the text @code{ciao\nhello}: + +@itemize +@item +The regexp @code{ciao.hello} does @strong{not} match because @code{s} is +disabled. + +@item +The regexp @code{(?s)ciao.hello} matches because the subpattern +@code{(?s)} has enabled flag @code{s} for the rest of the pattern, +including the dot. + +@item +The regexp @code{ciao(?s).hello} matches because the subpattern +@code{(?s)} has enabled flag @code{s} for the rest of the pattern, +including the dot. + +@item +The regexp @code{ciao(?s:.)hello} matches because the subpattern +@code{(?s:.)} has enabled flag @code{s} for the subpattern @code{.} +which is the dot. +@end itemize + +@c page +@node Package License +@appendix Package license + + +Copyright @copyright{} 2012 Marco Maggi @value{AUTHOR_URL}@* +Copyright @copyright{} 2011 Keegan McAllister @url{http://github.com/kmcallister/}@* +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +@enumerate +@item +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +@item +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +@item +Neither the name of the author nor the names of his contributors may be +used to endorse or promote products derived from this software without +specific prior written permission. +@end enumerate + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS +IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@c page + +@include fdl-1.3.texi + +@c page +@node references +@appendix Bibliography and references + + + +@c page +@node concept index +@appendix An entry for each concept + +@printindex cp + +@node function index +@appendix An entry for each function. + +@printindex fn + +@node variable index +@appendix An entry for each variable. + +@printindex vr + +@node type index +@appendix An entry for each type. + +@printindex tp + +@contents +@bye + +@c end of file diff --git a/outside/cre2/src/doc/fdl-1.3.texi b/outside/cre2/src/doc/fdl-1.3.texi new file mode 100644 index 000000000..1a7835ec5 --- /dev/null +++ b/outside/cre2/src/doc/fdl-1.3.texi @@ -0,0 +1,509 @@ +@node Documentation License +@appendix GNU Free Documentation License + +@cindex FDL, GNU Free Documentation License +@center Version 1.3, 3 November 2008 + +@c This file is intended to be included within another document, +@c hence no sectioning command or @node. + +@display +Copyright @copyright{} 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc. +@uref{http://fsf.org/} + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. +@end display + +@enumerate 0 +@item +PREAMBLE + +The purpose of this License is to make a manual, textbook, or other +functional and useful document @dfn{free} in the sense of freedom: to +assure everyone the effective freedom to copy and redistribute it, +with or without modifying it, either commercially or noncommercially. +Secondarily, this License preserves for the author and publisher a way +to get credit for their work, while not being considered responsible +for modifications made by others. + +This License is a kind of ``copyleft'', which means that derivative +works of the document must themselves be free in the same sense. It +complements the GNU General Public License, which is a copyleft +license designed for free software. + +We have designed this License in order to use it for manuals for free +software, because free software needs free documentation: a free +program should come with manuals providing the same freedoms that the +software does. But this License is not limited to software manuals; +it can be used for any textual work, regardless of subject matter or +whether it is published as a printed book. We recommend this License +principally for works whose purpose is instruction or reference. + +@item +APPLICABILITY AND DEFINITIONS + +This License applies to any manual or other work, in any medium, that +contains a notice placed by the copyright holder saying it can be +distributed under the terms of this License. Such a notice grants a +world-wide, royalty-free license, unlimited in duration, to use that +work under the conditions stated herein. The ``Document'', below, +refers to any such manual or work. Any member of the public is a +licensee, and is addressed as ``you''. You accept the license if you +copy, modify or distribute the work in a way requiring permission +under copyright law. + +A ``Modified Version'' of the Document means any work containing the +Document or a portion of it, either copied verbatim, or with +modifications and/or translated into another language. + +A ``Secondary Section'' is a named appendix or a front-matter section +of the Document that deals exclusively with the relationship of the +publishers or authors of the Document to the Document's overall +subject (or to related matters) and contains nothing that could fall +directly within that overall subject. (Thus, if the Document is in +part a textbook of mathematics, a Secondary Section may not explain +any mathematics.) The relationship could be a matter of historical +connection with the subject or with related matters, or of legal, +commercial, philosophical, ethical or political position regarding +them. + +The ``Invariant Sections'' are certain Secondary Sections whose titles +are designated, as being those of Invariant Sections, in the notice +that says that the Document is released under this License. If a +section does not fit the above definition of Secondary then it is not +allowed to be designated as Invariant. The Document may contain zero +Invariant Sections. If the Document does not identify any Invariant +Sections then there are none. + +The ``Cover Texts'' are certain short passages of text that are listed, +as Front-Cover Texts or Back-Cover Texts, in the notice that says that +the Document is released under this License. A Front-Cover Text may +be at most 5 words, and a Back-Cover Text may be at most 25 words. + +A ``Transparent'' copy of the Document means a machine-readable copy, +represented in a format whose specification is available to the +general public, that is suitable for revising the document +straightforwardly with generic text editors or (for images composed of +pixels) generic paint programs or (for drawings) some widely available +drawing editor, and that is suitable for input to text formatters or +for automatic translation to a variety of formats suitable for input +to text formatters. A copy made in an otherwise Transparent file +format whose markup, or absence of markup, has been arranged to thwart +or discourage subsequent modification by readers is not Transparent. +An image format is not Transparent if used for any substantial amount +of text. A copy that is not ``Transparent'' is called ``Opaque''. + +Examples of suitable formats for Transparent copies include plain +@sc{ascii} without markup, Texinfo input format, La@TeX{} input +format, @acronym{SGML} or @acronym{XML} using a publicly available +@acronym{DTD}, and standard-conforming simple @acronym{HTML}, +PostScript or @acronym{PDF} designed for human modification. Examples +of transparent image formats include @acronym{PNG}, @acronym{XCF} and +@acronym{JPG}. Opaque formats include proprietary formats that can be +read and edited only by proprietary word processors, @acronym{SGML} or +@acronym{XML} for which the @acronym{DTD} and/or processing tools are +not generally available, and the machine-generated @acronym{HTML}, +PostScript or @acronym{PDF} produced by some word processors for +output purposes only. + +The ``Title Page'' means, for a printed book, the title page itself, +plus such following pages as are needed to hold, legibly, the material +this License requires to appear in the title page. For works in +formats which do not have any title page as such, ``Title Page'' means +the text near the most prominent appearance of the work's title, +preceding the beginning of the body of the text. + +The ``publisher'' means any person or entity that distributes copies +of the Document to the public. + +A section ``Entitled XYZ'' means a named subunit of the Document whose +title either is precisely XYZ or contains XYZ in parentheses following +text that translates XYZ in another language. (Here XYZ stands for a +specific section name mentioned below, such as ``Acknowledgements'', +``Dedications'', ``Endorsements'', or ``History''.) To ``Preserve the Title'' +of such a section when you modify the Document means that it remains a +section ``Entitled XYZ'' according to this definition. + +The Document may include Warranty Disclaimers next to the notice which +states that this License applies to the Document. These Warranty +Disclaimers are considered to be included by reference in this +License, but only as regards disclaiming warranties: any other +implication that these Warranty Disclaimers may have is void and has +no effect on the meaning of this License. + +@item +VERBATIM COPYING + +You may copy and distribute the Document in any medium, either +commercially or noncommercially, provided that this License, the +copyright notices, and the license notice saying this License applies +to the Document are reproduced in all copies, and that you add no other +conditions whatsoever to those of this License. You may not use +technical measures to obstruct or control the reading or further +copying of the copies you make or distribute. However, you may accept +compensation in exchange for copies. If you distribute a large enough +number of copies you must also follow the conditions in section 3. + +You may also lend copies, under the same conditions stated above, and +you may publicly display copies. + +@item +COPYING IN QUANTITY + +If you publish printed copies (or copies in media that commonly have +printed covers) of the Document, numbering more than 100, and the +Document's license notice requires Cover Texts, you must enclose the +copies in covers that carry, clearly and legibly, all these Cover +Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on +the back cover. Both covers must also clearly and legibly identify +you as the publisher of these copies. The front cover must present +the full title with all words of the title equally prominent and +visible. You may add other material on the covers in addition. +Copying with changes limited to the covers, as long as they preserve +the title of the Document and satisfy these conditions, can be treated +as verbatim copying in other respects. + +If the required texts for either cover are too voluminous to fit +legibly, you should put the first ones listed (as many as fit +reasonably) on the actual cover, and continue the rest onto adjacent +pages. + +If you publish or distribute Opaque copies of the Document numbering +more than 100, you must either include a machine-readable Transparent +copy along with each Opaque copy, or state in or with each Opaque copy +a computer-network location from which the general network-using +public has access to download using public-standard network protocols +a complete Transparent copy of the Document, free of added material. +If you use the latter option, you must take reasonably prudent steps, +when you begin distribution of Opaque copies in quantity, to ensure +that this Transparent copy will remain thus accessible at the stated +location until at least one year after the last time you distribute an +Opaque copy (directly or through your agents or retailers) of that +edition to the public. + +It is requested, but not required, that you contact the authors of the +Document well before redistributing any large number of copies, to give +them a chance to provide you with an updated version of the Document. + +@item +MODIFICATIONS + +You may copy and distribute a Modified Version of the Document under +the conditions of sections 2 and 3 above, provided that you release +the Modified Version under precisely this License, with the Modified +Version filling the role of the Document, thus licensing distribution +and modification of the Modified Version to whoever possesses a copy +of it. In addition, you must do these things in the Modified Version: + +@enumerate A +@item +Use in the Title Page (and on the covers, if any) a title distinct +from that of the Document, and from those of previous versions +(which should, if there were any, be listed in the History section +of the Document). You may use the same title as a previous version +if the original publisher of that version gives permission. + +@item +List on the Title Page, as authors, one or more persons or entities +responsible for authorship of the modifications in the Modified +Version, together with at least five of the principal authors of the +Document (all of its principal authors, if it has fewer than five), +unless they release you from this requirement. + +@item +State on the Title page the name of the publisher of the +Modified Version, as the publisher. + +@item +Preserve all the copyright notices of the Document. + +@item +Add an appropriate copyright notice for your modifications +adjacent to the other copyright notices. + +@item +Include, immediately after the copyright notices, a license notice +giving the public permission to use the Modified Version under the +terms of this License, in the form shown in the Addendum below. + +@item +Preserve in that license notice the full lists of Invariant Sections +and required Cover Texts given in the Document's license notice. + +@item +Include an unaltered copy of this License. + +@item +Preserve the section Entitled ``History'', Preserve its Title, and add +to it an item stating at least the title, year, new authors, and +publisher of the Modified Version as given on the Title Page. If +there is no section Entitled ``History'' in the Document, create one +stating the title, year, authors, and publisher of the Document as +given on its Title Page, then add an item describing the Modified +Version as stated in the previous sentence. + +@item +Preserve the network location, if any, given in the Document for +public access to a Transparent copy of the Document, and likewise +the network locations given in the Document for previous versions +it was based on. These may be placed in the ``History'' section. +You may omit a network location for a work that was published at +least four years before the Document itself, or if the original +publisher of the version it refers to gives permission. + +@item +For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve +the Title of the section, and preserve in the section all the +substance and tone of each of the contributor acknowledgements and/or +dedications given therein. + +@item +Preserve all the Invariant Sections of the Document, +unaltered in their text and in their titles. Section numbers +or the equivalent are not considered part of the section titles. + +@item +Delete any section Entitled ``Endorsements''. Such a section +may not be included in the Modified Version. + +@item +Do not retitle any existing section to be Entitled ``Endorsements'' or +to conflict in title with any Invariant Section. + +@item +Preserve any Warranty Disclaimers. +@end enumerate + +If the Modified Version includes new front-matter sections or +appendices that qualify as Secondary Sections and contain no material +copied from the Document, you may at your option designate some or all +of these sections as invariant. To do this, add their titles to the +list of Invariant Sections in the Modified Version's license notice. +These titles must be distinct from any other section titles. + +You may add a section Entitled ``Endorsements'', provided it contains +nothing but endorsements of your Modified Version by various +parties---for example, statements of peer review or that the text has +been approved by an organization as the authoritative definition of a +standard. + +You may add a passage of up to five words as a Front-Cover Text, and a +passage of up to 25 words as a Back-Cover Text, to the end of the list +of Cover Texts in the Modified Version. Only one passage of +Front-Cover Text and one of Back-Cover Text may be added by (or +through arrangements made by) any one entity. If the Document already +includes a cover text for the same cover, previously added by you or +by arrangement made by the same entity you are acting on behalf of, +you may not add another; but you may replace the old one, on explicit +permission from the previous publisher that added the old one. + +The author(s) and publisher(s) of the Document do not by this License +give permission to use their names for publicity for or to assert or +imply endorsement of any Modified Version. + +@item +COMBINING DOCUMENTS + +You may combine the Document with other documents released under this +License, under the terms defined in section 4 above for modified +versions, provided that you include in the combination all of the +Invariant Sections of all of the original documents, unmodified, and +list them all as Invariant Sections of your combined work in its +license notice, and that you preserve all their Warranty Disclaimers. + +The combined work need only contain one copy of this License, and +multiple identical Invariant Sections may be replaced with a single +copy. If there are multiple Invariant Sections with the same name but +different contents, make the title of each such section unique by +adding at the end of it, in parentheses, the name of the original +author or publisher of that section if known, or else a unique number. +Make the same adjustment to the section titles in the list of +Invariant Sections in the license notice of the combined work. + +In the combination, you must combine any sections Entitled ``History'' +in the various original documents, forming one section Entitled +``History''; likewise combine any sections Entitled ``Acknowledgements'', +and any sections Entitled ``Dedications''. You must delete all +sections Entitled ``Endorsements.'' + +@item +COLLECTIONS OF DOCUMENTS + +You may make a collection consisting of the Document and other documents +released under this License, and replace the individual copies of this +License in the various documents with a single copy that is included in +the collection, provided that you follow the rules of this License for +verbatim copying of each of the documents in all other respects. + +You may extract a single document from such a collection, and distribute +it individually under this License, provided you insert a copy of this +License into the extracted document, and follow this License in all +other respects regarding verbatim copying of that document. + +@item +AGGREGATION WITH INDEPENDENT WORKS + +A compilation of the Document or its derivatives with other separate +and independent documents or works, in or on a volume of a storage or +distribution medium, is called an ``aggregate'' if the copyright +resulting from the compilation is not used to limit the legal rights +of the compilation's users beyond what the individual works permit. +When the Document is included in an aggregate, this License does not +apply to the other works in the aggregate which are not themselves +derivative works of the Document. + +If the Cover Text requirement of section 3 is applicable to these +copies of the Document, then if the Document is less than one half of +the entire aggregate, the Document's Cover Texts may be placed on +covers that bracket the Document within the aggregate, or the +electronic equivalent of covers if the Document is in electronic form. +Otherwise they must appear on printed covers that bracket the whole +aggregate. + +@item +TRANSLATION + +Translation is considered a kind of modification, so you may +distribute translations of the Document under the terms of section 4. +Replacing Invariant Sections with translations requires special +permission from their copyright holders, but you may include +translations of some or all Invariant Sections in addition to the +original versions of these Invariant Sections. You may include a +translation of this License, and all the license notices in the +Document, and any Warranty Disclaimers, provided that you also include +the original English version of this License and the original versions +of those notices and disclaimers. In case of a disagreement between +the translation and the original version of this License or a notice +or disclaimer, the original version will prevail. + +If a section in the Document is Entitled ``Acknowledgements'', +``Dedications'', or ``History'', the requirement (section 4) to Preserve +its Title (section 1) will typically require changing the actual +title. + +@item +TERMINATION + +You may not copy, modify, sublicense, or distribute the Document +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense, or distribute it is void, and +will automatically terminate your rights under this License. + +However, if you cease all violation of this License, then your license +from a particular copyright holder is reinstated (a) provisionally, +unless and until the copyright holder explicitly and finally +terminates your license, and (b) permanently, if the copyright holder +fails to notify you of the violation by some reasonable means prior to +60 days after the cessation. + +Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, receipt of a copy of some or all of the same material does +not give you any rights to use it. + +@item +FUTURE REVISIONS OF THIS LICENSE + +The Free Software Foundation may publish new, revised versions +of the GNU Free Documentation License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. See +@uref{http://www.gnu.org/copyleft/}. + +Each version of the License is given a distinguishing version number. +If the Document specifies that a particular numbered version of this +License ``or any later version'' applies to it, you have the option of +following the terms and conditions either of that specified version or +of any later version that has been published (not as a draft) by the +Free Software Foundation. If the Document does not specify a version +number of this License, you may choose any version ever published (not +as a draft) by the Free Software Foundation. If the Document +specifies that a proxy can decide which future versions of this +License can be used, that proxy's public statement of acceptance of a +version permanently authorizes you to choose that version for the +Document. + +@item +RELICENSING + +``Massive Multiauthor Collaboration Site'' (or ``MMC Site'') means any +World Wide Web server that publishes copyrightable works and also +provides prominent facilities for anybody to edit those works. A +public wiki that anybody can edit is an example of such a server. A +``Massive Multiauthor Collaboration'' (or ``MMC'') contained in the +site means any set of copyrightable works thus published on the MMC +site. + +``CC-BY-SA'' means the Creative Commons Attribution-Share Alike 3.0 +license published by Creative Commons Corporation, a not-for-profit +corporation with a principal place of business in San Francisco, +California, as well as future copyleft versions of that license +published by that same organization. + +``Incorporate'' means to publish or republish a Document, in whole or +in part, as part of another Document. + +An MMC is ``eligible for relicensing'' if it is licensed under this +License, and if all works that were first published under this License +somewhere other than this MMC, and subsequently incorporated in whole +or in part into the MMC, (1) had no cover texts or invariant sections, +and (2) were thus incorporated prior to November 1, 2008. + +The operator of an MMC Site may republish an MMC contained in the site +under CC-BY-SA on the same site at any time before August 1, 2009, +provided the MMC is eligible for relicensing. + +@end enumerate + +@page +@heading ADDENDUM: How to use this License for your documents + +To use this License in a document you have written, include a copy of +the License in the document and put the following copyright and +license notices just after the title page: + +@smallexample +@group + Copyright (C) @var{year} @var{your name}. + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.3 + or any later version published by the Free Software Foundation; + with no Invariant Sections, no Front-Cover Texts, and no Back-Cover + Texts. A copy of the license is included in the section entitled ``GNU + Free Documentation License''. +@end group +@end smallexample + +If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts, +replace the ``with@dots{}Texts.'' line with this: + +@smallexample +@group + with the Invariant Sections being @var{list their titles}, with + the Front-Cover Texts being @var{list}, and with the Back-Cover Texts + being @var{list}. +@end group +@end smallexample + +If you have Invariant Sections without Cover Texts, or some other +combination of the three, merge those two alternatives to suit the +situation. + +If your document contains nontrivial examples of program code, we +recommend releasing these examples in parallel under your choice of +free software license, such as the GNU General Public License, +to permit their use in free software. + +@c Local Variables: +@c ispell-local-pdict: "ispell-dict" +@c End: + diff --git a/outside/cre2/src/prepare.sh b/outside/cre2/src/prepare.sh new file mode 100644 index 000000000..74afa769b --- /dev/null +++ b/outside/cre2/src/prepare.sh @@ -0,0 +1,10 @@ +# prepare.sh -- +# +# Run this to rebuild the infrastructure and configure. + +set -xe + +(cd .. && sh autogen.sh) +sh ../configure.sh + +### end of file diff --git a/outside/cre2/src/src/cre2.cpp b/outside/cre2/src/src/cre2.cpp new file mode 100644 index 000000000..e44039b09 --- /dev/null +++ b/outside/cre2/src/src/cre2.cpp @@ -0,0 +1,605 @@ +/* + Source file for CRE2, a C language wrapper for RE2: a regular + expressions library by Google. + + Copyright (c) 2012 Marco Maggi + Copyright (c) 2011 Keegan McAllister + All rights reserved. + + For the license notice see the COPYING file. +*/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include "cre2.h" + +#include +#include + + +/** -------------------------------------------------------------------- + ** Version functions. + ** ----------------------------------------------------------------- */ + +const char * +cre2_version_string (void) +{ + return cre2_VERSION_INTERFACE_STRING; +} +int +cre2_version_interface_current (void) +{ + return cre2_VERSION_INTERFACE_CURRENT; +} +int +cre2_version_interface_revision (void) +{ + return cre2_VERSION_INTERFACE_REVISION; +} +int +cre2_version_interface_age (void) +{ + return cre2_VERSION_INTERFACE_AGE; +} + + +/** -------------------------------------------------------------------- + ** Options objects. + ** ----------------------------------------------------------------- */ + +/* Cast the pointer argument "opt" to a pointer of type + "RE2::Options*". */ +#define TO_OPT(opt) (reinterpret_cast(opt)) + +cre2_options_t * +cre2_opt_new(void) +/* Allocate and return a new options object. */ +{ + // FIXME: is this use of "nothrow" good to avoid raising exceptions + // when memory allocation fails and to return NULL instead? + return reinterpret_cast(new (std::nothrow) RE2::Options()); +} +void +cre2_opt_delete (cre2_options_t *opt) +/* Finalise an options object. */ +{ + delete TO_OPT(opt); +} + +/* Set or unset option flags in an options object. */ +#define OPT_BOOL(name) \ + void cre2_opt_set_##name (cre2_options_t *opt, int flag) \ + { \ + TO_OPT(opt)->set_##name(bool(flag)); \ + } \ + int cre2_opt_##name (cre2_options_t *opt) \ + { \ + return TO_OPT(opt)->name(); \ + } +OPT_BOOL(posix_syntax) +OPT_BOOL(longest_match) +OPT_BOOL(log_errors) +OPT_BOOL(literal) +OPT_BOOL(never_nl) +OPT_BOOL(case_sensitive) +OPT_BOOL(perl_classes) +OPT_BOOL(word_boundary) +OPT_BOOL(one_line) +#undef OPT_BOOL + +void +cre2_opt_set_encoding (cre2_options_t *opt, cre2_encoding_t enc) +/* Select the encoding in an options object. */ +{ + switch (enc) { + case CRE2_UTF8: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8); + break; + case CRE2_Latin1: + TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1); + break; + default: + fprintf(stderr, "CRE2: internal error: unknown encoding %d\n", enc); + exit(EXIT_FAILURE); + } +} +cre2_encoding_t +cre2_opt_encoding (cre2_options_t *opt) +{ + RE2::Options::Encoding E = TO_OPT(opt)->encoding(); + switch (E) { + case RE2::Options::EncodingUTF8: + return CRE2_UTF8; + case RE2::Options::EncodingLatin1: + return CRE2_Latin1; + default: + return CRE2_UNKNOWN; + } +} +void +cre2_opt_set_max_mem (cre2_options_t *opt, int m) +/* Configure the maximum amount of memory in an options object. */ +{ + TO_OPT(opt)->set_max_mem(m); +} +int +cre2_opt_max_mem (cre2_options_t *opt) +{ + return TO_OPT(opt)->max_mem(); +} + + +/** -------------------------------------------------------------------- + ** Precompiled regular expressions objects. + ** ----------------------------------------------------------------- */ + +#define TO_RE2(re) (reinterpret_cast(re)) +#define TO_CONST_RE2(re) (reinterpret_cast(re)) + +cre2_regexp_t * +cre2_new (const char *pattern, int pattern_len, const cre2_options_t *opt) +{ + re2::StringPiece pattern_re2(pattern, pattern_len); + if (opt) { + // FIXME: is this use of "nothrow" enough to avoid raising + // exceptions when memory allocation fails and to return NULL + // instead? + return reinterpret_cast + (new (std::nothrow) RE2(pattern_re2, *reinterpret_cast(opt))); + } else { + return reinterpret_cast (new (std::nothrow) RE2(pattern_re2)); + } +} +void +cre2_delete (cre2_regexp_t *re) +{ + delete TO_RE2(re); +} +const char * +cre2_pattern (const cre2_regexp_t *re) +{ + return TO_CONST_RE2(re)->pattern().c_str(); +} +int +cre2_error_code (const cre2_regexp_t *re) +{ + return int(TO_CONST_RE2(re)->error_code()); +} +const char * +cre2_error_string (const cre2_regexp_t *re) +{ + return TO_CONST_RE2(re)->error().c_str(); +} +void +cre2_error_arg (const cre2_regexp_t *re, cre2_string_t *arg) +{ + const std::string &argstr = TO_CONST_RE2(re)->error_arg(); + arg->data = argstr.data(); + arg->length = argstr.length(); +} +int +cre2_num_capturing_groups (const cre2_regexp_t *re) +{ + return TO_CONST_RE2(re)->NumberOfCapturingGroups(); +} +int +cre2_program_size (const cre2_regexp_t *re) +{ + return TO_CONST_RE2(re)->ProgramSize(); +} + + +/** -------------------------------------------------------------------- + ** Matching with precompiled regular expressions objects. + ** ----------------------------------------------------------------- */ + +int +cre2_match (const cre2_regexp_t *re , const char *text, + int textlen, int startpos, int endpos, cre2_anchor_t anchor, + cre2_string_t *match, int nmatch) +{ + re2::StringPiece text_re2(text, textlen); + re2::StringPiece match_re2[nmatch]; + RE2::Anchor anchor_re2 = RE2::UNANCHORED; + bool retval; // 0 for no match + // 1 for successful matching + switch (anchor) { + case CRE2_ANCHOR_START: + anchor_re2 = RE2::ANCHOR_START; + break; + case CRE2_ANCHOR_BOTH: + anchor_re2 = RE2::ANCHOR_BOTH; + break; + case CRE2_UNANCHORED: + break; + } + retval = TO_CONST_RE2(re)->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch); + if (retval) { + for (int i=0; idata, text->length); \ + re2::StringPiece strv[nmatch]; \ + RE2::Arg argv[nmatch]; \ + RE2::Arg * args[nmatch]; \ + bool retval; \ + for (int i=0; idata, text->length); \ + re2::StringPiece strv[nmatch]; \ + RE2::Arg argv[nmatch]; \ + RE2::Arg * args[nmatch]; \ + bool retval; \ + for (int i=0; idata = input.data(); \ + text->length = input.length(); \ + for (int i=0; idata, text->length); \ + re2::StringPiece strv[nmatch]; \ + RE2::Arg argv[nmatch]; \ + RE2::Arg * args[nmatch]; \ + bool retval; \ + for (int i=0; idata, text->length); \ + re2::StringPiece strv[nmatch]; \ + RE2::Arg argv[nmatch]; \ + RE2::Arg * args[nmatch]; \ + bool retval; \ + for (int i=0; idata = input.data(); \ + text->length = input.length(); \ + for (int i=0; idata, text_and_target->length); + re2::StringPiece R(rewrite->data, rewrite->length); + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = RE2::Replace(&S, pattern, R); + text_and_target->length = S.length(); + buffer = (char *)malloc(1+text_and_target->length); + if (buffer) { + S.copy(buffer, text_and_target->length); + buffer[text_and_target->length] = '\0'; + text_and_target->data = buffer; + } else + return -1; + return int(retval); + } catch(const std::exception &e) { + // e.what(); + return -1; + } catch(...) { + return -1; + } +} +int +cre2_replace_re (cre2_regexp_t * rex, cre2_string_t * text_and_target, cre2_string_t * rewrite) +{ + std::string S(text_and_target->data, text_and_target->length); + re2::StringPiece R(rewrite->data, rewrite->length); + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = RE2::Replace(&S, *TO_RE2(rex), R); + text_and_target->length = S.length(); + buffer = (char *)malloc(1+text_and_target->length); + if (buffer) { + S.copy(buffer, text_and_target->length); + buffer[text_and_target->length] = '\0'; + text_and_target->data = buffer; + } else + return -1; + return int(retval); +} + +/* ------------------------------------------------------------------ */ + +int +cre2_global_replace (const char * pattern, cre2_string_t * text_and_target, cre2_string_t * rewrite) +{ + std::string S(text_and_target->data, text_and_target->length); + re2::StringPiece R(rewrite->data, rewrite->length); + char * buffer; /* this exists to make GCC shut up about const */ + int retval; + retval = RE2::GlobalReplace(&S, pattern, R); + text_and_target->length = S.length(); + buffer = (char *)malloc(1+text_and_target->length); + if (buffer) { + S.copy(buffer, text_and_target->length); + buffer[text_and_target->length] = '\0'; + text_and_target->data = buffer; + } else + return -1; + return int(retval); +} +int +cre2_global_replace_re (cre2_regexp_t * rex, cre2_string_t * text_and_target, cre2_string_t * rewrite) +{ + std::string S(text_and_target->data, text_and_target->length); + re2::StringPiece R(rewrite->data, rewrite->length); + char * buffer; /* this exists to make GCC shut up about const */ + int retval; + retval = RE2::GlobalReplace(&S, *TO_RE2(rex), R); + text_and_target->length = S.length(); + buffer = (char *)malloc(1+text_and_target->length); + if (buffer) { + S.copy(buffer, text_and_target->length); + buffer[text_and_target->length] = '\0'; + text_and_target->data = buffer; + } else + return -1; + return retval; +} + +/* ------------------------------------------------------------------ */ + +int +cre2_extract (const char * pattern, cre2_string_t * text, + cre2_string_t * rewrite, cre2_string_t * target) +{ + re2::StringPiece T(text->data, text->length); + re2::StringPiece R(rewrite->data, rewrite->length); + std::string O; + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = RE2::Extract(T, pattern, R, &O); + target->length = O.length(); + buffer = (char *)malloc(1+target->length); + if (buffer) { + O.copy(buffer, target->length); + buffer[target->length] = '\0'; + target->data = buffer; + } else + return -1; + return int(retval); +} +int +cre2_extract_re (cre2_regexp_t * rex, cre2_string_t * text, + cre2_string_t * rewrite, cre2_string_t * target) +{ + re2::StringPiece T(text->data, text->length); + re2::StringPiece R(rewrite->data, rewrite->length); + std::string O; + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = RE2::Extract(T, *TO_RE2(rex), R, &O); + target->length = O.length(); + buffer = (char *)malloc(1+target->length); + if (buffer) { + O.copy(buffer, target->length); + buffer[target->length] = '\0'; + target->data = buffer; + } else + return -1; + return int(retval); +} + +/* ------------------------------------------------------------------ */ + +int +cre2_quote_meta (cre2_string_t * quoted, cre2_string_t * original) +{ + re2::StringPiece O(original->data, original->length); + std::string Q; + char * buffer; /* this exists to make GCC shut up about const */ + Q = RE2::QuoteMeta(O); + quoted->length = Q.length(); + buffer = (char *)malloc(1+quoted->length); + if (buffer) { + Q.copy(buffer, quoted->length); + buffer[quoted->length] = '\0'; + quoted->data = buffer; + return 0; + } else + return -1; +} +int +cre2_possible_match_range (cre2_regexp_t * rex, + cre2_string_t * min_, cre2_string_t * max_, int maxlen) +{ + std::string MIN, MAX; + cre2_string_t min, max; + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = TO_RE2(rex)->PossibleMatchRange(&MIN, &MAX, maxlen); + if (retval) { + /* copy MIN */ + min.length = MIN.length(); + buffer = (char *)malloc(1+min.length); + if (buffer) { + MIN.copy(buffer, min.length); + buffer[min.length] = '\0'; + min.data = buffer; + } else + return -1; + /* copy MAX */ + max.length = MAX.length(); + buffer = (char *)malloc(1+max.length); + if (buffer) { + MAX.copy(buffer, max.length); + buffer[max.length] = '\0'; + max.data = buffer; + } else { + free((void *)min.data); + min.data = NULL; + return -1; + } + *min_ = min; + *max_ = max; + return 1; + } else + return 0; +} +int +cre2_check_rewrite_string (cre2_regexp_t * rex, cre2_string_t * rewrite, cre2_string_t * errmsg) +{ + re2::StringPiece R(rewrite->data, rewrite->length); + std::string E; + char * buffer; /* this exists to make GCC shut up about const */ + bool retval; + retval = TO_RE2(rex)->CheckRewriteString(R, &E); + if (retval) { + errmsg->data = NULL; + errmsg->length = 0; + return 1; + } else { + errmsg->length = E.length(); + buffer = (char *)malloc(1+errmsg->length); + if (buffer) { + E.copy(buffer, errmsg->length); + buffer[errmsg->length] = '\0'; + errmsg->data = buffer; + } else + return -1; + return 0; + } +} + +/* end of file */ diff --git a/outside/cre2/src/src/cre2.h b/outside/cre2/src/src/cre2.h new file mode 100644 index 000000000..be17ac1af --- /dev/null +++ b/outside/cre2/src/src/cre2.h @@ -0,0 +1,299 @@ +/* + Header file for CRE2, a C language wrapper for RE2: a regular + expressions library by Google. + + Copyright (c) 2012 Marco Maggi + Copyright (c) 2011 Keegan McAllister + All rights reserved. + + For the license notice see the COPYING file. +*/ + + +/** -------------------------------------------------------------------- + ** Headers. + ** ----------------------------------------------------------------- */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef cre2_decl +# define cre2_decl extern +#endif + + +/** -------------------------------------------------------------------- + ** Version functions. + ** ----------------------------------------------------------------- */ + +cre2_decl const char * cre2_version_string (void); +cre2_decl int cre2_version_interface_current (void); +cre2_decl int cre2_version_interface_revision (void); +cre2_decl int cre2_version_interface_age (void); + + +/** -------------------------------------------------------------------- + ** Regular expressions configuration options. + ** ----------------------------------------------------------------- */ + +typedef void cre2_options_t; + +typedef enum cre2_encoding_t { + CRE2_UNKNOWN = 0, /* should never happen */ + CRE2_UTF8 = 1, + CRE2_Latin1 = 2 +} cre2_encoding_t; + +cre2_decl cre2_options_t *cre2_opt_new (void); +cre2_decl void cre2_opt_delete (cre2_options_t *opt); + +cre2_decl void cre2_opt_set_posix_syntax (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_longest_match (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_log_errors (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_literal (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_never_nl (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_case_sensitive (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_perl_classes (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_word_boundary (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_one_line (cre2_options_t *opt, int flag); +cre2_decl void cre2_opt_set_max_mem (cre2_options_t *opt, int m); +cre2_decl void cre2_opt_set_encoding (cre2_options_t *opt, cre2_encoding_t enc); + +cre2_decl int cre2_opt_posix_syntax (cre2_options_t *opt); +cre2_decl int cre2_opt_longest_match (cre2_options_t *opt); +cre2_decl int cre2_opt_log_errors (cre2_options_t *opt); +cre2_decl int cre2_opt_literal (cre2_options_t *opt); +cre2_decl int cre2_opt_never_nl (cre2_options_t *opt); +cre2_decl int cre2_opt_case_sensitive (cre2_options_t *opt); +cre2_decl int cre2_opt_perl_classes (cre2_options_t *opt); +cre2_decl int cre2_opt_word_boundary (cre2_options_t *opt); +cre2_decl int cre2_opt_one_line (cre2_options_t *opt); +cre2_decl int cre2_opt_max_mem (cre2_options_t *opt); +cre2_decl cre2_encoding_t cre2_opt_encoding (cre2_options_t *opt); + + +/** -------------------------------------------------------------------- + ** Precompiled regular expressions. + ** ----------------------------------------------------------------- */ + +typedef struct cre2_string_t { + const char * data; + int length; +} cre2_string_t; + +typedef void cre2_regexp_t; + +/* This definition must be kept in sync with the definition of "enum + ErrorCode" in the file "re2.h" of the original RE2 distribution. */ +typedef enum cre2_error_code_t { + CRE2_NO_ERROR = 0, + CRE2_ERROR_INTERNAL, /* unexpected error */ + /* parse errors */ + CRE2_ERROR_BAD_ESCAPE, /* bad escape sequence */ + CRE2_ERROR_BAD_CHAR_CLASS, /* bad character class */ + CRE2_ERROR_BAD_CHAR_RANGE, /* bad character class range */ + CRE2_ERROR_MISSING_BRACKET, /* missing closing ] */ + CRE2_ERROR_MISSING_PAREN, /* missing closing ) */ + CRE2_ERROR_TRAILING_BACKSLASH,/* trailing \ at end of regexp */ + CRE2_ERROR_REPEAT_ARGUMENT, /* repeat argument missing, e.g. "*" */ + CRE2_ERROR_REPEAT_SIZE, /* bad repetition argument */ + CRE2_ERROR_REPEA_TOP, /* bad repetition operator */ + CRE2_ERROR_BAD_PERL_OP, /* bad perl operator */ + CRE2_ERROR_BAD_UTF8, /* invalid UTF-8 in regexp */ + CRE2_ERROR_BAD_NAMED_CAPTURE, /* bad named capture group */ + CRE2_ERROR_PATTERN_TOO_LARGE, /* pattern too large (compile failed) */ +} cre2_error_code_t; + +/* construction and destruction */ +cre2_decl cre2_regexp_t * cre2_new (const char *pattern, int pattern_len, + const cre2_options_t *opt); +cre2_decl void cre2_delete (cre2_regexp_t *re); + +/* regular expression inspection */ +cre2_decl const char * cre2_pattern (const cre2_regexp_t *re); +cre2_decl int cre2_error_code (const cre2_regexp_t *re); +cre2_decl int cre2_num_capturing_groups (const cre2_regexp_t *re); +cre2_decl int cre2_program_size (const cre2_regexp_t *re); + +/* invalidated by further re use */ +cre2_decl const char *cre2_error_string(const cre2_regexp_t *re); +cre2_decl void cre2_error_arg(const cre2_regexp_t *re, cre2_string_t * arg); + + +/** -------------------------------------------------------------------- + ** Main matching functions. + ** ----------------------------------------------------------------- */ + +typedef enum cre2_anchor_t { + CRE2_UNANCHORED = 1, + CRE2_ANCHOR_START = 2, + CRE2_ANCHOR_BOTH = 3 +} cre2_anchor_t; + +typedef struct cre2_range_t { + long start; /* inclusive start index for bytevector */ + long past; /* exclusive end index for bytevector */ +} cre2_range_t; + +cre2_decl int cre2_match (const cre2_regexp_t * re, + const char * text, int textlen, + int startpos, int endpos, cre2_anchor_t anchor, + cre2_string_t * match, int nmatch); + +cre2_decl int cre2_easy_match (const char * pattern, int pattern_len, + const char * text, int text_len, + cre2_string_t * match, int nmatch); + +cre2_decl void cre2_strings_to_ranges (const char * text, cre2_range_t * ranges, + cre2_string_t * strings, int nmatch); + + +/** -------------------------------------------------------------------- + ** Other matching functions. + ** ----------------------------------------------------------------- */ + +typedef int cre2_match_stringz_fun_t (const char * pattern, const cre2_string_t * text, + cre2_string_t * match, int nmatch); + +typedef int cre2_match_stringz2_fun_t (const char * pattern, cre2_string_t * text, + cre2_string_t * match, int nmatch); + +typedef int cre2_match_rex_fun_t (cre2_regexp_t * rex, const cre2_string_t * text, + cre2_string_t * match, int nmatch); + +typedef int cre2_match_rex2_fun_t (cre2_regexp_t * rex, cre2_string_t * text, + cre2_string_t * match, int nmatch); + +cre2_decl cre2_match_stringz_fun_t cre2_full_match; +cre2_decl cre2_match_stringz_fun_t cre2_partial_match; +cre2_decl cre2_match_stringz2_fun_t cre2_consume; +cre2_decl cre2_match_stringz2_fun_t cre2_find_and_consume; + +cre2_decl cre2_match_rex_fun_t cre2_full_match_re; +cre2_decl cre2_match_rex_fun_t cre2_partial_match_re; +cre2_decl cre2_match_rex2_fun_t cre2_consume_re; +cre2_decl cre2_match_rex2_fun_t cre2_find_and_consume_re; + + +/** -------------------------------------------------------------------- + ** Problematic functions. + ** ----------------------------------------------------------------- */ + +/* Match the text in the buffer "text_and_target" against the rex in + "pattern" or "rex". Mutate "text_and_target" so that it references a + malloc'ed buffer holding the original text in which the first, and + only the first, match is substituted with the text in "rewrite". + Numeric backslash sequences (\1 to \9) in "rewrite" are substituted + with the portions of text matching the corresponding parenthetical + subexpressions. + + Return 0 if no match, 1 if successful match, -1 if error allocating + memory. */ +cre2_decl int cre2_replace (const char * pattern, + cre2_string_t * text_and_target, + cre2_string_t * rewrite); +cre2_decl int cre2_replace_re (cre2_regexp_t * rex, + cre2_string_t * text_and_target, + cre2_string_t * rewrite); + +/* Match the text in the buffer "text_and_target" against the rex in + "pattern" or "rex". Mutate "text_and_target" so that it references a + malloc'ed buffer holding the original text in which the all the + matching substrings are substituted with the text in "rewrite". + Numeric backslash sequences (\1 to \9) in "rewrite" are substituted + with the portions of text matching the corresponding parenthetical + subexpressions. + + Return 0 if no match, positive integer representing the number of + substitutions performed if successful match, -1 if error allocating + memory. */ +cre2_decl int cre2_global_replace (const char * pattern, + cre2_string_t * text_and_target, + cre2_string_t * rewrite); +cre2_decl int cre2_global_replace_re (cre2_regexp_t * rex, + cre2_string_t * text_and_target, + cre2_string_t * rewrite); + +/* Match the text in the buffer "text" against the rex in "pattern" or + "rex". Mutate "target" so that it references a malloc'ed buffer + holding a copy of the text in "rewrite"; numeric backslash sequences + (\1 to \9) in "rewrite" are substituted with the portions of text + matching the corresponding parenthetical subexpressions. + + Non-matching text in "text" is ignored. + + Return 0 if no match, 1 if successful match, -1 if error allocating + memory. */ +cre2_decl int cre2_extract (const char * pattern, + cre2_string_t * text, + cre2_string_t * rewrite, + cre2_string_t * target); + +cre2_decl int cre2_extract_re (cre2_regexp_t * rex, + cre2_string_t * text, + cre2_string_t * rewrite, + cre2_string_t * target); + +/* ------------------------------------------------------------------ */ + +/* Allocate a zero-terminated malloc'ed buffer and fill it with the text + from "original" having all the regexp meta characters quoted with + single backslashes. Return 0 if successful, return -1 if an error + allocating memory occurs. */ +cre2_decl int cre2_quote_meta (cre2_string_t * quoted, cre2_string_t * original); + +/* Compute a "minimum" string and a "maximum" string matching the given + regular expression. The min and max can in some cases be arbitrarily + precise, so the caller gets to specify "maxlen" begin the maximum + desired length of string returned. + + Assuming the call returns successfully, any string S that is an + anchored match for this regexp satisfies: + + min <= S && S <= max. + + Note that this function will only consider the first copy of an + infinitely repeated element (i.e., any regexp element followed by a + '*' or '+' operator). Regexps with "{N}" constructions are not + affected, as those do not compile down to infinite repetitions. + + "min_" and "max_" are mutated to reference zero-terminated malloc'ed + buffers holding the min and max strings. + + Return 0 if failure, return 1 if successful, return -1 if an error + allocating memory occurs. */ +cre2_decl int cre2_possible_match_range (cre2_regexp_t * rex, + cre2_string_t * min_, cre2_string_t * max_, + int maxlen); + +/* Check that the given rewrite string is suitable for use with this + regular expression. It checks that: + + * The regular expression has enough parenthesized subexpressions to + satisfy all of the \N tokens in rewrite + + * The rewrite string doesn't have any syntax errors. E.g., '\' + followed by anything other than a digit or '\'. + + A true return value guarantees that the replace and extract functions + won't fail because of a bad rewrite string. + + In case of error: "errmsg" is mutated to reference a zero-terminated + malloc'ed string describing the problem. + + Return 1 if the string is correct, return 0 if the string is + incorrect, return -1 if an error occurred allocating memory. */ +cre2_decl int cre2_check_rewrite_string (cre2_regexp_t * rex, + cre2_string_t * rewrite, cre2_string_t * errmsg); + + +/** -------------------------------------------------------------------- + ** Done. + ** ----------------------------------------------------------------- */ + +#ifdef __cplusplus +} // extern "C" +#endif + +/* end of file */ diff --git a/outside/cre2/src/tests/test-consume-match.c b/outside/cre2/src/tests/test-consume-match.c new file mode 100644 index 000000000..6d21ce4b3 --- /dev/null +++ b/outside/cre2/src/tests/test-consume-match.c @@ -0,0 +1,335 @@ +/* + Part of: CRE2 + Contents: test for consume match function + Date: Tue Jan 3, 2012 + + Abstract + + Test file for consume match function. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* success, no parentheses, full consumed buffer */ + const char * pattern = "ci.*ut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp("", input.data, input.length)) + goto error; + } + { /* success, no parentheses, partially consumed buffer */ + const char * pattern = "ci.*ut"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + result = cre2_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_consume(pattern, &input, NULL, 0); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_consume(pattern, &input, match, nmatch); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + result = cre2_consume(pattern, &input, match, nmatch); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* wrong regexp specification */ + const char * pattern = "cia(o salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_consume(pattern, &input, match, nmatch); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + + +/* ------------------------------------------------------------------ */ + + { /* success, no parentheses, full buffer consumed */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, no parentheses, partial buffer consumed */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-easy-matching.c b/outside/cre2/src/tests/test-easy-matching.c new file mode 100644 index 000000000..34fe1ef11 --- /dev/null +++ b/outside/cre2/src/tests/test-easy-matching.c @@ -0,0 +1,103 @@ +/* + Part of: CRE2 + Contents: test for easy matching + Date: Mon Jan 2, 2012 + + Abstract + + Test file for regular expressions matching. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + const char * pattern; + const char * text; + +/* ------------------------------------------------------------------ */ +/* single match */ + + pattern = "ciao"; + text = "ciao"; + { + cre2_string_t match; + int nmatch = 1; + cre2_easy_match(pattern, strlen(pattern), + text, strlen(text), + &match, nmatch); + PRINTF("match: "); + FWRITE(match.data, match.length, 1, stdout); + PRINTF("\n"); + if (0 != strncmp("ciao", match.data, match.length)) + goto error; + } + +/* ------------------------------------------------------------------ */ +/* wrong pattern */ + + pattern = "ci(ao"; + text = "ciao"; + { + cre2_string_t match; + int nmatch = 1; + int retval; + retval = cre2_easy_match(pattern, strlen(pattern), + text, strlen(text), + &match, nmatch); + if (2 != retval) + goto error; + } + +/* ------------------------------------------------------------------ */ +/* two groups */ + + pattern = "(ciao) (hello)"; + text = "ciao hello"; + { + int nmatch = 3; + cre2_string_t match[nmatch]; + cre2_easy_match(pattern, strlen(pattern), + text, strlen(text), + match, nmatch); + PRINTF("full match: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("first group: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + PRINTF("second group: "); + FWRITE(match[2].data, match[2].length, 1, stdout); + PRINTF("\n"); + if (0 != strncmp("ciao hello", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("ciao", match[1].data, match[1].length)) + goto error; + if (0 != strncmp("hello", match[2].data, match[2].length)) + goto error; + } + +/* ------------------------------------------------------------------ */ + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-find-and-consume-match.c b/outside/cre2/src/tests/test-find-and-consume-match.c new file mode 100644 index 000000000..525bceb53 --- /dev/null +++ b/outside/cre2/src/tests/test-find-and-consume-match.c @@ -0,0 +1,335 @@ +/* + Part of: CRE2 + Contents: test for find and consume match function + Date: Tue Jan 3, 2012 + + Abstract + + Test file for find and consume match function. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* success, no parentheses, full consumed buffer */ + const char * pattern = "ci.*ut"; + const char * text = "prefix ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_find_and_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp("", input.data, input.length)) + goto error; + } + { /* success, no parentheses, partially consumed buffer */ + const char * pattern = "ci.*ut"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_find_and_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "prefix ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_find_and_consume(pattern, &input, NULL, 0); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + const char * text = "prefix ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_find_and_consume(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* wrong regexp specification */ + const char * pattern = "cia(o salut"; + const char * text = "prefix ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_find_and_consume(pattern, &input, match, nmatch); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + + +/* ------------------------------------------------------------------ */ + + { /* success, no parentheses, full buffer consumed */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, no parentheses, partial buffer consumed */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "prefix ciao salut hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_find_and_consume_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(" hello", input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-full-match.c b/outside/cre2/src/tests/test-full-match.c new file mode 100644 index 000000000..5f3eae10b --- /dev/null +++ b/outside/cre2/src/tests/test-full-match.c @@ -0,0 +1,308 @@ +/* + Part of: CRE2 + Contents: test for full match function + Date: Tue Jan 3, 2012 + + Abstract + + Test file for full match function. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* success, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_full_match(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_full_match(pattern, &input, NULL, 0); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_full_match(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + result = cre2_full_match(pattern, &input, match, nmatch); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* wrong regexp specification */ + const char * pattern = "cia(o salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_full_match(pattern, &input, match, nmatch); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + + +/* ------------------------------------------------------------------ */ + + { /* success, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_full_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-matching.c b/outside/cre2/src/tests/test-matching.c new file mode 100644 index 000000000..17a3787d1 --- /dev/null +++ b/outside/cre2/src/tests/test-matching.c @@ -0,0 +1,122 @@ +/* + Part of: CRE2 + Contents: test for matching + Date: Mon Jan 2, 2012 + + Abstract + + Test file for regular expressions matching. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + cre2_regexp_t * rex; + cre2_options_t * opt; + const char * pattern; + +/* ------------------------------------------------------------------ */ +/* single match */ + + pattern = "ciao"; + opt = cre2_opt_new(); + cre2_opt_set_posix_syntax(opt, 1); + rex = cre2_new(pattern, strlen(pattern), opt); + { + if (cre2_error_code(rex)) + goto error; + cre2_string_t match; + int nmatch = 1; + int e; + const char * text = "ciao"; + int text_len = strlen(text); + + e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, &match, nmatch); + if (1 != e) + goto error; + PRINTF("match: retval=%d, ", e); + FWRITE(match.data, match.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + cre2_opt_delete(opt); + +/* ------------------------------------------------------------------ */ +/* two groups */ + + pattern = "(ciao) (hello)"; + opt = cre2_opt_new(); + rex = cre2_new(pattern, strlen(pattern), opt); + { + if (cre2_error_code(rex)) + goto error; + int nmatch = 3; + cre2_string_t strings[nmatch]; + cre2_range_t ranges[nmatch]; + int e; + const char * text = "ciao hello"; + int text_len = strlen(text); + + e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, strings, nmatch); + if (1 != e) + goto error; + cre2_strings_to_ranges(text, ranges, strings, nmatch); + PRINTF("full match: "); + FWRITE(text+ranges[0].start, ranges[0].past-ranges[0].start, 1, stdout); + PRINTF("\n"); + PRINTF("first group: "); + FWRITE(text+ranges[1].start, ranges[1].past-ranges[1].start, 1, stdout); + PRINTF("\n"); + PRINTF("second group: "); + FWRITE(text+ranges[2].start, ranges[2].past-ranges[2].start, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + cre2_opt_delete(opt); + +/* ------------------------------------------------------------------ */ +/* test literal option */ + + pattern = "(ciao) (hello)"; + opt = cre2_opt_new(); + cre2_opt_set_literal(opt, 1); + rex = cre2_new(pattern, strlen(pattern), opt); + { + if (cre2_error_code(rex)) + goto error; + int nmatch = 0; + int e; + const char * text = "(ciao) (hello)"; + int text_len = strlen(text); + e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, NULL, nmatch); + if (0 == e) + goto error; + } + cre2_delete(rex); + cre2_opt_delete(opt); + +/* ------------------------------------------------------------------ */ + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-misc.c b/outside/cre2/src/tests/test-misc.c new file mode 100644 index 000000000..d2351612b --- /dev/null +++ b/outside/cre2/src/tests/test-misc.c @@ -0,0 +1,119 @@ +/* + Part of: CRE2 + Contents: test for miscellaneous functions + Date: Wed Jan 4, 2012 + + Abstract + + Test file for miscellaneous functions. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* quote meta characters */ + const char * pattern = "1.5-2.0?"; + cre2_string_t original = { + .data = pattern, + .length = strlen(pattern) + }; + cre2_string_t quoted; + int result; + result = cre2_quote_meta("ed, &original); + if (0 != result) + goto error; + if (0 != strncmp("1\\.5\\-2\\.0\\?", quoted.data, quoted.length)) + goto error; + free((void *)quoted.data); + } + + /* ------------------------------------------------------------------ */ + + { /* minimum and maximum matching strings */ + const char * pattern = "(?i)ABCdef"; + cre2_regexp_t * rex; + cre2_string_t min, max; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_possible_match_range(rex, &min, &max, 1024); + if (1 != result) + goto error; + if (0 != strncmp("ABCDEF", min.data, min.length)) + goto error; + if (0 != strncmp("abcdef", max.data, max.length)) + goto error; + } + cre2_delete(rex); + free((void *)min.data); + free((void *)max.data); + } + + /* ------------------------------------------------------------------ */ + + { /* successfully check rewrite string */ + const char * pattern = "a(b)c"; + const char * subst = "def"; + cre2_string_t rewrite = { + .data = subst, + .length = strlen(subst) + }; + cre2_regexp_t * rex; + cre2_string_t errmsg; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_check_rewrite_string(rex, &rewrite, &errmsg); + if (1 != result) + goto error; + } + cre2_delete(rex); + } + { /* failed check rewrite string */ + const char * pattern = "a(b)c"; + const char * subst = "\\1 \\2"; + cre2_string_t rewrite = { + .data = subst, + .length = strlen(subst) + }; + cre2_regexp_t * rex; + cre2_string_t errmsg; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_check_rewrite_string(rex, &rewrite, &errmsg); + if (0 != result) + goto error; + PRINTF("error message: "); + FWRITE(errmsg.data, errmsg.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)errmsg.data); + } + +/* ------------------------------------------------------------------ */ + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-options.c b/outside/cre2/src/tests/test-options.c new file mode 100644 index 000000000..c6a47d39d --- /dev/null +++ b/outside/cre2/src/tests/test-options.c @@ -0,0 +1,43 @@ +/* + Part of: CRE2 + Contents: test for options + Date: Mon Jan 2, 2012 + + Abstract + + Test file for options objects. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include + +int +main (int argc, const char *const argv[]) +{ + cre2_options_t * opt; + + opt = cre2_opt_new(); + { + cre2_opt_set_posix_syntax(opt, 1); + cre2_opt_set_longest_match(opt, 1); + cre2_opt_set_log_errors(opt, 1); + cre2_opt_set_literal(opt, 1); + cre2_opt_set_never_nl(opt, 1); + cre2_opt_set_case_sensitive(opt, 1); + cre2_opt_set_perl_classes(opt, 1); + cre2_opt_set_word_boundary(opt, 1); + cre2_opt_set_one_line(opt, 1); + cre2_opt_set_encoding(opt, CRE2_UTF8); + cre2_opt_set_encoding(opt, CRE2_Latin1); + cre2_opt_set_max_mem(opt, 4096); + } + cre2_opt_delete(opt); + exit(EXIT_SUCCESS); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-partial-match.c b/outside/cre2/src/tests/test-partial-match.c new file mode 100644 index 000000000..5825789c2 --- /dev/null +++ b/outside/cre2/src/tests/test-partial-match.c @@ -0,0 +1,308 @@ +/* + Part of: CRE2 + Contents: test for partial match function + Date: Tue Jan 3, 2012 + + Abstract + + Test file for partial match function. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* success, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "pre ciao salut post"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_partial_match(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_partial_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + result = cre2_partial_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_partial_match(pattern, &input, NULL, 0); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_partial_match(pattern, &input, match, nmatch); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + result = cre2_partial_match(pattern, &input, NULL, 0); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + result = cre2_partial_match(pattern, &input, match, nmatch); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_partial_match(pattern, &input, match, nmatch); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* wrong regexp specification */ + const char * pattern = "cia(o salut"; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + result = cre2_partial_match(pattern, &input, match, nmatch); + if (0 != result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + + +/* ------------------------------------------------------------------ */ + + { /* success, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, one match entry */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + { /* success, two parenthetical subexpressions, two match entries */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp("salut", match[1].data, match[1].length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + PRINTF("match 1: "); + FWRITE(match[1].data, match[1].length, 1, stdout); + PRINTF("\n"); + } + { /* failure, no parentheses */ + const char * pattern = "ci.*ut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (result) + goto error; + } + { /* failure, one parenthetical subexpression */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao hello"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* success, one parenthetical subexpression, no match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, NULL, 0); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + } + { /* failure, one parenthetical subexpression, two match entries */ + const char * pattern = "(ciao) salut"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 2; + cre2_string_t match[nmatch]; + int result; + memset(match, '\0', nmatch * sizeof(cre2_string_t)); + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (0 != result) + goto error; + } + { /* success, two parenthetical subexpressions, one match entry */ + const char * pattern = "(ciao) (salut)"; + cre2_regexp_t * rex; + const char * text = "ciao salut"; + cre2_string_t input = { .data = text, .length = strlen(text) }; + int nmatch = 1; + cre2_string_t match[nmatch]; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + result = cre2_partial_match_re(rex, &input, match, nmatch); + cre2_delete(rex); + if (! result) + goto error; + if (0 != strncmp("ciao", match[0].data, match[0].length)) + goto error; + if (0 != strncmp(text, input.data, input.length)) + goto error; + PRINTF("match 0: "); + FWRITE(match[0].data, match[0].length, 1, stdout); + PRINTF("\n"); + } + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-replace.c b/outside/cre2/src/tests/test-replace.c new file mode 100644 index 000000000..e1098cf95 --- /dev/null +++ b/outside/cre2/src/tests/test-replace.c @@ -0,0 +1,257 @@ +/* + Part of: CRE2 + Contents: test for replace + Date: Wed Jan 4, 2012 + + Abstract + + Test file for replacing. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + { /* replace all the buffer using the full match */ + cre2_regexp_t * rex; + const char * pattern = "ciao hello salut"; + const char * text = "ciao hello salut"; + const char * replace = "pre \\0 post"; + cre2_string_t target = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_replace_re(rex, &target, &rewrite); + if (1 != result) + goto error; + if (0 != strncmp("pre ciao hello salut post", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + { /* replace substring with fixed string */ + cre2_regexp_t * rex; + const char * pattern = "hello"; + const char * text = "ciao hello salut"; + const char * replace = "ohayo"; + cre2_string_t target = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_replace_re(rex, &target, &rewrite); + if (1 != result) + goto error; + if (0 != strncmp("ciao ohayo salut", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + + /* ------------------------------------------------------------------ */ + + { /* global replace all the buffer using the full match */ + cre2_regexp_t * rex; + const char * pattern = "ciao hello salut"; + const char * text = "ciao hello salut"; + const char * replace = "pre \\0 post"; + cre2_string_t target = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_global_replace_re(rex, &target, &rewrite); + if (1 != result) + goto error; + if (0 != strncmp("pre ciao hello salut post", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + { /* global replace substring with fixed string */ + cre2_regexp_t * rex; + const char * pattern = "hello"; + const char * text = "ciao hello salut"; + const char * replace = "ohayo"; + cre2_string_t target = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_global_replace_re(rex, &target, &rewrite); + if (1 != result) + goto error; + if (0 != strncmp("ciao ohayo salut", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + { /* global replace multiple substrings with parametrised string */ + cre2_regexp_t * rex; + const char * pattern = "[a-z]+\\(([0-9]+)\\)"; + const char * text = "ciao(1) hello(2) salut(3)"; + const char * replace = "ohayo(\\1)"; + cre2_string_t target = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_global_replace_re(rex, &target, &rewrite); + if (3 != result) /* 3 substitutions */ + goto error; + if (0 != strncmp("ohayo(1) ohayo(2) ohayo(3)", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("result %d, rewritten to: ", result); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + +/* ------------------------------------------------------------------ */ + + { /* extract all the buffer using the full match */ + cre2_regexp_t * rex; + const char * pattern = "ciao hello salut"; + const char * text = "ciao hello salut"; + const char * replace = "pre \\0 post"; + cre2_string_t input = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + cre2_string_t target; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_extract_re(rex, &input, &rewrite, &target); + if (1 != result) + goto error; + if (0 != strncmp("pre ciao hello salut post", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + { /* extract substring with fixed string */ + cre2_regexp_t * rex; + const char * pattern = "hello([0-9]+)"; + const char * text = "ciao hello123 salut"; + const char * replace = "ohayo\\1"; + cre2_string_t input = { + .data = text, + .length = strlen(text) + }; + cre2_string_t rewrite = { + .data = replace, + .length = strlen(replace) + }; + cre2_string_t target; + int result; + rex = cre2_new(pattern, strlen(pattern), NULL); + { + result = cre2_extract_re(rex, &input, &rewrite, &target); + if (1 != result) + goto error; + if (0 != strncmp("ohayo123", target.data, target.length)) + goto error; + if ('\0' != target.data[target.length]) + goto error; + PRINTF("rewritten to: "); + FWRITE(target.data, target.length, 1, stdout); + PRINTF("\n"); + } + cre2_delete(rex); + free((void *)target.data); + } + + /* ------------------------------------------------------------------ */ + + exit(EXIT_SUCCESS); + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-rex-alloc.c b/outside/cre2/src/tests/test-rex-alloc.c new file mode 100644 index 000000000..cde19f376 --- /dev/null +++ b/outside/cre2/src/tests/test-rex-alloc.c @@ -0,0 +1,113 @@ +/* + Part of: CRE2 + Contents: test for rex allocation + Date: Mon Jan 2, 2012 + + Abstract + + Test file for regular expressions allocation. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include +#include + +#if 0 +# define PRINTF printf +# define FWRITE fwrite +#else +# define PRINTF(MSG, ...) /* empty string */ +# define FWRITE(BUF, ...) /* empty string */ +#endif + +int +main (int argc, const char *const argv[]) +{ + cre2_regexp_t * rex; + cre2_options_t * opt; + opt = cre2_opt_new(); + cre2_opt_set_posix_syntax(opt, 1); + rex = cre2_new("ciao", 4, opt); + { + cre2_string_t S; + PRINTF("pattern: %s\n", cre2_pattern(rex)); + PRINTF("error code: %d\n", cre2_error_code(rex)); + PRINTF("error string: \"%s\"\n", cre2_error_string(rex)); + PRINTF("number of capturing groups: %d\n", cre2_num_capturing_groups(rex)); + PRINTF("program size: %d\n", cre2_program_size(rex)); + cre2_error_arg(rex, &S); + PRINTF("error arg: len=%d, data=\"%s\"\n", S.length, S.data); + if (cre2_error_code(rex)) + goto error; + if (cre2_num_capturing_groups(rex)) + goto error; + if (cre2_error_code(rex)) + goto error; + if (0 != strlen(cre2_error_string(rex))) + goto error; + if (0 != S.length) + goto error; + } + cre2_delete(rex); + cre2_opt_delete(opt); + +/* ------------------------------------------------------------------ */ +/* no options object */ + + rex = cre2_new("ciao", 4, NULL); + { + if (cre2_error_code(rex)) + goto error; + } + cre2_delete(rex); + +/* ------------------------------------------------------------------ */ + + opt = cre2_opt_new(); + cre2_opt_set_posix_syntax(opt, 1); + rex = cre2_new("ci(ao)", 6, opt); + { + PRINTF("error code: %d\n", cre2_error_code(rex)); + PRINTF("number of capturing groups: %d\n", cre2_num_capturing_groups(rex)); + PRINTF("program size: %d\n", cre2_program_size(rex)); + if (cre2_error_code(rex)) + goto error; + if (1 != cre2_num_capturing_groups(rex)) + goto error; + } + cre2_delete(rex); + cre2_opt_delete(opt); + +/* ------------------------------------------------------------------ */ + + opt = cre2_opt_new(); + cre2_opt_set_log_errors(opt, 0); + rex = cre2_new("ci(ao", 5, opt); + { + int code = cre2_error_code(rex); + const char * msg = cre2_error_string(rex); + cre2_string_t S; + cre2_error_arg(rex, &S); + if (CRE2_ERROR_MISSING_PAREN != code) + goto error; + if (! msg) + goto error; + PRINTF("pattern: %s\n", cre2_pattern(rex)); + PRINTF("error: code=%d, msg=\"%s\"\n", code, msg); + PRINTF("error arg: len=%d, data=\"%s\"\n", S.length, S.data); + } + cre2_delete(rex); + cre2_opt_delete(opt); + + exit(EXIT_SUCCESS); + + error: + exit(EXIT_FAILURE); +} + +/* end of file */ diff --git a/outside/cre2/src/tests/test-version.c b/outside/cre2/src/tests/test-version.c new file mode 100644 index 000000000..aa1e5a67f --- /dev/null +++ b/outside/cre2/src/tests/test-version.c @@ -0,0 +1,30 @@ +/* + Part of: CRE2 + Contents: test for version functions + Date: Mon Jan 2, 2012 + + Abstract + + Test file for version functions. + + Copyright (C) 2012 Marco Maggi + + See the COPYING file. +*/ + +#include +#include +#include + +int +main (int argc, const char *const argv[]) +{ + printf("version number string: %s\n", cre2_version_string()); + printf("libtool version number: %d:%d:%d\n", + cre2_version_interface_current(), + cre2_version_interface_revision(), + cre2_version_interface_age()); + exit(EXIT_SUCCESS); +} + +/* end of file */ From 6ed3ff3bdafb9d61fd24585347b5dddffd7f2796 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 14:16:03 -0700 Subject: [PATCH 02/13] Fixing scripts --- Makefile | 2 +- outside/cre2/src/build.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fa62da733..2bd68d784 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(OS),osx) OSLIBS=-framework CoreServices -framework CoreFoundation endif ifeq ($(OS),linux) - OSLIBS=-lcrypto -lpthread -lrt -lcurses + OSLIBS=-lcrypto -lpthread -lrt -lcurses -lssl DEFINES=-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE endif ifeq ($(OS),bsd) diff --git a/outside/cre2/src/build.sh b/outside/cre2/src/build.sh index b67d94b53..fdf2e0110 100644 --- a/outside/cre2/src/build.sh +++ b/outside/cre2/src/build.sh @@ -1,6 +1,6 @@ if [ ! -d "=build" ]; then mkdir "=build" -libtoolize +(libtoolize || glibtoolize) sh autogen.sh cd "=build" ../configure --enable-maintainer-mode LDFLAGS=-pthread From 1fe7f81e762f9f46e9ce93b9c08ef9880a153153 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 14:22:08 -0700 Subject: [PATCH 03/13] Fixed warning --- gen164/5/rexp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen164/5/rexp.c b/gen164/5/rexp.c index d2363e699..4dfd0e556 100644 --- a/gen164/5/rexp.c +++ b/gen164/5/rexp.c @@ -17,7 +17,7 @@ u2k(lub); int lub_l = u2_ckb_lent(lub); - if (lub_l != strlen(lub_y)) { + if (lub_l != strlen((char *)lub_y)) { free(lub_y); free(rad_y); return u2_nul; From 9dd174adee1fb1dba805fb7e12b5a3e53283fd28 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 16:04:01 -0700 Subject: [PATCH 04/13] Testing compatibility change --- outside/cre2/src/src/cre2.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/outside/cre2/src/src/cre2.cpp b/outside/cre2/src/src/cre2.cpp index e44039b09..647b948b7 100644 --- a/outside/cre2/src/src/cre2.cpp +++ b/outside/cre2/src/src/cre2.cpp @@ -201,10 +201,11 @@ cre2_match (const cre2_regexp_t *re , const char *text, cre2_string_t *match, int nmatch) { re2::StringPiece text_re2(text, textlen); - re2::StringPiece match_re2[nmatch]; + re2::StringPiece *match_re2; RE2::Anchor anchor_re2 = RE2::UNANCHORED; bool retval; // 0 for no match // 1 for successful matching + match_re2 = (re2::StringPiece *)malloc(sizeof(re2::StringPiece) * nmatch); switch (anchor) { case CRE2_ANCHOR_START: anchor_re2 = RE2::ANCHOR_START; @@ -222,6 +223,7 @@ cre2_match (const cre2_regexp_t *re , const char *text, match[i].length = match_re2[i].length(); } } + free(match_re2); return (retval)? 1 : 0; } int From 81852e8ab5c9353f35429225fb7e82fc6f92e7f0 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 16:12:51 -0700 Subject: [PATCH 05/13] More compatibility --- outside/cre2/src/src/cre2.cpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/outside/cre2/src/src/cre2.cpp b/outside/cre2/src/src/cre2.cpp index 647b948b7..3bb4e1aec 100644 --- a/outside/cre2/src/src/cre2.cpp +++ b/outside/cre2/src/src/cre2.cpp @@ -274,10 +274,13 @@ cre2_strings_to_ranges (const char * text, cre2_range_t * ranges, cre2_string_t cre2_string_t * match, int nmatch) \ { \ re2::StringPiece input(text->data, text->length); \ - re2::StringPiece strv[nmatch]; \ - RE2::Arg argv[nmatch]; \ - RE2::Arg * args[nmatch]; \ - bool retval; \ + re2::StringPiece *strv; \ + RE2::Arg *argv; \ + RE2::Arg * *args; \ + bool retval; \ + strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \ + argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \ + args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \ for (int i=0; idata, text->length); \ - re2::StringPiece strv[nmatch]; \ - RE2::Arg argv[nmatch]; \ - RE2::Arg * args[nmatch]; \ - bool retval; \ + re2::StringPiece *strv; \ + RE2::Arg *argv; \ + RE2::Arg * *args; \ + bool retval; \ + strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \ + argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \ + args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \ for (int i=0; i Date: Wed, 9 Apr 2014 16:15:36 -0700 Subject: [PATCH 06/13] Even more compatibility --- outside/cre2/src/src/cre2.cpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/outside/cre2/src/src/cre2.cpp b/outside/cre2/src/src/cre2.cpp index 3bb4e1aec..17819f6ad 100644 --- a/outside/cre2/src/src/cre2.cpp +++ b/outside/cre2/src/src/cre2.cpp @@ -349,10 +349,13 @@ DEFINE_MATCH_ZSTRING_FUN2(cre2_find_and_consume,FindAndConsumeN) cre2_string_t * match, int nmatch) \ { \ re2::StringPiece input(text->data, text->length); \ - re2::StringPiece strv[nmatch]; \ - RE2::Arg argv[nmatch]; \ - RE2::Arg * args[nmatch]; \ - bool retval; \ + re2::StringPiece *strv; \ + RE2::Arg *argv; \ + RE2::Arg * *args; \ + bool retval; \ + strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \ + argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \ + args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \ for (int i=0; idata, text->length); \ - re2::StringPiece strv[nmatch]; \ - RE2::Arg argv[nmatch]; \ - RE2::Arg * args[nmatch]; \ - bool retval; \ + re2::StringPiece *strv; \ + RE2::Arg *argv; \ + RE2::Arg * *args; \ + bool retval; \ + strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \ + argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \ + args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \ for (int i=0; i Date: Wed, 9 Apr 2014 16:35:31 -0700 Subject: [PATCH 07/13] Fixing makefile stuff --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 2bd68d784..d68b97a35 100644 --- a/Makefile +++ b/Makefile @@ -43,14 +43,14 @@ ifeq ($(OS),osx) OSLIBS=-framework CoreServices -framework CoreFoundation endif ifeq ($(OS),linux) - OSLIBS=-lcrypto -lpthread -lrt -lcurses -lssl + OSLIBS=-lpthread -lrt -lcurses DEFINES=-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE endif ifeq ($(OS),bsd) - OSLIBS=-lcrypto -lpthread -lncurses -lkvm + OSLIBS=-lpthread -lncurses -lkvm endif -LIBS=-lgmp -lncurses -lsigsegv $(OSLIBS) -lre2 +LIBS=-lssl -lcrypto -lgmp -lncurses -lsigsegv $(OSLIBS) -lre2 INCLUDE=include GENERATED=generated From 371f67b61362a87363e5cada9f686d0ce6d24dc4 Mon Sep 17 00:00:00 2001 From: Gavin Whelan Date: Wed, 9 Apr 2014 17:14:46 -0700 Subject: [PATCH 08/13] Removing debugging/testing stuff --- gen164/5/repg.c | 4 +--- gen164/5/rexp.c | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/gen164/5/repg.c b/gen164/5/repg.c index 6982a533b..b31333297 100644 --- a/gen164/5/repg.c +++ b/gen164/5/repg.c @@ -53,8 +53,6 @@ rec++; } - fprintf(stderr, "\r\nrepg: \r\n%s : %s\r\n", lub_y, rad_y); - cre2_regexp_t * rex; cre2_options_t * opt; @@ -150,6 +148,6 @@ */ u2_ho_jet j2_mbj(Pt5, repg)[] = { - { ".2", c3__lite, j2_mb(Pt5, repg), u2_jet_live | u2_jet_test, u2_none, u2_none }, + { ".2", c3__lite, j2_mb(Pt5, repg), Tier5, u2_none, u2_none }, { } }; diff --git a/gen164/5/rexp.c b/gen164/5/rexp.c index 4dfd0e556..1033ece18 100644 --- a/gen164/5/rexp.c +++ b/gen164/5/rexp.c @@ -61,8 +61,6 @@ rec++; } - fprintf(stderr, "\r\n%s : %s\r\n", lub_y, rad_y); - cre2_regexp_t * rex; cre2_options_t * opt; @@ -98,7 +96,6 @@ char * buf = malloc(matches[i].length + 1); memcpy(buf, matches[i].data, matches[i].length); buf[matches[i].length] = 0; - fprintf(stderr, "%d: %s\r\n", i, buf); map = u2_ckd_by_put(map, i, u2_ci_tape(buf)); free(buf); } @@ -149,6 +146,6 @@ */ u2_ho_jet j2_mbj(Pt5, rexp)[] = { - { ".2", c3__lite, j2_mb(Pt5, rexp), u2_jet_live | u2_jet_test, u2_none, u2_none }, + { ".2", c3__lite, j2_mb(Pt5, rexp), Tier5, u2_none, u2_none }, { } }; From 8cfb9cf1ac71b2d831bb982a634897567ea586ee Mon Sep 17 00:00:00 2001 From: Steven Dee Date: Thu, 10 Apr 2014 04:22:55 -0700 Subject: [PATCH 09/13] Ignore the strange earth-husk around cre2 --- Makefile | 15 ++++++++++----- outside/cre2/src/src/cre2.cpp | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index d68b97a35..7146d817f 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,8 @@ LIB=$(PWD)/lib RM=rm -f CC=gcc +CXX=g++ +CXXFLAGS=$(CFLAGS) CLD=g++ -O2 -g -L/usr/local/lib -L/opt/local/lib YACC=bison -v -b$(GENERATED)/y LEX=lex @@ -527,6 +529,9 @@ BASE_OFILES=\ $(F_OFILES) \ $(J164_OFILES) +CRE2_OFILES=\ + outside/cre2/src/src/cre2.o + OUT_OFILES=\ outside/jhttp/http_parser.o @@ -547,20 +552,20 @@ V_OFILES=\ v/walk.o VERE_OFILES=\ - $(V_OFILES) \ $(BASE_OFILES) \ - $(OUT_OFILES) + $(CRE2_OFILES) \ + $(OUT_OFILES) \ + $(V_OFILES) LIBUV=outside/libuv/libuv.a -LIBCRE=outside/cre2/lib/libcre2.a all: $(BIN)/vere $(LIBUV): $(MAKE) -C outside/libuv libuv.a -$(LIBCRE): - cd outside/cre2/src && sh build.sh +$(CRE2_OFILES): outside/cre2/src/src/cre2.cpp outside/cre2/src/src/cre2.h + $(CXX) $(CXXFLAGS) -c $< -o $@ $(V_OFILES) f/loom.o f/trac.o: include/v/vere.h diff --git a/outside/cre2/src/src/cre2.cpp b/outside/cre2/src/src/cre2.cpp index 17819f6ad..bb7f4b3c3 100644 --- a/outside/cre2/src/src/cre2.cpp +++ b/outside/cre2/src/src/cre2.cpp @@ -26,22 +26,38 @@ const char * cre2_version_string (void) { +#ifdef HAVE_CONFIG_H return cre2_VERSION_INTERFACE_STRING; +#else + return "0.0"; +#endif } int cre2_version_interface_current (void) { +#ifdef HAVE_CONFIG_H return cre2_VERSION_INTERFACE_CURRENT; +#else + return 0; +#endif } int cre2_version_interface_revision (void) { +#ifdef HAVE_CONFIG_H return cre2_VERSION_INTERFACE_REVISION; +#else + return 0; +#endif } int cre2_version_interface_age (void) { +#ifdef HAVE_CONFIG_H return cre2_VERSION_INTERFACE_AGE; +#else + return 0; +#endif } From e4708058a0595bbff5e590b863c57a70fe62ee1e Mon Sep 17 00:00:00 2001 From: Steven Dee Date: Thu, 10 Apr 2014 04:24:19 -0700 Subject: [PATCH 10/13] Include errno properly on OpenBSD --- v/sist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v/sist.c b/v/sist.c index 640b32268..e4e4848e8 100644 --- a/v/sist.c +++ b/v/sist.c @@ -2,11 +2,11 @@ ** ** This file is in the public domain. */ +#include #include #include #include #include -#include #include #include From ec13f53941805bfb2ab56fbf9216b75875129db0 Mon Sep 17 00:00:00 2001 From: Steve Dee Date: Thu, 10 Apr 2014 11:31:05 -0700 Subject: [PATCH 11/13] Just grab cre2.h from the dir it starts in --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7146d817f..4c88878b9 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ CFLAGS= -O2 -g \ -I/opt/local/include \ -I$(INCLUDE) \ -Ioutside/libuv/include \ - -Ioutside/cre2/include \ + -Ioutside/cre2/src/src \ -I $(GENERATED) \ $(DEFINES) \ $(MDEFINES) From 27dd121d144d4d021fac869603e17feeebbe7505 Mon Sep 17 00:00:00 2001 From: Steve Dee Date: Thu, 10 Apr 2014 11:36:47 -0700 Subject: [PATCH 12/13] Add 'outside/re2/' from commit '539b44fc4c5a49c3453b80e3af85d297f4cab4bf' git-subtree-dir: outside/re2 git-subtree-mainline: f94738bfd171ae447133e0964843addbb497894f git-subtree-split: 539b44fc4c5a49c3453b80e3af85d297f4cab4bf --- outside/re2/.hgignore | 7 + outside/re2/AUTHORS | 13 + outside/re2/CONTRIBUTORS | 40 + outside/re2/LICENSE | 27 + outside/re2/Makefile | 291 + outside/re2/README | 19 + outside/re2/benchlog/benchlog.c2 | 2211 +++++++ outside/re2/benchlog/benchlog.mini | 582 ++ outside/re2/benchlog/benchlog.r70 | 1475 +++++ outside/re2/benchlog/benchlog.wreck | 1058 ++++ outside/re2/benchlog/mktable | 155 + outside/re2/doc/README.xkcd | 1 + outside/re2/doc/mksyntaxgo | 41 + outside/re2/doc/mksyntaxhtml | 42 + outside/re2/doc/mksyntaxwiki | 36 + outside/re2/doc/syntax.html | 388 ++ outside/re2/doc/syntax.txt | 395 ++ outside/re2/doc/xkcd.png | Bin 0 -> 26496 bytes outside/re2/lib/codereview/codereview.cfg | 1 + outside/re2/lib/codereview/codereview.py | 3591 ++++++++++++ outside/re2/libre2.symbols | 16 + outside/re2/libre2.symbols.darwin | 13 + outside/re2/re2/Makefile | 1 + outside/re2/re2/bitstate.cc | 378 ++ outside/re2/re2/compile.cc | 1140 ++++ outside/re2/re2/dfa.cc | 2115 +++++++ outside/re2/re2/filtered_re2.cc | 102 + outside/re2/re2/filtered_re2.h | 101 + outside/re2/re2/make_perl_groups.pl | 110 + outside/re2/re2/make_unicode_casefold.py | 146 + outside/re2/re2/make_unicode_groups.py | 111 + outside/re2/re2/mimics_pcre.cc | 185 + outside/re2/re2/nfa.cc | 709 +++ outside/re2/re2/onepass.cc | 614 ++ outside/re2/re2/parse.cc | 2216 +++++++ outside/re2/re2/perl_groups.cc | 119 + outside/re2/re2/prefilter.cc | 715 +++ outside/re2/re2/prefilter.h | 105 + outside/re2/re2/prefilter_tree.cc | 397 ++ outside/re2/re2/prefilter_tree.h | 131 + outside/re2/re2/prog.cc | 343 ++ outside/re2/re2/prog.h | 376 ++ outside/re2/re2/re2.cc | 1218 ++++ outside/re2/re2/re2.h | 877 +++ outside/re2/re2/regexp.cc | 931 +++ outside/re2/re2/regexp.h | 633 ++ outside/re2/re2/set.cc | 113 + outside/re2/re2/set.h | 55 + outside/re2/re2/simplify.cc | 393 ++ outside/re2/re2/stringpiece.h | 182 + outside/re2/re2/testing/backtrack.cc | 254 + outside/re2/re2/testing/charclass_test.cc | 223 + outside/re2/re2/testing/compile_test.cc | 171 + outside/re2/re2/testing/dfa_test.cc | 344 ++ outside/re2/re2/testing/dump.cc | 164 + outside/re2/re2/testing/exhaustive1_test.cc | 42 + outside/re2/re2/testing/exhaustive2_test.cc | 70 + outside/re2/re2/testing/exhaustive3_test.cc | 94 + outside/re2/re2/testing/exhaustive_test.cc | 38 + outside/re2/re2/testing/exhaustive_tester.cc | 188 + outside/re2/re2/testing/exhaustive_tester.h | 85 + outside/re2/re2/testing/filtered_re2_test.cc | 275 + outside/re2/re2/testing/mimics_pcre_test.cc | 76 + outside/re2/re2/testing/null_walker.cc | 44 + outside/re2/re2/testing/parse_test.cc | 433 ++ .../re2/re2/testing/possible_match_test.cc | 240 + outside/re2/re2/testing/random_test.cc | 95 + outside/re2/re2/testing/re2_arg_test.cc | 133 + outside/re2/re2/testing/re2_test.cc | 1404 +++++ outside/re2/re2/testing/regexp_benchmark.cc | 1461 +++++ outside/re2/re2/testing/regexp_generator.cc | 264 + outside/re2/re2/testing/regexp_generator.h | 70 + outside/re2/re2/testing/regexp_test.cc | 81 + .../re2/re2/testing/required_prefix_test.cc | 67 + outside/re2/re2/testing/search_test.cc | 325 ++ outside/re2/re2/testing/set_test.cc | 114 + outside/re2/re2/testing/simplify_test.cc | 167 + outside/re2/re2/testing/string_generator.cc | 113 + outside/re2/re2/testing/string_generator.h | 58 + .../re2/re2/testing/string_generator_test.cc | 109 + outside/re2/re2/testing/tester.cc | 640 +++ outside/re2/re2/testing/tester.h | 121 + outside/re2/re2/testing/unicode_test.py | 207 + outside/re2/re2/tostring.cc | 341 ++ outside/re2/re2/unicode.py | 297 + outside/re2/re2/unicode_casefold.cc | 480 ++ outside/re2/re2/unicode_casefold.h | 75 + outside/re2/re2/unicode_groups.cc | 5078 +++++++++++++++++ outside/re2/re2/unicode_groups.h | 64 + outside/re2/re2/variadic_function.h | 344 ++ outside/re2/re2/walker-inl.h | 244 + outside/re2/runtests | 21 + outside/re2/testinstall.cc | 26 + outside/re2/ucs2.diff | 567 ++ outside/re2/util/arena.cc | 168 + outside/re2/util/arena.h | 103 + outside/re2/util/atomicops.h | 137 + outside/re2/util/benchmark.cc | 153 + outside/re2/util/benchmark.h | 41 + outside/re2/util/flags.h | 27 + outside/re2/util/hash.cc | 231 + outside/re2/util/logging.h | 86 + outside/re2/util/mutex.h | 211 + outside/re2/util/pcre.cc | 961 ++++ outside/re2/util/pcre.h | 679 +++ outside/re2/util/random.cc | 34 + outside/re2/util/random.h | 29 + outside/re2/util/rune.cc | 258 + outside/re2/util/sparse_array.h | 453 ++ outside/re2/util/sparse_array_test.cc | 150 + outside/re2/util/sparse_set.h | 179 + outside/re2/util/stringpiece.cc | 87 + outside/re2/util/stringprintf.cc | 78 + outside/re2/util/strutil.cc | 97 + outside/re2/util/test.cc | 39 + outside/re2/util/test.h | 57 + outside/re2/util/thread.cc | 44 + outside/re2/util/thread.h | 26 + outside/re2/util/utf.h | 43 + outside/re2/util/util.h | 122 + outside/re2/util/valgrind.cc | 24 + outside/re2/util/valgrind.h | 4517 +++++++++++++++ 122 files changed, 49659 insertions(+) create mode 100644 outside/re2/.hgignore create mode 100644 outside/re2/AUTHORS create mode 100644 outside/re2/CONTRIBUTORS create mode 100644 outside/re2/LICENSE create mode 100644 outside/re2/Makefile create mode 100644 outside/re2/README create mode 100644 outside/re2/benchlog/benchlog.c2 create mode 100644 outside/re2/benchlog/benchlog.mini create mode 100644 outside/re2/benchlog/benchlog.r70 create mode 100644 outside/re2/benchlog/benchlog.wreck create mode 100755 outside/re2/benchlog/mktable create mode 100644 outside/re2/doc/README.xkcd create mode 100755 outside/re2/doc/mksyntaxgo create mode 100755 outside/re2/doc/mksyntaxhtml create mode 100755 outside/re2/doc/mksyntaxwiki create mode 100644 outside/re2/doc/syntax.html create mode 100644 outside/re2/doc/syntax.txt create mode 100644 outside/re2/doc/xkcd.png create mode 100644 outside/re2/lib/codereview/codereview.cfg create mode 100644 outside/re2/lib/codereview/codereview.py create mode 100644 outside/re2/libre2.symbols create mode 100644 outside/re2/libre2.symbols.darwin create mode 100644 outside/re2/re2/Makefile create mode 100644 outside/re2/re2/bitstate.cc create mode 100644 outside/re2/re2/compile.cc create mode 100644 outside/re2/re2/dfa.cc create mode 100644 outside/re2/re2/filtered_re2.cc create mode 100644 outside/re2/re2/filtered_re2.h create mode 100755 outside/re2/re2/make_perl_groups.pl create mode 100755 outside/re2/re2/make_unicode_casefold.py create mode 100755 outside/re2/re2/make_unicode_groups.py create mode 100644 outside/re2/re2/mimics_pcre.cc create mode 100644 outside/re2/re2/nfa.cc create mode 100644 outside/re2/re2/onepass.cc create mode 100644 outside/re2/re2/parse.cc create mode 100644 outside/re2/re2/perl_groups.cc create mode 100644 outside/re2/re2/prefilter.cc create mode 100644 outside/re2/re2/prefilter.h create mode 100644 outside/re2/re2/prefilter_tree.cc create mode 100644 outside/re2/re2/prefilter_tree.h create mode 100644 outside/re2/re2/prog.cc create mode 100644 outside/re2/re2/prog.h create mode 100644 outside/re2/re2/re2.cc create mode 100644 outside/re2/re2/re2.h create mode 100644 outside/re2/re2/regexp.cc create mode 100644 outside/re2/re2/regexp.h create mode 100644 outside/re2/re2/set.cc create mode 100644 outside/re2/re2/set.h create mode 100644 outside/re2/re2/simplify.cc create mode 100644 outside/re2/re2/stringpiece.h create mode 100644 outside/re2/re2/testing/backtrack.cc create mode 100644 outside/re2/re2/testing/charclass_test.cc create mode 100644 outside/re2/re2/testing/compile_test.cc create mode 100644 outside/re2/re2/testing/dfa_test.cc create mode 100644 outside/re2/re2/testing/dump.cc create mode 100644 outside/re2/re2/testing/exhaustive1_test.cc create mode 100644 outside/re2/re2/testing/exhaustive2_test.cc create mode 100644 outside/re2/re2/testing/exhaustive3_test.cc create mode 100644 outside/re2/re2/testing/exhaustive_test.cc create mode 100644 outside/re2/re2/testing/exhaustive_tester.cc create mode 100644 outside/re2/re2/testing/exhaustive_tester.h create mode 100644 outside/re2/re2/testing/filtered_re2_test.cc create mode 100644 outside/re2/re2/testing/mimics_pcre_test.cc create mode 100644 outside/re2/re2/testing/null_walker.cc create mode 100644 outside/re2/re2/testing/parse_test.cc create mode 100644 outside/re2/re2/testing/possible_match_test.cc create mode 100644 outside/re2/re2/testing/random_test.cc create mode 100644 outside/re2/re2/testing/re2_arg_test.cc create mode 100644 outside/re2/re2/testing/re2_test.cc create mode 100644 outside/re2/re2/testing/regexp_benchmark.cc create mode 100644 outside/re2/re2/testing/regexp_generator.cc create mode 100644 outside/re2/re2/testing/regexp_generator.h create mode 100644 outside/re2/re2/testing/regexp_test.cc create mode 100644 outside/re2/re2/testing/required_prefix_test.cc create mode 100644 outside/re2/re2/testing/search_test.cc create mode 100644 outside/re2/re2/testing/set_test.cc create mode 100644 outside/re2/re2/testing/simplify_test.cc create mode 100644 outside/re2/re2/testing/string_generator.cc create mode 100644 outside/re2/re2/testing/string_generator.h create mode 100644 outside/re2/re2/testing/string_generator_test.cc create mode 100644 outside/re2/re2/testing/tester.cc create mode 100644 outside/re2/re2/testing/tester.h create mode 100755 outside/re2/re2/testing/unicode_test.py create mode 100644 outside/re2/re2/tostring.cc create mode 100644 outside/re2/re2/unicode.py create mode 100644 outside/re2/re2/unicode_casefold.cc create mode 100644 outside/re2/re2/unicode_casefold.h create mode 100644 outside/re2/re2/unicode_groups.cc create mode 100644 outside/re2/re2/unicode_groups.h create mode 100644 outside/re2/re2/variadic_function.h create mode 100644 outside/re2/re2/walker-inl.h create mode 100755 outside/re2/runtests create mode 100644 outside/re2/testinstall.cc create mode 100644 outside/re2/ucs2.diff create mode 100644 outside/re2/util/arena.cc create mode 100644 outside/re2/util/arena.h create mode 100644 outside/re2/util/atomicops.h create mode 100644 outside/re2/util/benchmark.cc create mode 100644 outside/re2/util/benchmark.h create mode 100644 outside/re2/util/flags.h create mode 100644 outside/re2/util/hash.cc create mode 100644 outside/re2/util/logging.h create mode 100644 outside/re2/util/mutex.h create mode 100644 outside/re2/util/pcre.cc create mode 100644 outside/re2/util/pcre.h create mode 100644 outside/re2/util/random.cc create mode 100644 outside/re2/util/random.h create mode 100644 outside/re2/util/rune.cc create mode 100644 outside/re2/util/sparse_array.h create mode 100644 outside/re2/util/sparse_array_test.cc create mode 100644 outside/re2/util/sparse_set.h create mode 100644 outside/re2/util/stringpiece.cc create mode 100644 outside/re2/util/stringprintf.cc create mode 100644 outside/re2/util/strutil.cc create mode 100644 outside/re2/util/test.cc create mode 100644 outside/re2/util/test.h create mode 100644 outside/re2/util/thread.cc create mode 100644 outside/re2/util/thread.h create mode 100644 outside/re2/util/utf.h create mode 100644 outside/re2/util/util.h create mode 100644 outside/re2/util/valgrind.cc create mode 100644 outside/re2/util/valgrind.h diff --git a/outside/re2/.hgignore b/outside/re2/.hgignore new file mode 100644 index 000000000..bfa7eb715 --- /dev/null +++ b/outside/re2/.hgignore @@ -0,0 +1,7 @@ +syntax:glob +*.pyc +*.orig +core + +syntax:regexp +^obj/ diff --git a/outside/re2/AUTHORS b/outside/re2/AUTHORS new file mode 100644 index 000000000..0754006fe --- /dev/null +++ b/outside/re2/AUTHORS @@ -0,0 +1,13 @@ +# This is the official list of RE2 authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. + +# Names should be added to this file as +# Name or Organization +# The email address is not required for organizations. + +# Please keep the list sorted. + +Google Inc. +Samsung Electronics +Stefano Rivera diff --git a/outside/re2/CONTRIBUTORS b/outside/re2/CONTRIBUTORS new file mode 100644 index 000000000..2a694bca6 --- /dev/null +++ b/outside/re2/CONTRIBUTORS @@ -0,0 +1,40 @@ +# This is the official list of people who can contribute +# (and typically have contributed) code to the RE2 repository. +# The AUTHORS file lists the copyright holders; this file +# lists people. For example, Google employees are listed here +# but not in AUTHORS, because Google holds the copyright. +# +# The submission process automatically checks to make sure +# that people submitting code are listed in this file (by email address). +# +# Names should be added to this file only after verifying that +# the individual or the individual's organization has agreed to +# the appropriate Contributor License Agreement, found here: +# +# http://code.google.com/legal/individual-cla-v1.0.html +# http://code.google.com/legal/corporate-cla-v1.0.html +# +# The agreement for individuals can be filled out on the web. +# +# When adding J Random Contributor's name to this file, +# either J's name or J's organization's name should be +# added to the AUTHORS file, depending on whether the +# individual or corporate CLA was used. + +# Names should be added to this file like so: +# Name + +# Please keep the list sorted. + +Dominic Battré +Doug Kwan +Dmitriy Vyukov +John Millikin +Mike Nazarewicz +Pawel Hajdan +Rob Pike +Russ Cox +Sanjay Ghemawat +Stefano Rivera +Srinivasan Venkatachary +Viatcheslav Ostapenko diff --git a/outside/re2/LICENSE b/outside/re2/LICENSE new file mode 100644 index 000000000..09e5ec1c7 --- /dev/null +++ b/outside/re2/LICENSE @@ -0,0 +1,27 @@ +// Copyright (c) 2009 The RE2 Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/outside/re2/Makefile b/outside/re2/Makefile new file mode 100644 index 000000000..88bf9430a --- /dev/null +++ b/outside/re2/Makefile @@ -0,0 +1,291 @@ +# Copyright 2009 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +all: obj/libre2.a obj/so/libre2.so + +# to build against PCRE for testing or benchmarking, +# uncomment the next two lines +# CCPCRE=-I/usr/local/include -DUSEPCRE +# LDPCRE=-L/usr/local/lib -lpcre + +CXX?=g++ +CXXFLAGS?=-Wall -O3 -g -pthread # can override +RE2_CXXFLAGS?=-Wno-sign-compare -c -I. $(CCPCRE) # required +LDFLAGS?= +AR?=ar +ARFLAGS?=rsc +NM?=nm +NMFLAGS?=-p + +# Variables mandated by GNU, the arbiter of all good taste on the internet. +# http://www.gnu.org/prep/standards/standards.html +prefix=/usr/local +exec_prefix=$(prefix) +bindir=$(exec_prefix)/bin +includedir=$(prefix)/include +libdir=$(exec_prefix)/lib +INSTALL=install +INSTALL_PROGRAM=$(INSTALL) +INSTALL_DATA=$(INSTALL) -m 644 + +# ABI version +# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html +SONAME=0 + +# To rebuild the Tables generated by Perl and Python scripts (requires Internet +# access for Unicode data), uncomment the following line: +# REBUILD_TABLES=1 + +ifeq ($(shell uname),Darwin) +MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin +else +MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script=libre2.symbols $(LDFLAGS) +endif + +INSTALL_HFILES=\ + re2/filtered_re2.h\ + re2/re2.h\ + re2/set.h\ + re2/stringpiece.h\ + re2/variadic_function.h\ + +HFILES=\ + util/arena.h\ + util/atomicops.h\ + util/benchmark.h\ + util/flags.h\ + util/logging.h\ + util/mutex.h\ + util/pcre.h\ + util/random.h\ + util/sparse_array.h\ + util/sparse_set.h\ + util/test.h\ + util/utf.h\ + util/util.h\ + util/valgrind.h\ + re2/filtered_re2.h\ + re2/prefilter.h\ + re2/prefilter_tree.h\ + re2/prog.h\ + re2/re2.h\ + re2/regexp.h\ + re2/set.h\ + re2/stringpiece.h\ + re2/testing/exhaustive_tester.h\ + re2/testing/regexp_generator.h\ + re2/testing/string_generator.h\ + re2/testing/tester.h\ + re2/unicode_casefold.h\ + re2/unicode_groups.h\ + re2/variadic_function.h\ + re2/walker-inl.h\ + +OFILES=\ + obj/util/arena.o\ + obj/util/hash.o\ + obj/util/rune.o\ + obj/util/stringpiece.o\ + obj/util/stringprintf.o\ + obj/util/strutil.o\ + obj/util/valgrind.o\ + obj/re2/bitstate.o\ + obj/re2/compile.o\ + obj/re2/dfa.o\ + obj/re2/filtered_re2.o\ + obj/re2/mimics_pcre.o\ + obj/re2/nfa.o\ + obj/re2/onepass.o\ + obj/re2/parse.o\ + obj/re2/perl_groups.o\ + obj/re2/prefilter.o\ + obj/re2/prefilter_tree.o\ + obj/re2/prog.o\ + obj/re2/re2.o\ + obj/re2/regexp.o\ + obj/re2/set.o\ + obj/re2/simplify.o\ + obj/re2/tostring.o\ + obj/re2/unicode_casefold.o\ + obj/re2/unicode_groups.o\ + +TESTOFILES=\ + obj/util/pcre.o\ + obj/util/random.o\ + obj/util/thread.o\ + obj/re2/testing/backtrack.o\ + obj/re2/testing/dump.o\ + obj/re2/testing/exhaustive_tester.o\ + obj/re2/testing/null_walker.o\ + obj/re2/testing/regexp_generator.o\ + obj/re2/testing/string_generator.o\ + obj/re2/testing/tester.o\ + +TESTS=\ + obj/test/charclass_test\ + obj/test/compile_test\ + obj/test/filtered_re2_test\ + obj/test/mimics_pcre_test\ + obj/test/parse_test\ + obj/test/possible_match_test\ + obj/test/re2_test\ + obj/test/re2_arg_test\ + obj/test/regexp_test\ + obj/test/required_prefix_test\ + obj/test/search_test\ + obj/test/set_test\ + obj/test/simplify_test\ + obj/test/string_generator_test\ + +BIGTESTS=\ + obj/test/dfa_test\ + obj/test/exhaustive1_test\ + obj/test/exhaustive2_test\ + obj/test/exhaustive3_test\ + obj/test/exhaustive_test\ + obj/test/random_test\ + +SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES)) +STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES)) +STESTS=$(patsubst obj/%,obj/so/%,$(TESTS)) +SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS)) + +DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES)) +DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) +DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) +DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) + +obj/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + +obj/dbg/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc + +obj/so/%.o: %.cc $(HFILES) + @mkdir -p $$(dirname $@) + $(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc + +obj/libre2.a: $(OFILES) + @mkdir -p obj + $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) + +obj/dbg/libre2.a: $(DOFILES) + @mkdir -p obj/dbg + $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) + +obj/so/libre2.so: $(SOFILES) + @mkdir -p obj/so + $(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES) + ln -sf libre2.so.$(SONAME) $@ + +obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE) + +obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o + @mkdir -p obj/dbg/test + $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE) + +obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o + @mkdir -p obj/so/test + $(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE) + +obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o + @mkdir -p obj/test + $(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE) + +ifdef REBUILD_TABLES +re2/perl_groups.cc: re2/make_perl_groups.pl + perl $< > $@ + +re2/unicode_%.cc: re2/make_unicode_%.py + python $< > $@ +endif + +distclean: clean + rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc + +clean: + rm -rf obj + rm -f re2/*.pyc + +testofiles: $(TESTOFILES) + +test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test + +debug-test: $(DTESTS) + @echo + @echo Running debug binary tests. + @echo + @./runtests $(DTESTS) + +static-test: $(TESTS) + @echo + @echo Running static binary tests. + @echo + @./runtests $(TESTS) + +shared-test: $(STESTS) + @echo + @echo Running dynamic binary tests. + @echo + @LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) + +debug-bigtest: $(DTESTS) $(DBIGTESTS) + @./runtests $(DTESTS) $(DBIGTESTS) + +static-bigtest: $(TESTS) $(BIGTESTS) + @./runtests $(TESTS) $(BIGTESTS) + +shared-bigtest: $(STESTS) $(SBIGTESTS) + @LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS) + +benchmark: obj/test/regexp_benchmark + +install: obj/libre2.a obj/so/libre2.so + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir) + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 + $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + $(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0 + ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME) + ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so + +testinstall: + @mkdir -p obj + cp testinstall.cc obj + (cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall) + LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall + +benchlog: obj/test/regexp_benchmark + (echo '==BENCHMARK==' `hostname` `date`; \ + (uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ + echo; \ + ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') + +# Keep gmake from deleting intermediate files it creates. +# This makes repeated builds faster and preserves debug info on OS X. + +.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \ + obj/dbg/libre2.a obj/so/libre2.a \ + obj/test/% obj/so/test/% obj/dbg/test/% + +log: + make clean + make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test + echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt + echo '#' $$(date) >>re2-exhaustive.txt + obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt + obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt + + make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test + echo '#' RE2 basic search tests built by make $@ >re2-search.txt + echo '#' $$(date) >>re2-search.txt + obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt + +x: x.cc obj/libre2.a + g++ -I. -o x x.cc obj/libre2.a diff --git a/outside/re2/README b/outside/re2/README new file mode 100644 index 000000000..57b31813d --- /dev/null +++ b/outside/re2/README @@ -0,0 +1,19 @@ +This is the source code repository for RE2, a regular expression library. + +For documentation about how to install and use RE2, +visit http://code.google.com/p/re2/. + +The short version is: + +make +make test +make install +make testinstall + +Unless otherwise noted, the RE2 source files are distributed +under the BSD-style license found in the LICENSE file. + +RE2's native language is C++. +An Inferno wrapper is at http://code.google.com/p/inferno-re2/. +A Python wrapper is at http://github.com/facebook/pyre2/. +A Ruby wrapper is at http://github.com/axic/rre2/. diff --git a/outside/re2/benchlog/benchlog.c2 b/outside/re2/benchlog/benchlog.c2 new file mode 100644 index 000000000..2c1664c69 --- /dev/null +++ b/outside/re2/benchlog/benchlog.c2 @@ -0,0 +1,2211 @@ +c2=; apt-cache show libpcre3-dev +Package: libpcre3-dev +Priority: optional +Section: libdevel +Installed-Size: 712 +Maintainer: Ubuntu Developers +Original-Maintainer: Mark Baker +Architecture: amd64 +Source: pcre3 +Version: 7.8-3 +Depends: libc6-dev, libpcre3 (= 7.8-3), libpcrecpp0 (= 7.8-3) +Conflicts: libpcre1-dev, libpcre2-dev +Filename: pool/main/p/pcre3/libpcre3-dev_7.8-3_amd64.deb +Size: 263634 +MD5sum: 0a081735710002405b16b9fde7db3f90 +SHA1: 73d6ba90280a6d897f420ac99c62aceed8bc9886 +SHA256: 75a8ac25fba93e72d043f58cd4cde5af0c266a3764527c58b7b059cdc58d7d72 +Description: Perl 5 Compatible Regular Expression Library - development files + This is a library of functions to support regular expressions whose syntax + and semantics are as close as possible to those of the Perl 5 language. + . + This package contains the development files, including headers, static + libraries, and documentation. +Bugs: https://bugs.launchpad.net/ubuntu/+filebug +Origin: Ubuntu + +c2=; + + +==BENCHMARK== c2 Fri Feb 26 11:56:53 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# 10870db2d7b5+ tip + +Search_Easy0_CachedPCRE/8 10000000 127 ns/op 62.57 MB/s +Search_Easy0_CachedPCRE/16 10000000 156 ns/op 102.26 MB/s +Search_Easy0_CachedPCRE/32 5000000 213 ns/op 150.12 MB/s +Search_Easy0_CachedPCRE/64 5000000 326 ns/op 195.76 MB/s +Search_Easy0_CachedPCRE/128 5000000 427 ns/op 299.08 MB/s +Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.95 MB/s +Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.04 MB/s +Search_Easy0_CachedPCRE/1K 500000 3395 ns/op 301.54 MB/s +Search_Easy0_CachedPCRE/2K 200000 5680 ns/op 360.53 MB/s +Search_Easy0_CachedPCRE/4K 100000 10664 ns/op 384.07 MB/s +Search_Easy0_CachedPCRE/8K 50000 21849 ns/op 374.93 MB/s +Search_Easy0_CachedPCRE/16K 50000 42327 ns/op 387.08 MB/s +Search_Easy0_CachedPCRE/32K 20000 85374 ns/op 383.81 MB/s +Search_Easy0_CachedPCRE/64K 10000 169652 ns/op 386.30 MB/s +Search_Easy0_CachedPCRE/128K 5000 340683 ns/op 384.73 MB/s +Search_Easy0_CachedPCRE/256K 2000 679601 ns/op 385.73 MB/s +Search_Easy0_CachedPCRE/512K 1000 1361625 ns/op 385.05 MB/s +Search_Easy0_CachedPCRE/1M 500 2723438 ns/op 385.02 MB/s +Search_Easy0_CachedPCRE/2M 200 5470390 ns/op 383.36 MB/s +Search_Easy0_CachedPCRE/4M 100 11041050 ns/op 379.88 MB/s +Search_Easy0_CachedPCRE/8M 50 22165440 ns/op 378.45 MB/s +Search_Easy0_CachedPCRE/16M 50 44294160 ns/op 378.77 MB/s +Search_Easy0_CachedRE2/8 5000000 316 ns/op 25.25 MB/s +Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.32 MB/s +Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.26 MB/s +Search_Easy0_CachedRE2/64 5000000 334 ns/op 191.58 MB/s +Search_Easy0_CachedRE2/128 5000000 371 ns/op 344.40 MB/s +Search_Easy0_CachedRE2/256 5000000 391 ns/op 653.11 MB/s +Search_Easy0_CachedRE2/512 5000000 465 ns/op 1099.45 MB/s +Search_Easy0_CachedRE2/1K 2000000 664 ns/op 1541.25 MB/s +Search_Easy0_CachedRE2/2K 1000000 1015 ns/op 2015.77 MB/s +Search_Easy0_CachedRE2/4K 1000000 1581 ns/op 2590.41 MB/s +Search_Easy0_CachedRE2/8K 500000 2914 ns/op 2810.63 MB/s +Search_Easy0_CachedRE2/16K 200000 5406 ns/op 3030.64 MB/s +Search_Easy0_CachedRE2/32K 100000 10992 ns/op 2980.97 MB/s +Search_Easy0_CachedRE2/64K 50000 21829 ns/op 3002.12 MB/s +Search_Easy0_CachedRE2/128K 50000 44263 ns/op 2961.20 MB/s +Search_Easy0_CachedRE2/256K 20000 88222 ns/op 2971.39 MB/s +Search_Easy0_CachedRE2/512K 10000 177626 ns/op 2951.64 MB/s +Search_Easy0_CachedRE2/1M 5000 356519 ns/op 2941.15 MB/s +Search_Easy0_CachedRE2/2M 2000 730121 ns/op 2872.33 MB/s +Search_Easy0_CachedRE2/4M 1000 1522926 ns/op 2754.11 MB/s +Search_Easy0_CachedRE2/8M 500 3093982 ns/op 2711.27 MB/s +Search_Easy0_CachedRE2/16M 200 6173845 ns/op 2717.47 MB/s +Search_Easy1_CachedPCRE/8 10000000 129 ns/op 61.93 MB/s +Search_Easy1_CachedPCRE/16 10000000 156 ns/op 102.24 MB/s +Search_Easy1_CachedPCRE/32 5000000 213 ns/op 150.10 MB/s +Search_Easy1_CachedPCRE/64 5000000 326 ns/op 195.85 MB/s +Search_Easy1_CachedPCRE/128 2000000 648 ns/op 197.35 MB/s +Search_Easy1_CachedPCRE/256 2000000 934 ns/op 273.84 MB/s +Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.73 MB/s +Search_Easy1_CachedPCRE/1K 500000 3432 ns/op 298.32 MB/s +Search_Easy1_CachedPCRE/2K 200000 6255 ns/op 327.39 MB/s +Search_Easy1_CachedPCRE/4K 100000 11212 ns/op 365.31 MB/s +Search_Easy1_CachedPCRE/8K 50000 22182 ns/op 369.31 MB/s +Search_Easy1_CachedPCRE/16K 50000 42713 ns/op 383.58 MB/s +Search_Easy1_CachedPCRE/32K 20000 85747 ns/op 382.14 MB/s +Search_Easy1_CachedPCRE/64K 10000 170670 ns/op 383.99 MB/s +Search_Easy1_CachedPCRE/128K 5000 342979 ns/op 382.16 MB/s +Search_Easy1_CachedPCRE/256K 2000 683959 ns/op 383.27 MB/s +Search_Easy1_CachedPCRE/512K 1000 1370065 ns/op 382.67 MB/s +Search_Easy1_CachedPCRE/1M 500 2742576 ns/op 382.33 MB/s +Search_Easy1_CachedPCRE/2M 200 5507205 ns/op 380.80 MB/s +Search_Easy1_CachedPCRE/4M 100 11116960 ns/op 377.29 MB/s +Search_Easy1_CachedPCRE/8M 50 22302540 ns/op 376.13 MB/s +Search_Easy1_CachedPCRE/16M 50 44593400 ns/op 376.23 MB/s +Search_Easy1_CachedRE2/8 5000000 316 ns/op 25.30 MB/s +Search_Easy1_CachedRE2/16 5000000 318 ns/op 50.29 MB/s +Search_Easy1_CachedRE2/32 5000000 331 ns/op 96.45 MB/s +Search_Easy1_CachedRE2/64 5000000 334 ns/op 191.09 MB/s +Search_Easy1_CachedRE2/128 5000000 367 ns/op 348.34 MB/s +Search_Easy1_CachedRE2/256 5000000 399 ns/op 640.68 MB/s +Search_Easy1_CachedRE2/512 5000000 476 ns/op 1073.44 MB/s +Search_Easy1_CachedRE2/1K 2000000 655 ns/op 1563.28 MB/s +Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2043.22 MB/s +Search_Easy1_CachedRE2/4K 1000000 1582 ns/op 2588.68 MB/s +Search_Easy1_CachedRE2/8K 500000 2916 ns/op 2808.86 MB/s +Search_Easy1_CachedRE2/16K 200000 5435 ns/op 3014.34 MB/s +Search_Easy1_CachedRE2/32K 100000 10957 ns/op 2990.35 MB/s +Search_Easy1_CachedRE2/64K 50000 21824 ns/op 3002.80 MB/s +Search_Easy1_CachedRE2/128K 50000 44255 ns/op 2961.71 MB/s +Search_Easy1_CachedRE2/256K 20000 88214 ns/op 2971.66 MB/s +Search_Easy1_CachedRE2/512K 10000 177657 ns/op 2951.12 MB/s +Search_Easy1_CachedRE2/1M 5000 356560 ns/op 2940.81 MB/s +Search_Easy1_CachedRE2/2M 2000 730094 ns/op 2872.44 MB/s +Search_Easy1_CachedRE2/4M 1000 1522720 ns/op 2754.48 MB/s +Search_Easy1_CachedRE2/8M 500 3093050 ns/op 2712.08 MB/s +Search_Easy1_CachedRE2/16M 200 6171535 ns/op 2718.48 MB/s +Search_Medium_CachedPCRE/8 10000000 128 ns/op 62.07 MB/s +Search_Medium_CachedPCRE/16 10000000 157 ns/op 101.40 MB/s +Search_Medium_CachedPCRE/32 5000000 214 ns/op 149.24 MB/s +Search_Medium_CachedPCRE/64 5000000 336 ns/op 190.31 MB/s +Search_Medium_CachedPCRE/128 5000000 430 ns/op 297.32 MB/s +Search_Medium_CachedPCRE/256 200000 8892 ns/op 28.79 MB/s +Search_Medium_CachedPCRE/512 50000 21295 ns/op 24.04 MB/s +Search_Medium_CachedPCRE/1K 50000 41581 ns/op 24.63 MB/s +Search_Medium_CachedPCRE/2K 20000 61200 ns/op 33.46 MB/s +Search_Medium_CachedPCRE/4K 10000 173807 ns/op 23.57 MB/s +Search_Medium_CachedPCRE/8K 5000 382058 ns/op 21.44 MB/s +Search_Medium_CachedPCRE/16K 2000 773090 ns/op 21.19 MB/s +Search_Medium_CachedPCRE/32K 1000 1545797 ns/op 21.20 MB/s +Search_Medium_CachedPCRE/64K 500 3076340 ns/op 21.30 MB/s +Search_Medium_CachedPCRE/128K 200 6134010 ns/op 21.37 MB/s +Search_Medium_CachedPCRE/256K 100 12315460 ns/op 21.29 MB/s +Search_Medium_CachedRE2/8 5000000 338 ns/op 23.62 MB/s +Search_Medium_CachedRE2/16 5000000 363 ns/op 43.99 MB/s +Search_Medium_CachedRE2/32 5000000 413 ns/op 77.32 MB/s +Search_Medium_CachedRE2/64 2000000 515 ns/op 124.15 MB/s +Search_Medium_CachedRE2/128 2000000 722 ns/op 177.20 MB/s +Search_Medium_CachedRE2/256 1000000 1126 ns/op 227.29 MB/s +Search_Medium_CachedRE2/512 1000000 1937 ns/op 264.32 MB/s +Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s +Search_Medium_CachedRE2/2K 200000 6788 ns/op 301.71 MB/s +Search_Medium_CachedRE2/4K 100000 13258 ns/op 308.92 MB/s +Search_Medium_CachedRE2/8K 50000 26198 ns/op 312.69 MB/s +Search_Medium_CachedRE2/16K 20000 52097 ns/op 314.48 MB/s +Search_Medium_CachedRE2/32K 10000 103975 ns/op 315.15 MB/s +Search_Medium_CachedRE2/64K 5000 207487 ns/op 315.86 MB/s +Search_Medium_CachedRE2/128K 5000 414637 ns/op 316.11 MB/s +Search_Medium_CachedRE2/256K 2000 828752 ns/op 316.31 MB/s +Search_Medium_CachedRE2/512K 1000 1657280 ns/op 316.35 MB/s +Search_Medium_CachedRE2/1M 500 3314560 ns/op 316.35 MB/s +Search_Medium_CachedRE2/2M 200 6643535 ns/op 315.67 MB/s +Search_Medium_CachedRE2/4M 100 13338160 ns/op 314.46 MB/s +Search_Medium_CachedRE2/8M 50 26716200 ns/op 313.99 MB/s +Search_Medium_CachedRE2/16M 20 53439850 ns/op 313.95 MB/s +Search_Hard_CachedPCRE/8 10000000 128 ns/op 62.06 MB/s +Search_Hard_CachedPCRE/16 10000000 157 ns/op 101.42 MB/s +Search_Hard_CachedPCRE/32 5000000 214 ns/op 149.37 MB/s +Search_Hard_CachedPCRE/64 5000000 336 ns/op 190.26 MB/s +Search_Hard_CachedPCRE/128 5000000 430 ns/op 297.44 MB/s +Search_Hard_CachedPCRE/256 2000 780527 ns/op 0.33 MB/s +Search_Hard_CachedPCRE/512 500 3210270 ns/op 0.16 MB/s +Search_Hard_CachedPCRE/1K 100 12762760 ns/op 0.08 MB/s +Search_Hard_CachedPCRE/2K 50 46734020 ns/op 0.04 MB/s +Search_Hard_CachedPCRE/4K 5 201439400 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 338 ns/op 23.65 MB/s +Search_Hard_CachedRE2/16 5000000 363 ns/op 44.05 MB/s +Search_Hard_CachedRE2/32 5000000 411 ns/op 77.70 MB/s +Search_Hard_CachedRE2/64 2000000 512 ns/op 124.89 MB/s +Search_Hard_CachedRE2/128 2000000 721 ns/op 177.47 MB/s +Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.50 MB/s +Search_Hard_CachedRE2/512 1000000 1933 ns/op 264.85 MB/s +Search_Hard_CachedRE2/1K 500000 3550 ns/op 288.42 MB/s +Search_Hard_CachedRE2/2K 200000 6786 ns/op 301.80 MB/s +Search_Hard_CachedRE2/4K 100000 13256 ns/op 308.97 MB/s +Search_Hard_CachedRE2/8K 50000 26197 ns/op 312.71 MB/s +Search_Hard_CachedRE2/16K 20000 52077 ns/op 314.61 MB/s +Search_Hard_CachedRE2/32K 10000 103962 ns/op 315.19 MB/s +Search_Hard_CachedRE2/64K 5000 207496 ns/op 315.84 MB/s +Search_Hard_CachedRE2/128K 5000 414609 ns/op 316.13 MB/s +Search_Hard_CachedRE2/256K 2000 828753 ns/op 316.31 MB/s +Search_Hard_CachedRE2/512K 1000 1657228 ns/op 316.36 MB/s +Search_Hard_CachedRE2/1M 500 3314250 ns/op 316.38 MB/s +Search_Hard_CachedRE2/2M 200 6643040 ns/op 315.69 MB/s +Search_Hard_CachedRE2/4M 100 13337040 ns/op 314.49 MB/s +Search_Hard_CachedRE2/8M 50 26716100 ns/op 313.99 MB/s +Search_Hard_CachedRE2/16M 20 53433550 ns/op 313.98 MB/s +Search_Parens_CachedPCRE/8 5000000 213 ns/op 37.43 MB/s +Search_Parens_CachedRE2/8 5000000 337 ns/op 23.73 MB/s +Search_Parens_CachedRE2/16 5000000 362 ns/op 44.12 MB/s +Search_Parens_CachedRE2/32 5000000 412 ns/op 77.59 MB/s +Search_Parens_CachedRE2/64 2000000 514 ns/op 124.37 MB/s +Search_Parens_CachedRE2/128 2000000 721 ns/op 177.32 MB/s +Search_Parens_CachedRE2/256 1000000 1125 ns/op 227.50 MB/s +Search_Parens_CachedRE2/512 1000000 1932 ns/op 264.92 MB/s +Search_Parens_CachedRE2/1K 500000 3550 ns/op 288.37 MB/s +Search_Parens_CachedRE2/2K 200000 6786 ns/op 301.78 MB/s +Search_Parens_CachedRE2/4K 100000 13258 ns/op 308.94 MB/s +Search_Parens_CachedRE2/8K 50000 26199 ns/op 312.68 MB/s +Search_Parens_CachedRE2/16K 20000 52095 ns/op 314.50 MB/s +Search_Parens_CachedRE2/32K 10000 103958 ns/op 315.20 MB/s +Search_Parens_CachedRE2/64K 5000 207520 ns/op 315.81 MB/s +Search_Parens_CachedRE2/128K 5000 414602 ns/op 316.14 MB/s +Search_Parens_CachedRE2/256K 2000 828782 ns/op 316.30 MB/s +Search_Parens_CachedRE2/512K 1000 1657076 ns/op 316.39 MB/s +Search_Parens_CachedRE2/1M 500 3314154 ns/op 316.39 MB/s +Search_Parens_CachedRE2/2M 200 6643900 ns/op 315.65 MB/s +Search_Parens_CachedRE2/4M 100 13336670 ns/op 314.49 MB/s +Search_Parens_CachedRE2/8M 50 26714480 ns/op 314.01 MB/s +Search_Parens_CachedRE2/16M 20 53434900 ns/op 313.97 MB/s +Search_BigFixed_CachedPCRE/8 5000000 251 ns/op 31.76 MB/s +Search_BigFixed_CachedPCRE/16 5000000 314 ns/op 50.80 MB/s +Search_BigFixed_CachedPCRE/32 5000000 441 ns/op 72.49 MB/s +Search_BigFixed_CachedPCRE/64 2000000 694 ns/op 92.21 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1066 ns/op 119.99 MB/s +Search_BigFixed_CachedPCRE/256 1000000 1933 ns/op 132.39 MB/s +Search_BigFixed_CachedPCRE/512 500000 3652 ns/op 140.19 MB/s +Search_BigFixed_CachedPCRE/1K 200000 7089 ns/op 144.43 MB/s +Search_BigFixed_CachedPCRE/2K 100000 13964 ns/op 146.66 MB/s +Search_BigFixed_CachedPCRE/4K 50000 27716 ns/op 147.78 MB/s +Search_BigFixed_CachedPCRE/8K 20000 55232 ns/op 148.32 MB/s +Search_BigFixed_CachedPCRE/16K 10000 110321 ns/op 148.51 MB/s +Search_BigFixed_CachedPCRE/32K 5000 220561 ns/op 148.57 MB/s +Search_BigFixed_CachedRE2/8 10000000 131 ns/op 60.69 MB/s +Search_BigFixed_CachedRE2/16 5000000 374 ns/op 42.75 MB/s +Search_BigFixed_CachedRE2/32 5000000 410 ns/op 77.99 MB/s +Search_BigFixed_CachedRE2/64 5000000 489 ns/op 130.84 MB/s +Search_BigFixed_CachedRE2/128 2000000 635 ns/op 201.43 MB/s +Search_BigFixed_CachedRE2/256 2000000 945 ns/op 270.66 MB/s +Search_BigFixed_CachedRE2/512 1000000 1552 ns/op 329.85 MB/s +Search_BigFixed_CachedRE2/1K 500000 2766 ns/op 370.19 MB/s +Search_BigFixed_CachedRE2/2K 200000 5191 ns/op 394.49 MB/s +Search_BigFixed_CachedRE2/4K 100000 10046 ns/op 407.71 MB/s +Search_BigFixed_CachedRE2/8K 100000 19752 ns/op 414.74 MB/s +Search_BigFixed_CachedRE2/16K 50000 39168 ns/op 418.30 MB/s +Search_BigFixed_CachedRE2/32K 20000 78114 ns/op 419.49 MB/s +Search_BigFixed_CachedRE2/64K 10000 155895 ns/op 420.38 MB/s +Search_BigFixed_CachedRE2/128K 5000 311573 ns/op 420.68 MB/s +Search_BigFixed_CachedRE2/256K 2000 624241 ns/op 419.94 MB/s +Search_BigFixed_CachedRE2/512K 1000 1253377 ns/op 418.30 MB/s +Search_BigFixed_CachedRE2/1M 500 2530874 ns/op 414.31 MB/s +Search_Success_PCRE/8 1000000 1836 ns/op 4.36 MB/s +Search_Success_PCRE/16 1000000 1880 ns/op 8.51 MB/s +Search_Success_PCRE/32 1000000 1970 ns/op 16.24 MB/s +Search_Success_PCRE/64 500000 2106 ns/op 30.38 MB/s +Search_Success_PCRE/128 500000 2447 ns/op 52.29 MB/s +Search_Success_PCRE/256 500000 3103 ns/op 82.48 MB/s +Search_Success_PCRE/512 500000 4428 ns/op 115.62 MB/s +Search_Success_PCRE/1K 200000 7053 ns/op 145.17 MB/s +Search_Success_PCRE/2K 100000 12308 ns/op 166.39 MB/s +Search_Success_PCRE/4K 50000 22793 ns/op 179.70 MB/s +Search_Success_PCRE/8K 50000 43847 ns/op 186.83 MB/s +Search_Success_PCRE/16K 20000 85952 ns/op 190.62 MB/s +Search_Success_PCRE/32K 10000 170305 ns/op 192.41 MB/s +Search_Success_PCRE/64K 5000 338862 ns/op 193.40 MB/s +Search_Success_PCRE/128K 2000 676940 ns/op 193.62 MB/s +Search_Success_PCRE/256K 1000 1355784 ns/op 193.35 MB/s +Search_Success_PCRE/512K 500 2725254 ns/op 192.38 MB/s +Search_Success_PCRE/1M 200 5542255 ns/op 189.20 MB/s +Search_Success_PCRE/2M 100 11433880 ns/op 183.42 MB/s +Search_Success_PCRE/4M 50 24217120 ns/op 173.20 MB/s +Search_Success_PCRE/8M 20 56016550 ns/op 149.75 MB/s +Search_Success_PCRE/16M 10 137107400 ns/op 122.37 MB/s +Search_Success_RE2/8 200000 8525 ns/op 0.94 MB/s +Search_Success_RE2/16 100000 19567 ns/op 0.82 MB/s +Search_Success_RE2/32 100000 19549 ns/op 1.64 MB/s +Search_Success_RE2/64 100000 19744 ns/op 3.24 MB/s +Search_Success_RE2/128 100000 19919 ns/op 6.43 MB/s +Search_Success_RE2/256 50000 20201 ns/op 12.67 MB/s +Search_Success_RE2/512 50000 20993 ns/op 24.39 MB/s +Search_Success_RE2/1K 50000 22581 ns/op 45.35 MB/s +Search_Success_RE2/2K 50000 25897 ns/op 79.08 MB/s +Search_Success_RE2/4K 50000 32389 ns/op 126.46 MB/s +Search_Success_RE2/8K 50000 45266 ns/op 180.97 MB/s +Search_Success_RE2/16K 20000 71222 ns/op 230.04 MB/s +Search_Success_RE2/32K 10000 123342 ns/op 265.67 MB/s +Search_Success_RE2/64K 5000 227134 ns/op 288.53 MB/s +Search_Success_RE2/128K 5000 434534 ns/op 301.64 MB/s +Search_Success_RE2/256K 2000 852033 ns/op 307.67 MB/s +Search_Success_RE2/512K 1000 1692057 ns/op 309.85 MB/s +Search_Success_RE2/1M 500 3396306 ns/op 308.74 MB/s +Search_Success_RE2/2M 200 6984505 ns/op 300.26 MB/s +Search_Success_RE2/4M 100 14632000 ns/op 286.65 MB/s +Search_Success_RE2/8M 50 31782800 ns/op 263.94 MB/s +Search_Success_RE2/16M 10 103645400 ns/op 161.87 MB/s +Search_Success_CachedPCRE/8 5000000 257 ns/op 31.04 MB/s +Search_Success_CachedPCRE/16 5000000 308 ns/op 51.88 MB/s +Search_Success_CachedPCRE/32 5000000 409 ns/op 78.14 MB/s +Search_Success_CachedPCRE/64 2000000 611 ns/op 104.66 MB/s +Search_Success_CachedPCRE/128 2000000 889 ns/op 143.85 MB/s +Search_Success_CachedPCRE/256 1000000 1546 ns/op 165.48 MB/s +Search_Success_CachedPCRE/512 500000 2861 ns/op 178.95 MB/s +Search_Success_CachedPCRE/1K 200000 5491 ns/op 186.46 MB/s +Search_Success_CachedPCRE/2K 100000 10746 ns/op 190.57 MB/s +Search_Success_CachedPCRE/4K 50000 21262 ns/op 192.64 MB/s +Search_Success_CachedPCRE/8K 50000 42295 ns/op 193.69 MB/s +Search_Success_CachedPCRE/16K 20000 84375 ns/op 194.18 MB/s +Search_Success_CachedPCRE/32K 10000 168635 ns/op 194.31 MB/s +Search_Success_CachedPCRE/64K 5000 337158 ns/op 194.38 MB/s +Search_Success_CachedPCRE/128K 2000 675199 ns/op 194.12 MB/s +Search_Success_CachedPCRE/256K 1000 1353970 ns/op 193.61 MB/s +Search_Success_CachedPCRE/512K 500 2723300 ns/op 192.52 MB/s +Search_Success_CachedPCRE/1M 200 5539695 ns/op 189.28 MB/s +Search_Success_CachedPCRE/2M 100 11424760 ns/op 183.56 MB/s +Search_Success_CachedPCRE/4M 50 24204760 ns/op 173.28 MB/s +Search_Success_CachedPCRE/8M 20 55998450 ns/op 149.80 MB/s +Search_Success_CachedPCRE/16M 10 137082500 ns/op 122.39 MB/s +Search_Success_CachedRE2/8 10000000 126 ns/op 63.05 MB/s +Search_Success_CachedRE2/16 5000000 373 ns/op 42.86 MB/s +Search_Success_CachedRE2/32 5000000 423 ns/op 75.51 MB/s +Search_Success_CachedRE2/64 2000000 523 ns/op 122.33 MB/s +Search_Success_CachedRE2/128 2000000 730 ns/op 175.15 MB/s +Search_Success_CachedRE2/256 1000000 1135 ns/op 225.51 MB/s +Search_Success_CachedRE2/512 1000000 1942 ns/op 263.51 MB/s +Search_Success_CachedRE2/1K 500000 3562 ns/op 287.44 MB/s +Search_Success_CachedRE2/2K 200000 6797 ns/op 301.31 MB/s +Search_Success_CachedRE2/4K 100000 13268 ns/op 308.70 MB/s +Search_Success_CachedRE2/8K 50000 26210 ns/op 312.55 MB/s +Search_Success_CachedRE2/16K 20000 52116 ns/op 314.37 MB/s +Search_Success_CachedRE2/32K 10000 104050 ns/op 314.92 MB/s +Search_Success_CachedRE2/64K 5000 207912 ns/op 315.21 MB/s +Search_Success_CachedRE2/128K 5000 415393 ns/op 315.54 MB/s +Search_Success_CachedRE2/256K 2000 832643 ns/op 314.83 MB/s +Search_Success_CachedRE2/512K 1000 1672561 ns/op 313.46 MB/s +Search_Success_CachedRE2/1M 500 3376196 ns/op 310.58 MB/s +Search_Success_CachedRE2/2M 200 6957190 ns/op 301.44 MB/s +Search_Success_CachedRE2/4M 100 14592130 ns/op 287.44 MB/s +Search_Success_CachedRE2/8M 50 31731860 ns/op 264.36 MB/s +Search_Success_CachedRE2/16M 10 103597500 ns/op 161.95 MB/s +Search_Success1_PCRE/8 500000 2053 ns/op 3.90 MB/s +Search_Success1_PCRE/16 500000 2061 ns/op 7.76 MB/s +Search_Success1_PCRE/32 500000 2169 ns/op 14.75 MB/s +Search_Success1_PCRE/64 500000 2310 ns/op 27.70 MB/s +Search_Success1_PCRE/128 500000 2640 ns/op 48.48 MB/s +Search_Success1_PCRE/256 500000 3292 ns/op 77.76 MB/s +Search_Success1_PCRE/512 500000 4593 ns/op 111.47 MB/s +Search_Success1_PCRE/1K 200000 7241 ns/op 141.40 MB/s +Search_Success1_PCRE/2K 100000 12489 ns/op 163.98 MB/s +Search_Success1_PCRE/4K 50000 22994 ns/op 178.13 MB/s +Search_Success1_PCRE/8K 50000 44014 ns/op 186.12 MB/s +Search_Success1_PCRE/16K 20000 86120 ns/op 190.24 MB/s +Search_Success1_PCRE/32K 10000 170489 ns/op 192.20 MB/s +Search_Success1_PCRE/64K 5000 339029 ns/op 193.30 MB/s +Search_Success1_PCRE/128K 2000 677115 ns/op 193.57 MB/s +Search_Success1_PCRE/256K 1000 1355861 ns/op 193.34 MB/s +Search_Success1_PCRE/512K 500 2725160 ns/op 192.39 MB/s +Search_Success1_PCRE/1M 200 5543665 ns/op 189.15 MB/s +Search_Success1_PCRE/2M 100 11434390 ns/op 183.41 MB/s +Search_Success1_PCRE/4M 50 24215940 ns/op 173.20 MB/s +Search_Success1_PCRE/8M 20 56027250 ns/op 149.72 MB/s +Search_Success1_PCRE/16M 10 137103200 ns/op 122.37 MB/s +Search_Success1_RE2/8 50000 26411 ns/op 0.30 MB/s +Search_Success1_RE2/16 50000 27068 ns/op 0.59 MB/s +Search_Success1_RE2/32 50000 27117 ns/op 1.18 MB/s +Search_Success1_RE2/64 50000 27405 ns/op 2.34 MB/s +Search_Success1_RE2/128 50000 27398 ns/op 4.67 MB/s +Search_Success1_RE2/256 50000 27580 ns/op 9.28 MB/s +Search_Success1_RE2/512 50000 28504 ns/op 17.96 MB/s +Search_Success1_RE2/1K 50000 29993 ns/op 34.14 MB/s +Search_Success1_RE2/2K 50000 33373 ns/op 61.37 MB/s +Search_Success1_RE2/4K 50000 39867 ns/op 102.74 MB/s +Search_Success1_RE2/8K 20000 52940 ns/op 154.74 MB/s +Search_Success1_RE2/16K 20000 78818 ns/op 207.87 MB/s +Search_Success1_RE2/32K 10000 130836 ns/op 250.45 MB/s +Search_Success1_RE2/64K 5000 234725 ns/op 279.20 MB/s +Search_Success1_RE2/128K 5000 442253 ns/op 296.37 MB/s +Search_Success1_RE2/256K 2000 859671 ns/op 304.94 MB/s +Search_Success1_RE2/512K 1000 1699921 ns/op 308.42 MB/s +Search_Success1_RE2/1M 500 3404204 ns/op 308.02 MB/s +Search_Success1_RE2/2M 200 6992400 ns/op 299.92 MB/s +Search_Success1_RE2/4M 100 14641200 ns/op 286.47 MB/s +Search_Success1_RE2/8M 50 31788680 ns/op 263.89 MB/s +Search_Success1_RE2/16M 10 103656000 ns/op 161.85 MB/s +Search_Success1_Cached_PCRE/8 5000000 305 ns/op 26.22 MB/s +Search_Success1_Cached_PCRE/16 5000000 355 ns/op 44.96 MB/s +Search_Success1_Cached_PCRE/32 5000000 456 ns/op 70.06 MB/s +Search_Success1_Cached_PCRE/64 2000000 658 ns/op 97.12 MB/s +Search_Success1_Cached_PCRE/128 2000000 936 ns/op 136.62 MB/s +Search_Success1_Cached_PCRE/256 1000000 1593 ns/op 160.62 MB/s +Search_Success1_Cached_PCRE/512 500000 2908 ns/op 176.05 MB/s +Search_Success1_Cached_PCRE/1K 200000 5537 ns/op 184.93 MB/s +Search_Success1_Cached_PCRE/2K 100000 10793 ns/op 189.74 MB/s +Search_Success1_Cached_PCRE/4K 50000 21311 ns/op 192.19 MB/s +Search_Success1_Cached_PCRE/8K 50000 42340 ns/op 193.48 MB/s +Search_Success1_Cached_PCRE/16K 20000 84417 ns/op 194.08 MB/s +Search_Success1_Cached_PCRE/32K 10000 168689 ns/op 194.25 MB/s +Search_Success1_Cached_PCRE/64K 5000 337219 ns/op 194.34 MB/s +Search_Success1_Cached_PCRE/128K 2000 675255 ns/op 194.11 MB/s +Search_Success1_Cached_PCRE/256K 1000 1354027 ns/op 193.60 MB/s +Search_Success1_Cached_PCRE/512K 500 2723352 ns/op 192.52 MB/s +Search_Success1_Cached_PCRE/1M 200 5539800 ns/op 189.28 MB/s +Search_Success1_Cached_PCRE/2M 100 11426990 ns/op 183.53 MB/s +Search_Success1_Cached_PCRE/4M 50 24206500 ns/op 173.27 MB/s +Search_Success1_Cached_PCRE/8M 20 56008200 ns/op 149.77 MB/s +Search_Success1_Cached_PCRE/16M 10 137084600 ns/op 122.39 MB/s +Search_Success1_Cached_RE2/8 5000000 347 ns/op 22.99 MB/s +Search_Success1_Cached_RE2/16 5000000 373 ns/op 42.83 MB/s +Search_Success1_Cached_RE2/32 5000000 421 ns/op 75.97 MB/s +Search_Success1_Cached_RE2/64 2000000 520 ns/op 122.97 MB/s +Search_Success1_Cached_RE2/128 2000000 729 ns/op 175.43 MB/s +Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.82 MB/s +Search_Success1_Cached_RE2/512 1000000 1945 ns/op 263.23 MB/s +Search_Success1_Cached_RE2/1K 500000 3559 ns/op 287.66 MB/s +Search_Success1_Cached_RE2/2K 200000 6795 ns/op 301.39 MB/s +Search_Success1_Cached_RE2/4K 100000 13266 ns/op 308.74 MB/s +Search_Success1_Cached_RE2/8K 50000 26210 ns/op 312.54 MB/s +Search_Success1_Cached_RE2/16K 20000 52116 ns/op 314.37 MB/s +Search_Success1_Cached_RE2/32K 10000 104042 ns/op 314.95 MB/s +Search_Success1_Cached_RE2/64K 5000 207904 ns/op 315.22 MB/s +Search_Success1_Cached_RE2/128K 5000 415336 ns/op 315.58 MB/s +Search_Success1_Cached_RE2/256K 2000 832674 ns/op 314.82 MB/s +Search_Success1_Cached_RE2/512K 1000 1672745 ns/op 313.43 MB/s +Search_Success1_Cached_RE2/1M 500 3376504 ns/op 310.55 MB/s +Search_Success1_Cached_RE2/2M 200 6957405 ns/op 301.43 MB/s +Search_Success1_Cached_RE2/4M 100 14592660 ns/op 287.43 MB/s +Search_Success1_Cached_RE2/8M 50 31728560 ns/op 264.39 MB/s +Search_Success1_Cached_RE2/16M 10 103598300 ns/op 161.94 MB/s +Search_Digits_PCRE 200000 5212 ns/op +Search_Digits_RE2 50000 21686 ns/op +Parse_Digits_PCRE 200000 5229 ns/op +Parse_Digits_RE2 200000 9825 ns/op +Parse_CachedDigits_PCRE 2000000 519 ns/op +Parse_CachedDigits_RE2 5000000 271 ns/op +Parse_DigitDs_PCRE 500000 4224 ns/op +Parse_DigitDs_RE2 200000 9706 ns/op +Parse_CachedDigitDs_PCRE 2000000 505 ns/op +Parse_CachedDigitDs_RE2 5000000 279 ns/op +Parse_Split_PCRE 500000 3533 ns/op +Parse_Split_RE2 100000 11256 ns/op +Parse_CachedSplit_PCRE 5000000 373 ns/op +Parse_CachedSplit_RE2 10000000 167 ns/op +Parse_SplitHard_PCRE 500000 3350 ns/op +Parse_SplitHard_RE2 100000 13959 ns/op +Parse_CachedSplitHard_PCRE 5000000 352 ns/op +Parse_CachedSplitHard_RE2 1000000 1780 ns/op +Parse_CachedSplitBig1_PCRE 500 4902314 ns/op +Parse_CachedSplitBig1_RE2 2000 674772 ns/op +Parse_CachedSplitBig2_PCRE 2000 513858 ns/op +Parse_CachedSplitBig2_RE2 20 52044800 ns/op +BM_PCRE_Compile 500000 3767 ns/op +BM_RE2_Compile 100000 10752 ns/op +SearchPhone_CachedPCRE/8 1000000 1231 ns/op 6.50 MB/s +SearchPhone_CachedPCRE/16 500000 2026 ns/op 7.89 MB/s +SearchPhone_CachedPCRE/32 500000 3623 ns/op 8.83 MB/s +SearchPhone_CachedPCRE/64 200000 6813 ns/op 9.39 MB/s +SearchPhone_CachedPCRE/128 100000 13330 ns/op 9.60 MB/s +SearchPhone_CachedPCRE/256 50000 25832 ns/op 9.91 MB/s +SearchPhone_CachedPCRE/512 20000 51132 ns/op 10.01 MB/s +SearchPhone_CachedPCRE/1K 10000 101950 ns/op 10.04 MB/s +SearchPhone_CachedPCRE/2K 10000 199960 ns/op 10.24 MB/s +SearchPhone_CachedPCRE/4K 5000 397105 ns/op 10.31 MB/s +SearchPhone_CachedPCRE/8K 2000 792685 ns/op 10.33 MB/s +SearchPhone_CachedPCRE/16K 1000 1576834 ns/op 10.39 MB/s +SearchPhone_CachedPCRE/32K 500 3152026 ns/op 10.40 MB/s +SearchPhone_CachedPCRE/64K 200 6293925 ns/op 10.41 MB/s +SearchPhone_CachedPCRE/128K 100 12613350 ns/op 10.39 MB/s +SearchPhone_CachedPCRE/256K 50 25253020 ns/op 10.38 MB/s +SearchPhone_CachedPCRE/512K 20 50462800 ns/op 10.39 MB/s +SearchPhone_CachedPCRE/1M 10 101412600 ns/op 10.34 MB/s +SearchPhone_CachedPCRE/2M 5 203302200 ns/op 10.32 MB/s +SearchPhone_CachedPCRE/4M 5 404935400 ns/op 10.36 MB/s +SearchPhone_CachedPCRE/8M 2 810444500 ns/op 10.35 MB/s +SearchPhone_CachedPCRE/16M 1 1615334000 ns/op 10.39 MB/s +SearchPhone_CachedRE2/8 2000000 897 ns/op 8.91 MB/s +SearchPhone_CachedRE2/16 2000000 928 ns/op 17.24 MB/s +SearchPhone_CachedRE2/32 2000000 968 ns/op 33.04 MB/s +SearchPhone_CachedRE2/64 1000000 1069 ns/op 59.84 MB/s +SearchPhone_CachedRE2/128 1000000 1286 ns/op 99.52 MB/s +SearchPhone_CachedRE2/256 1000000 1691 ns/op 151.31 MB/s +SearchPhone_CachedRE2/512 500000 2496 ns/op 205.07 MB/s +SearchPhone_CachedRE2/1K 500000 4107 ns/op 249.27 MB/s +SearchPhone_CachedRE2/2K 200000 7347 ns/op 278.74 MB/s +SearchPhone_CachedRE2/4K 100000 13824 ns/op 296.29 MB/s +SearchPhone_CachedRE2/8K 50000 26758 ns/op 306.15 MB/s +SearchPhone_CachedRE2/16K 20000 52773 ns/op 310.46 MB/s +SearchPhone_CachedRE2/32K 10000 104775 ns/op 312.75 MB/s +SearchPhone_CachedRE2/64K 5000 208321 ns/op 314.59 MB/s +SearchPhone_CachedRE2/128K 5000 415436 ns/op 315.50 MB/s +SearchPhone_CachedRE2/256K 2000 829659 ns/op 315.97 MB/s +SearchPhone_CachedRE2/512K 1000 1658073 ns/op 316.20 MB/s +SearchPhone_CachedRE2/1M 500 3315418 ns/op 316.27 MB/s +SearchPhone_CachedRE2/2M 200 6645570 ns/op 315.57 MB/s +SearchPhone_CachedRE2/4M 100 13341780 ns/op 314.37 MB/s +SearchPhone_CachedRE2/8M 50 26722980 ns/op 313.91 MB/s +SearchPhone_CachedRE2/16M 20 53451450 ns/op 313.88 MB/s +EmptyPartialMatchPCRE 10000000 139 ns/op +EmptyPartialMatchRE2 5000000 314 ns/op +SimplePartialMatchPCRE 10000000 195 ns/op +SimplePartialMatchRE2 5000000 352 ns/op +HTTPPartialMatchPCRE 2000000 577 ns/op +HTTPPartialMatchRE2 2000000 624 ns/op +SmallHTTPPartialMatchPCRE 2000000 577 ns/op +SmallHTTPPartialMatchRE2 2000000 622 ns/op +DotMatchPCRE 5000000 455 ns/op +DotMatchRE2 2000000 671 ns/op +ASCIIMatchPCRE 5000000 400 ns/op +ASCIIMatchRE2 2000000 676 ns/op +==BENCHMARK== c2 Fri Feb 26 14:16:33 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# c268b421d457+ tip + +Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.47 MB/s +Search_Easy0_CachedPCRE/16 10000000 160 ns/op 99.62 MB/s +Search_Easy0_CachedPCRE/32 5000000 217 ns/op 147.10 MB/s +Search_Easy0_CachedPCRE/64 5000000 331 ns/op 193.17 MB/s +Search_Easy0_CachedPCRE/128 5000000 431 ns/op 296.60 MB/s +Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.78 MB/s +Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.06 MB/s +Search_Easy0_CachedPCRE/1K 500000 3393 ns/op 301.76 MB/s +Search_Easy0_CachedPCRE/2K 200000 5681 ns/op 360.45 MB/s +Search_Easy0_CachedPCRE/4K 100000 10623 ns/op 385.57 MB/s +Search_Easy0_CachedPCRE/8K 50000 21776 ns/op 376.18 MB/s +Search_Easy0_CachedPCRE/16K 50000 42162 ns/op 388.59 MB/s +Search_Easy0_CachedPCRE/32K 20000 85140 ns/op 384.87 MB/s +Search_Easy0_CachedPCRE/64K 10000 169773 ns/op 386.02 MB/s +Search_Easy0_CachedPCRE/128K 5000 340985 ns/op 384.39 MB/s +Search_Easy0_CachedPCRE/256K 2000 680440 ns/op 385.26 MB/s +Search_Easy0_CachedPCRE/512K 1000 1362919 ns/op 384.68 MB/s +Search_Easy0_CachedPCRE/1M 500 2726382 ns/op 384.60 MB/s +Search_Easy0_CachedPCRE/2M 200 5456280 ns/op 384.36 MB/s +Search_Easy0_CachedPCRE/4M 100 11055420 ns/op 379.39 MB/s +Search_Easy0_CachedPCRE/8M 50 22173320 ns/op 378.32 MB/s +Search_Easy0_CachedPCRE/16M 50 44321260 ns/op 378.54 MB/s +Search_Easy0_CachedRE2/8 5000000 314 ns/op 25.47 MB/s +Search_Easy0_CachedRE2/16 5000000 315 ns/op 50.72 MB/s +Search_Easy0_CachedRE2/32 5000000 331 ns/op 96.44 MB/s +Search_Easy0_CachedRE2/64 5000000 332 ns/op 192.59 MB/s +Search_Easy0_CachedRE2/128 5000000 363 ns/op 352.00 MB/s +Search_Easy0_CachedRE2/256 5000000 389 ns/op 658.00 MB/s +Search_Easy0_CachedRE2/512 5000000 469 ns/op 1089.76 MB/s +Search_Easy0_CachedRE2/1K 2000000 652 ns/op 1569.80 MB/s +Search_Easy0_CachedRE2/2K 1000000 1013 ns/op 2020.66 MB/s +Search_Easy0_CachedRE2/4K 1000000 1571 ns/op 2606.84 MB/s +Search_Easy0_CachedRE2/8K 500000 2911 ns/op 2814.06 MB/s +Search_Easy0_CachedRE2/16K 200000 5405 ns/op 3030.77 MB/s +Search_Easy0_CachedRE2/32K 100000 10989 ns/op 2981.79 MB/s +Search_Easy0_CachedRE2/64K 50000 21839 ns/op 3000.77 MB/s +Search_Easy0_CachedRE2/128K 50000 44376 ns/op 2953.66 MB/s +Search_Easy0_CachedRE2/256K 20000 88364 ns/op 2966.64 MB/s +Search_Easy0_CachedRE2/512K 10000 177685 ns/op 2950.64 MB/s +Search_Easy0_CachedRE2/1M 5000 356602 ns/op 2940.46 MB/s +Search_Easy0_CachedRE2/2M 2000 715631 ns/op 2930.49 MB/s +Search_Easy0_CachedRE2/4M 1000 1529594 ns/op 2742.10 MB/s +Search_Easy0_CachedRE2/8M 500 3089266 ns/op 2715.40 MB/s +Search_Easy0_CachedRE2/16M 200 6153925 ns/op 2726.26 MB/s +Search_Easy1_CachedPCRE/8 10000000 132 ns/op 60.48 MB/s +Search_Easy1_CachedPCRE/16 10000000 160 ns/op 99.52 MB/s +Search_Easy1_CachedPCRE/32 5000000 217 ns/op 147.11 MB/s +Search_Easy1_CachedPCRE/64 5000000 331 ns/op 193.20 MB/s +Search_Easy1_CachedPCRE/128 2000000 648 ns/op 197.44 MB/s +Search_Easy1_CachedPCRE/256 2000000 935 ns/op 273.76 MB/s +Search_Easy1_CachedPCRE/512 1000000 1966 ns/op 260.32 MB/s +Search_Easy1_CachedPCRE/1K 500000 3418 ns/op 299.53 MB/s +Search_Easy1_CachedPCRE/2K 200000 6237 ns/op 328.33 MB/s +Search_Easy1_CachedPCRE/4K 100000 11125 ns/op 368.18 MB/s +Search_Easy1_CachedPCRE/8K 50000 22022 ns/op 371.98 MB/s +Search_Easy1_CachedPCRE/16K 50000 42402 ns/op 386.39 MB/s +Search_Easy1_CachedPCRE/32K 20000 85237 ns/op 384.43 MB/s +Search_Easy1_CachedPCRE/64K 10000 170201 ns/op 385.05 MB/s +Search_Easy1_CachedPCRE/128K 5000 342009 ns/op 383.24 MB/s +Search_Easy1_CachedPCRE/256K 2000 682201 ns/op 384.26 MB/s +Search_Easy1_CachedPCRE/512K 1000 1366471 ns/op 383.68 MB/s +Search_Easy1_CachedPCRE/1M 500 2735128 ns/op 383.37 MB/s +Search_Easy1_CachedPCRE/2M 200 5471205 ns/op 383.31 MB/s +Search_Easy1_CachedPCRE/4M 100 11093340 ns/op 378.09 MB/s +Search_Easy1_CachedPCRE/8M 50 22240420 ns/op 377.18 MB/s +Search_Easy1_CachedPCRE/16M 50 44464400 ns/op 377.32 MB/s +Search_Easy1_CachedRE2/8 5000000 316 ns/op 25.27 MB/s +Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.44 MB/s +Search_Easy1_CachedRE2/32 5000000 330 ns/op 96.79 MB/s +Search_Easy1_CachedRE2/64 5000000 334 ns/op 191.06 MB/s +Search_Easy1_CachedRE2/128 5000000 365 ns/op 350.44 MB/s +Search_Easy1_CachedRE2/256 5000000 400 ns/op 639.30 MB/s +Search_Easy1_CachedRE2/512 5000000 472 ns/op 1083.02 MB/s +Search_Easy1_CachedRE2/1K 2000000 652 ns/op 1570.19 MB/s +Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2043.19 MB/s +Search_Easy1_CachedRE2/4K 1000000 1576 ns/op 2598.26 MB/s +Search_Easy1_CachedRE2/8K 500000 2924 ns/op 2801.57 MB/s +Search_Easy1_CachedRE2/16K 200000 5449 ns/op 3006.54 MB/s +Search_Easy1_CachedRE2/32K 100000 10985 ns/op 2982.90 MB/s +Search_Easy1_CachedRE2/64K 50000 21837 ns/op 3001.13 MB/s +Search_Easy1_CachedRE2/128K 50000 44336 ns/op 2956.31 MB/s +Search_Easy1_CachedRE2/256K 20000 88350 ns/op 2967.08 MB/s +Search_Easy1_CachedRE2/512K 10000 177698 ns/op 2950.43 MB/s +Search_Easy1_CachedRE2/1M 5000 356645 ns/op 2940.11 MB/s +Search_Easy1_CachedRE2/2M 2000 715710 ns/op 2930.17 MB/s +Search_Easy1_CachedRE2/4M 1000 1529932 ns/op 2741.50 MB/s +Search_Easy1_CachedRE2/8M 500 3087586 ns/op 2716.88 MB/s +Search_Easy1_CachedRE2/16M 200 6155690 ns/op 2725.48 MB/s +Search_Medium_CachedPCRE/8 10000000 133 ns/op 59.81 MB/s +Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.58 MB/s +Search_Medium_CachedPCRE/32 5000000 219 ns/op 145.96 MB/s +Search_Medium_CachedPCRE/64 5000000 340 ns/op 188.00 MB/s +Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.67 MB/s +Search_Medium_CachedPCRE/256 200000 9076 ns/op 28.20 MB/s +Search_Medium_CachedPCRE/512 50000 21579 ns/op 23.73 MB/s +Search_Medium_CachedPCRE/1K 50000 42391 ns/op 24.16 MB/s +Search_Medium_CachedPCRE/2K 20000 62367 ns/op 32.84 MB/s +Search_Medium_CachedPCRE/4K 10000 153667 ns/op 26.66 MB/s +Search_Medium_CachedPCRE/8K 5000 332606 ns/op 24.63 MB/s +Search_Medium_CachedPCRE/16K 2000 677805 ns/op 24.17 MB/s +Search_Medium_CachedPCRE/32K 1000 1355730 ns/op 24.17 MB/s +Search_Medium_CachedPCRE/64K 500 2707474 ns/op 24.21 MB/s +Search_Medium_CachedPCRE/128K 200 5409525 ns/op 24.23 MB/s +Search_Medium_CachedPCRE/256K 100 10821290 ns/op 24.22 MB/s +Search_Medium_CachedRE2/8 5000000 335 ns/op 23.87 MB/s +Search_Medium_CachedRE2/16 5000000 362 ns/op 44.16 MB/s +Search_Medium_CachedRE2/32 5000000 408 ns/op 78.36 MB/s +Search_Medium_CachedRE2/64 2000000 510 ns/op 125.32 MB/s +Search_Medium_CachedRE2/128 2000000 723 ns/op 176.99 MB/s +Search_Medium_CachedRE2/256 1000000 1125 ns/op 227.47 MB/s +Search_Medium_CachedRE2/512 1000000 1935 ns/op 264.50 MB/s +Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.20 MB/s +Search_Medium_CachedRE2/2K 200000 6794 ns/op 301.41 MB/s +Search_Medium_CachedRE2/4K 100000 13257 ns/op 308.96 MB/s +Search_Medium_CachedRE2/8K 50000 26198 ns/op 312.69 MB/s +Search_Medium_CachedRE2/16K 20000 52087 ns/op 314.55 MB/s +Search_Medium_CachedRE2/32K 10000 103942 ns/op 315.25 MB/s +Search_Medium_CachedRE2/64K 5000 207481 ns/op 315.86 MB/s +Search_Medium_CachedRE2/128K 5000 414561 ns/op 316.17 MB/s +Search_Medium_CachedRE2/256K 2000 828789 ns/op 316.30 MB/s +Search_Medium_CachedRE2/512K 1000 1657133 ns/op 316.38 MB/s +Search_Medium_CachedRE2/1M 500 3314164 ns/op 316.39 MB/s +Search_Medium_CachedRE2/2M 200 6632795 ns/op 316.18 MB/s +Search_Medium_CachedRE2/4M 100 13340680 ns/op 314.40 MB/s +Search_Medium_CachedRE2/8M 50 26721100 ns/op 313.93 MB/s +Search_Medium_CachedRE2/16M 20 53443050 ns/op 313.93 MB/s +Search_Hard_CachedPCRE/8 10000000 133 ns/op 59.77 MB/s +Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.62 MB/s +Search_Hard_CachedPCRE/32 5000000 219 ns/op 145.97 MB/s +Search_Hard_CachedPCRE/64 5000000 340 ns/op 188.06 MB/s +Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.69 MB/s +Search_Hard_CachedPCRE/256 2000 573267 ns/op 0.45 MB/s +Search_Hard_CachedPCRE/512 500 2347118 ns/op 0.22 MB/s +Search_Hard_CachedPCRE/1K 200 9316730 ns/op 0.11 MB/s +Search_Hard_CachedPCRE/2K 50 34064460 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/4K 10 146725200 ns/op 0.03 MB/s +Search_Hard_CachedRE2/8 5000000 335 ns/op 23.87 MB/s +Search_Hard_CachedRE2/16 5000000 363 ns/op 44.03 MB/s +Search_Hard_CachedRE2/32 5000000 411 ns/op 77.80 MB/s +Search_Hard_CachedRE2/64 2000000 510 ns/op 125.28 MB/s +Search_Hard_CachedRE2/128 2000000 720 ns/op 177.74 MB/s +Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s +Search_Hard_CachedRE2/512 1000000 1936 ns/op 264.45 MB/s +Search_Hard_CachedRE2/1K 500000 3552 ns/op 288.25 MB/s +Search_Hard_CachedRE2/2K 200000 6794 ns/op 301.41 MB/s +Search_Hard_CachedRE2/4K 100000 13257 ns/op 308.96 MB/s +Search_Hard_CachedRE2/8K 50000 26201 ns/op 312.66 MB/s +Search_Hard_CachedRE2/16K 20000 52089 ns/op 314.53 MB/s +Search_Hard_CachedRE2/32K 10000 103959 ns/op 315.20 MB/s +Search_Hard_CachedRE2/64K 5000 207483 ns/op 315.86 MB/s +Search_Hard_CachedRE2/128K 5000 414583 ns/op 316.15 MB/s +Search_Hard_CachedRE2/256K 2000 828720 ns/op 316.32 MB/s +Search_Hard_CachedRE2/512K 1000 1657121 ns/op 316.38 MB/s +Search_Hard_CachedRE2/1M 500 3314102 ns/op 316.40 MB/s +Search_Hard_CachedRE2/2M 200 6632065 ns/op 316.21 MB/s +Search_Hard_CachedRE2/4M 100 13339990 ns/op 314.42 MB/s +Search_Hard_CachedRE2/8M 50 26721960 ns/op 313.92 MB/s +Search_Hard_CachedRE2/16M 20 53440900 ns/op 313.94 MB/s +Search_Parens_CachedPCRE/8 10000000 197 ns/op 40.42 MB/s +Search_Parens_CachedRE2/8 5000000 334 ns/op 23.90 MB/s +Search_Parens_CachedRE2/16 5000000 359 ns/op 44.46 MB/s +Search_Parens_CachedRE2/32 5000000 413 ns/op 77.42 MB/s +Search_Parens_CachedRE2/64 2000000 511 ns/op 125.07 MB/s +Search_Parens_CachedRE2/128 2000000 722 ns/op 177.10 MB/s +Search_Parens_CachedRE2/256 1000000 1128 ns/op 226.81 MB/s +Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.47 MB/s +Search_Parens_CachedRE2/1K 500000 3561 ns/op 287.49 MB/s +Search_Parens_CachedRE2/2K 200000 6787 ns/op 301.72 MB/s +Search_Parens_CachedRE2/4K 100000 13262 ns/op 308.84 MB/s +Search_Parens_CachedRE2/8K 50000 26204 ns/op 312.61 MB/s +Search_Parens_CachedRE2/16K 20000 52095 ns/op 314.50 MB/s +Search_Parens_CachedRE2/32K 10000 103945 ns/op 315.24 MB/s +Search_Parens_CachedRE2/64K 5000 207517 ns/op 315.81 MB/s +Search_Parens_CachedRE2/128K 5000 414628 ns/op 316.12 MB/s +Search_Parens_CachedRE2/256K 2000 828799 ns/op 316.29 MB/s +Search_Parens_CachedRE2/512K 1000 1657224 ns/op 316.37 MB/s +Search_Parens_CachedRE2/1M 500 3314264 ns/op 316.38 MB/s +Search_Parens_CachedRE2/2M 200 6633485 ns/op 316.15 MB/s +Search_Parens_CachedRE2/4M 100 13340780 ns/op 314.40 MB/s +Search_Parens_CachedRE2/8M 50 26719280 ns/op 313.95 MB/s +Search_Parens_CachedRE2/16M 20 53447850 ns/op 313.90 MB/s +Search_BigFixed_CachedPCRE/8 5000000 242 ns/op 32.93 MB/s +Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.06 MB/s +Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.48 MB/s +Search_BigFixed_CachedPCRE/64 2000000 652 ns/op 98.09 MB/s +Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.90 MB/s +Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s +Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.19 MB/s +Search_BigFixed_CachedPCRE/1K 200000 6476 ns/op 158.12 MB/s +Search_BigFixed_CachedPCRE/2K 100000 12745 ns/op 160.68 MB/s +Search_BigFixed_CachedPCRE/4K 50000 25284 ns/op 162.00 MB/s +Search_BigFixed_CachedPCRE/8K 20000 50366 ns/op 162.65 MB/s +Search_BigFixed_CachedPCRE/16K 10000 100603 ns/op 162.86 MB/s +Search_BigFixed_CachedPCRE/32K 5000 201124 ns/op 162.92 MB/s +Search_BigFixed_CachedRE2/8 10000000 130 ns/op 61.36 MB/s +Search_BigFixed_CachedRE2/16 5000000 375 ns/op 42.64 MB/s +Search_BigFixed_CachedRE2/32 5000000 407 ns/op 78.61 MB/s +Search_BigFixed_CachedRE2/64 5000000 486 ns/op 131.63 MB/s +Search_BigFixed_CachedRE2/128 2000000 630 ns/op 203.08 MB/s +Search_BigFixed_CachedRE2/256 2000000 945 ns/op 270.64 MB/s +Search_BigFixed_CachedRE2/512 1000000 1547 ns/op 330.90 MB/s +Search_BigFixed_CachedRE2/1K 500000 2765 ns/op 370.29 MB/s +Search_BigFixed_CachedRE2/2K 200000 5187 ns/op 394.77 MB/s +Search_BigFixed_CachedRE2/4K 100000 10045 ns/op 407.74 MB/s +Search_BigFixed_CachedRE2/8K 100000 19754 ns/op 414.68 MB/s +Search_BigFixed_CachedRE2/16K 50000 39160 ns/op 418.39 MB/s +Search_BigFixed_CachedRE2/32K 20000 78097 ns/op 419.58 MB/s +Search_BigFixed_CachedRE2/64K 10000 155858 ns/op 420.48 MB/s +Search_BigFixed_CachedRE2/128K 5000 311449 ns/op 420.85 MB/s +Search_BigFixed_CachedRE2/256K 2000 623620 ns/op 420.36 MB/s +Search_BigFixed_CachedRE2/512K 1000 1250862 ns/op 419.14 MB/s +Search_BigFixed_CachedRE2/1M 500 2517654 ns/op 416.49 MB/s +Search_Success_PCRE/8 1000000 1812 ns/op 4.41 MB/s +Search_Success_PCRE/16 1000000 1852 ns/op 8.64 MB/s +Search_Success_PCRE/32 1000000 1935 ns/op 16.53 MB/s +Search_Success_PCRE/64 500000 2130 ns/op 30.04 MB/s +Search_Success_PCRE/128 500000 2480 ns/op 51.61 MB/s +Search_Success_PCRE/256 500000 3190 ns/op 80.25 MB/s +Search_Success_PCRE/512 500000 4611 ns/op 111.02 MB/s +Search_Success_PCRE/1K 200000 7430 ns/op 137.80 MB/s +Search_Success_PCRE/2K 100000 13072 ns/op 156.66 MB/s +Search_Success_PCRE/4K 50000 24385 ns/op 167.97 MB/s +Search_Success_PCRE/8K 50000 47046 ns/op 174.13 MB/s +Search_Success_PCRE/16K 20000 92417 ns/op 177.28 MB/s +Search_Success_PCRE/32K 10000 183262 ns/op 178.80 MB/s +Search_Success_PCRE/64K 5000 364683 ns/op 179.71 MB/s +Search_Success_PCRE/128K 2000 728298 ns/op 179.97 MB/s +Search_Success_PCRE/256K 1000 1457823 ns/op 179.82 MB/s +Search_Success_PCRE/512K 500 2926208 ns/op 179.17 MB/s +Search_Success_PCRE/1M 200 5926520 ns/op 176.93 MB/s +Search_Success_PCRE/2M 100 12118480 ns/op 173.05 MB/s +Search_Success_PCRE/4M 50 25402020 ns/op 165.12 MB/s +Search_Success_PCRE/8M 20 56959600 ns/op 147.27 MB/s +Search_Success_PCRE/16M 10 134219200 ns/op 125.00 MB/s +Search_Success_RE2/8 200000 8371 ns/op 0.96 MB/s +Search_Success_RE2/16 100000 19886 ns/op 0.80 MB/s +Search_Success_RE2/32 100000 19774 ns/op 1.62 MB/s +Search_Success_RE2/64 50000 20190 ns/op 3.17 MB/s +Search_Success_RE2/128 50000 20169 ns/op 6.35 MB/s +Search_Success_RE2/256 50000 20632 ns/op 12.41 MB/s +Search_Success_RE2/512 50000 21598 ns/op 23.71 MB/s +Search_Success_RE2/1K 50000 23051 ns/op 44.42 MB/s +Search_Success_RE2/2K 50000 26258 ns/op 77.99 MB/s +Search_Success_RE2/4K 50000 32804 ns/op 124.86 MB/s +Search_Success_RE2/8K 50000 45835 ns/op 178.73 MB/s +Search_Success_RE2/16K 20000 71685 ns/op 228.55 MB/s +Search_Success_RE2/32K 10000 123817 ns/op 264.65 MB/s +Search_Success_RE2/64K 5000 227706 ns/op 287.81 MB/s +Search_Success_RE2/128K 5000 435094 ns/op 301.25 MB/s +Search_Success_RE2/256K 2000 851813 ns/op 307.75 MB/s +Search_Success_RE2/512K 1000 1689866 ns/op 310.25 MB/s +Search_Success_RE2/1M 500 3385158 ns/op 309.76 MB/s +Search_Success_RE2/2M 200 6914280 ns/op 303.31 MB/s +Search_Success_RE2/4M 100 14404490 ns/op 291.18 MB/s +Search_Success_RE2/8M 50 30838520 ns/op 272.02 MB/s +Search_Success_RE2/16M 10 7977066800 ns/op 2.10 MB/s +Search_Success_CachedPCRE/8 5000000 247 ns/op 32.27 MB/s +Search_Success_CachedPCRE/16 5000000 289 ns/op 55.29 MB/s +Search_Success_CachedPCRE/32 5000000 396 ns/op 80.68 MB/s +Search_Success_CachedPCRE/64 2000000 611 ns/op 104.66 MB/s +Search_Success_CachedPCRE/128 2000000 1760 ns/op 72.71 MB/s +==BENCHMARK== c2 Fri Feb 26 14:31:16 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# c268b421d457+ tip + +Search_Easy0_CachedPCRE/8 10000000 131 ns/op 61.07 MB/s +Search_Easy0_CachedPCRE/16 10000000 159 ns/op 100.32 MB/s +Search_Easy0_CachedPCRE/32 5000000 216 ns/op 147.77 MB/s +Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.83 MB/s +Search_Easy0_CachedPCRE/128 5000000 430 ns/op 297.34 MB/s +Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.97 MB/s +Search_Easy0_CachedPCRE/512 1000000 1739 ns/op 294.27 MB/s +Search_Easy0_CachedPCRE/1K 500000 3389 ns/op 302.07 MB/s +Search_Easy0_CachedPCRE/2K 200000 5681 ns/op 360.45 MB/s +Search_Easy0_CachedPCRE/4K 100000 10629 ns/op 385.34 MB/s +Search_Easy0_CachedPCRE/8K 50000 21776 ns/op 376.18 MB/s +Search_Easy0_CachedPCRE/16K 50000 42192 ns/op 388.32 MB/s +Search_Easy0_CachedPCRE/32K 20000 85172 ns/op 384.73 MB/s +Search_Easy0_CachedPCRE/64K 10000 169816 ns/op 385.92 MB/s +Search_Easy0_CachedPCRE/128K 5000 341102 ns/op 384.26 MB/s +Search_Easy0_CachedPCRE/256K 2000 680651 ns/op 385.14 MB/s +Search_Easy0_CachedPCRE/512K 1000 1362954 ns/op 384.67 MB/s +Search_Easy0_CachedPCRE/1M 500 2726140 ns/op 384.64 MB/s +Search_Easy0_CachedPCRE/2M 200 5463185 ns/op 383.87 MB/s +Search_Easy0_CachedPCRE/4M 100 11055500 ns/op 379.39 MB/s +Search_Easy0_CachedPCRE/8M 50 22168840 ns/op 378.40 MB/s +Search_Easy0_CachedPCRE/16M 50 44330340 ns/op 378.46 MB/s +Search_Easy0_CachedRE2/8 5000000 318 ns/op 25.09 MB/s +Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.33 MB/s +Search_Easy0_CachedRE2/32 5000000 341 ns/op 93.71 MB/s +Search_Easy0_CachedRE2/64 5000000 350 ns/op 182.71 MB/s +Search_Easy0_CachedRE2/128 5000000 383 ns/op 333.80 MB/s +Search_Easy0_CachedRE2/256 5000000 401 ns/op 636.89 MB/s +Search_Easy0_CachedRE2/512 5000000 483 ns/op 1058.27 MB/s +Search_Easy0_CachedRE2/1K 2000000 672 ns/op 1523.32 MB/s +Search_Easy0_CachedRE2/2K 1000000 1023 ns/op 2000.24 MB/s +Search_Easy0_CachedRE2/4K 1000000 1597 ns/op 2564.44 MB/s +Search_Easy0_CachedRE2/8K 500000 2918 ns/op 2807.09 MB/s +Search_Easy0_CachedRE2/16K 200000 5429 ns/op 3017.39 MB/s +Search_Easy0_CachedRE2/32K 100000 11045 ns/op 2966.75 MB/s +Search_Easy0_CachedRE2/64K 50000 21873 ns/op 2996.08 MB/s +Search_Easy0_CachedRE2/128K 50000 44398 ns/op 2952.16 MB/s +Search_Easy0_CachedRE2/256K 20000 88429 ns/op 2964.44 MB/s +Search_Easy0_CachedRE2/512K 10000 177688 ns/op 2950.60 MB/s +Search_Easy0_CachedRE2/1M 5000 356798 ns/op 2938.84 MB/s +Search_Easy0_CachedRE2/2M 2000 721040 ns/op 2908.51 MB/s +Search_Easy0_CachedRE2/4M 1000 1526733 ns/op 2747.24 MB/s +Search_Easy0_CachedRE2/8M 500 3085732 ns/op 2718.51 MB/s +Search_Easy0_CachedRE2/16M 200 6155395 ns/op 2725.61 MB/s +Search_Easy1_CachedPCRE/8 10000000 131 ns/op 60.93 MB/s +Search_Easy1_CachedPCRE/16 10000000 159 ns/op 100.31 MB/s +Search_Easy1_CachedPCRE/32 5000000 216 ns/op 147.91 MB/s +Search_Easy1_CachedPCRE/64 5000000 330 ns/op 193.81 MB/s +Search_Easy1_CachedPCRE/128 2000000 647 ns/op 197.81 MB/s +Search_Easy1_CachedPCRE/256 2000000 933 ns/op 274.18 MB/s +Search_Easy1_CachedPCRE/512 1000000 1969 ns/op 260.02 MB/s +Search_Easy1_CachedPCRE/1K 500000 3440 ns/op 297.67 MB/s +Search_Easy1_CachedPCRE/2K 200000 6230 ns/op 328.73 MB/s +Search_Easy1_CachedPCRE/4K 100000 11116 ns/op 368.45 MB/s +Search_Easy1_CachedPCRE/8K 50000 22010 ns/op 372.19 MB/s +Search_Easy1_CachedPCRE/16K 50000 42395 ns/op 386.45 MB/s +Search_Easy1_CachedPCRE/32K 20000 85210 ns/op 384.55 MB/s +Search_Easy1_CachedPCRE/64K 10000 170224 ns/op 385.00 MB/s +Search_Easy1_CachedPCRE/128K 5000 342017 ns/op 383.23 MB/s +Search_Easy1_CachedPCRE/256K 2000 682168 ns/op 384.28 MB/s +Search_Easy1_CachedPCRE/512K 1000 1366582 ns/op 383.65 MB/s +Search_Easy1_CachedPCRE/1M 500 2735192 ns/op 383.36 MB/s +Search_Easy1_CachedPCRE/2M 200 5480130 ns/op 382.68 MB/s +Search_Easy1_CachedPCRE/4M 100 11087200 ns/op 378.30 MB/s +Search_Easy1_CachedPCRE/8M 50 22238640 ns/op 377.21 MB/s +Search_Easy1_CachedPCRE/16M 50 44462340 ns/op 377.34 MB/s +Search_Easy1_CachedRE2/8 5000000 318 ns/op 25.09 MB/s +Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.36 MB/s +Search_Easy1_CachedRE2/32 5000000 345 ns/op 92.55 MB/s +Search_Easy1_CachedRE2/64 5000000 350 ns/op 182.79 MB/s +Search_Easy1_CachedRE2/128 5000000 385 ns/op 331.75 MB/s +Search_Easy1_CachedRE2/256 5000000 408 ns/op 626.83 MB/s +Search_Easy1_CachedRE2/512 5000000 484 ns/op 1056.72 MB/s +Search_Easy1_CachedRE2/1K 2000000 676 ns/op 1513.66 MB/s +Search_Easy1_CachedRE2/2K 1000000 1020 ns/op 2007.55 MB/s +Search_Easy1_CachedRE2/4K 1000000 1596 ns/op 2564.98 MB/s +Search_Easy1_CachedRE2/8K 500000 2918 ns/op 2806.79 MB/s +Search_Easy1_CachedRE2/16K 200000 5447 ns/op 3007.74 MB/s +Search_Easy1_CachedRE2/32K 100000 11037 ns/op 2968.84 MB/s +Search_Easy1_CachedRE2/64K 50000 21863 ns/op 2997.48 MB/s +Search_Easy1_CachedRE2/128K 50000 44394 ns/op 2952.41 MB/s +Search_Easy1_CachedRE2/256K 20000 88430 ns/op 2964.42 MB/s +Search_Easy1_CachedRE2/512K 10000 177661 ns/op 2951.06 MB/s +Search_Easy1_CachedRE2/1M 5000 356783 ns/op 2938.97 MB/s +Search_Easy1_CachedRE2/2M 2000 721013 ns/op 2908.62 MB/s +Search_Easy1_CachedRE2/4M 1000 1526313 ns/op 2748.00 MB/s +Search_Easy1_CachedRE2/8M 500 3085670 ns/op 2718.57 MB/s +Search_Easy1_CachedRE2/16M 200 6156380 ns/op 2725.18 MB/s +Search_Medium_CachedPCRE/8 10000000 132 ns/op 60.24 MB/s +Search_Medium_CachedPCRE/16 10000000 161 ns/op 99.22 MB/s +Search_Medium_CachedPCRE/32 5000000 218 ns/op 146.72 MB/s +Search_Medium_CachedPCRE/64 5000000 339 ns/op 188.54 MB/s +Search_Medium_CachedPCRE/128 5000000 433 ns/op 295.45 MB/s +Search_Medium_CachedPCRE/256 200000 9074 ns/op 28.21 MB/s +Search_Medium_CachedPCRE/512 50000 21580 ns/op 23.73 MB/s +Search_Medium_CachedPCRE/1K 50000 45469 ns/op 22.52 MB/s +Search_Medium_CachedPCRE/2K 20000 62384 ns/op 32.83 MB/s +Search_Medium_CachedPCRE/4K 10000 153718 ns/op 26.65 MB/s +Search_Medium_CachedPCRE/8K 5000 332814 ns/op 24.61 MB/s +Search_Medium_CachedPCRE/16K 2000 678531 ns/op 24.15 MB/s +Search_Medium_CachedPCRE/32K 1000 1356201 ns/op 24.16 MB/s +Search_Medium_CachedPCRE/64K 500 2708792 ns/op 24.19 MB/s +Search_Medium_CachedPCRE/128K 200 5412745 ns/op 24.22 MB/s +Search_Medium_CachedPCRE/256K 100 10830430 ns/op 24.20 MB/s +Search_Medium_CachedRE2/8 5000000 326 ns/op 24.47 MB/s +Search_Medium_CachedRE2/16 5000000 363 ns/op 43.98 MB/s +Search_Medium_CachedRE2/32 5000000 412 ns/op 77.57 MB/s +Search_Medium_CachedRE2/64 2000000 506 ns/op 126.31 MB/s +Search_Medium_CachedRE2/128 2000000 715 ns/op 178.94 MB/s +Search_Medium_CachedRE2/256 1000000 1119 ns/op 228.65 MB/s +Search_Medium_CachedRE2/512 1000000 1928 ns/op 265.47 MB/s +Search_Medium_CachedRE2/1K 500000 3546 ns/op 288.75 MB/s +Search_Medium_CachedRE2/2K 200000 6782 ns/op 301.97 MB/s +Search_Medium_CachedRE2/4K 100000 13257 ns/op 308.95 MB/s +Search_Medium_CachedRE2/8K 50000 26197 ns/op 312.70 MB/s +Search_Medium_CachedRE2/16K 20000 52081 ns/op 314.58 MB/s +Search_Medium_CachedRE2/32K 10000 103926 ns/op 315.30 MB/s +Search_Medium_CachedRE2/64K 5000 207484 ns/op 315.86 MB/s +Search_Medium_CachedRE2/128K 5000 414545 ns/op 316.18 MB/s +Search_Medium_CachedRE2/256K 2000 828791 ns/op 316.30 MB/s +Search_Medium_CachedRE2/512K 1000 1657160 ns/op 316.38 MB/s +Search_Medium_CachedRE2/1M 500 3314254 ns/op 316.38 MB/s +Search_Medium_CachedRE2/2M 200 6636905 ns/op 315.98 MB/s +Search_Medium_CachedRE2/4M 100 13339080 ns/op 314.44 MB/s +Search_Medium_CachedRE2/8M 50 26718900 ns/op 313.96 MB/s +Search_Medium_CachedRE2/16M 20 53442000 ns/op 313.93 MB/s +Search_Hard_CachedPCRE/8 10000000 132 ns/op 60.21 MB/s +Search_Hard_CachedPCRE/16 10000000 161 ns/op 99.25 MB/s +Search_Hard_CachedPCRE/32 5000000 218 ns/op 146.67 MB/s +Search_Hard_CachedPCRE/64 5000000 339 ns/op 188.62 MB/s +Search_Hard_CachedPCRE/128 5000000 433 ns/op 295.34 MB/s +Search_Hard_CachedPCRE/256 2000 573612 ns/op 0.45 MB/s +Search_Hard_CachedPCRE/512 500 2344764 ns/op 0.22 MB/s +Search_Hard_CachedPCRE/1K 200 9311170 ns/op 0.11 MB/s +Search_Hard_CachedPCRE/2K 50 34066500 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/4K 10 146643800 ns/op 0.03 MB/s +Search_Hard_CachedRE2/8 5000000 333 ns/op 23.98 MB/s +Search_Hard_CachedRE2/16 5000000 358 ns/op 44.62 MB/s +Search_Hard_CachedRE2/32 5000000 408 ns/op 78.26 MB/s +Search_Hard_CachedRE2/64 2000000 509 ns/op 125.53 MB/s +Search_Hard_CachedRE2/128 2000000 717 ns/op 178.52 MB/s +Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.48 MB/s +Search_Hard_CachedRE2/512 1000000 1929 ns/op 265.34 MB/s +Search_Hard_CachedRE2/1K 500000 3547 ns/op 288.63 MB/s +Search_Hard_CachedRE2/2K 200000 6782 ns/op 301.97 MB/s +Search_Hard_CachedRE2/4K 100000 13254 ns/op 309.02 MB/s +Search_Hard_CachedRE2/8K 50000 26193 ns/op 312.74 MB/s +Search_Hard_CachedRE2/16K 20000 52077 ns/op 314.61 MB/s +Search_Hard_CachedRE2/32K 10000 103944 ns/op 315.25 MB/s +Search_Hard_CachedRE2/64K 5000 207487 ns/op 315.86 MB/s +Search_Hard_CachedRE2/128K 5000 414578 ns/op 316.16 MB/s +Search_Hard_CachedRE2/256K 2000 828793 ns/op 316.30 MB/s +Search_Hard_CachedRE2/512K 1000 1657164 ns/op 316.38 MB/s +Search_Hard_CachedRE2/1M 500 3314178 ns/op 316.39 MB/s +Search_Hard_CachedRE2/2M 200 6636585 ns/op 316.00 MB/s +Search_Hard_CachedRE2/4M 100 13339310 ns/op 314.43 MB/s +Search_Hard_CachedRE2/8M 50 26720420 ns/op 313.94 MB/s +Search_Hard_CachedRE2/16M 20 53443250 ns/op 313.93 MB/s +Search_Parens_CachedPCRE/8 10000000 196 ns/op 40.66 MB/s +Search_Parens_CachedRE2/8 5000000 331 ns/op 24.15 MB/s +Search_Parens_CachedRE2/16 5000000 359 ns/op 44.46 MB/s +Search_Parens_CachedRE2/32 5000000 409 ns/op 78.05 MB/s +Search_Parens_CachedRE2/64 2000000 509 ns/op 125.63 MB/s +Search_Parens_CachedRE2/128 2000000 720 ns/op 177.69 MB/s +Search_Parens_CachedRE2/256 1000000 1127 ns/op 226.97 MB/s +Search_Parens_CachedRE2/512 1000000 1937 ns/op 264.32 MB/s +Search_Parens_CachedRE2/1K 500000 3547 ns/op 288.65 MB/s +Search_Parens_CachedRE2/2K 200000 6784 ns/op 301.88 MB/s +Search_Parens_CachedRE2/4K 100000 13253 ns/op 309.05 MB/s +Search_Parens_CachedRE2/8K 50000 26195 ns/op 312.73 MB/s +Search_Parens_CachedRE2/16K 20000 52085 ns/op 314.56 MB/s +Search_Parens_CachedRE2/32K 10000 103948 ns/op 315.23 MB/s +Search_Parens_CachedRE2/64K 5000 207519 ns/op 315.81 MB/s +Search_Parens_CachedRE2/128K 5000 414605 ns/op 316.14 MB/s +Search_Parens_CachedRE2/256K 2000 828800 ns/op 316.29 MB/s +Search_Parens_CachedRE2/512K 1000 1657191 ns/op 316.37 MB/s +Search_Parens_CachedRE2/1M 500 3314252 ns/op 316.38 MB/s +Search_Parens_CachedRE2/2M 200 6637005 ns/op 315.98 MB/s +Search_Parens_CachedRE2/4M 100 13338840 ns/op 314.44 MB/s +Search_Parens_CachedRE2/8M 50 26718340 ns/op 313.96 MB/s +Search_Parens_CachedRE2/16M 20 53436450 ns/op 313.97 MB/s +Search_BigFixed_CachedPCRE/8 5000000 242 ns/op 32.94 MB/s +Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.07 MB/s +Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.50 MB/s +Search_BigFixed_CachedPCRE/64 2000000 652 ns/op 98.14 MB/s +Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.90 MB/s +Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s +Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.17 MB/s +Search_BigFixed_CachedPCRE/1K 200000 6476 ns/op 158.12 MB/s +Search_BigFixed_CachedPCRE/2K 100000 12746 ns/op 160.68 MB/s +Search_BigFixed_CachedPCRE/4K 50000 25285 ns/op 161.99 MB/s +Search_BigFixed_CachedPCRE/8K 20000 50367 ns/op 162.64 MB/s +Search_BigFixed_CachedPCRE/16K 10000 100611 ns/op 162.84 MB/s +Search_BigFixed_CachedPCRE/32K 5000 201128 ns/op 162.92 MB/s +Search_BigFixed_CachedRE2/8 10000000 130 ns/op 61.50 MB/s +Search_BigFixed_CachedRE2/16 5000000 373 ns/op 42.81 MB/s +Search_BigFixed_CachedRE2/32 5000000 406 ns/op 78.69 MB/s +Search_BigFixed_CachedRE2/64 5000000 485 ns/op 131.89 MB/s +Search_BigFixed_CachedRE2/128 2000000 630 ns/op 203.11 MB/s +Search_BigFixed_CachedRE2/256 2000000 949 ns/op 269.70 MB/s +Search_BigFixed_CachedRE2/512 1000000 1547 ns/op 330.96 MB/s +Search_BigFixed_CachedRE2/1K 500000 2765 ns/op 370.28 MB/s +Search_BigFixed_CachedRE2/2K 200000 5186 ns/op 394.84 MB/s +Search_BigFixed_CachedRE2/4K 100000 10045 ns/op 407.74 MB/s +Search_BigFixed_CachedRE2/8K 100000 19751 ns/op 414.75 MB/s +Search_BigFixed_CachedRE2/16K 50000 39158 ns/op 418.41 MB/s +Search_BigFixed_CachedRE2/32K 20000 78112 ns/op 419.50 MB/s +Search_BigFixed_CachedRE2/64K 10000 155876 ns/op 420.44 MB/s +Search_BigFixed_CachedRE2/128K 5000 311462 ns/op 420.83 MB/s +Search_BigFixed_CachedRE2/256K 2000 623684 ns/op 420.32 MB/s +Search_BigFixed_CachedRE2/512K 1000 1251098 ns/op 419.06 MB/s +Search_BigFixed_CachedRE2/1M 500 2517996 ns/op 416.43 MB/s +Search_Success_PCRE/8 1000000 1816 ns/op 4.40 MB/s +Search_Success_PCRE/16 1000000 1862 ns/op 8.59 MB/s +Search_Success_PCRE/32 1000000 1963 ns/op 16.30 MB/s +Search_Success_PCRE/64 500000 2143 ns/op 29.86 MB/s +Search_Success_PCRE/128 500000 2492 ns/op 51.35 MB/s +Search_Success_PCRE/256 500000 3226 ns/op 79.35 MB/s +Search_Success_PCRE/512 500000 4627 ns/op 110.65 MB/s +Search_Success_PCRE/1K 200000 7459 ns/op 137.28 MB/s +Search_Success_PCRE/2K 100000 13114 ns/op 156.16 MB/s +Search_Success_PCRE/4K 50000 24417 ns/op 167.75 MB/s +Search_Success_PCRE/8K 50000 47082 ns/op 173.99 MB/s +Search_Success_PCRE/16K 20000 92415 ns/op 177.29 MB/s +Search_Success_PCRE/32K 10000 183255 ns/op 178.81 MB/s +Search_Success_PCRE/64K 5000 364699 ns/op 179.70 MB/s +Search_Success_PCRE/128K 2000 728375 ns/op 179.95 MB/s +Search_Success_PCRE/256K 1000 1457928 ns/op 179.81 MB/s +Search_Success_PCRE/512K 500 2926398 ns/op 179.16 MB/s +Search_Success_PCRE/1M 200 5926725 ns/op 176.92 MB/s +Search_Success_PCRE/2M 100 12130250 ns/op 172.89 MB/s +Search_Success_PCRE/4M 50 25401120 ns/op 165.12 MB/s +Search_Success_PCRE/8M 20 56961850 ns/op 147.27 MB/s +Search_Success_PCRE/16M 10 134232100 ns/op 124.99 MB/s +Search_Success_RE2/8 200000 8299 ns/op 0.96 MB/s +Search_Success_RE2/16 50000 20306 ns/op 0.79 MB/s +Search_Success_RE2/32 50000 20336 ns/op 1.57 MB/s +Search_Success_RE2/64 50000 20557 ns/op 3.11 MB/s +Search_Success_RE2/128 50000 20586 ns/op 6.22 MB/s +Search_Success_RE2/256 50000 20882 ns/op 12.26 MB/s +Search_Success_RE2/512 50000 21673 ns/op 23.62 MB/s +Search_Success_RE2/1K 50000 23408 ns/op 43.75 MB/s +Search_Success_RE2/2K 50000 26992 ns/op 75.87 MB/s +Search_Success_RE2/4K 50000 33213 ns/op 123.33 MB/s +Search_Success_RE2/8K 50000 46189 ns/op 177.36 MB/s +Search_Success_RE2/16K 20000 72241 ns/op 226.79 MB/s +Search_Success_RE2/32K 10000 124254 ns/op 263.72 MB/s +Search_Success_RE2/64K 5000 228106 ns/op 287.30 MB/s +Search_Success_RE2/128K 5000 435538 ns/op 300.94 MB/s +Search_Success_RE2/256K 2000 852223 ns/op 307.60 MB/s +Search_Success_RE2/512K 1000 1690298 ns/op 310.17 MB/s +Search_Success_RE2/1M 500 3385618 ns/op 309.71 MB/s +Search_Success_RE2/2M 200 6919025 ns/op 303.10 MB/s +Search_Success_RE2/4M 100 14401900 ns/op 291.23 MB/s +Search_Success_RE2/8M 50 30840700 ns/op 272.00 MB/s +==BENCHMARK== c2 Fri Feb 26 15:45:38 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# fd9366132ce9+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped + +Search_Easy0_CachedPCRE/8 10000000 134 ns/op 59.52 MB/s +Search_Easy0_CachedPCRE/16 10000000 161 ns/op 99.25 MB/s +Search_Easy0_CachedPCRE/32 10000000 218 ns/op 146.77 MB/s +Search_Easy0_CachedPCRE/64 5000000 331 ns/op 192.89 MB/s +Search_Easy0_CachedPCRE/128 5000000 432 ns/op 296.26 MB/s +Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.79 MB/s +Search_Easy0_CachedPCRE/512 1000000 1741 ns/op 294.05 MB/s +Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.01 MB/s +Search_Easy0_CachedPCRE/2K 500000 5686 ns/op 360.17 MB/s +Search_Easy0_CachedPCRE/4K 200000 10629 ns/op 385.34 MB/s +Search_Easy0_CachedPCRE/8K 100000 21787 ns/op 375.99 MB/s +Search_Easy0_CachedPCRE/16K 50000 42183 ns/op 388.40 MB/s +Search_Easy0_CachedPCRE/32K 20000 85149 ns/op 384.83 MB/s +Search_Easy0_CachedPCRE/64K 10000 169790 ns/op 385.98 MB/s +Search_Easy0_CachedPCRE/128K 5000 340958 ns/op 384.42 MB/s +Search_Easy0_CachedPCRE/256K 5000 680879 ns/op 385.01 MB/s +Search_Easy0_CachedPCRE/512K 2000 1364074 ns/op 384.35 MB/s +Search_Easy0_CachedPCRE/1M 1000 2728489 ns/op 384.31 MB/s +Search_Easy0_CachedPCRE/2M 500 5460158 ns/op 384.08 MB/s +Search_Easy0_CachedPCRE/4M 100 11069260 ns/op 378.91 MB/s +Search_Easy0_CachedPCRE/8M 100 22189670 ns/op 378.04 MB/s +Search_Easy0_CachedPCRE/16M 50 44364000 ns/op 378.17 MB/s +Search_Easy0_CachedRE2/8 5000000 317 ns/op 25.22 MB/s +Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.43 MB/s +Search_Easy0_CachedRE2/32 5000000 331 ns/op 96.61 MB/s +Search_Easy0_CachedRE2/64 5000000 334 ns/op 191.62 MB/s +Search_Easy0_CachedRE2/128 5000000 377 ns/op 339.27 MB/s +Search_Easy0_CachedRE2/256 5000000 404 ns/op 632.62 MB/s +Search_Easy0_CachedRE2/512 5000000 483 ns/op 1058.96 MB/s +Search_Easy0_CachedRE2/1K 5000000 664 ns/op 1542.06 MB/s +Search_Easy0_CachedRE2/2K 1000000 1010 ns/op 2027.71 MB/s +Search_Easy0_CachedRE2/4K 1000000 1581 ns/op 2590.42 MB/s +Search_Easy0_CachedRE2/8K 1000000 2939 ns/op 2786.56 MB/s +Search_Easy0_CachedRE2/16K 500000 5439 ns/op 3011.88 MB/s +Search_Easy0_CachedRE2/32K 200000 11066 ns/op 2961.13 MB/s +Search_Easy0_CachedRE2/64K 100000 21875 ns/op 2995.87 MB/s +Search_Easy0_CachedRE2/128K 50000 44331 ns/op 2956.66 MB/s +Search_Easy0_CachedRE2/256K 20000 88335 ns/op 2967.61 MB/s +Search_Easy0_CachedRE2/512K 10000 177855 ns/op 2947.84 MB/s +Search_Easy0_CachedRE2/1M 5000 356896 ns/op 2938.04 MB/s +Search_Easy0_CachedRE2/2M 5000 716469 ns/op 2927.07 MB/s +Search_Easy0_CachedRE2/4M 1000 1532367 ns/op 2737.14 MB/s +Search_Easy0_CachedRE2/8M 500 3086890 ns/op 2717.49 MB/s +Search_Easy0_CachedRE2/16M 500 6157146 ns/op 2724.84 MB/s +Search_Easy1_CachedPCRE/8 20000000 133 ns/op 60.08 MB/s +Search_Easy1_CachedPCRE/16 10000000 161 ns/op 99.37 MB/s +Search_Easy1_CachedPCRE/32 10000000 218 ns/op 146.70 MB/s +Search_Easy1_CachedPCRE/64 5000000 331 ns/op 192.79 MB/s +Search_Easy1_CachedPCRE/128 5000000 649 ns/op 197.15 MB/s +Search_Easy1_CachedPCRE/256 2000000 935 ns/op 273.55 MB/s +Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.76 MB/s +Search_Easy1_CachedPCRE/1K 500000 3421 ns/op 299.32 MB/s +Search_Easy1_CachedPCRE/2K 500000 6236 ns/op 328.40 MB/s +Search_Easy1_CachedPCRE/4K 200000 11135 ns/op 367.84 MB/s +Search_Easy1_CachedPCRE/8K 100000 22040 ns/op 371.68 MB/s +Search_Easy1_CachedPCRE/16K 50000 42415 ns/op 386.28 MB/s +Search_Easy1_CachedPCRE/32K 20000 85249 ns/op 384.38 MB/s +Search_Easy1_CachedPCRE/64K 10000 170306 ns/op 384.81 MB/s +Search_Easy1_CachedPCRE/128K 5000 342332 ns/op 382.88 MB/s +Search_Easy1_CachedPCRE/256K 5000 682556 ns/op 384.06 MB/s +Search_Easy1_CachedPCRE/512K 2000 1366952 ns/op 383.55 MB/s +Search_Easy1_CachedPCRE/1M 1000 2736532 ns/op 383.18 MB/s +Search_Easy1_CachedPCRE/2M 500 5477062 ns/op 382.90 MB/s +Search_Easy1_CachedPCRE/4M 100 11097300 ns/op 377.96 MB/s +Search_Easy1_CachedPCRE/8M 100 22254540 ns/op 376.94 MB/s +Search_Easy1_CachedPCRE/16M 50 44510220 ns/op 376.93 MB/s +Search_Easy1_CachedRE2/8 5000000 317 ns/op 25.19 MB/s +Search_Easy1_CachedRE2/16 5000000 317 ns/op 50.42 MB/s +Search_Easy1_CachedRE2/32 5000000 332 ns/op 96.25 MB/s +Search_Easy1_CachedRE2/64 5000000 335 ns/op 190.94 MB/s +Search_Easy1_CachedRE2/128 5000000 376 ns/op 340.07 MB/s +Search_Easy1_CachedRE2/256 5000000 415 ns/op 615.50 MB/s +Search_Easy1_CachedRE2/512 5000000 485 ns/op 1054.85 MB/s +Search_Easy1_CachedRE2/1K 5000000 663 ns/op 1543.27 MB/s +Search_Easy1_CachedRE2/2K 1000000 1009 ns/op 2029.62 MB/s +Search_Easy1_CachedRE2/4K 1000000 1585 ns/op 2582.98 MB/s +Search_Easy1_CachedRE2/8K 1000000 2947 ns/op 2779.39 MB/s +Search_Easy1_CachedRE2/16K 500000 5474 ns/op 2992.97 MB/s +Search_Easy1_CachedRE2/32K 200000 11058 ns/op 2963.24 MB/s +Search_Easy1_CachedRE2/64K 100000 21872 ns/op 2996.27 MB/s +Search_Easy1_CachedRE2/128K 50000 44328 ns/op 2956.85 MB/s +Search_Easy1_CachedRE2/256K 20000 88325 ns/op 2967.95 MB/s +Search_Easy1_CachedRE2/512K 10000 177870 ns/op 2947.58 MB/s +Search_Easy1_CachedRE2/1M 5000 356912 ns/op 2937.91 MB/s +Search_Easy1_CachedRE2/2M 5000 716384 ns/op 2927.41 MB/s +Search_Easy1_CachedRE2/4M 1000 1532077 ns/op 2737.66 MB/s +Search_Easy1_CachedRE2/8M 500 3087256 ns/op 2717.17 MB/s +Search_Easy1_CachedRE2/16M 500 6163142 ns/op 2722.19 MB/s +Search_Medium_CachedPCRE/8 20000000 134 ns/op 59.46 MB/s +Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.36 MB/s +Search_Medium_CachedPCRE/32 10000000 219 ns/op 145.73 MB/s +Search_Medium_CachedPCRE/64 5000000 340 ns/op 187.82 MB/s +Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.39 MB/s +Search_Medium_CachedPCRE/256 200000 9077 ns/op 28.20 MB/s +Search_Medium_CachedPCRE/512 100000 21579 ns/op 23.73 MB/s +Search_Medium_CachedPCRE/1K 50000 42393 ns/op 24.15 MB/s +Search_Medium_CachedPCRE/2K 50000 62381 ns/op 32.83 MB/s +Search_Medium_CachedPCRE/4K 10000 153708 ns/op 26.65 MB/s +Search_Medium_CachedPCRE/8K 5000 332752 ns/op 24.62 MB/s +Search_Medium_CachedPCRE/16K 5000 678258 ns/op 24.16 MB/s +Search_Medium_CachedPCRE/32K 2000 1355855 ns/op 24.17 MB/s +Search_Medium_CachedPCRE/64K 1000 2707494 ns/op 24.21 MB/s +Search_Medium_CachedPCRE/128K 500 5410032 ns/op 24.23 MB/s +Search_Medium_CachedPCRE/256K 100 10825800 ns/op 24.21 MB/s +Search_Medium_CachedRE2/8 5000000 337 ns/op 23.70 MB/s +Search_Medium_CachedRE2/16 5000000 363 ns/op 44.02 MB/s +Search_Medium_CachedRE2/32 5000000 414 ns/op 77.23 MB/s +Search_Medium_CachedRE2/64 5000000 510 ns/op 125.47 MB/s +Search_Medium_CachedRE2/128 5000000 724 ns/op 176.68 MB/s +Search_Medium_CachedRE2/256 1000000 1124 ns/op 227.62 MB/s +Search_Medium_CachedRE2/512 1000000 1933 ns/op 264.81 MB/s +Search_Medium_CachedRE2/1K 500000 3551 ns/op 288.35 MB/s +Search_Medium_CachedRE2/2K 500000 6786 ns/op 301.77 MB/s +Search_Medium_CachedRE2/4K 200000 13256 ns/op 308.97 MB/s +Search_Medium_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s +Search_Medium_CachedRE2/16K 50000 52085 ns/op 314.56 MB/s +Search_Medium_CachedRE2/32K 10000 103940 ns/op 315.26 MB/s +Search_Medium_CachedRE2/64K 10000 207489 ns/op 315.85 MB/s +Search_Medium_CachedRE2/128K 5000 414571 ns/op 316.16 MB/s +Search_Medium_CachedRE2/256K 2000 828757 ns/op 316.31 MB/s +Search_Medium_CachedRE2/512K 1000 1657123 ns/op 316.38 MB/s +Search_Medium_CachedRE2/1M 500 3314204 ns/op 316.39 MB/s +Search_Medium_CachedRE2/2M 500 6633334 ns/op 316.15 MB/s +Search_Medium_CachedRE2/4M 100 13342170 ns/op 314.36 MB/s +Search_Medium_CachedRE2/8M 100 26718850 ns/op 313.96 MB/s +Search_Medium_CachedRE2/16M 50 53433900 ns/op 313.98 MB/s +Search_Hard_CachedPCRE/8 20000000 134 ns/op 59.43 MB/s +Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.34 MB/s +Search_Hard_CachedPCRE/32 10000000 219 ns/op 145.72 MB/s +Search_Hard_CachedPCRE/64 5000000 340 ns/op 187.84 MB/s +Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.64 MB/s +Search_Hard_CachedPCRE/256 5000 572444 ns/op 0.45 MB/s +Search_Hard_CachedPCRE/512 1000 2345148 ns/op 0.22 MB/s +Search_Hard_CachedPCRE/1K 200 9327675 ns/op 0.11 MB/s +Search_Hard_CachedPCRE/2K 50 34095380 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/4K 10 146669300 ns/op 0.03 MB/s +Search_Hard_CachedRE2/8 5000000 338 ns/op 23.62 MB/s +Search_Hard_CachedRE2/16 5000000 366 ns/op 43.65 MB/s +Search_Hard_CachedRE2/32 5000000 422 ns/op 75.77 MB/s +Search_Hard_CachedRE2/64 5000000 513 ns/op 124.66 MB/s +Search_Hard_CachedRE2/128 5000000 721 ns/op 177.41 MB/s +Search_Hard_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s +Search_Hard_CachedRE2/512 1000000 1933 ns/op 264.75 MB/s +Search_Hard_CachedRE2/1K 500000 3551 ns/op 288.31 MB/s +Search_Hard_CachedRE2/2K 500000 6787 ns/op 301.74 MB/s +Search_Hard_CachedRE2/4K 200000 13262 ns/op 308.84 MB/s +Search_Hard_CachedRE2/8K 100000 26203 ns/op 312.63 MB/s +Search_Hard_CachedRE2/16K 50000 52085 ns/op 314.56 MB/s +Search_Hard_CachedRE2/32K 10000 103943 ns/op 315.25 MB/s +Search_Hard_CachedRE2/64K 10000 207492 ns/op 315.85 MB/s +Search_Hard_CachedRE2/128K 5000 414602 ns/op 316.14 MB/s +Search_Hard_CachedRE2/256K 2000 828771 ns/op 316.30 MB/s +Search_Hard_CachedRE2/512K 1000 1657138 ns/op 316.38 MB/s +==BENCHMARK== c2 Fri Feb 26 15:52:36 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# fd9366132ce9+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped + +Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.19 MB/s +Search_Easy0_CachedPCRE/16 10000000 160 ns/op 99.51 MB/s +Search_Easy0_CachedPCRE/32 10000000 217 ns/op 147.30 MB/s +Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.41 MB/s +Search_Easy0_CachedPCRE/128 5000000 431 ns/op 296.81 MB/s +Search_Easy0_CachedPCRE/256 1000000 1082 ns/op 236.52 MB/s +Search_Easy0_CachedPCRE/512 1000000 1742 ns/op 293.84 MB/s +Search_Easy0_CachedPCRE/1K 500000 3391 ns/op 301.97 MB/s +Search_Easy0_CachedPCRE/2K 500000 5680 ns/op 360.53 MB/s +Search_Easy0_CachedPCRE/4K 200000 10620 ns/op 385.66 MB/s +Search_Easy0_CachedPCRE/8K 100000 21760 ns/op 376.47 MB/s +Search_Easy0_CachedPCRE/16K 50000 42151 ns/op 388.70 MB/s +Search_Easy0_CachedPCRE/32K 20000 85091 ns/op 385.09 MB/s +Search_Easy0_CachedPCRE/64K 10000 169811 ns/op 385.93 MB/s +Search_Easy0_CachedPCRE/128K 5000 340974 ns/op 384.40 MB/s +Search_Easy0_CachedPCRE/256K 5000 680322 ns/op 385.32 MB/s +Search_Easy0_CachedPCRE/512K 2000 1362887 ns/op 384.69 MB/s +Search_Easy0_CachedPCRE/1M 1000 2726335 ns/op 384.61 MB/s +Search_Easy0_CachedPCRE/2M 500 5466910 ns/op 383.61 MB/s +Search_Easy0_CachedPCRE/4M 100 11043660 ns/op 379.79 MB/s +Search_Easy0_CachedPCRE/8M 100 22181360 ns/op 378.18 MB/s +Search_Easy0_CachedPCRE/16M 50 44332240 ns/op 378.44 MB/s +Search_Easy0_CachedRE2/8 5000000 319 ns/op 25.07 MB/s +Search_Easy0_CachedRE2/16 5000000 320 ns/op 49.98 MB/s +Search_Easy0_CachedRE2/32 5000000 334 ns/op 95.62 MB/s +Search_Easy0_CachedRE2/64 5000000 336 ns/op 190.00 MB/s +Search_Easy0_CachedRE2/128 5000000 367 ns/op 348.26 MB/s +Search_Easy0_CachedRE2/256 5000000 392 ns/op 651.98 MB/s +Search_Easy0_CachedRE2/512 5000000 472 ns/op 1084.04 MB/s +Search_Easy0_CachedRE2/1K 5000000 652 ns/op 1569.77 MB/s +Search_Easy0_CachedRE2/2K 1000000 1002 ns/op 2041.98 MB/s +Search_Easy0_CachedRE2/4K 1000000 1588 ns/op 2579.04 MB/s +Search_Easy0_CachedRE2/8K 1000000 2919 ns/op 2805.50 MB/s +Search_Easy0_CachedRE2/16K 500000 5422 ns/op 3021.44 MB/s +Search_Easy0_CachedRE2/32K 200000 11029 ns/op 2970.83 MB/s +Search_Easy0_CachedRE2/64K 100000 21880 ns/op 2995.23 MB/s +Search_Easy0_CachedRE2/128K 50000 44348 ns/op 2955.51 MB/s +Search_Easy0_CachedRE2/256K 20000 88537 ns/op 2960.84 MB/s +Search_Easy0_CachedRE2/512K 10000 178161 ns/op 2942.77 MB/s +Search_Easy0_CachedRE2/1M 5000 357466 ns/op 2933.35 MB/s +Search_Easy0_CachedRE2/2M 5000 726215 ns/op 2887.78 MB/s +Search_Easy0_CachedRE2/4M 1000 1518925 ns/op 2761.36 MB/s +Search_Easy0_CachedRE2/8M 500 3093556 ns/op 2711.64 MB/s +Search_Easy0_CachedRE2/16M 500 6163216 ns/op 2722.15 MB/s +Search_Easy1_CachedPCRE/8 20000000 133 ns/op 60.01 MB/s +Search_Easy1_CachedPCRE/16 10000000 161 ns/op 99.01 MB/s +Search_Easy1_CachedPCRE/32 10000000 218 ns/op 146.72 MB/s +Search_Easy1_CachedPCRE/64 5000000 331 ns/op 192.81 MB/s +Search_Easy1_CachedPCRE/128 5000000 651 ns/op 196.47 MB/s +Search_Easy1_CachedPCRE/256 2000000 937 ns/op 273.13 MB/s +Search_Easy1_CachedPCRE/512 1000000 1971 ns/op 259.75 MB/s +Search_Easy1_CachedPCRE/1K 500000 3419 ns/op 299.44 MB/s +Search_Easy1_CachedPCRE/2K 500000 6237 ns/op 328.32 MB/s +Search_Easy1_CachedPCRE/4K 200000 11126 ns/op 368.14 MB/s +Search_Easy1_CachedPCRE/8K 100000 22025 ns/op 371.94 MB/s +Search_Easy1_CachedPCRE/16K 50000 42414 ns/op 386.28 MB/s +Search_Easy1_CachedPCRE/32K 20000 85208 ns/op 384.56 MB/s +Search_Easy1_CachedPCRE/64K 10000 170269 ns/op 384.90 MB/s +Search_Easy1_CachedPCRE/128K 5000 342014 ns/op 383.24 MB/s +Search_Easy1_CachedPCRE/256K 5000 682258 ns/op 384.23 MB/s +Search_Easy1_CachedPCRE/512K 2000 1366582 ns/op 383.65 MB/s +Search_Easy1_CachedPCRE/1M 1000 2735046 ns/op 383.39 MB/s +Search_Easy1_CachedPCRE/2M 500 5591430 ns/op 375.07 MB/s +Search_Easy1_CachedPCRE/4M 100 11077680 ns/op 378.63 MB/s +Search_Easy1_CachedPCRE/8M 100 22246570 ns/op 377.07 MB/s +Search_Easy1_CachedPCRE/16M 50 44470360 ns/op 377.27 MB/s +Search_Easy1_CachedRE2/8 5000000 319 ns/op 25.02 MB/s +Search_Easy1_CachedRE2/16 5000000 326 ns/op 49.00 MB/s +Search_Easy1_CachedRE2/32 5000000 334 ns/op 95.72 MB/s +Search_Easy1_CachedRE2/64 5000000 337 ns/op 189.56 MB/s +Search_Easy1_CachedRE2/128 5000000 365 ns/op 349.81 MB/s +Search_Easy1_CachedRE2/256 5000000 399 ns/op 640.98 MB/s +Search_Easy1_CachedRE2/512 5000000 469 ns/op 1089.44 MB/s +Search_Easy1_CachedRE2/1K 5000000 652 ns/op 1569.62 MB/s +Search_Easy1_CachedRE2/2K 1000000 1004 ns/op 2038.28 MB/s +Search_Easy1_CachedRE2/4K 1000000 1584 ns/op 2584.99 MB/s +Search_Easy1_CachedRE2/8K 1000000 2919 ns/op 2806.40 MB/s +Search_Easy1_CachedRE2/16K 500000 5451 ns/op 3005.49 MB/s +Search_Easy1_CachedRE2/32K 200000 10985 ns/op 2982.94 MB/s +Search_Easy1_CachedRE2/64K 100000 21869 ns/op 2996.70 MB/s +Search_Easy1_CachedRE2/128K 50000 44326 ns/op 2956.95 MB/s +Search_Easy1_CachedRE2/256K 20000 88517 ns/op 2961.50 MB/s +Search_Easy1_CachedRE2/512K 10000 178161 ns/op 2942.76 MB/s +Search_Easy1_CachedRE2/1M 5000 357524 ns/op 2932.88 MB/s +Search_Easy1_CachedRE2/2M 5000 726271 ns/op 2887.56 MB/s +Search_Easy1_CachedRE2/4M 1000 1519940 ns/op 2759.52 MB/s +Search_Easy1_CachedRE2/8M 500 3095036 ns/op 2710.34 MB/s +Search_Easy1_CachedRE2/16M 500 6165230 ns/op 2721.26 MB/s +Search_Medium_CachedPCRE/8 20000000 134 ns/op 59.33 MB/s +Search_Medium_CachedPCRE/16 10000000 162 ns/op 98.23 MB/s +Search_Medium_CachedPCRE/32 10000000 219 ns/op 145.68 MB/s +Search_Medium_CachedPCRE/64 5000000 340 ns/op 187.74 MB/s +Search_Medium_CachedPCRE/128 5000000 434 ns/op 294.36 MB/s +Search_Medium_CachedPCRE/256 200000 9116 ns/op 28.08 MB/s +Search_Medium_CachedPCRE/512 100000 21829 ns/op 23.45 MB/s +Search_Medium_CachedPCRE/1K 50000 42878 ns/op 23.88 MB/s +Search_Medium_CachedPCRE/2K 50000 62528 ns/op 32.75 MB/s +Search_Medium_CachedPCRE/4K 10000 153909 ns/op 26.61 MB/s +Search_Medium_CachedPCRE/8K 5000 333099 ns/op 24.59 MB/s +Search_Medium_CachedPCRE/16K 5000 678554 ns/op 24.15 MB/s +Search_Medium_CachedPCRE/32K 2000 1354963 ns/op 24.18 MB/s +Search_Medium_CachedPCRE/64K 1000 2705485 ns/op 24.22 MB/s +Search_Medium_CachedPCRE/128K 500 5407590 ns/op 24.24 MB/s +Search_Medium_CachedPCRE/256K 100 10817570 ns/op 24.23 MB/s +Search_Medium_CachedRE2/8 5000000 339 ns/op 23.55 MB/s +Search_Medium_CachedRE2/16 5000000 364 ns/op 43.85 MB/s +Search_Medium_CachedRE2/32 5000000 417 ns/op 76.70 MB/s +Search_Medium_CachedRE2/64 5000000 515 ns/op 124.27 MB/s +Search_Medium_CachedRE2/128 5000000 723 ns/op 176.82 MB/s +Search_Medium_CachedRE2/256 1000000 1127 ns/op 227.13 MB/s +Search_Medium_CachedRE2/512 1000000 1935 ns/op 264.52 MB/s +Search_Medium_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s +Search_Medium_CachedRE2/2K 500000 6794 ns/op 301.41 MB/s +Search_Medium_CachedRE2/4K 200000 13257 ns/op 308.96 MB/s +Search_Medium_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s +Search_Medium_CachedRE2/16K 50000 52083 ns/op 314.57 MB/s +Search_Medium_CachedRE2/32K 10000 103951 ns/op 315.22 MB/s +Search_Medium_CachedRE2/64K 10000 207486 ns/op 315.86 MB/s +Search_Medium_CachedRE2/128K 5000 414561 ns/op 316.17 MB/s +Search_Medium_CachedRE2/256K 2000 828728 ns/op 316.32 MB/s +Search_Medium_CachedRE2/512K 1000 1657039 ns/op 316.40 MB/s +Search_Medium_CachedRE2/1M 500 3314040 ns/op 316.40 MB/s +Search_Medium_CachedRE2/2M 500 6637874 ns/op 315.94 MB/s +Search_Medium_CachedRE2/4M 100 13332420 ns/op 314.59 MB/s +Search_Medium_CachedRE2/8M 100 26715300 ns/op 314.00 MB/s +Search_Medium_CachedRE2/16M 50 53430940 ns/op 314.00 MB/s +Search_Hard_CachedPCRE/8 20000000 134 ns/op 59.35 MB/s +Search_Hard_CachedPCRE/16 10000000 162 ns/op 98.21 MB/s +Search_Hard_CachedPCRE/32 10000000 219 ns/op 145.65 MB/s +Search_Hard_CachedPCRE/64 5000000 340 ns/op 187.74 MB/s +Search_Hard_CachedPCRE/128 5000000 434 ns/op 294.33 MB/s +Search_Hard_CachedPCRE/256 5000 572641 ns/op 0.45 MB/s +Search_Hard_CachedPCRE/512 1000 2348430 ns/op 0.22 MB/s +Search_Hard_CachedPCRE/1K 200 9314740 ns/op 0.11 MB/s +Search_Hard_CachedPCRE/2K 50 34077360 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/4K 10 146685100 ns/op 0.03 MB/s +Search_Hard_CachedRE2/8 5000000 339 ns/op 23.56 MB/s +Search_Hard_CachedRE2/16 5000000 364 ns/op 43.87 MB/s +Search_Hard_CachedRE2/32 5000000 416 ns/op 76.91 MB/s +Search_Hard_CachedRE2/64 5000000 514 ns/op 124.39 MB/s +Search_Hard_CachedRE2/128 5000000 723 ns/op 176.97 MB/s +Search_Hard_CachedRE2/256 1000000 1126 ns/op 227.29 MB/s +Search_Hard_CachedRE2/512 1000000 1935 ns/op 264.58 MB/s +Search_Hard_CachedRE2/1K 500000 3552 ns/op 288.24 MB/s +Search_Hard_CachedRE2/2K 500000 6787 ns/op 301.73 MB/s +Search_Hard_CachedRE2/4K 200000 13258 ns/op 308.93 MB/s +Search_Hard_CachedRE2/8K 100000 26198 ns/op 312.69 MB/s +Search_Hard_CachedRE2/16K 50000 52078 ns/op 314.60 MB/s +Search_Hard_CachedRE2/32K 10000 103957 ns/op 315.21 MB/s +Search_Hard_CachedRE2/64K 10000 207490 ns/op 315.85 MB/s +Search_Hard_CachedRE2/128K 5000 414573 ns/op 316.16 MB/s +Search_Hard_CachedRE2/256K 2000 828748 ns/op 316.31 MB/s +Search_Hard_CachedRE2/512K 1000 1657141 ns/op 316.38 MB/s +Search_Hard_CachedRE2/1M 500 3314048 ns/op 316.40 MB/s +Search_Hard_CachedRE2/2M 500 6637896 ns/op 315.94 MB/s +Search_Hard_CachedRE2/4M 100 13331710 ns/op 314.61 MB/s +Search_Hard_CachedRE2/8M 100 26716050 ns/op 313.99 MB/s +Search_Hard_CachedRE2/16M 50 53428900 ns/op 314.01 MB/s +Search_Parens_CachedPCRE/8 10000000 197 ns/op 40.52 MB/s +Search_Parens_CachedRE2/8 5000000 339 ns/op 23.55 MB/s +Search_Parens_CachedRE2/16 5000000 365 ns/op 43.83 MB/s +Search_Parens_CachedRE2/32 5000000 416 ns/op 76.85 MB/s +Search_Parens_CachedRE2/64 5000000 518 ns/op 123.48 MB/s +Search_Parens_CachedRE2/128 5000000 732 ns/op 174.84 MB/s +Search_Parens_CachedRE2/256 1000000 1125 ns/op 227.38 MB/s +Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.54 MB/s +Search_Parens_CachedRE2/1K 500000 3553 ns/op 288.18 MB/s +Search_Parens_CachedRE2/2K 500000 6787 ns/op 301.73 MB/s +Search_Parens_CachedRE2/4K 200000 13258 ns/op 308.93 MB/s +Search_Parens_CachedRE2/8K 100000 26198 ns/op 312.68 MB/s +Search_Parens_CachedRE2/16K 50000 52082 ns/op 314.58 MB/s +Search_Parens_CachedRE2/32K 10000 103942 ns/op 315.25 MB/s +Search_Parens_CachedRE2/64K 10000 207482 ns/op 315.86 MB/s +Search_Parens_CachedRE2/128K 5000 414565 ns/op 316.17 MB/s +Search_Parens_CachedRE2/256K 2000 828752 ns/op 316.31 MB/s +Search_Parens_CachedRE2/512K 1000 1657114 ns/op 316.39 MB/s +Search_Parens_CachedRE2/1M 500 3314130 ns/op 316.40 MB/s +Search_Parens_CachedRE2/2M 500 6637822 ns/op 315.94 MB/s +Search_Parens_CachedRE2/4M 100 13333110 ns/op 314.58 MB/s +Search_Parens_CachedRE2/8M 100 26718660 ns/op 313.96 MB/s +Search_Parens_CachedRE2/16M 50 53434420 ns/op 313.98 MB/s +Search_BigFixed_CachedPCRE/8 10000000 245 ns/op 32.58 MB/s +Search_BigFixed_CachedPCRE/16 5000000 302 ns/op 52.87 MB/s +Search_BigFixed_CachedPCRE/32 5000000 419 ns/op 76.34 MB/s +Search_BigFixed_CachedPCRE/64 5000000 657 ns/op 97.37 MB/s +Search_BigFixed_CachedPCRE/128 2000000 986 ns/op 129.75 MB/s +Search_BigFixed_CachedPCRE/256 1000000 1776 ns/op 144.11 MB/s +Search_BigFixed_CachedPCRE/512 500000 3343 ns/op 153.12 MB/s +Search_BigFixed_CachedPCRE/1K 500000 6477 ns/op 158.09 MB/s +Search_BigFixed_CachedPCRE/2K 200000 12745 ns/op 160.68 MB/s +Search_BigFixed_CachedPCRE/4K 100000 25282 ns/op 162.01 MB/s +Search_BigFixed_CachedPCRE/8K 50000 50360 ns/op 162.67 MB/s +Search_BigFixed_CachedPCRE/16K 10000 100599 ns/op 162.86 MB/s +Search_BigFixed_CachedPCRE/32K 10000 201002 ns/op 163.02 MB/s +Search_BigFixed_CachedRE2/8 20000000 130 ns/op 61.10 MB/s +Search_BigFixed_CachedRE2/16 5000000 375 ns/op 42.65 MB/s +Search_BigFixed_CachedRE2/32 5000000 412 ns/op 77.57 MB/s +Search_BigFixed_CachedRE2/64 5000000 488 ns/op 130.92 MB/s +Search_BigFixed_CachedRE2/128 5000000 635 ns/op 201.33 MB/s +Search_BigFixed_CachedRE2/256 2000000 946 ns/op 270.51 MB/s +Search_BigFixed_CachedRE2/512 1000000 1551 ns/op 329.90 MB/s +Search_BigFixed_CachedRE2/1K 1000000 2767 ns/op 369.95 MB/s +Search_BigFixed_CachedRE2/2K 500000 5192 ns/op 394.43 MB/s +Search_BigFixed_CachedRE2/4K 200000 10047 ns/op 407.68 MB/s +Search_BigFixed_CachedRE2/8K 100000 19753 ns/op 414.70 MB/s +Search_BigFixed_CachedRE2/16K 50000 39165 ns/op 418.33 MB/s +Search_BigFixed_CachedRE2/32K 20000 78111 ns/op 419.50 MB/s +Search_BigFixed_CachedRE2/64K 10000 155869 ns/op 420.45 MB/s +Search_BigFixed_CachedRE2/128K 5000 311467 ns/op 420.82 MB/s +Search_BigFixed_CachedRE2/256K 5000 622457 ns/op 421.14 MB/s +Search_BigFixed_CachedRE2/512K 2000 1247149 ns/op 420.39 MB/s +Search_BigFixed_CachedRE2/1M 1000 2502506 ns/op 419.01 MB/s +Search_Success_PCRE/8 1000000 1835 ns/op 4.36 MB/s +Search_Success_PCRE/16 1000000 1890 ns/op 8.46 MB/s +Search_Success_PCRE/32 1000000 1981 ns/op 16.15 MB/s +Search_Success_PCRE/64 1000000 2151 ns/op 29.75 MB/s +Search_Success_PCRE/128 1000000 2511 ns/op 50.96 MB/s +Search_Success_PCRE/256 500000 3229 ns/op 79.26 MB/s +Search_Success_PCRE/512 500000 4647 ns/op 110.16 MB/s +Search_Success_PCRE/1K 200000 7500 ns/op 136.52 MB/s +Search_Success_PCRE/2K 200000 13134 ns/op 155.92 MB/s +Search_Success_PCRE/4K 100000 24469 ns/op 167.39 MB/s +Search_Success_PCRE/8K 50000 47127 ns/op 173.83 MB/s +Search_Success_PCRE/16K 20000 92460 ns/op 177.20 MB/s +Search_Success_PCRE/32K 10000 183255 ns/op 178.81 MB/s +Search_Success_PCRE/64K 5000 364664 ns/op 179.72 MB/s +Search_Success_PCRE/128K 2000 728382 ns/op 179.95 MB/s +Search_Success_PCRE/256K 1000 1458071 ns/op 179.79 MB/s +Search_Success_PCRE/512K 500 2927234 ns/op 179.11 MB/s +Search_Success_PCRE/1M 500 5852934 ns/op 179.15 MB/s +Search_Success_PCRE/2M 200 11886620 ns/op 176.43 MB/s +Search_Success_PCRE/4M 100 24402710 ns/op 171.88 MB/s +Search_Success_PCRE/8M 50 50996680 ns/op 164.49 MB/s +Search_Success_PCRE/16M 10 135693000 ns/op 123.64 MB/s +Search_Success_RE2/16M 20 74552300 ns/op 225.04 MB/s +Search_Success_CachedPCRE/8 10000000 236 ns/op 33.88 MB/s +Search_Success_CachedPCRE/16 10000000 289 ns/op 55.21 MB/s +Search_Success_CachedPCRE/32 5000000 397 ns/op 80.58 MB/s +Search_Success_CachedPCRE/64 5000000 611 ns/op 104.58 MB/s +Search_Success_CachedPCRE/128 2000000 914 ns/op 139.91 MB/s +Search_Success_CachedPCRE/256 1000000 1622 ns/op 157.81 MB/s +Search_Success_CachedPCRE/512 500000 3037 ns/op 168.54 MB/s +Search_Success_CachedPCRE/1K 500000 5867 ns/op 174.51 MB/s +Search_Success_CachedPCRE/2K 200000 11529 ns/op 177.62 MB/s +Search_Success_CachedPCRE/4K 100000 22852 ns/op 179.23 MB/s +Search_Success_CachedPCRE/8K 50000 46293 ns/op 176.96 MB/s +Search_Success_CachedPCRE/16K 20000 90812 ns/op 180.42 MB/s +Search_Success_CachedPCRE/32K 10000 181517 ns/op 180.52 MB/s +Search_Success_CachedPCRE/64K 5000 362941 ns/op 180.57 MB/s +Search_Success_CachedPCRE/128K 2000 726534 ns/op 180.41 MB/s +Search_Success_CachedPCRE/256K 1000 1456177 ns/op 180.02 MB/s +Search_Success_CachedPCRE/512K 500 2925190 ns/op 179.23 MB/s +Search_Success_CachedPCRE/1M 500 5850306 ns/op 179.23 MB/s +Search_Success_CachedPCRE/2M 200 11879265 ns/op 176.54 MB/s +Search_Success_CachedPCRE/4M 100 24386990 ns/op 171.99 MB/s +Search_Success_CachedPCRE/8M 50 50981240 ns/op 164.54 MB/s +Search_Success_CachedPCRE/16M 10 135670500 ns/op 123.66 MB/s +Search_Success_CachedRE2/8 20000000 130 ns/op 61.27 MB/s +Search_Success_CachedRE2/16 5000000 376 ns/op 42.54 MB/s +Search_Success_CachedRE2/32 5000000 427 ns/op 74.93 MB/s +Search_Success_CachedRE2/64 5000000 526 ns/op 121.48 MB/s +Search_Success_CachedRE2/128 5000000 732 ns/op 174.78 MB/s +Search_Success_CachedRE2/256 1000000 1135 ns/op 225.54 MB/s +Search_Success_CachedRE2/512 1000000 1944 ns/op 263.31 MB/s +Search_Success_CachedRE2/1K 500000 3563 ns/op 287.37 MB/s +Search_Success_CachedRE2/2K 500000 6797 ns/op 301.31 MB/s +Search_Success_CachedRE2/4K 200000 13268 ns/op 308.71 MB/s +Search_Success_CachedRE2/8K 100000 26208 ns/op 312.57 MB/s +Search_Success_CachedRE2/16K 50000 52094 ns/op 314.50 MB/s +Search_Success_CachedRE2/32K 10000 104033 ns/op 314.98 MB/s +Search_Success_CachedRE2/64K 10000 207643 ns/op 315.62 MB/s +Search_Success_CachedRE2/128K 5000 415199 ns/op 315.68 MB/s +Search_Success_CachedRE2/256K 2000 831963 ns/op 315.09 MB/s +Search_Success_CachedRE2/512K 1000 1670044 ns/op 313.94 MB/s +Search_Success_CachedRE2/1M 500 3366302 ns/op 311.49 MB/s +Search_Success_CachedRE2/2M 200 6902225 ns/op 303.84 MB/s +Search_Success_CachedRE2/4M 100 14383930 ns/op 291.60 MB/s +Search_Success_CachedRE2/8M 50 30930940 ns/op 271.20 MB/s +Search_Success_CachedRE2/16M 20 74507750 ns/op 225.17 MB/s +Search_Success1_PCRE/8 1000000 1997 ns/op 4.00 MB/s +Search_Success1_PCRE/16 1000000 2038 ns/op 7.85 MB/s +Search_Success1_PCRE/32 1000000 2116 ns/op 15.12 MB/s +Search_Success1_PCRE/64 1000000 2280 ns/op 28.07 MB/s +Search_Success1_PCRE/128 1000000 2636 ns/op 48.56 MB/s +Search_Success1_PCRE/256 500000 3351 ns/op 76.39 MB/s +Search_Success1_PCRE/512 500000 4775 ns/op 107.22 MB/s +Search_Success1_PCRE/1K 200000 7626 ns/op 134.27 MB/s +Search_Success1_PCRE/2K 200000 13284 ns/op 154.17 MB/s +Search_Success1_PCRE/4K 100000 24592 ns/op 166.56 MB/s +Search_Success1_PCRE/8K 50000 47260 ns/op 173.34 MB/s +Search_Success1_PCRE/16K 20000 92599 ns/op 176.93 MB/s +Search_Success1_PCRE/32K 10000 183386 ns/op 178.68 MB/s +Search_Success1_PCRE/64K 5000 364838 ns/op 179.63 MB/s +Search_Success1_PCRE/128K 2000 728548 ns/op 179.91 MB/s +Search_Success1_PCRE/256K 1000 1458127 ns/op 179.78 MB/s +Search_Success1_PCRE/512K 500 2927280 ns/op 179.10 MB/s +Search_Success1_PCRE/1M 500 5853210 ns/op 179.15 MB/s +Search_Success1_PCRE/2M 200 11886770 ns/op 176.43 MB/s +Search_Success1_PCRE/4M 100 24400970 ns/op 171.89 MB/s +Search_Success1_PCRE/8M 50 50996900 ns/op 164.49 MB/s +Search_Success1_PCRE/16M 10 135681900 ns/op 123.65 MB/s +Search_Success1_RE2/8 100000 27018 ns/op 0.30 MB/s +Search_Success1_RE2/16 100000 27002 ns/op 0.59 MB/s +Search_Success1_RE2/32 100000 27343 ns/op 1.17 MB/s +Search_Success1_RE2/64 100000 27321 ns/op 2.34 MB/s +Search_Success1_RE2/128 100000 27794 ns/op 4.61 MB/s +Search_Success1_RE2/256 100000 27807 ns/op 9.21 MB/s +Search_Success1_RE2/512 100000 28664 ns/op 17.86 MB/s +Search_Success1_RE2/1K 50000 30116 ns/op 34.00 MB/s +Search_Success1_RE2/2K 50000 33360 ns/op 61.39 MB/s +Search_Success1_RE2/4K 50000 40118 ns/op 102.10 MB/s +Search_Success1_RE2/8K 50000 52901 ns/op 154.85 MB/s +Search_Success1_RE2/16K 20000 78892 ns/op 207.68 MB/s +Search_Success1_RE2/32K 10000 131013 ns/op 250.11 MB/s +Search_Success1_RE2/64K 10000 234601 ns/op 279.35 MB/s +Search_Success1_RE2/128K 5000 442246 ns/op 296.38 MB/s +Search_Success1_RE2/256K 2000 859127 ns/op 305.13 MB/s +Search_Success1_RE2/512K 1000 1697416 ns/op 308.87 MB/s +Search_Success1_RE2/1M 500 3394086 ns/op 308.94 MB/s +Search_Success1_RE2/2M 200 6933840 ns/op 302.45 MB/s +Search_Success1_RE2/4M 100 14439980 ns/op 290.46 MB/s +Search_Success1_RE2/8M 50 30989920 ns/op 270.69 MB/s +Search_Success1_RE2/16M 20 74560700 ns/op 225.01 MB/s +Search_Success1_Cached_PCRE/8 10000000 257 ns/op 31.10 MB/s +Search_Success1_Cached_PCRE/16 5000000 311 ns/op 51.42 MB/s +Search_Success1_Cached_PCRE/32 5000000 418 ns/op 76.48 MB/s +Search_Success1_Cached_PCRE/64 5000000 633 ns/op 101.03 MB/s +Search_Success1_Cached_PCRE/128 2000000 935 ns/op 136.78 MB/s +Search_Success1_Cached_PCRE/256 1000000 1643 ns/op 155.79 MB/s +Search_Success1_Cached_PCRE/512 500000 3058 ns/op 167.41 MB/s +Search_Success1_Cached_PCRE/1K 500000 5888 ns/op 173.89 MB/s +Search_Success1_Cached_PCRE/2K 200000 11550 ns/op 177.30 MB/s +Search_Success1_Cached_PCRE/4K 100000 22873 ns/op 179.07 MB/s +Search_Success1_Cached_PCRE/8K 50000 45522 ns/op 179.95 MB/s +Search_Success1_Cached_PCRE/16K 20000 90830 ns/op 180.38 MB/s +Search_Success1_Cached_PCRE/32K 10000 181547 ns/op 180.49 MB/s +Search_Success1_Cached_PCRE/64K 5000 362960 ns/op 180.56 MB/s +Search_Success1_Cached_PCRE/128K 2000 726612 ns/op 180.39 MB/s +Search_Success1_Cached_PCRE/256K 1000 1456167 ns/op 180.02 MB/s +Search_Success1_Cached_PCRE/512K 500 2924960 ns/op 179.25 MB/s +Search_Success1_Cached_PCRE/1M 500 5850124 ns/op 179.24 MB/s +Search_Success1_Cached_PCRE/2M 200 11879665 ns/op 176.53 MB/s +Search_Success1_Cached_PCRE/4M 100 24385800 ns/op 172.00 MB/s +Search_Success1_Cached_PCRE/8M 50 50977600 ns/op 164.55 MB/s +Search_Success1_Cached_PCRE/16M 10 135651600 ns/op 123.68 MB/s +Search_Success1_Cached_RE2/8 5000000 347 ns/op 23.00 MB/s +Search_Success1_Cached_RE2/16 5000000 373 ns/op 42.83 MB/s +Search_Success1_Cached_RE2/32 5000000 423 ns/op 75.55 MB/s +Search_Success1_Cached_RE2/64 5000000 523 ns/op 122.23 MB/s +Search_Success1_Cached_RE2/128 5000000 731 ns/op 174.97 MB/s +Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.85 MB/s +Search_Success1_Cached_RE2/512 1000000 1942 ns/op 263.56 MB/s +Search_Success1_Cached_RE2/1K 500000 3560 ns/op 287.62 MB/s +Search_Success1_Cached_RE2/2K 500000 6794 ns/op 301.40 MB/s +Search_Success1_Cached_RE2/4K 200000 13267 ns/op 308.73 MB/s +Search_Success1_Cached_RE2/8K 100000 26210 ns/op 312.54 MB/s +Search_Success1_Cached_RE2/16K 50000 52100 ns/op 314.47 MB/s +Search_Success1_Cached_RE2/32K 10000 104040 ns/op 314.95 MB/s +Search_Success1_Cached_RE2/64K 10000 207650 ns/op 315.61 MB/s +Search_Success1_Cached_RE2/128K 5000 415201 ns/op 315.68 MB/s +Search_Success1_Cached_RE2/256K 2000 831979 ns/op 315.08 MB/s +Search_Success1_Cached_RE2/512K 1000 1670071 ns/op 313.93 MB/s +Search_Success1_Cached_RE2/1M 500 3366256 ns/op 311.50 MB/s +Search_Success1_Cached_RE2/2M 200 6902045 ns/op 303.85 MB/s +Search_Success1_Cached_RE2/4M 100 14384020 ns/op 291.59 MB/s +Search_Success1_Cached_RE2/8M 50 30929640 ns/op 271.22 MB/s +Search_Success1_Cached_RE2/16M 20 74502350 ns/op 225.19 MB/s +Search_Digits_PCRE 500000 5023 ns/op +Search_Digits_RE2 100000 21787 ns/op +Parse_Digits_PCRE 500000 5015 ns/op +Parse_Digits_RE2 200000 9912 ns/op +Parse_CachedDigits_PCRE 5000000 448 ns/op +Parse_CachedDigits_RE2 10000000 266 ns/op +Parse_DigitDs_PCRE 500000 4128 ns/op +Parse_DigitDs_RE2 200000 9679 ns/op +Parse_CachedDigitDs_PCRE 5000000 459 ns/op +Parse_CachedDigitDs_RE2 10000000 265 ns/op +Parse_Split_PCRE 500000 3122 ns/op +Parse_Split_RE2 200000 11139 ns/op +Parse_CachedSplit_PCRE 5000000 333 ns/op +Parse_CachedSplit_RE2 10000000 170 ns/op +Parse_SplitHard_PCRE 500000 3113 ns/op +Parse_SplitHard_RE2 200000 14117 ns/op +Parse_CachedSplitHard_PCRE 5000000 328 ns/op +Parse_CachedSplitHard_RE2 1000000 2472 ns/op +Parse_CachedSplitBig1_PCRE 500 4502404 ns/op +Parse_CachedSplitBig1_RE2 5000 635120 ns/op +Parse_CachedSplitBig2_PCRE 5000 553267 ns/op +Parse_CachedSplitBig2_RE2 50 51601920 ns/op +BM_PCRE_Compile 500000 3798 ns/op +BM_RE2_Compile 200000 10923 ns/op +SearchPhone_CachedPCRE/8 1000000 1196 ns/op 6.68 MB/s +SearchPhone_CachedPCRE/16 1000000 1969 ns/op 8.12 MB/s +SearchPhone_CachedPCRE/32 500000 3511 ns/op 9.11 MB/s +SearchPhone_CachedPCRE/64 500000 6563 ns/op 9.75 MB/s +SearchPhone_CachedPCRE/128 200000 12796 ns/op 10.00 MB/s +SearchPhone_CachedPCRE/256 100000 25045 ns/op 10.22 MB/s +SearchPhone_CachedPCRE/512 50000 49381 ns/op 10.37 MB/s +SearchPhone_CachedPCRE/1K 20000 98166 ns/op 10.43 MB/s +SearchPhone_CachedPCRE/2K 10000 193434 ns/op 10.59 MB/s +SearchPhone_CachedPCRE/4K 5000 382921 ns/op 10.70 MB/s +SearchPhone_CachedPCRE/8K 2000 765255 ns/op 10.70 MB/s +SearchPhone_CachedPCRE/16K 1000 1524376 ns/op 10.75 MB/s +SearchPhone_CachedPCRE/32K 500 3046932 ns/op 10.75 MB/s +SearchPhone_CachedPCRE/64K 500 6088620 ns/op 10.76 MB/s +SearchPhone_CachedPCRE/128K 100 12170430 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/256K 100 24329780 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/512K 50 48663960 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/1M 20 97341800 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/2M 10 194512900 ns/op 10.78 MB/s +SearchPhone_CachedPCRE/4M 5 389369200 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/8M 5 778852600 ns/op 10.77 MB/s +SearchPhone_CachedPCRE/16M 1 1558273000 ns/op 10.77 MB/s +SearchPhone_CachedRE2/8 2000000 884 ns/op 9.05 MB/s +SearchPhone_CachedRE2/16 2000000 913 ns/op 17.52 MB/s +SearchPhone_CachedRE2/32 2000000 965 ns/op 33.14 MB/s +SearchPhone_CachedRE2/64 1000000 1078 ns/op 59.32 MB/s +SearchPhone_CachedRE2/128 1000000 1269 ns/op 100.87 MB/s +SearchPhone_CachedRE2/256 1000000 1678 ns/op 152.50 MB/s +SearchPhone_CachedRE2/512 1000000 2482 ns/op 206.26 MB/s +SearchPhone_CachedRE2/1K 500000 4110 ns/op 249.11 MB/s +SearchPhone_CachedRE2/2K 500000 7347 ns/op 278.74 MB/s +SearchPhone_CachedRE2/4K 200000 13805 ns/op 296.70 MB/s +SearchPhone_CachedRE2/8K 100000 26763 ns/op 306.09 MB/s +SearchPhone_CachedRE2/16K 50000 52718 ns/op 310.78 MB/s +SearchPhone_CachedRE2/32K 10000 104770 ns/op 312.76 MB/s +SearchPhone_CachedRE2/64K 10000 208323 ns/op 314.59 MB/s +SearchPhone_CachedRE2/128K 5000 415437 ns/op 315.50 MB/s +SearchPhone_CachedRE2/256K 2000 829593 ns/op 315.99 MB/s +SearchPhone_CachedRE2/512K 1000 1657998 ns/op 316.22 MB/s +SearchPhone_CachedRE2/1M 500 3314964 ns/op 316.32 MB/s +SearchPhone_CachedRE2/2M 500 6639102 ns/op 315.88 MB/s +SearchPhone_CachedRE2/4M 100 13334810 ns/op 314.54 MB/s +SearchPhone_CachedRE2/8M 100 26721480 ns/op 313.93 MB/s +SearchPhone_CachedRE2/16M 50 53438280 ns/op 313.96 MB/s +EmptyPartialMatchPCRE 20000000 138 ns/op +EmptyPartialMatchRE2 5000000 314 ns/op +SimplePartialMatchPCRE 10000000 193 ns/op +SimplePartialMatchRE2 5000000 344 ns/op +HTTPPartialMatchPCRE 5000000 574 ns/op +HTTPPartialMatchRE2 5000000 621 ns/op +SmallHTTPPartialMatchPCRE 5000000 576 ns/op +SmallHTTPPartialMatchRE2 5000000 625 ns/op +DotMatchPCRE 5000000 414 ns/op +DotMatchRE2 5000000 670 ns/op +ASCIIMatchPCRE 5000000 395 ns/op +ASCIIMatchRE2 5000000 668 ns/op +==BENCHMARK== c2 Fri Feb 26 16:11:53 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# fd9366132ce9+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped + +Search_Easy0_CachedPCRE/8 10000000 132 ns/op 60.22 MB/s +Search_Easy0_CachedPCRE/16 10000000 158 ns/op 100.63 MB/s +Search_Easy0_CachedPCRE/32 10000000 215 ns/op 148.22 MB/s +Search_Easy0_CachedPCRE/64 5000000 329 ns/op 194.12 MB/s +Search_Easy0_CachedPCRE/128 5000000 429 ns/op 297.74 MB/s +Search_Easy0_CachedPCRE/256 1000000 1081 ns/op 236.75 MB/s +Search_Easy0_CachedPCRE/512 1000000 1740 ns/op 294.12 MB/s +Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.01 MB/s +Search_Easy0_CachedPCRE/2K 500000 5682 ns/op 360.42 MB/s +Search_Easy0_CachedPCRE/4K 200000 10631 ns/op 385.26 MB/s +Search_Easy0_CachedPCRE/8K 100000 21774 ns/op 376.22 MB/s +Search_Easy0_CachedPCRE/16K 50000 42171 ns/op 388.51 MB/s +Search_Easy0_CachedPCRE/32K 20000 85140 ns/op 384.87 MB/s +Search_Easy0_CachedPCRE/64K 10000 169833 ns/op 385.88 MB/s +Search_Easy0_CachedPCRE/128K 5000 341039 ns/op 384.33 MB/s +Search_Easy0_CachedPCRE/256K 5000 680619 ns/op 385.15 MB/s +Search_Easy0_CachedPCRE/512K 2000 1363481 ns/op 384.52 MB/s +Search_Easy0_CachedPCRE/1M 1000 2726584 ns/op 384.57 MB/s +Search_Easy0_CachedPCRE/2M 500 5460554 ns/op 384.05 MB/s +Search_Easy0_CachedPCRE/4M 100 11058850 ns/op 379.27 MB/s +Search_Easy0_CachedPCRE/8M 100 22178340 ns/op 378.23 MB/s +Search_Easy0_CachedPCRE/16M 50 44339640 ns/op 378.38 MB/s +Search_Easy0_CachedRE2/8 5000000 315 ns/op 25.32 MB/s +Search_Easy0_CachedRE2/16 5000000 317 ns/op 50.40 MB/s +Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.12 MB/s +Search_Easy0_CachedRE2/64 5000000 333 ns/op 191.81 MB/s +Search_Easy0_CachedRE2/128 5000000 365 ns/op 349.86 MB/s +Search_Easy0_CachedRE2/256 5000000 395 ns/op 646.63 MB/s +Search_Easy0_CachedRE2/512 5000000 459 ns/op 1114.56 MB/s +Search_Easy0_CachedRE2/1K 5000000 634 ns/op 1613.10 MB/s +Search_Easy0_CachedRE2/2K 2000000 991 ns/op 2065.21 MB/s +Search_Easy0_CachedRE2/4K 1000000 1571 ns/op 2606.83 MB/s +Search_Easy0_CachedRE2/8K 1000000 2919 ns/op 2805.81 MB/s +Search_Easy0_CachedRE2/16K 500000 5406 ns/op 3030.65 MB/s +Search_Easy0_CachedRE2/32K 200000 11015 ns/op 2974.76 MB/s +Search_Easy0_CachedRE2/64K 100000 21911 ns/op 2990.89 MB/s +Search_Easy0_CachedRE2/128K 50000 44356 ns/op 2954.95 MB/s +Search_Easy0_CachedRE2/256K 20000 88544 ns/op 2960.58 MB/s +Search_Easy0_CachedRE2/512K 10000 178349 ns/op 2939.67 MB/s +Search_Easy0_CachedRE2/1M 5000 357706 ns/op 2931.39 MB/s +Search_Easy0_CachedRE2/2M 5000 721832 ns/op 2905.32 MB/s +Search_Easy0_CachedRE2/4M 1000 1529421 ns/op 2742.41 MB/s +Search_Easy0_CachedRE2/8M 500 3092246 ns/op 2712.79 MB/s +Search_Easy0_CachedRE2/16M 500 6166744 ns/op 2720.60 MB/s +Search_Easy1_CachedPCRE/8 20000000 130 ns/op 61.31 MB/s +Search_Easy1_CachedPCRE/16 10000000 158 ns/op 100.72 MB/s +Search_Easy1_CachedPCRE/32 10000000 215 ns/op 148.32 MB/s +Search_Easy1_CachedPCRE/64 5000000 329 ns/op 194.13 MB/s +Search_Easy1_CachedPCRE/128 5000000 647 ns/op 197.60 MB/s +Search_Easy1_CachedPCRE/256 2000000 934 ns/op 273.86 MB/s +Search_Easy1_CachedPCRE/512 1000000 1968 ns/op 260.14 MB/s +Search_Easy1_CachedPCRE/1K 500000 3418 ns/op 299.55 MB/s +Search_Easy1_CachedPCRE/2K 500000 6235 ns/op 328.42 MB/s +Search_Easy1_CachedPCRE/4K 200000 11128 ns/op 368.07 MB/s +Search_Easy1_CachedPCRE/8K 100000 22016 ns/op 372.09 MB/s +Search_Easy1_CachedPCRE/16K 50000 42398 ns/op 386.43 MB/s +Search_Easy1_CachedPCRE/32K 20000 85215 ns/op 384.53 MB/s +Search_Easy1_CachedPCRE/64K 10000 170243 ns/op 384.95 MB/s +Search_Easy1_CachedPCRE/128K 5000 342036 ns/op 383.21 MB/s +Search_Easy1_CachedPCRE/256K 5000 682271 ns/op 384.22 MB/s +Search_Easy1_CachedPCRE/512K 2000 1367025 ns/op 383.52 MB/s +Search_Easy1_CachedPCRE/1M 1000 2735481 ns/op 383.32 MB/s +Search_Easy1_CachedPCRE/2M 500 5477128 ns/op 382.89 MB/s +==BENCHMARK== c2 Fri Feb 26 16:14:43 PST 2010 +# Linux c2 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64 GNU/Linux +# g++ (Ubuntu 4.4.1-4ubuntu8) 4.4.1 +# Copyright (C) 2009 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# fd9366132ce9+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.15, not stripped + +Search_Easy0_CachedPCRE/8 10000000 131 ns/op 60.99 MB/s +Search_Easy0_CachedPCRE/16 10000000 159 ns/op 100.35 MB/s +Search_Easy0_CachedPCRE/32 10000000 216 ns/op 147.95 MB/s +Search_Easy0_CachedPCRE/64 5000000 330 ns/op 193.87 MB/s +Search_Easy0_CachedPCRE/128 5000000 430 ns/op 297.35 MB/s +Search_Easy0_CachedPCRE/256 1000000 1080 ns/op 236.90 MB/s +Search_Easy0_CachedPCRE/512 1000000 1740 ns/op 294.24 MB/s +Search_Easy0_CachedPCRE/1K 500000 3390 ns/op 302.06 MB/s +Search_Easy0_CachedPCRE/2K 500000 5681 ns/op 360.48 MB/s +Search_Easy0_CachedPCRE/4K 200000 10630 ns/op 385.32 MB/s +Search_Easy0_CachedPCRE/8K 100000 21770 ns/op 376.29 MB/s +Search_Easy0_CachedPCRE/16K 50000 42147 ns/op 388.73 MB/s +Search_Easy0_CachedPCRE/32K 20000 85149 ns/op 384.83 MB/s +Search_Easy0_CachedPCRE/64K 10000 169788 ns/op 385.99 MB/s +Search_Easy0_CachedPCRE/128K 5000 340959 ns/op 384.42 MB/s +Search_Easy0_CachedPCRE/256K 5000 680407 ns/op 385.27 MB/s +Search_Easy0_CachedPCRE/512K 2000 1363245 ns/op 384.59 MB/s +Search_Easy0_CachedPCRE/1M 1000 2726837 ns/op 384.54 MB/s +Search_Easy0_CachedPCRE/2M 500 5462792 ns/op 383.90 MB/s +Search_Easy0_CachedPCRE/4M 100 11055460 ns/op 379.39 MB/s +Search_Easy0_CachedPCRE/8M 100 22174870 ns/op 378.29 MB/s +Search_Easy0_CachedPCRE/16M 50 44348440 ns/op 378.30 MB/s +Search_Easy0_CachedRE2/8 5000000 312 ns/op 25.61 MB/s +Search_Easy0_CachedRE2/16 5000000 311 ns/op 51.44 MB/s +Search_Easy0_CachedRE2/32 5000000 329 ns/op 97.01 MB/s +Search_Easy0_CachedRE2/64 5000000 331 ns/op 193.03 MB/s +Search_Easy0_CachedRE2/128 5000000 366 ns/op 349.43 MB/s +Search_Easy0_CachedRE2/256 5000000 382 ns/op 668.48 MB/s +Search_Easy0_CachedRE2/512 5000000 469 ns/op 1091.00 MB/s +Search_Easy0_CachedRE2/1K 5000000 650 ns/op 1574.64 MB/s +Search_Easy0_CachedRE2/2K 1000000 1002 ns/op 2043.38 MB/s +Search_Easy0_CachedRE2/4K 1000000 1577 ns/op 2596.54 MB/s +Search_Easy0_CachedRE2/8K 1000000 2911 ns/op 2813.46 MB/s +Search_Easy0_CachedRE2/16K 500000 5425 ns/op 3019.69 MB/s +Search_Easy0_CachedRE2/32K 200000 11026 ns/op 2971.78 MB/s +Search_Easy0_CachedRE2/64K 100000 21854 ns/op 2998.69 MB/s +Search_Easy0_CachedRE2/128K 50000 44382 ns/op 2953.23 MB/s +Search_Easy0_CachedRE2/256K 20000 88308 ns/op 2968.52 MB/s +Search_Easy0_CachedRE2/512K 10000 177645 ns/op 2951.32 MB/s +Search_Easy0_CachedRE2/1M 5000 356548 ns/op 2940.90 MB/s +Search_Easy0_CachedRE2/2M 5000 720036 ns/op 2912.56 MB/s +Search_Easy0_CachedRE2/4M 1000 1524214 ns/op 2751.78 MB/s +Search_Easy0_CachedRE2/8M 500 3083238 ns/op 2720.71 MB/s +Search_Easy0_CachedRE2/16M 500 6149012 ns/op 2728.44 MB/s +Search_Easy1_CachedPCRE/8 20000000 131 ns/op 60.89 MB/s +Search_Easy1_CachedPCRE/16 10000000 159 ns/op 100.17 MB/s +Search_Easy1_CachedPCRE/32 10000000 216 ns/op 147.73 MB/s +Search_Easy1_CachedPCRE/64 5000000 330 ns/op 193.67 MB/s +Search_Easy1_CachedPCRE/128 5000000 647 ns/op 197.80 MB/s +Search_Easy1_CachedPCRE/256 2000000 933 ns/op 274.19 MB/s +Search_Easy1_CachedPCRE/512 1000000 1963 ns/op 260.71 MB/s +Search_Easy1_CachedPCRE/1K 500000 3417 ns/op 299.65 MB/s +Search_Easy1_CachedPCRE/2K 500000 6237 ns/op 328.32 MB/s +Search_Easy1_CachedPCRE/4K 200000 11124 ns/op 368.19 MB/s +Search_Easy1_CachedPCRE/8K 100000 22020 ns/op 372.02 MB/s +Search_Easy1_CachedPCRE/16K 50000 42400 ns/op 386.41 MB/s +Search_Easy1_CachedPCRE/32K 20000 85208 ns/op 384.56 MB/s +Search_Easy1_CachedPCRE/64K 10000 170218 ns/op 385.01 MB/s +Search_Easy1_CachedPCRE/128K 5000 341992 ns/op 383.26 MB/s +Search_Easy1_CachedPCRE/256K 5000 682192 ns/op 384.27 MB/s +Search_Easy1_CachedPCRE/512K 2000 1366643 ns/op 383.63 MB/s +Search_Easy1_CachedPCRE/1M 1000 2735060 ns/op 383.38 MB/s +Search_Easy1_CachedPCRE/2M 500 5477962 ns/op 382.83 MB/s +Search_Easy1_CachedPCRE/4M 100 11090380 ns/op 378.19 MB/s +Search_Easy1_CachedPCRE/8M 100 22241800 ns/op 377.16 MB/s +Search_Easy1_CachedPCRE/16M 50 44479060 ns/op 377.19 MB/s +Search_Easy1_CachedRE2/8 5000000 314 ns/op 25.47 MB/s +Search_Easy1_CachedRE2/16 5000000 316 ns/op 50.60 MB/s +Search_Easy1_CachedRE2/32 5000000 332 ns/op 96.25 MB/s +Search_Easy1_CachedRE2/64 5000000 338 ns/op 189.05 MB/s +Search_Easy1_CachedRE2/128 5000000 367 ns/op 348.49 MB/s +Search_Easy1_CachedRE2/256 5000000 399 ns/op 641.03 MB/s +Search_Easy1_CachedRE2/512 5000000 468 ns/op 1092.75 MB/s +Search_Easy1_CachedRE2/1K 5000000 650 ns/op 1573.57 MB/s +Search_Easy1_CachedRE2/2K 1000000 1002 ns/op 2042.31 MB/s +Search_Easy1_CachedRE2/4K 1000000 1576 ns/op 2598.20 MB/s +Search_Easy1_CachedRE2/8K 1000000 2918 ns/op 2806.71 MB/s +Search_Easy1_CachedRE2/16K 500000 5447 ns/op 3007.80 MB/s +Search_Easy1_CachedRE2/32K 200000 10969 ns/op 2987.17 MB/s +Search_Easy1_CachedRE2/64K 100000 21865 ns/op 2997.18 MB/s +Search_Easy1_CachedRE2/128K 50000 44355 ns/op 2955.06 MB/s +Search_Easy1_CachedRE2/256K 20000 88281 ns/op 2969.41 MB/s +Search_Easy1_CachedRE2/512K 10000 177638 ns/op 2951.44 MB/s +Search_Easy1_CachedRE2/1M 5000 356550 ns/op 2940.89 MB/s +Search_Easy1_CachedRE2/2M 5000 720024 ns/op 2912.61 MB/s +Search_Easy1_CachedRE2/4M 1000 1524169 ns/op 2751.86 MB/s +Search_Easy1_CachedRE2/8M 500 3084670 ns/op 2719.45 MB/s +Search_Easy1_CachedRE2/16M 500 6151972 ns/op 2727.13 MB/s +Search_Medium_CachedPCRE/8 20000000 132 ns/op 60.22 MB/s +Search_Medium_CachedPCRE/16 10000000 161 ns/op 99.16 MB/s +Search_Medium_CachedPCRE/32 10000000 218 ns/op 146.58 MB/s +Search_Medium_CachedPCRE/64 5000000 339 ns/op 188.59 MB/s +Search_Medium_CachedPCRE/128 5000000 433 ns/op 295.34 MB/s +Search_Medium_CachedPCRE/256 200000 9075 ns/op 28.21 MB/s +Search_Medium_CachedPCRE/512 100000 21569 ns/op 23.74 MB/s +Search_Medium_CachedPCRE/1K 50000 42379 ns/op 24.16 MB/s +Search_Medium_CachedPCRE/2K 50000 62363 ns/op 32.84 MB/s +Search_Medium_CachedPCRE/4K 10000 153731 ns/op 26.64 MB/s +Search_Medium_CachedPCRE/8K 5000 332686 ns/op 24.62 MB/s +Search_Medium_CachedPCRE/16K 5000 678481 ns/op 24.15 MB/s +Search_Medium_CachedPCRE/32K 2000 1356329 ns/op 24.16 MB/s +Search_Medium_CachedPCRE/64K 1000 2709033 ns/op 24.19 MB/s +Search_Medium_CachedPCRE/128K 500 5413924 ns/op 24.21 MB/s +Search_Medium_CachedPCRE/256K 100 10832790 ns/op 24.20 MB/s +Search_Medium_CachedRE2/8 5000000 332 ns/op 24.08 MB/s +Search_Medium_CachedRE2/16 5000000 358 ns/op 44.58 MB/s +Search_Medium_CachedRE2/32 5000000 407 ns/op 78.49 MB/s +Search_Medium_CachedRE2/64 5000000 508 ns/op 125.89 MB/s +Search_Medium_CachedRE2/128 5000000 719 ns/op 177.95 MB/s +Search_Medium_CachedRE2/256 1000000 1123 ns/op 227.89 MB/s +Search_Medium_CachedRE2/512 1000000 1932 ns/op 264.94 MB/s +Search_Medium_CachedRE2/1K 500000 3550 ns/op 288.40 MB/s +Search_Medium_CachedRE2/2K 500000 6786 ns/op 301.78 MB/s +Search_Medium_CachedRE2/4K 200000 13256 ns/op 308.98 MB/s +Search_Medium_CachedRE2/8K 100000 26195 ns/op 312.72 MB/s +Search_Medium_CachedRE2/16K 50000 52079 ns/op 314.60 MB/s +Search_Medium_CachedRE2/32K 10000 103941 ns/op 315.25 MB/s +Search_Medium_CachedRE2/64K 10000 207495 ns/op 315.84 MB/s +Search_Medium_CachedRE2/128K 5000 414566 ns/op 316.17 MB/s +Search_Medium_CachedRE2/256K 2000 828759 ns/op 316.31 MB/s +Search_Medium_CachedRE2/512K 1000 1657168 ns/op 316.38 MB/s +Search_Medium_CachedRE2/1M 500 3314174 ns/op 316.39 MB/s +Search_Medium_CachedRE2/2M 500 6635590 ns/op 316.05 MB/s +Search_Medium_CachedRE2/4M 100 13336940 ns/op 314.49 MB/s +Search_Medium_CachedRE2/8M 100 26717640 ns/op 313.97 MB/s +Search_Medium_CachedRE2/16M 50 53430720 ns/op 314.00 MB/s +Search_Hard_CachedPCRE/8 20000000 133 ns/op 60.13 MB/s +Search_Hard_CachedPCRE/16 10000000 161 ns/op 99.09 MB/s +Search_Hard_CachedPCRE/32 10000000 218 ns/op 146.59 MB/s +Search_Hard_CachedPCRE/64 5000000 339 ns/op 188.50 MB/s +Search_Hard_CachedPCRE/128 5000000 433 ns/op 295.22 MB/s +Search_Hard_CachedPCRE/256 5000 572457 ns/op 0.45 MB/s +Search_Hard_CachedPCRE/512 1000 2346699 ns/op 0.22 MB/s +Search_Hard_CachedPCRE/1K 200 9314450 ns/op 0.11 MB/s +Search_Hard_CachedPCRE/2K 50 34065320 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/4K 10 146729800 ns/op 0.03 MB/s +Search_Hard_CachedRE2/8 5000000 330 ns/op 24.19 MB/s +Search_Hard_CachedRE2/16 5000000 358 ns/op 44.66 MB/s +Search_Hard_CachedRE2/32 5000000 412 ns/op 77.51 MB/s +Search_Hard_CachedRE2/64 5000000 507 ns/op 125.99 MB/s +Search_Hard_CachedRE2/128 5000000 719 ns/op 178.01 MB/s +Search_Hard_CachedRE2/256 1000000 1122 ns/op 228.01 MB/s +Search_Hard_CachedRE2/512 1000000 1931 ns/op 265.03 MB/s +Search_Hard_CachedRE2/1K 500000 3550 ns/op 288.44 MB/s +Search_Hard_CachedRE2/2K 500000 6788 ns/op 301.70 MB/s +Search_Hard_CachedRE2/4K 200000 13256 ns/op 308.98 MB/s +Search_Hard_CachedRE2/8K 100000 26200 ns/op 312.67 MB/s +Search_Hard_CachedRE2/16K 50000 52082 ns/op 314.58 MB/s +Search_Hard_CachedRE2/32K 10000 103936 ns/op 315.27 MB/s +Search_Hard_CachedRE2/64K 10000 207497 ns/op 315.84 MB/s +Search_Hard_CachedRE2/128K 5000 414603 ns/op 316.14 MB/s +Search_Hard_CachedRE2/256K 2000 828770 ns/op 316.30 MB/s +Search_Hard_CachedRE2/512K 1000 1657127 ns/op 316.38 MB/s +Search_Hard_CachedRE2/1M 500 3314338 ns/op 316.38 MB/s +Search_Hard_CachedRE2/2M 500 6635802 ns/op 316.04 MB/s +Search_Hard_CachedRE2/4M 100 13338440 ns/op 314.45 MB/s +Search_Hard_CachedRE2/8M 100 26718310 ns/op 313.96 MB/s +Search_Hard_CachedRE2/16M 50 53433380 ns/op 313.98 MB/s +Search_Parens_CachedPCRE/8 10000000 196 ns/op 40.67 MB/s +Search_Parens_CachedRE2/8 5000000 337 ns/op 23.70 MB/s +Search_Parens_CachedRE2/16 5000000 365 ns/op 43.77 MB/s +Search_Parens_CachedRE2/32 5000000 412 ns/op 77.62 MB/s +Search_Parens_CachedRE2/64 5000000 517 ns/op 123.72 MB/s +Search_Parens_CachedRE2/128 5000000 722 ns/op 177.17 MB/s +Search_Parens_CachedRE2/256 1000000 1126 ns/op 227.26 MB/s +Search_Parens_CachedRE2/512 1000000 1935 ns/op 264.56 MB/s +Search_Parens_CachedRE2/1K 500000 3550 ns/op 288.41 MB/s +Search_Parens_CachedRE2/2K 500000 6788 ns/op 301.68 MB/s +Search_Parens_CachedRE2/4K 200000 13262 ns/op 308.84 MB/s +Search_Parens_CachedRE2/8K 100000 26202 ns/op 312.65 MB/s +Search_Parens_CachedRE2/16K 50000 52088 ns/op 314.54 MB/s +Search_Parens_CachedRE2/32K 10000 103968 ns/op 315.17 MB/s +Search_Parens_CachedRE2/64K 10000 207504 ns/op 315.83 MB/s +Search_Parens_CachedRE2/128K 5000 414604 ns/op 316.14 MB/s +Search_Parens_CachedRE2/256K 2000 828795 ns/op 316.30 MB/s +Search_Parens_CachedRE2/512K 1000 1657211 ns/op 316.37 MB/s +Search_Parens_CachedRE2/1M 500 3314290 ns/op 316.38 MB/s +Search_Parens_CachedRE2/2M 500 6636392 ns/op 316.01 MB/s +Search_Parens_CachedRE2/4M 100 13338070 ns/op 314.46 MB/s +Search_Parens_CachedRE2/8M 100 26717640 ns/op 313.97 MB/s +Search_Parens_CachedRE2/16M 50 53437080 ns/op 313.96 MB/s +Search_BigFixed_CachedPCRE/8 10000000 242 ns/op 32.95 MB/s +Search_BigFixed_CachedPCRE/16 5000000 301 ns/op 53.10 MB/s +Search_BigFixed_CachedPCRE/32 5000000 418 ns/op 76.52 MB/s +Search_BigFixed_CachedPCRE/64 5000000 654 ns/op 97.84 MB/s +Search_BigFixed_CachedPCRE/128 2000000 985 ns/op 129.93 MB/s +Search_BigFixed_CachedPCRE/256 1000000 1775 ns/op 144.21 MB/s +Search_BigFixed_CachedPCRE/512 500000 3342 ns/op 153.19 MB/s +Search_BigFixed_CachedPCRE/1K 500000 6476 ns/op 158.12 MB/s +Search_BigFixed_CachedPCRE/2K 200000 12744 ns/op 160.69 MB/s +Search_BigFixed_CachedPCRE/4K 100000 25281 ns/op 162.01 MB/s +Search_BigFixed_CachedPCRE/8K 50000 50359 ns/op 162.67 MB/s +Search_BigFixed_CachedPCRE/16K 10000 100607 ns/op 162.85 MB/s +Search_BigFixed_CachedPCRE/32K 10000 200995 ns/op 163.03 MB/s +Search_BigFixed_CachedRE2/8 20000000 131 ns/op 61.00 MB/s +Search_BigFixed_CachedRE2/16 5000000 381 ns/op 41.95 MB/s +Search_BigFixed_CachedRE2/32 5000000 412 ns/op 77.51 MB/s +Search_BigFixed_CachedRE2/64 5000000 492 ns/op 129.84 MB/s +Search_BigFixed_CachedRE2/128 5000000 636 ns/op 201.21 MB/s +Search_BigFixed_CachedRE2/256 2000000 952 ns/op 268.71 MB/s +Search_BigFixed_CachedRE2/512 1000000 1552 ns/op 329.79 MB/s +Search_BigFixed_CachedRE2/1K 1000000 2772 ns/op 369.32 MB/s +Search_BigFixed_CachedRE2/2K 500000 5192 ns/op 394.39 MB/s +Search_BigFixed_CachedRE2/4K 200000 10051 ns/op 407.48 MB/s +Search_BigFixed_CachedRE2/8K 100000 19758 ns/op 414.61 MB/s +Search_BigFixed_CachedRE2/16K 50000 39167 ns/op 418.31 MB/s +Search_BigFixed_CachedRE2/32K 20000 78103 ns/op 419.55 MB/s +Search_BigFixed_CachedRE2/64K 10000 155875 ns/op 420.44 MB/s +Search_BigFixed_CachedRE2/128K 5000 311474 ns/op 420.81 MB/s +Search_BigFixed_CachedRE2/256K 5000 622461 ns/op 421.14 MB/s +Search_BigFixed_CachedRE2/512K 2000 1246952 ns/op 420.46 MB/s +Search_BigFixed_CachedRE2/1M 1000 2502325 ns/op 419.04 MB/s +Search_Success_PCRE/8 1000000 1783 ns/op 4.48 MB/s +Search_Success_PCRE/16 1000000 1839 ns/op 8.70 MB/s +Search_Success_PCRE/32 1000000 1934 ns/op 16.54 MB/s +Search_Success_PCRE/64 1000000 2104 ns/op 30.41 MB/s +Search_Success_PCRE/128 1000000 2484 ns/op 51.52 MB/s +Search_Success_PCRE/256 500000 3181 ns/op 80.47 MB/s +Search_Success_PCRE/512 500000 4598 ns/op 111.34 MB/s +Search_Success_PCRE/1K 500000 7463 ns/op 137.20 MB/s +Search_Success_PCRE/2K 200000 13079 ns/op 156.58 MB/s +Search_Success_PCRE/4K 100000 24404 ns/op 167.84 MB/s +Search_Success_PCRE/8K 50000 47074 ns/op 174.02 MB/s +Search_Success_PCRE/16K 20000 92372 ns/op 177.37 MB/s +Search_Success_PCRE/32K 10000 183212 ns/op 178.85 MB/s +Search_Success_PCRE/64K 5000 364671 ns/op 179.71 MB/s +Search_Success_PCRE/128K 2000 728337 ns/op 179.96 MB/s +Search_Success_PCRE/256K 1000 1457798 ns/op 179.82 MB/s +Search_Success_PCRE/512K 500 2926292 ns/op 179.16 MB/s +Search_Success_PCRE/1M 500 5851210 ns/op 179.21 MB/s +Search_Success_PCRE/2M 200 11872745 ns/op 176.64 MB/s +Search_Success_PCRE/4M 50 25398520 ns/op 165.14 MB/s +Search_Success_PCRE/8M 20 56956150 ns/op 147.28 MB/s +Search_Success_PCRE/16M 10 134245000 ns/op 124.97 MB/s +Search_Success_RE2/8 200000 8097 ns/op 0.99 MB/s +Search_Success_RE2/16 100000 19992 ns/op 0.80 MB/s +Search_Success_RE2/32 100000 19968 ns/op 1.60 MB/s +Search_Success_RE2/64 100000 20151 ns/op 3.18 MB/s +Search_Success_RE2/128 100000 20319 ns/op 6.30 MB/s +Search_Success_RE2/256 100000 20646 ns/op 12.40 MB/s +Search_Success_RE2/512 100000 21451 ns/op 23.87 MB/s +Search_Success_RE2/1K 100000 23054 ns/op 44.42 MB/s +Search_Success_RE2/2K 100000 26339 ns/op 77.75 MB/s +Search_Success_RE2/4K 50000 32820 ns/op 124.80 MB/s +Search_Success_RE2/8K 50000 45821 ns/op 178.78 MB/s +Search_Success_RE2/16K 50000 71718 ns/op 228.45 MB/s +Search_Success_RE2/32K 10000 123789 ns/op 264.71 MB/s +Search_Success_RE2/64K 10000 227372 ns/op 288.23 MB/s +Search_Success_RE2/128K 5000 435072 ns/op 301.26 MB/s +Search_Success_RE2/256K 2000 851760 ns/op 307.77 MB/s +Search_Success_RE2/512K 1000 1689906 ns/op 310.25 MB/s +Search_Success_RE2/1M 500 3385400 ns/op 309.73 MB/s +Search_Success_RE2/2M 200 6918485 ns/op 303.12 MB/s +Search_Success_RE2/4M 100 14404850 ns/op 291.17 MB/s +Search_Success_RE2/8M 50 30839480 ns/op 272.01 MB/s +Search_Success_RE2/16M 20 73836050 ns/op 227.22 MB/s +Search_Success_CachedPCRE/8 10000000 234 ns/op 34.15 MB/s +Search_Success_CachedPCRE/16 10000000 287 ns/op 55.56 MB/s +Search_Success_CachedPCRE/32 5000000 395 ns/op 80.93 MB/s +Search_Success_CachedPCRE/64 5000000 610 ns/op 104.84 MB/s +Search_Success_CachedPCRE/128 2000000 913 ns/op 140.13 MB/s +Search_Success_CachedPCRE/256 1000000 1620 ns/op 157.98 MB/s +Search_Success_CachedPCRE/512 500000 3036 ns/op 168.63 MB/s +Search_Success_CachedPCRE/1K 500000 5866 ns/op 174.55 MB/s +Search_Success_CachedPCRE/2K 200000 11528 ns/op 177.64 MB/s +Search_Success_CachedPCRE/4K 100000 22851 ns/op 179.24 MB/s +Search_Success_CachedPCRE/8K 50000 45501 ns/op 180.04 MB/s +Search_Success_CachedPCRE/16K 20000 90807 ns/op 180.43 MB/s +Search_Success_CachedPCRE/32K 10000 181512 ns/op 180.53 MB/s +Search_Success_CachedPCRE/64K 5000 362934 ns/op 180.57 MB/s +Search_Success_CachedPCRE/128K 2000 726545 ns/op 180.40 MB/s +Search_Success_CachedPCRE/256K 1000 1455974 ns/op 180.05 MB/s +Search_Success_CachedPCRE/512K 500 2924332 ns/op 179.28 MB/s +Search_Success_CachedPCRE/1M 500 5848344 ns/op 179.29 MB/s +Search_Success_CachedPCRE/2M 200 11865095 ns/op 176.75 MB/s +Search_Success_CachedPCRE/4M 50 25384340 ns/op 165.23 MB/s +Search_Success_CachedPCRE/8M 20 56942400 ns/op 147.32 MB/s +Search_Success_CachedPCRE/16M 10 134227100 ns/op 124.99 MB/s +Search_Success_CachedRE2/8 20000000 133 ns/op 59.99 MB/s +Search_Success_CachedRE2/16 5000000 371 ns/op 43.03 MB/s +Search_Success_CachedRE2/32 5000000 417 ns/op 76.69 MB/s +Search_Success_CachedRE2/64 5000000 517 ns/op 123.61 MB/s +Search_Success_CachedRE2/128 5000000 730 ns/op 175.23 MB/s +Search_Success_CachedRE2/256 1000000 1134 ns/op 225.72 MB/s +Search_Success_CachedRE2/512 1000000 1943 ns/op 263.49 MB/s +Search_Success_CachedRE2/1K 500000 3560 ns/op 287.59 MB/s +Search_Success_CachedRE2/2K 500000 6796 ns/op 301.32 MB/s +Search_Success_CachedRE2/4K 200000 13266 ns/op 308.76 MB/s +Search_Success_CachedRE2/8K 100000 26213 ns/op 312.51 MB/s +Search_Success_CachedRE2/16K 50000 52097 ns/op 314.49 MB/s +Search_Success_CachedRE2/32K 10000 104050 ns/op 314.92 MB/s +Search_Success_CachedRE2/64K 10000 207657 ns/op 315.60 MB/s +Search_Success_CachedRE2/128K 5000 415228 ns/op 315.66 MB/s +Search_Success_CachedRE2/256K 2000 831992 ns/op 315.08 MB/s +Search_Success_CachedRE2/512K 1000 1669679 ns/op 314.01 MB/s +Search_Success_CachedRE2/1M 500 3364660 ns/op 311.64 MB/s +Search_Success_CachedRE2/2M 200 6892065 ns/op 304.29 MB/s +Search_Success_CachedRE2/4M 100 14355860 ns/op 292.17 MB/s +Search_Success_CachedRE2/8M 50 30788480 ns/op 272.46 MB/s +Search_Success_CachedRE2/16M 20 73781750 ns/op 227.39 MB/s +Search_Success1_PCRE/8 1000000 1945 ns/op 4.11 MB/s +Search_Success1_PCRE/16 1000000 2005 ns/op 7.98 MB/s +Search_Success1_PCRE/32 1000000 2102 ns/op 15.22 MB/s +Search_Success1_PCRE/64 1000000 2277 ns/op 28.10 MB/s +Search_Success1_PCRE/128 1000000 2640 ns/op 48.48 MB/s +Search_Success1_PCRE/256 500000 3321 ns/op 77.07 MB/s +Search_Success1_PCRE/512 500000 4750 ns/op 107.79 MB/s +Search_Success1_PCRE/1K 200000 7579 ns/op 135.11 MB/s +Search_Success1_PCRE/2K 200000 13241 ns/op 154.67 MB/s +Search_Success1_PCRE/4K 100000 24584 ns/op 166.61 MB/s +Search_Success1_PCRE/8K 50000 47274 ns/op 173.29 MB/s +Search_Success1_PCRE/16K 20000 92603 ns/op 176.93 MB/s +Search_Success1_PCRE/32K 10000 183395 ns/op 178.67 MB/s +Search_Success1_PCRE/64K 5000 364841 ns/op 179.63 MB/s +Search_Success1_PCRE/128K 2000 728503 ns/op 179.92 MB/s +Search_Success1_PCRE/256K 1000 1458071 ns/op 179.79 MB/s +Search_Success1_PCRE/512K 500 2926604 ns/op 179.15 MB/s +Search_Success1_PCRE/1M 500 5851218 ns/op 179.21 MB/s +Search_Success1_PCRE/2M 200 11872985 ns/op 176.63 MB/s +Search_Success1_PCRE/4M 50 25401620 ns/op 165.12 MB/s +Search_Success1_PCRE/8M 20 56961950 ns/op 147.27 MB/s +Search_Success1_PCRE/16M 10 134240500 ns/op 124.98 MB/s +Search_Success1_RE2/8 100000 27528 ns/op 0.29 MB/s +Search_Success1_RE2/16 100000 27909 ns/op 0.57 MB/s +Search_Success1_RE2/32 100000 27939 ns/op 1.15 MB/s +Search_Success1_RE2/64 100000 28296 ns/op 2.26 MB/s +Search_Success1_RE2/128 100000 28485 ns/op 4.49 MB/s +Search_Success1_RE2/256 100000 28656 ns/op 8.93 MB/s +Search_Success1_RE2/512 100000 29337 ns/op 17.45 MB/s +Search_Success1_RE2/1K 50000 31020 ns/op 33.01 MB/s +Search_Success1_RE2/2K 50000 34197 ns/op 59.89 MB/s +Search_Success1_RE2/4K 50000 40779 ns/op 100.44 MB/s +Search_Success1_RE2/8K 50000 53805 ns/op 152.25 MB/s +Search_Success1_RE2/16K 20000 79804 ns/op 205.30 MB/s +Search_Success1_RE2/32K 10000 131917 ns/op 248.40 MB/s +Search_Success1_RE2/64K 10000 235487 ns/op 278.30 MB/s +Search_Success1_RE2/128K 5000 443078 ns/op 295.82 MB/s +Search_Success1_RE2/256K 2000 859950 ns/op 304.84 MB/s +Search_Success1_RE2/512K 1000 1697973 ns/op 308.77 MB/s +Search_Success1_RE2/1M 500 3393262 ns/op 309.02 MB/s +Search_Success1_RE2/2M 200 6926335 ns/op 302.78 MB/s +Search_Success1_RE2/4M 100 14413600 ns/op 291.00 MB/s +Search_Success1_RE2/8M 50 30850640 ns/op 271.91 MB/s +Search_Success1_RE2/16M 20 73845250 ns/op 227.19 MB/s +Search_Success1_Cached_PCRE/8 10000000 255 ns/op 31.28 MB/s +Search_Success1_Cached_PCRE/16 5000000 309 ns/op 51.71 MB/s +Search_Success1_Cached_PCRE/32 5000000 416 ns/op 76.76 MB/s +Search_Success1_Cached_PCRE/64 5000000 632 ns/op 101.25 MB/s +Search_Success1_Cached_PCRE/128 2000000 935 ns/op 136.88 MB/s +Search_Success1_Cached_PCRE/256 1000000 1641 ns/op 155.95 MB/s +Search_Success1_Cached_PCRE/512 500000 3057 ns/op 167.45 MB/s +Search_Success1_Cached_PCRE/1K 500000 5888 ns/op 173.91 MB/s +Search_Success1_Cached_PCRE/2K 200000 11550 ns/op 177.30 MB/s +Search_Success1_Cached_PCRE/4K 100000 22873 ns/op 179.07 MB/s +Search_Success1_Cached_PCRE/8K 50000 45523 ns/op 179.95 MB/s +Search_Success1_Cached_PCRE/16K 20000 90831 ns/op 180.38 MB/s +Search_Success1_Cached_PCRE/32K 10000 181548 ns/op 180.49 MB/s +Search_Success1_Cached_PCRE/64K 5000 362962 ns/op 180.56 MB/s +Search_Success1_Cached_PCRE/128K 2000 726556 ns/op 180.40 MB/s +Search_Success1_Cached_PCRE/256K 1000 1455905 ns/op 180.06 MB/s +Search_Success1_Cached_PCRE/512K 500 2924290 ns/op 179.29 MB/s +Search_Success1_Cached_PCRE/1M 500 5848600 ns/op 179.29 MB/s +Search_Success1_Cached_PCRE/2M 200 11865335 ns/op 176.75 MB/s +Search_Success1_Cached_PCRE/4M 50 25381500 ns/op 165.25 MB/s +Search_Success1_Cached_PCRE/8M 20 56935900 ns/op 147.33 MB/s +Search_Success1_Cached_PCRE/16M 10 134214600 ns/op 125.00 MB/s +Search_Success1_Cached_RE2/8 5000000 343 ns/op 23.27 MB/s +Search_Success1_Cached_RE2/16 5000000 372 ns/op 43.01 MB/s +Search_Success1_Cached_RE2/32 5000000 421 ns/op 75.96 MB/s +Search_Success1_Cached_RE2/64 5000000 518 ns/op 123.53 MB/s +Search_Success1_Cached_RE2/128 5000000 730 ns/op 175.31 MB/s +Search_Success1_Cached_RE2/256 1000000 1133 ns/op 225.77 MB/s +Search_Success1_Cached_RE2/512 1000000 1943 ns/op 263.48 MB/s +Search_Success1_Cached_RE2/1K 500000 3560 ns/op 287.59 MB/s +Search_Success1_Cached_RE2/2K 500000 6796 ns/op 301.33 MB/s +Search_Success1_Cached_RE2/4K 200000 13269 ns/op 308.69 MB/s +Search_Success1_Cached_RE2/8K 100000 26212 ns/op 312.52 MB/s +Search_Success1_Cached_RE2/16K 50000 52104 ns/op 314.45 MB/s +Search_Success1_Cached_RE2/32K 10000 104063 ns/op 314.88 MB/s +Search_Success1_Cached_RE2/64K 10000 207703 ns/op 315.53 MB/s +Search_Success1_Cached_RE2/128K 5000 415264 ns/op 315.64 MB/s +Search_Success1_Cached_RE2/256K 2000 831974 ns/op 315.09 MB/s +Search_Success1_Cached_RE2/512K 1000 1669692 ns/op 314.00 MB/s +Search_Success1_Cached_RE2/1M 500 3364484 ns/op 311.66 MB/s +Search_Success1_Cached_RE2/2M 200 6892295 ns/op 304.27 MB/s +Search_Success1_Cached_RE2/4M 100 14355830 ns/op 292.17 MB/s +Search_Success1_Cached_RE2/8M 50 30788400 ns/op 272.46 MB/s +Search_Success1_Cached_RE2/16M 20 73781700 ns/op 227.39 MB/s +Search_Digits_PCRE 500000 4957 ns/op +Search_Digits_RE2 100000 22155 ns/op +Parse_Digits_PCRE 500000 5045 ns/op +Parse_Digits_RE2 200000 9570 ns/op +Parse_CachedDigits_PCRE 5000000 448 ns/op +Parse_CachedDigits_RE2 5000000 301 ns/op +Parse_DigitDs_PCRE 500000 4075 ns/op +Parse_DigitDs_RE2 200000 9567 ns/op +Parse_CachedDigitDs_PCRE 5000000 453 ns/op +Parse_CachedDigitDs_RE2 5000000 301 ns/op +Parse_Split_PCRE 500000 3055 ns/op +Parse_Split_RE2 200000 10818 ns/op +Parse_CachedSplit_PCRE 5000000 329 ns/op +Parse_CachedSplit_RE2 10000000 172 ns/op +Parse_SplitHard_PCRE 500000 3069 ns/op +Parse_SplitHard_RE2 200000 13016 ns/op +Parse_CachedSplitHard_PCRE 5000000 325 ns/op +Parse_CachedSplitHard_RE2 1000000 2140 ns/op +Parse_CachedSplitBig1_PCRE 500 4502460 ns/op +Parse_CachedSplitBig1_RE2 5000 674142 ns/op +Parse_CachedSplitBig2_PCRE 5000 553268 ns/op +Parse_CachedSplitBig2_RE2 50 55654780 ns/op +BM_PCRE_Compile 500000 3780 ns/op +BM_RE2_Compile 200000 10409 ns/op +SearchPhone_CachedPCRE/8 1000000 1155 ns/op 6.92 MB/s +SearchPhone_CachedPCRE/16 1000000 1900 ns/op 8.42 MB/s +SearchPhone_CachedPCRE/32 500000 3414 ns/op 9.37 MB/s +SearchPhone_CachedPCRE/64 500000 6265 ns/op 10.21 MB/s +SearchPhone_CachedPCRE/128 200000 12227 ns/op 10.47 MB/s +SearchPhone_CachedPCRE/256 100000 23880 ns/op 10.72 MB/s +SearchPhone_CachedPCRE/512 50000 47672 ns/op 10.74 MB/s +SearchPhone_CachedPCRE/1K 20000 94526 ns/op 10.83 MB/s +SearchPhone_CachedPCRE/2K 10000 186297 ns/op 10.99 MB/s +SearchPhone_CachedPCRE/4K 5000 365404 ns/op 11.21 MB/s +SearchPhone_CachedPCRE/8K 5000 726987 ns/op 11.27 MB/s +SearchPhone_CachedPCRE/16K 2000 1451414 ns/op 11.29 MB/s +SearchPhone_CachedPCRE/32K 1000 2900737 ns/op 11.30 MB/s +SearchPhone_CachedPCRE/64K 500 5795914 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/128K 100 11603080 ns/op 11.30 MB/s +SearchPhone_CachedPCRE/256K 100 23178330 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/512K 50 46345740 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/1M 20 92692000 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/2M 10 185324900 ns/op 11.32 MB/s +SearchPhone_CachedPCRE/4M 5 370957000 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/8M 5 741607400 ns/op 11.31 MB/s +SearchPhone_CachedPCRE/16M 1 1482978000 ns/op 11.31 MB/s +SearchPhone_CachedRE2/8 2000000 860 ns/op 9.30 MB/s +SearchPhone_CachedRE2/16 2000000 903 ns/op 17.71 MB/s +SearchPhone_CachedRE2/32 2000000 944 ns/op 33.87 MB/s +SearchPhone_CachedRE2/64 1000000 1060 ns/op 60.34 MB/s +SearchPhone_CachedRE2/128 1000000 1267 ns/op 100.95 MB/s +SearchPhone_CachedRE2/256 1000000 1674 ns/op 152.88 MB/s +SearchPhone_CachedRE2/512 1000000 2476 ns/op 206.78 MB/s +SearchPhone_CachedRE2/1K 500000 4097 ns/op 249.91 MB/s +SearchPhone_CachedRE2/2K 500000 7343 ns/op 278.89 MB/s +SearchPhone_CachedRE2/4K 200000 13823 ns/op 296.31 MB/s +SearchPhone_CachedRE2/8K 100000 26767 ns/op 306.04 MB/s +SearchPhone_CachedRE2/16K 50000 52732 ns/op 310.70 MB/s +SearchPhone_CachedRE2/32K 10000 104785 ns/op 312.72 MB/s +SearchPhone_CachedRE2/64K 10000 208330 ns/op 314.58 MB/s +SearchPhone_CachedRE2/128K 5000 415442 ns/op 315.50 MB/s +SearchPhone_CachedRE2/256K 2000 829700 ns/op 315.95 MB/s +SearchPhone_CachedRE2/512K 1000 1658075 ns/op 316.20 MB/s +SearchPhone_CachedRE2/1M 500 3315348 ns/op 316.28 MB/s +SearchPhone_CachedRE2/2M 500 6637420 ns/op 315.96 MB/s +SearchPhone_CachedRE2/4M 100 13343750 ns/op 314.33 MB/s +SearchPhone_CachedRE2/8M 100 26723120 ns/op 313.91 MB/s +SearchPhone_CachedRE2/16M 50 53440620 ns/op 313.94 MB/s +EmptyPartialMatchPCRE 20000000 137 ns/op +EmptyPartialMatchRE2 5000000 310 ns/op +SimplePartialMatchPCRE 10000000 188 ns/op +SimplePartialMatchRE2 5000000 354 ns/op +HTTPPartialMatchPCRE 5000000 574 ns/op +HTTPPartialMatchRE2 5000000 627 ns/op +SmallHTTPPartialMatchPCRE 5000000 574 ns/op +SmallHTTPPartialMatchRE2 5000000 627 ns/op +DotMatchPCRE 5000000 409 ns/op +DotMatchRE2 5000000 691 ns/op +ASCIIMatchPCRE 5000000 392 ns/op +ASCIIMatchRE2 5000000 686 ns/op diff --git a/outside/re2/benchlog/benchlog.mini b/outside/re2/benchlog/benchlog.mini new file mode 100644 index 000000000..276483d95 --- /dev/null +++ b/outside/re2/benchlog/benchlog.mini @@ -0,0 +1,582 @@ +hw.ncpu: 2 +hw.byteorder: 1234 +hw.memsize: 4294967296 +hw.activecpu: 2 +hw.physicalcpu: 2 +hw.physicalcpu_max: 2 +hw.logicalcpu: 2 +hw.logicalcpu_max: 2 +hw.cputype: 7 +hw.cpusubtype: 4 +hw.cpu64bit_capable: 1 +hw.cpufamily: 1114597871 +hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0 +hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0 +hw.pagesize: 4096 +hw.busfrequency: 664000000 +hw.busfrequency_min: 664000000 +hw.busfrequency_max: 664000000 +hw.cpufrequency: 1830000000 +hw.cpufrequency_min: 1830000000 +hw.cpufrequency_max: 1830000000 +hw.cachelinesize: 64 +hw.l1icachesize: 32768 +hw.l1dcachesize: 32768 +hw.l2cachesize: 2097152 +hw.tbfrequency: 1000000000 +hw.packages: 1 +hw.optional.floatingpoint: 1 +hw.optional.mmx: 1 +hw.optional.sse: 1 +hw.optional.sse2: 1 +hw.optional.sse3: 1 +hw.optional.supplementalsse3: 1 +hw.optional.sse4_1: 0 +hw.optional.sse4_2: 0 +hw.optional.x86_64: 1 +hw.machine = i386 +hw.model = Macmini2,1 +hw.ncpu = 2 +hw.byteorder = 1234 +hw.physmem = 2147483648 +hw.usermem = 1849147392 +hw.pagesize = 4096 +hw.epoch = 0 +hw.vectorunit = 1 +hw.busfrequency = 664000000 +hw.cpufrequency = 1830000000 +hw.cachelinesize = 64 +hw.l1icachesize = 32768 +hw.l1dcachesize = 32768 +hw.l2settings = 1 +hw.l2cachesize = 2097152 +hw.tbfrequency = 1000000000 +hw.memsize = 4294967296 +hw.availcpu = 2 + +machdep.cpu.max_basic: 10 +machdep.cpu.max_ext: 2147483656 +machdep.cpu.vendor: GenuineIntel +machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz +machdep.cpu.family: 6 +machdep.cpu.model: 15 +machdep.cpu.extmodel: 0 +machdep.cpu.extfamily: 0 +machdep.cpu.stepping: 2 +machdep.cpu.feature_bits: 3219913727 58301 +machdep.cpu.extfeature_bits: 537921536 1 +machdep.cpu.signature: 1778 +machdep.cpu.brand: 0 +machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM +machdep.cpu.extfeatures: SYSCALL XD EM64T +machdep.cpu.logical_per_package: 2 +machdep.cpu.cores_per_package: 2 +machdep.cpu.microcode_version: 87 +machdep.cpu.mwait.linesize_min: 64 +machdep.cpu.mwait.linesize_max: 64 +machdep.cpu.mwait.extensions: 3 +machdep.cpu.mwait.sub_Cstates: 139808 +machdep.cpu.thermal.sensor: 1 +machdep.cpu.thermal.dynamic_acceleration: 0 +machdep.cpu.thermal.thresholds: 2 +machdep.cpu.thermal.ACNT_MCNT: 1 +machdep.cpu.arch_perf.version: 2 +machdep.cpu.arch_perf.number: 2 +machdep.cpu.arch_perf.width: 40 +machdep.cpu.arch_perf.events_number: 7 +machdep.cpu.arch_perf.events: 0 +machdep.cpu.arch_perf.fixed_number: 0 +machdep.cpu.arch_perf.fixed_width: 0 +machdep.cpu.cache.linesize: 64 +machdep.cpu.cache.L2_associativity: 6 +machdep.cpu.cache.size: 2048 +machdep.cpu.tlb.inst.small: 128 +machdep.cpu.tlb.inst.large: 8 +machdep.cpu.tlb.data.small: 16 +machdep.cpu.tlb.data.small_level1: 256 +machdep.cpu.tlb.data.large: 16 +machdep.cpu.tlb.data.large_level1: 32 +machdep.cpu.address_bits.physical: 36 +machdep.cpu.address_bits.virtual: 48 +machdep.cpu.core_count: 2 +machdep.cpu.thread_count: 2 + + +==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010 +# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386 +# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# a94585d91e66+ tip +# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64 + +Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s +Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s +Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s +Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s +Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s +Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s +Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s +Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s +Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s +Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s +Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s +Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s +Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s +Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s +Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s +Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s +Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s +Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s +Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s +Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s +Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s +Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s +Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s +Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s +Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s +Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s +Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s +Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s +Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s +Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s +Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s +Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s +Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s +Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s +Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s +Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s +Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s +Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s +Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s +Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s +Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s +Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s +Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s +Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s +Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s +Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s +Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s +Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s +Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s +Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s +Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s +Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s +Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s +Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s +Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s +Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s +Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s +Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s +Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s +Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s +Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s +Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s +Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s +Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s +Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s +Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s +Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s +Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s +Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s +Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s +Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s +Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s +Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s +Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s +Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s +Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s +Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s +Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s +Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s +Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s +Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s +Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s +Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s +Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s +Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s +Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s +Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s +Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s +Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s +Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s +Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s +Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s +Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s +Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s +Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s +Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s +Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s +Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s +Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s +Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s +Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s +Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s +Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s +Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s +Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s +Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s +Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s +Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s +Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s +Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s +Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s +Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s +Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s +Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s +Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s +Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s +Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s +Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s +Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s +Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s +Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s +Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s +Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s +Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s +Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s +Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s +Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s +Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s +Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s +Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s +Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s +Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s +Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s +Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s +Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s +Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s +Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s +Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s +Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s +Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s +Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s +Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s +Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s +Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s +Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s +Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s +Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s +Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s +Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s +Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s +Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s +Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s +Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s +Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s +Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s +Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s +Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s +Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s +Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s +Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s +Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s +Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s +Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s +Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s +Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s +Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s +Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s +Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s +Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s +Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s +Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s +Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s +Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s +Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s +Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s +Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s +Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s +Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s +Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s +Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s +Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s +Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s +Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s +Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s +Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s +Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s +Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s +Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s +Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s +Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s +Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s +Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s +Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s +Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s +Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s +Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s +Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s +Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s +Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s +Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s +Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s +Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s +Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s +Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s +Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s +Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s +Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s +Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s +Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s +Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s +Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s +Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s +Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s +Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s +Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s +Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s +Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s +Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s +Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s +Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s +Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s +Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s +Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s +Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s +Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s +Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s +Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s +Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s +Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s +Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s +Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s +Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s +Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s +Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s +Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s +Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s +Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s +Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s +Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s +Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s +Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s +Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s +Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s +Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s +Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s +Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s +Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s +Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s +Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s +Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s +Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s +Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s +Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s +Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s +Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s +Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s +Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s +Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s +Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s +Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s +Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s +Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s +Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s +Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s +Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s +Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s +Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s +Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s +Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s +Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s +Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s +Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s +Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s +Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s +Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s +Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s +Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s +Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s +Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s +Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s +Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s +Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s +Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s +Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s +Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s +Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s +Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s +Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s +Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s +Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s +Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s +Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s +Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s +Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s +Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s +Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s +Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s +Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s +Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s +Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s +Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s +Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s +Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s +Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s +Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s +Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s +Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s +Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s +Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s +Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s +Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s +Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s +Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s +Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s +Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s +Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s +Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s +Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s +Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s +Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s +Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s +Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s +Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s +Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s +Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s +Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s +Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s +Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s +Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s +Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s +Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s +Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s +Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s +Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s +Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s +Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s +Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s +Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s +Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s +Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s +Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s +Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s +Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s +Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s +Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s +Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s +Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s +Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s +Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s +Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s +Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s +Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s +Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s +Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s +Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s +Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s +Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s +Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s +Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s +Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s +Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s +Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s +Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s +Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s +Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s +Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s +Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s +Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s +Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s +Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s +Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s +Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s +Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s +Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s +Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s +Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s +Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s +Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s +Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s +Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s +Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s +Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s +Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s +Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s +Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s +Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s +Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s +Search_Digits_PCRE 500000 7534 ns/op +Search_Digits_RE2 50000 44162 ns/op +Parse_Digits_PCRE 200000 7664 ns/op +Parse_Digits_RE2 100000 22595 ns/op +Parse_CachedDigits_PCRE 5000000 721 ns/op +Parse_CachedDigits_RE2 5000000 413 ns/op +Parse_DigitDs_PCRE 500000 7095 ns/op +Parse_DigitDs_RE2 100000 22259 ns/op +Parse_CachedDigitDs_PCRE 5000000 704 ns/op +Parse_CachedDigitDs_RE2 5000000 415 ns/op +Parse_Split_PCRE 500000 5540 ns/op +Parse_Split_RE2 100000 23817 ns/op +Parse_CachedSplit_PCRE 5000000 490 ns/op +Parse_CachedSplit_RE2 10000000 251 ns/op +Parse_SplitHard_PCRE 500000 5410 ns/op +Parse_SplitHard_RE2 100000 28518 ns/op +Parse_CachedSplitHard_PCRE 5000000 488 ns/op +Parse_CachedSplitHard_RE2 1000000 2489 ns/op +Parse_CachedSplitBig1_PCRE 500 7171752 ns/op +Parse_CachedSplitBig1_RE2 2000 990722 ns/op +Parse_CachedSplitBig2_PCRE 5000 658331 ns/op +Parse_CachedSplitBig2_RE2 20 81205250 ns/op +BM_PCRE_Compile 500000 6443 ns/op +BM_RE2_Compile 100000 24103 ns/op +SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s +SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s +SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s +SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s +SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s +SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s +SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s +SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s +SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s +SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s +SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s +SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s +SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s +SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s +SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s +SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s +SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s +SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s +SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s +SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s +SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s +SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s +SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s +SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s +SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s +SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s +SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s +SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s +SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s +SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s +SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s +SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s +SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s +SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s +SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s +SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s +SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s +SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s +SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s +SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s +SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s +SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s +SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s +SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s +EmptyPartialMatchPCRE 10000000 195 ns/op +EmptyPartialMatchRE2 5000000 497 ns/op +SimplePartialMatchPCRE 10000000 276 ns/op +SimplePartialMatchRE2 5000000 548 ns/op +HTTPPartialMatchPCRE 2000000 826 ns/op +HTTPPartialMatchRE2 2000000 894 ns/op +SmallHTTPPartialMatchPCRE 2000000 825 ns/op +SmallHTTPPartialMatchRE2 2000000 895 ns/op +DotMatchPCRE 2000000 810 ns/op +DotMatchRE2 2000000 976 ns/op +ASCIIMatchPCRE 5000000 604 ns/op +ASCIIMatchRE2 2000000 976 ns/op diff --git a/outside/re2/benchlog/benchlog.r70 b/outside/re2/benchlog/benchlog.r70 new file mode 100644 index 000000000..1e4e86b48 --- /dev/null +++ b/outside/re2/benchlog/benchlog.r70 @@ -0,0 +1,1475 @@ +processor : 0 +vendor_id : AuthenticAMD +cpu family : 15 +model : 65 +model name : Dual-Core AMD Opteron(tm) Processor 8214 HE +stepping : 2 +cpu MHz : 2200.000 +cache size : 1024 KB +physical id : 0 +siblings : 2 +core id : 0 +cpu cores : 2 +fpu : yes +fpu_exception : yes +cpuid level : 1 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt rdtscp lm 3dnowext 3dnow rep_good pni cx16 lahf_lm cmp_legacy svm extapic cr8_legacy +bogomips : 4420.36 +TLB size : 1024 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 40 bits physical, 48 bits virtual +power management: ts fid vid ttp tm stc + +MemTotal: 8235352 kB +MemFree: 1083816 kB +Buffers: 476688 kB +Cached: 4809208 kB +SwapCached: 88 kB +Active: 2782140 kB +Inactive: 3716900 kB +SwapTotal: 2097136 kB +SwapFree: 2047340 kB +Dirty: 360 kB +Writeback: 0 kB +AnonPages: 1204244 kB +Mapped: 123916 kB +Slab: 583660 kB +SReclaimable: 505492 kB +SUnreclaim: 78168 kB +PageTables: 13084 kB +NFS_Unstable: 0 kB +Bounce: 0 kB +CommitLimit: 6214812 kB +Committed_AS: 2371464 kB +VmallocTotal: 34359738367 kB +VmallocUsed: 58520 kB +VmallocChunk: 34359678971 kB + + + +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:10:56 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# d7671f473f1a+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +Search_Easy0_CachedPCRE/8 10000000 149 ns/op 53.57 MB/s +Search_Easy0_CachedPCRE/16 10000000 194 ns/op 82.16 MB/s +Search_Easy0_CachedPCRE/32 5000000 291 ns/op 109.70 MB/s +Search_Easy0_CachedPCRE/64 5000000 452 ns/op 141.37 MB/s +Search_Easy0_CachedPCRE/128 2000000 773 ns/op 165.41 MB/s +Search_Easy0_CachedPCRE/256 500000 2135 ns/op 119.88 MB/s +Search_Easy0_CachedPCRE/512 500000 3674 ns/op 139.33 MB/s +Search_Easy0_CachedPCRE/1K 200000 7051 ns/op 145.21 MB/s +Search_Easy0_CachedPCRE/2K 100000 12536 ns/op 163.37 MB/s +Search_Easy0_CachedPCRE/4K 50000 24447 ns/op 167.54 MB/s +Search_Easy0_CachedPCRE/8K 20000 50421 ns/op 162.47 MB/s +Search_Easy0_CachedPCRE/16K 20000 98404 ns/op 166.50 MB/s +Search_Easy0_CachedPCRE/32K 10000 197440 ns/op 165.96 MB/s +Search_Easy0_CachedPCRE/64K 5000 394161 ns/op 166.27 MB/s +Search_Easy0_CachedPCRE/128K 2000 791340 ns/op 165.63 MB/s +Search_Easy0_CachedPCRE/256K 1000 1577534 ns/op 166.17 MB/s +Search_Easy0_CachedPCRE/512K 500 3165770 ns/op 165.61 MB/s +Search_Easy0_CachedPCRE/1M 200 6435865 ns/op 162.93 MB/s +Search_Easy0_CachedPCRE/2M 100 12895230 ns/op 162.63 MB/s +Search_Easy0_CachedPCRE/4M 50 25771800 ns/op 162.75 MB/s +Search_Easy0_CachedPCRE/8M 20 52665900 ns/op 159.28 MB/s +Search_Easy0_CachedPCRE/16M 10 104448400 ns/op 160.63 MB/s +Search_Easy0_CachedRE2/8 5000000 332 ns/op 24.08 MB/s +Search_Easy0_CachedRE2/16 5000000 339 ns/op 47.16 MB/s +Search_Easy0_CachedRE2/32 5000000 377 ns/op 84.77 MB/s +Search_Easy0_CachedRE2/64 5000000 388 ns/op 164.54 MB/s +Search_Easy0_CachedRE2/128 5000000 443 ns/op 288.35 MB/s +Search_Easy0_CachedRE2/256 2000000 517 ns/op 495.00 MB/s +Search_Easy0_CachedRE2/512 2000000 674 ns/op 759.60 MB/s +Search_Easy0_CachedRE2/1K 1000000 1192 ns/op 858.67 MB/s +Search_Easy0_CachedRE2/2K 500000 2145 ns/op 954.62 MB/s +Search_Easy0_CachedRE2/4K 500000 3711 ns/op 1103.52 MB/s +Search_Easy0_CachedRE2/8K 200000 7176 ns/op 1141.45 MB/s +Search_Easy0_CachedRE2/16K 100000 13930 ns/op 1176.13 MB/s +Search_Easy0_CachedRE2/32K 50000 28054 ns/op 1168.03 MB/s +Search_Easy0_CachedRE2/64K 20000 55914 ns/op 1172.07 MB/s +Search_Easy0_CachedRE2/128K 10000 119719 ns/op 1094.83 MB/s +Search_Easy0_CachedRE2/256K 5000 238165 ns/op 1100.68 MB/s +Search_Easy0_CachedRE2/512K 5000 480109 ns/op 1092.02 MB/s +Search_Easy0_CachedRE2/1M 1000 1024370 ns/op 1023.63 MB/s +Search_Easy0_CachedRE2/2M 500 2052224 ns/op 1021.89 MB/s +Search_Easy0_CachedRE2/4M 500 4122288 ns/op 1017.47 MB/s +Search_Easy0_CachedRE2/8M 200 8271160 ns/op 1014.20 MB/s +Search_Easy0_CachedRE2/16M 100 16825980 ns/op 997.10 MB/s +Search_Easy1_CachedPCRE/8 10000000 153 ns/op 52.14 MB/s +Search_Easy1_CachedPCRE/16 10000000 194 ns/op 82.27 MB/s +Search_Easy1_CachedPCRE/32 5000000 292 ns/op 109.48 MB/s +Search_Easy1_CachedPCRE/64 5000000 451 ns/op 141.88 MB/s +Search_Easy1_CachedPCRE/128 1000000 1148 ns/op 111.46 MB/s +Search_Easy1_CachedPCRE/256 1000000 1857 ns/op 137.80 MB/s +Search_Easy1_CachedPCRE/512 500000 4148 ns/op 123.43 MB/s +Search_Easy1_CachedPCRE/1K 200000 7516 ns/op 136.23 MB/s +Search_Easy1_CachedPCRE/2K 100000 14053 ns/op 145.73 MB/s +Search_Easy1_CachedPCRE/4K 50000 26487 ns/op 154.64 MB/s +Search_Easy1_CachedPCRE/8K 20000 52324 ns/op 156.56 MB/s +Search_Easy1_CachedPCRE/16K 10000 101153 ns/op 161.97 MB/s +Search_Easy1_CachedPCRE/32K 5000 202395 ns/op 161.90 MB/s +Search_Easy1_CachedPCRE/64K 5000 403530 ns/op 162.41 MB/s +Search_Easy1_CachedPCRE/128K 2000 817517 ns/op 160.33 MB/s +Search_Easy1_CachedPCRE/256K 1000 1628277 ns/op 160.99 MB/s +Search_Easy1_CachedPCRE/512K 500 3252172 ns/op 161.21 MB/s +Search_Easy1_CachedPCRE/1M 200 6555365 ns/op 159.96 MB/s +Search_Easy1_CachedPCRE/2M 100 13116580 ns/op 159.89 MB/s +Search_Easy1_CachedPCRE/4M 50 26249100 ns/op 159.79 MB/s +Search_Easy1_CachedPCRE/8M 20 52633400 ns/op 159.38 MB/s +Search_Easy1_CachedPCRE/16M 10 105218400 ns/op 159.45 MB/s +Search_Easy1_CachedRE2/8 5000000 340 ns/op 23.49 MB/s +Search_Easy1_CachedRE2/16 5000000 341 ns/op 46.81 MB/s +Search_Easy1_CachedRE2/32 5000000 380 ns/op 84.01 MB/s +Search_Easy1_CachedRE2/64 5000000 395 ns/op 161.89 MB/s +Search_Easy1_CachedRE2/128 5000000 465 ns/op 275.05 MB/s +Search_Easy1_CachedRE2/256 2000000 512 ns/op 499.90 MB/s +Search_Easy1_CachedRE2/512 2000000 678 ns/op 754.90 MB/s +Search_Easy1_CachedRE2/1K 1000000 1194 ns/op 857.60 MB/s +Search_Easy1_CachedRE2/2K 500000 2163 ns/op 946.49 MB/s +Search_Easy1_CachedRE2/4K 500000 3722 ns/op 1100.32 MB/s +Search_Easy1_CachedRE2/8K 200000 7134 ns/op 1148.27 MB/s +Search_Easy1_CachedRE2/16K 100000 14008 ns/op 1169.56 MB/s +Search_Easy1_CachedRE2/32K 50000 28535 ns/op 1148.33 MB/s +Search_Easy1_CachedRE2/64K 20000 57155 ns/op 1146.64 MB/s +Search_Easy1_CachedRE2/128K 10000 119610 ns/op 1095.82 MB/s +Search_Easy1_CachedRE2/256K 5000 238525 ns/op 1099.02 MB/s +Search_Easy1_CachedRE2/512K 5000 480327 ns/op 1091.52 MB/s +Search_Easy1_CachedRE2/1M 1000 1026046 ns/op 1021.96 MB/s +Search_Easy1_CachedRE2/2M 500 2035202 ns/op 1030.44 MB/s +Search_Easy1_CachedRE2/4M 500 4095944 ns/op 1024.01 MB/s +Search_Easy1_CachedRE2/8M 200 8295200 ns/op 1011.26 MB/s +Search_Easy1_CachedRE2/16M 100 17081710 ns/op 982.17 MB/s +Search_Medium_CachedPCRE/8 10000000 161 ns/op 49.55 MB/s +Search_Medium_CachedPCRE/16 5000000 212 ns/op 75.29 MB/s +Search_Medium_CachedPCRE/32 5000000 290 ns/op 110.22 MB/s +Search_Medium_CachedPCRE/64 5000000 450 ns/op 142.01 MB/s +Search_Medium_CachedPCRE/128 2000000 771 ns/op 165.99 MB/s +Search_Medium_CachedPCRE/256 100000 18958 ns/op 13.50 MB/s +Search_Medium_CachedPCRE/512 50000 44112 ns/op 11.61 MB/s +Search_Medium_CachedPCRE/1K 20000 87173 ns/op 11.75 MB/s +Search_Medium_CachedPCRE/2K 10000 129587 ns/op 15.80 MB/s +Search_Medium_CachedPCRE/4K 5000 321362 ns/op 12.75 MB/s +Search_Medium_CachedPCRE/8K 2000 694721 ns/op 11.79 MB/s +Search_Medium_CachedPCRE/16K 1000 1480844 ns/op 11.06 MB/s +Search_Medium_CachedPCRE/32K 500 3018562 ns/op 10.86 MB/s +Search_Medium_CachedPCRE/64K 200 6037290 ns/op 10.86 MB/s +Search_Medium_CachedPCRE/128K 100 12019360 ns/op 10.91 MB/s +Search_Medium_CachedPCRE/256K 50 23983440 ns/op 10.93 MB/s +Search_Medium_CachedRE2/8 5000000 335 ns/op 23.86 MB/s +Search_Medium_CachedRE2/16 5000000 391 ns/op 40.87 MB/s +Search_Medium_CachedRE2/32 5000000 496 ns/op 64.45 MB/s +Search_Medium_CachedRE2/64 2000000 723 ns/op 88.46 MB/s +Search_Medium_CachedRE2/128 1000000 1154 ns/op 110.89 MB/s +Search_Medium_CachedRE2/256 500000 2027 ns/op 126.29 MB/s +Search_Medium_CachedRE2/512 500000 3773 ns/op 135.68 MB/s +Search_Medium_CachedRE2/1K 200000 7258 ns/op 141.08 MB/s +Search_Medium_CachedRE2/2K 100000 14262 ns/op 143.59 MB/s +Search_Medium_CachedRE2/4K 50000 28179 ns/op 145.35 MB/s +Search_Medium_CachedRE2/8K 20000 56070 ns/op 146.10 MB/s +Search_Medium_CachedRE2/16K 10000 111844 ns/op 146.49 MB/s +Search_Medium_CachedRE2/32K 5000 224068 ns/op 146.24 MB/s +Search_Medium_CachedRE2/64K 5000 447358 ns/op 146.50 MB/s +Search_Medium_CachedRE2/128K 2000 901733 ns/op 145.36 MB/s +Search_Medium_CachedRE2/256K 1000 1805851 ns/op 145.16 MB/s +Search_Medium_CachedRE2/512K 500 3612816 ns/op 145.12 MB/s +Search_Medium_CachedRE2/1M 200 7351105 ns/op 142.64 MB/s +Search_Medium_CachedRE2/2M 100 14694290 ns/op 142.72 MB/s +Search_Medium_CachedRE2/4M 50 29395260 ns/op 142.69 MB/s +Search_Medium_CachedRE2/8M 20 58088750 ns/op 144.41 MB/s +Search_Medium_CachedRE2/16M 10 116312400 ns/op 144.24 MB/s +Search_Hard_CachedPCRE/8 10000000 162 ns/op 49.10 MB/s +Search_Hard_CachedPCRE/16 5000000 209 ns/op 76.28 MB/s +Search_Hard_CachedPCRE/32 5000000 289 ns/op 110.69 MB/s +Search_Hard_CachedPCRE/64 5000000 449 ns/op 142.33 MB/s +Search_Hard_CachedPCRE/128 2000000 769 ns/op 166.34 MB/s +Search_Hard_CachedPCRE/256 1000 1243528 ns/op 0.21 MB/s +Search_Hard_CachedPCRE/512 200 5089915 ns/op 0.10 MB/s +Search_Hard_CachedPCRE/1K 50 20228240 ns/op 0.05 MB/s +Search_Hard_CachedPCRE/2K 20 74096950 ns/op 0.03 MB/s +Search_Hard_CachedPCRE/4K 5 318803000 ns/op 0.01 MB/s +Search_Hard_CachedRE2/8 5000000 332 ns/op 24.03 MB/s +Search_Hard_CachedRE2/16 5000000 385 ns/op 41.47 MB/s +Search_Hard_CachedRE2/32 5000000 497 ns/op 64.38 MB/s +Search_Hard_CachedRE2/64 2000000 716 ns/op 89.38 MB/s +Search_Hard_CachedRE2/128 1000000 1146 ns/op 111.66 MB/s +Search_Hard_CachedRE2/256 500000 2017 ns/op 126.92 MB/s +Search_Hard_CachedRE2/512 500000 3765 ns/op 135.98 MB/s +Search_Hard_CachedRE2/1K 200000 7257 ns/op 141.09 MB/s +Search_Hard_CachedRE2/2K 100000 14209 ns/op 144.13 MB/s +Search_Hard_CachedRE2/4K 50000 28224 ns/op 145.12 MB/s +Search_Hard_CachedRE2/8K 20000 56015 ns/op 146.25 MB/s +Search_Hard_CachedRE2/16K 10000 112066 ns/op 146.20 MB/s +Search_Hard_CachedRE2/32K 5000 223212 ns/op 146.80 MB/s +Search_Hard_CachedRE2/64K 5000 447573 ns/op 146.43 MB/s +Search_Hard_CachedRE2/128K 2000 900290 ns/op 145.59 MB/s +Search_Hard_CachedRE2/256K 1000 1803864 ns/op 145.32 MB/s +Search_Hard_CachedRE2/512K 500 3608078 ns/op 145.31 MB/s +Search_Hard_CachedRE2/1M 200 7270210 ns/op 144.23 MB/s +Search_Hard_CachedRE2/2M 100 14554490 ns/op 144.09 MB/s +Search_Hard_CachedRE2/4M 50 29162380 ns/op 143.83 MB/s +Search_Hard_CachedRE2/8M 20 58978900 ns/op 142.23 MB/s +Search_Hard_CachedRE2/16M 10 116714000 ns/op 143.75 MB/s +Search_Parens_CachedPCRE/8 5000000 251 ns/op 31.75 MB/s +Search_Parens_CachedRE2/8 5000000 328 ns/op 24.34 MB/s +Search_Parens_CachedRE2/16 5000000 382 ns/op 41.82 MB/s +Search_Parens_CachedRE2/32 5000000 495 ns/op 64.62 MB/s +Search_Parens_CachedRE2/64 2000000 695 ns/op 92.04 MB/s +Search_Parens_CachedRE2/128 1000000 1107 ns/op 115.62 MB/s +Search_Parens_CachedRE2/256 1000000 2021 ns/op 126.63 MB/s +Search_Parens_CachedRE2/512 500000 3768 ns/op 135.88 MB/s +Search_Parens_CachedRE2/1K 200000 7242 ns/op 141.39 MB/s +Search_Parens_CachedRE2/2K 100000 14241 ns/op 143.81 MB/s +Search_Parens_CachedRE2/4K 50000 28148 ns/op 145.52 MB/s +Search_Parens_CachedRE2/8K 20000 56228 ns/op 145.69 MB/s +Search_Parens_CachedRE2/16K 10000 111761 ns/op 146.60 MB/s +Search_Parens_CachedRE2/32K 5000 223183 ns/op 146.82 MB/s +Search_Parens_CachedRE2/64K 5000 447285 ns/op 146.52 MB/s +Search_Parens_CachedRE2/128K 2000 902123 ns/op 145.29 MB/s +Search_Parens_CachedRE2/256K 1000 1803973 ns/op 145.31 MB/s +Search_Parens_CachedRE2/512K 500 3642388 ns/op 143.94 MB/s +Search_Parens_CachedRE2/1M 200 7339060 ns/op 142.88 MB/s +Search_Parens_CachedRE2/2M 100 14671260 ns/op 142.94 MB/s +Search_Parens_CachedRE2/4M 50 29267200 ns/op 143.31 MB/s +Search_Parens_CachedRE2/8M 20 58361500 ns/op 143.74 MB/s +Search_Parens_CachedRE2/16M 10 116252000 ns/op 144.32 MB/s +Search_BigFixed_CachedPCRE/8 5000000 400 ns/op 19.96 MB/s +Search_BigFixed_CachedPCRE/16 2000000 506 ns/op 31.61 MB/s +Search_BigFixed_CachedPCRE/32 2000000 697 ns/op 45.89 MB/s +Search_BigFixed_CachedPCRE/64 1000000 1069 ns/op 59.84 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1812 ns/op 70.62 MB/s +Search_BigFixed_CachedPCRE/256 500000 3311 ns/op 77.31 MB/s +Search_BigFixed_CachedPCRE/512 200000 6284 ns/op 81.48 MB/s +Search_BigFixed_CachedPCRE/1K 100000 12249 ns/op 83.60 MB/s +Search_BigFixed_CachedPCRE/2K 50000 24210 ns/op 84.59 MB/s +Search_BigFixed_CachedPCRE/4K 50000 48501 ns/op 84.45 MB/s +Search_BigFixed_CachedPCRE/8K 20000 95883 ns/op 85.44 MB/s +Search_BigFixed_CachedPCRE/16K 10000 191855 ns/op 85.40 MB/s +Search_BigFixed_CachedPCRE/32K 5000 384026 ns/op 85.33 MB/s +Search_BigFixed_CachedRE2/8 10000000 174 ns/op 45.91 MB/s +Search_BigFixed_CachedRE2/16 5000000 357 ns/op 44.76 MB/s +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:19:30 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# d7671f473f1a+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 14:19:39 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# d7671f473f1a+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +Search_Easy0_CachedPCRE/8 10000000 193 ns/op 41.33 MB/s +Search_Easy0_CachedPCRE/16 5000000 233 ns/op 68.49 MB/s +Search_Easy0_CachedPCRE/32 5000000 325 ns/op 98.43 MB/s +Search_Easy0_CachedPCRE/64 5000000 471 ns/op 135.63 MB/s +Search_Easy0_CachedPCRE/128 2000000 763 ns/op 167.55 MB/s +Search_Easy0_CachedPCRE/256 500000 2056 ns/op 124.48 MB/s +Search_Easy0_CachedPCRE/512 500000 3584 ns/op 142.84 MB/s +Search_Easy0_CachedPCRE/1K 200000 7100 ns/op 144.21 MB/s +Search_Easy0_CachedPCRE/2K 100000 12051 ns/op 169.93 MB/s +Search_Easy0_CachedPCRE/4K 50000 24050 ns/op 170.31 MB/s +Search_Easy0_CachedPCRE/8K 50000 49373 ns/op 165.92 MB/s +Search_Easy0_CachedPCRE/16K 20000 96773 ns/op 169.30 MB/s +Search_Easy0_CachedPCRE/32K 10000 194165 ns/op 168.76 MB/s +Search_Easy0_CachedPCRE/64K 5000 386741 ns/op 169.46 MB/s +Search_Easy0_CachedPCRE/128K 2000 777607 ns/op 168.56 MB/s +Search_Easy0_CachedPCRE/256K 1000 1552988 ns/op 168.80 MB/s +Search_Easy0_CachedPCRE/512K 500 3110474 ns/op 168.56 MB/s +Search_Easy0_CachedPCRE/1M 200 6364800 ns/op 164.75 MB/s +Search_Easy0_CachedPCRE/2M 100 12764160 ns/op 164.30 MB/s +Search_Easy0_CachedPCRE/4M 50 25694680 ns/op 163.24 MB/s +Search_Easy0_CachedPCRE/8M 20 51243350 ns/op 163.70 MB/s +Search_Easy0_CachedPCRE/16M 10 102468300 ns/op 163.73 MB/s +Search_Easy0_CachedRE2/8 5000000 316 ns/op 25.29 MB/s +Search_Easy0_CachedRE2/16 5000000 325 ns/op 49.12 MB/s +Search_Easy0_CachedRE2/32 5000000 371 ns/op 86.24 MB/s +Search_Easy0_CachedRE2/64 5000000 369 ns/op 173.23 MB/s +Search_Easy0_CachedRE2/128 5000000 451 ns/op 283.59 MB/s +Search_Easy0_CachedRE2/256 2000000 562 ns/op 455.33 MB/s +Search_Easy0_CachedRE2/512 2000000 744 ns/op 687.58 MB/s +Search_Easy0_CachedRE2/1K 1000000 1306 ns/op 783.89 MB/s +Search_Easy0_CachedRE2/2K 500000 2240 ns/op 913.97 MB/s +Search_Easy0_CachedRE2/4K 500000 3941 ns/op 1039.23 MB/s +Search_Easy0_CachedRE2/8K 200000 7648 ns/op 1071.01 MB/s +Search_Easy0_CachedRE2/16K 100000 14759 ns/op 1110.05 MB/s +Search_Easy0_CachedRE2/32K 50000 30302 ns/op 1081.38 MB/s +Search_Easy0_CachedRE2/64K 20000 60931 ns/op 1075.56 MB/s +Search_Easy0_CachedRE2/128K 10000 127313 ns/op 1029.52 MB/s +Search_Easy0_CachedRE2/256K 5000 254134 ns/op 1031.52 MB/s +Search_Easy0_CachedRE2/512K 5000 491892 ns/op 1065.86 MB/s +Search_Easy0_CachedRE2/1M 1000 1133898 ns/op 924.75 MB/s +Search_Easy0_CachedRE2/2M 500 2308796 ns/op 908.33 MB/s +Search_Easy0_CachedRE2/4M 500 4548904 ns/op 922.05 MB/s +Search_Easy0_CachedRE2/8M 200 9024270 ns/op 929.56 MB/s +Search_Easy0_CachedRE2/16M 100 18438590 ns/op 909.90 MB/s +Search_Easy1_CachedPCRE/8 10000000 191 ns/op 41.68 MB/s +Search_Easy1_CachedPCRE/16 5000000 229 ns/op 69.63 MB/s +Search_Easy1_CachedPCRE/32 5000000 324 ns/op 98.73 MB/s +Search_Easy1_CachedPCRE/64 5000000 470 ns/op 136.08 MB/s +Search_Easy1_CachedPCRE/128 1000000 1186 ns/op 107.88 MB/s +Search_Easy1_CachedPCRE/256 1000000 1773 ns/op 144.34 MB/s +Search_Easy1_CachedPCRE/512 500000 3948 ns/op 129.66 MB/s +Search_Easy1_CachedPCRE/1K 200000 7218 ns/op 141.85 MB/s +Search_Easy1_CachedPCRE/2K 100000 13609 ns/op 150.49 MB/s +Search_Easy1_CachedPCRE/4K 50000 25314 ns/op 161.80 MB/s +Search_Easy1_CachedPCRE/8K 20000 50313 ns/op 162.82 MB/s +Search_Easy1_CachedPCRE/16K 20000 98301 ns/op 166.67 MB/s +Search_Easy1_CachedPCRE/32K 10000 197269 ns/op 166.11 MB/s +Search_Easy1_CachedPCRE/64K 5000 392344 ns/op 167.04 MB/s +Search_Easy1_CachedPCRE/128K 2000 789037 ns/op 166.12 MB/s +Search_Easy1_CachedPCRE/256K 1000 1572839 ns/op 166.67 MB/s +Search_Easy1_CachedPCRE/512K 500 3152628 ns/op 166.30 MB/s +Search_Easy1_CachedPCRE/1M 200 6467335 ns/op 162.13 MB/s +Search_Easy1_CachedPCRE/2M 100 12945310 ns/op 162.00 MB/s +Search_Easy1_CachedPCRE/4M 50 26107960 ns/op 160.65 MB/s +Search_Easy1_CachedPCRE/8M 20 52019700 ns/op 161.26 MB/s +Search_Easy1_CachedPCRE/16M 10 103979700 ns/op 161.35 MB/s +Search_Easy1_CachedRE2/8 5000000 320 ns/op 24.99 MB/s +Search_Easy1_CachedRE2/16 5000000 325 ns/op 49.14 MB/s +Search_Easy1_CachedRE2/32 5000000 362 ns/op 88.30 MB/s +Search_Easy1_CachedRE2/64 5000000 379 ns/op 168.55 MB/s +Search_Easy1_CachedRE2/128 5000000 435 ns/op 294.25 MB/s +Search_Easy1_CachedRE2/256 2000000 530 ns/op 482.85 MB/s +Search_Easy1_CachedRE2/512 2000000 721 ns/op 710.03 MB/s +Search_Easy1_CachedRE2/1K 1000000 1290 ns/op 793.55 MB/s +Search_Easy1_CachedRE2/2K 500000 2223 ns/op 921.20 MB/s +Search_Easy1_CachedRE2/4K 500000 3936 ns/op 1040.51 MB/s +Search_Easy1_CachedRE2/8K 200000 7628 ns/op 1073.85 MB/s +Search_Easy1_CachedRE2/16K 100000 14704 ns/op 1114.25 MB/s +Search_Easy1_CachedRE2/32K 50000 30338 ns/op 1080.08 MB/s +Search_Easy1_CachedRE2/64K 20000 60907 ns/op 1075.99 MB/s +Search_Easy1_CachedRE2/128K 10000 127232 ns/op 1030.17 MB/s +Search_Easy1_CachedRE2/256K 5000 253878 ns/op 1032.56 MB/s +Search_Easy1_CachedRE2/512K 2000 510752 ns/op 1026.50 MB/s +Search_Easy1_CachedRE2/1M 1000 1132564 ns/op 925.84 MB/s +Search_Easy1_CachedRE2/2M 500 2265660 ns/op 925.63 MB/s +Search_Easy1_CachedRE2/4M 500 4524512 ns/op 927.02 MB/s +Search_Easy1_CachedRE2/8M 200 9113050 ns/op 920.50 MB/s +Search_Easy1_CachedRE2/16M 100 18149050 ns/op 924.41 MB/s +Search_Medium_CachedPCRE/8 10000000 196 ns/op 40.61 MB/s +Search_Medium_CachedPCRE/16 5000000 239 ns/op 66.94 MB/s +Search_Medium_CachedPCRE/32 5000000 311 ns/op 102.58 MB/s +Search_Medium_CachedPCRE/64 5000000 458 ns/op 139.44 MB/s +Search_Medium_CachedPCRE/128 2000000 747 ns/op 171.17 MB/s +Search_Medium_CachedPCRE/256 100000 16750 ns/op 15.28 MB/s +Search_Medium_CachedPCRE/512 50000 39824 ns/op 12.86 MB/s +Search_Medium_CachedPCRE/1K 20000 78534 ns/op 13.04 MB/s +Search_Medium_CachedPCRE/2K 10000 116649 ns/op 17.56 MB/s +Search_Medium_CachedPCRE/4K 5000 289351 ns/op 14.16 MB/s +Search_Medium_CachedPCRE/8K 2000 624232 ns/op 13.12 MB/s +Search_Medium_CachedPCRE/16K 1000 1273353 ns/op 12.87 MB/s +Search_Medium_CachedPCRE/32K 500 2547042 ns/op 12.87 MB/s +Search_Medium_CachedPCRE/64K 200 5087635 ns/op 12.88 MB/s +Search_Medium_CachedPCRE/128K 100 10218440 ns/op 12.83 MB/s +Search_Medium_CachedPCRE/256K 50 20359020 ns/op 12.88 MB/s +Search_Medium_CachedRE2/8 5000000 335 ns/op 23.85 MB/s +Search_Medium_CachedRE2/16 5000000 387 ns/op 41.31 MB/s +Search_Medium_CachedRE2/32 5000000 497 ns/op 64.31 MB/s +Search_Medium_CachedRE2/64 2000000 698 ns/op 91.65 MB/s +Search_Medium_CachedRE2/128 1000000 1116 ns/op 114.69 MB/s +Search_Medium_CachedRE2/256 1000000 1941 ns/op 131.87 MB/s +Search_Medium_CachedRE2/512 500000 3610 ns/op 141.81 MB/s +Search_Medium_CachedRE2/1K 200000 6924 ns/op 147.87 MB/s +Search_Medium_CachedRE2/2K 100000 13593 ns/op 150.66 MB/s +Search_Medium_CachedRE2/4K 50000 26821 ns/op 152.71 MB/s +Search_Medium_CachedRE2/8K 20000 53355 ns/op 153.54 MB/s +Search_Medium_CachedRE2/16K 10000 106541 ns/op 153.78 MB/s +Search_Medium_CachedRE2/32K 5000 213334 ns/op 153.60 MB/s +Search_Medium_CachedRE2/64K 5000 425884 ns/op 153.88 MB/s +Search_Medium_CachedRE2/128K 2000 861612 ns/op 152.12 MB/s +Search_Medium_CachedRE2/256K 1000 1721932 ns/op 152.24 MB/s +Search_Medium_CachedRE2/512K 500 3436138 ns/op 152.58 MB/s +Search_Medium_CachedRE2/1M 200 6959260 ns/op 150.67 MB/s +Search_Medium_CachedRE2/2M 100 13991020 ns/op 149.89 MB/s +Search_Medium_CachedRE2/4M 50 27927180 ns/op 150.19 MB/s +Search_Medium_CachedRE2/8M 20 56069500 ns/op 149.61 MB/s +Search_Medium_CachedRE2/16M 10 112054200 ns/op 149.72 MB/s +Search_Hard_CachedPCRE/8 10000000 198 ns/op 40.39 MB/s +Search_Hard_CachedPCRE/16 5000000 240 ns/op 66.51 MB/s +Search_Hard_CachedPCRE/32 5000000 312 ns/op 102.38 MB/s +Search_Hard_CachedPCRE/64 5000000 459 ns/op 139.25 MB/s +Search_Hard_CachedPCRE/128 2000000 749 ns/op 170.80 MB/s +Search_Hard_CachedPCRE/256 1000 1035026 ns/op 0.25 MB/s +Search_Hard_CachedPCRE/512 500 4247092 ns/op 0.12 MB/s +Search_Hard_CachedPCRE/1K 100 16874720 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/2K 20 61595100 ns/op 0.03 MB/s +Search_Hard_CachedPCRE/4K 5 266182000 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 332 ns/op 24.04 MB/s +Search_Hard_CachedRE2/16 5000000 389 ns/op 41.05 MB/s +Search_Hard_CachedRE2/32 5000000 498 ns/op 64.25 MB/s +Search_Hard_CachedRE2/64 2000000 695 ns/op 91.97 MB/s +Search_Hard_CachedRE2/128 1000000 1109 ns/op 115.34 MB/s +Search_Hard_CachedRE2/256 1000000 1944 ns/op 131.66 MB/s +Search_Hard_CachedRE2/512 500000 3603 ns/op 142.09 MB/s +Search_Hard_CachedRE2/1K 200000 6910 ns/op 148.17 MB/s +Search_Hard_CachedRE2/2K 100000 13584 ns/op 150.76 MB/s +Search_Hard_CachedRE2/4K 50000 26804 ns/op 152.81 MB/s +Search_Hard_CachedRE2/8K 20000 53574 ns/op 152.91 MB/s +Search_Hard_CachedRE2/16K 10000 106335 ns/op 154.08 MB/s +Search_Hard_CachedRE2/32K 5000 212875 ns/op 153.93 MB/s +Search_Hard_CachedRE2/64K 5000 426419 ns/op 153.69 MB/s +Search_Hard_CachedRE2/128K 2000 860044 ns/op 152.40 MB/s +Search_Hard_CachedRE2/256K 1000 1717290 ns/op 152.65 MB/s +Search_Hard_CachedRE2/512K 500 3443368 ns/op 152.26 MB/s +Search_Hard_CachedRE2/1M 200 6974745 ns/op 150.34 MB/s +Search_Hard_CachedRE2/2M 100 13946120 ns/op 150.38 MB/s +Search_Hard_CachedRE2/4M 50 27953660 ns/op 150.04 MB/s +Search_Hard_CachedRE2/8M 20 55889600 ns/op 150.09 MB/s +Search_Hard_CachedRE2/16M 10 111632200 ns/op 150.29 MB/s +Search_Parens_CachedPCRE/8 5000000 306 ns/op 26.06 MB/s +Search_Parens_CachedRE2/8 5000000 330 ns/op 24.18 MB/s +Search_Parens_CachedRE2/16 5000000 384 ns/op 41.64 MB/s +Search_Parens_CachedRE2/32 5000000 493 ns/op 64.80 MB/s +Search_Parens_CachedRE2/64 2000000 713 ns/op 89.69 MB/s +Search_Parens_CachedRE2/128 1000000 1148 ns/op 111.47 MB/s +Search_Parens_CachedRE2/256 500000 2027 ns/op 126.29 MB/s +Search_Parens_CachedRE2/512 500000 3767 ns/op 135.91 MB/s +Search_Parens_CachedRE2/1K 200000 7264 ns/op 140.96 MB/s +Search_Parens_CachedRE2/2K 100000 14217 ns/op 144.05 MB/s +Search_Parens_CachedRE2/4K 50000 28234 ns/op 145.07 MB/s +Search_Parens_CachedRE2/8K 20000 56090 ns/op 146.05 MB/s +Search_Parens_CachedRE2/16K 10000 112201 ns/op 146.02 MB/s +Search_Parens_CachedRE2/32K 5000 223654 ns/op 146.51 MB/s +Search_Parens_CachedRE2/64K 5000 448713 ns/op 146.05 MB/s +Search_Parens_CachedRE2/128K 2000 903401 ns/op 145.09 MB/s +Search_Parens_CachedRE2/256K 1000 1801568 ns/op 145.51 MB/s +Search_Parens_CachedRE2/512K 500 3611400 ns/op 145.18 MB/s +Search_Parens_CachedRE2/1M 200 7303355 ns/op 143.57 MB/s +Search_Parens_CachedRE2/2M 100 14659380 ns/op 143.06 MB/s +Search_Parens_CachedRE2/4M 50 29371720 ns/op 142.80 MB/s +Search_Parens_CachedRE2/8M 20 58387300 ns/op 143.67 MB/s +Search_Parens_CachedRE2/16M 10 116634700 ns/op 143.84 MB/s +Search_BigFixed_CachedPCRE/8 5000000 384 ns/op 20.82 MB/s +Search_BigFixed_CachedPCRE/16 5000000 476 ns/op 33.60 MB/s +Search_BigFixed_CachedPCRE/32 2000000 641 ns/op 49.87 MB/s +Search_BigFixed_CachedPCRE/64 2000000 969 ns/op 66.03 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1619 ns/op 79.03 MB/s +Search_BigFixed_CachedPCRE/256 500000 2934 ns/op 87.23 MB/s +Search_BigFixed_CachedPCRE/512 200000 5548 ns/op 92.28 MB/s +Search_BigFixed_CachedPCRE/1K 100000 10777 ns/op 95.01 MB/s +Search_BigFixed_CachedPCRE/2K 50000 21301 ns/op 96.14 MB/s +Search_BigFixed_CachedPCRE/4K 50000 42253 ns/op 96.94 MB/s +Search_BigFixed_CachedPCRE/8K 20000 84014 ns/op 97.51 MB/s +Search_BigFixed_CachedPCRE/16K 10000 168135 ns/op 97.45 MB/s +Search_BigFixed_CachedPCRE/32K 5000 336352 ns/op 97.42 MB/s +Search_BigFixed_CachedRE2/8 10000000 173 ns/op 46.12 MB/s +Search_BigFixed_CachedRE2/16 5000000 361 ns/op 44.31 MB/s +Search_BigFixed_CachedRE2/32 5000000 428 ns/op 74.67 MB/s +Search_BigFixed_CachedRE2/64 2000000 542 ns/op 117.92 MB/s +Search_BigFixed_CachedRE2/128 2000000 804 ns/op 159.17 MB/s +Search_BigFixed_CachedRE2/256 1000000 1306 ns/op 195.91 MB/s +Search_BigFixed_CachedRE2/512 500000 2303 ns/op 222.23 MB/s +Search_BigFixed_CachedRE2/1K 500000 4140 ns/op 247.33 MB/s +Search_BigFixed_CachedRE2/2K 200000 8305 ns/op 246.59 MB/s +Search_BigFixed_CachedRE2/4K 100000 16335 ns/op 250.74 MB/s +Search_BigFixed_CachedRE2/8K 50000 32402 ns/op 252.82 MB/s +Search_BigFixed_CachedRE2/16K 20000 61654 ns/op 265.74 MB/s +Search_BigFixed_CachedRE2/32K 10000 123161 ns/op 266.06 MB/s +Search_BigFixed_CachedRE2/64K 5000 250635 ns/op 261.48 MB/s +Search_BigFixed_CachedRE2/128K 2000 501539 ns/op 261.34 MB/s +Search_BigFixed_CachedRE2/256K 1000 1029773 ns/op 254.56 MB/s +Search_BigFixed_CachedRE2/512K 500 2088812 ns/op 251.00 MB/s +Search_BigFixed_CachedRE2/1M 500 4367148 ns/op 240.11 MB/s +Search_Success_PCRE/8 500000 3069 ns/op 2.61 MB/s +Search_Success_PCRE/16 500000 3145 ns/op 5.09 MB/s +Search_Success_PCRE/32 500000 3288 ns/op 9.73 MB/s +Search_Success_PCRE/64 500000 3564 ns/op 17.96 MB/s +Search_Success_PCRE/128 500000 4104 ns/op 31.18 MB/s +Search_Success_PCRE/256 200000 5214 ns/op 49.09 MB/s +Search_Success_PCRE/512 200000 7414 ns/op 69.05 MB/s +Search_Success_PCRE/1K 100000 11867 ns/op 86.29 MB/s +Search_Success_PCRE/2K 50000 20669 ns/op 99.08 MB/s +Search_Success_PCRE/4K 50000 38338 ns/op 106.84 MB/s +Search_Success_PCRE/8K 20000 73632 ns/op 111.26 MB/s +Search_Success_PCRE/16K 10000 144640 ns/op 113.27 MB/s +Search_Success_PCRE/32K 5000 286497 ns/op 114.37 MB/s +Search_Success_PCRE/64K 2000 571622 ns/op 114.65 MB/s +Search_Success_PCRE/128K 1000 1141585 ns/op 114.82 MB/s +Search_Success_PCRE/256K 500 2297252 ns/op 114.11 MB/s +Search_Success_PCRE/512K 500 4580748 ns/op 114.45 MB/s +Search_Success_PCRE/1M 200 9388870 ns/op 111.68 MB/s +Search_Success_PCRE/2M 100 19154170 ns/op 109.49 MB/s +Search_Success_PCRE/4M 50 39603180 ns/op 105.91 MB/s +Search_Success_PCRE/8M 10 100235900 ns/op 83.69 MB/s +Search_Success_PCRE/16M 5 249216000 ns/op 67.32 MB/s +Search_Success_RE2/8 100000 10763 ns/op 0.74 MB/s +Search_Success_RE2/16 50000 24745 ns/op 0.65 MB/s +Search_Success_RE2/32 50000 24874 ns/op 1.29 MB/s +Search_Success_RE2/64 50000 25512 ns/op 2.51 MB/s +Search_Success_RE2/128 50000 25781 ns/op 4.96 MB/s +Search_Success_RE2/256 50000 26515 ns/op 9.65 MB/s +Search_Success_RE2/512 50000 28061 ns/op 18.25 MB/s +Search_Success_RE2/1K 50000 31719 ns/op 32.28 MB/s +Search_Success_RE2/2K 50000 38644 ns/op 53.00 MB/s +Search_Success_RE2/4K 20000 52593 ns/op 77.88 MB/s +Search_Success_RE2/8K 20000 80472 ns/op 101.80 MB/s +Search_Success_RE2/16K 10000 136735 ns/op 119.82 MB/s +Search_Success_RE2/32K 5000 248227 ns/op 132.01 MB/s +Search_Success_RE2/64K 5000 474621 ns/op 138.08 MB/s +Search_Success_RE2/128K 2000 926762 ns/op 141.43 MB/s +Search_Success_RE2/256K 1000 1834769 ns/op 142.88 MB/s +Search_Success_RE2/512K 500 3659356 ns/op 143.27 MB/s +Search_Success_RE2/1M 200 7482580 ns/op 140.14 MB/s +Search_Success_RE2/2M 100 15275510 ns/op 137.29 MB/s +Search_Success_RE2/4M 50 32164720 ns/op 130.40 MB/s +Search_Success_RE2/8M 20 71208250 ns/op 117.80 MB/s +Search_Success_RE2/16M 5 215755600 ns/op 77.76 MB/s +Search_Success_CachedPCRE/8 5000000 397 ns/op 20.12 MB/s +Search_Success_CachedPCRE/16 5000000 466 ns/op 34.27 MB/s +Search_Success_CachedPCRE/32 2000000 602 ns/op 53.10 MB/s +Search_Success_CachedPCRE/64 2000000 881 ns/op 72.63 MB/s +Search_Success_CachedPCRE/128 1000000 1432 ns/op 89.38 MB/s +Search_Success_CachedPCRE/256 500000 2542 ns/op 100.69 MB/s +Search_Success_CachedPCRE/512 500000 4750 ns/op 107.78 MB/s +Search_Success_CachedPCRE/1K 200000 9157 ns/op 111.83 MB/s +Search_Success_CachedPCRE/2K 100000 18016 ns/op 113.67 MB/s +Search_Success_CachedPCRE/4K 50000 35707 ns/op 114.71 MB/s +Search_Success_CachedPCRE/8K 20000 70955 ns/op 115.45 MB/s +Search_Success_CachedPCRE/16K 10000 141912 ns/op 115.45 MB/s +Search_Success_CachedPCRE/32K 5000 284777 ns/op 115.07 MB/s +Search_Success_CachedPCRE/64K 2000 571111 ns/op 114.75 MB/s +Search_Success_CachedPCRE/128K 1000 1142328 ns/op 114.74 MB/s +Search_Success_CachedPCRE/256K 500 2289468 ns/op 114.50 MB/s +Search_Success_CachedPCRE/512K 500 4566850 ns/op 114.80 MB/s +Search_Success_CachedPCRE/1M 200 9379830 ns/op 111.79 MB/s +Search_Success_CachedPCRE/2M 100 19115070 ns/op 109.71 MB/s +Search_Success_CachedPCRE/4M 50 39568300 ns/op 106.00 MB/s +Search_Success_CachedPCRE/8M 10 100039600 ns/op 83.85 MB/s +Search_Success_CachedPCRE/16M 5 249181800 ns/op 67.33 MB/s +Search_Success_CachedRE2/8 5000000 201 ns/op 39.73 MB/s +Search_Success_CachedRE2/16 5000000 395 ns/op 40.43 MB/s +Search_Success_CachedRE2/32 2000000 507 ns/op 63.06 MB/s +Search_Success_CachedRE2/64 2000000 723 ns/op 88.42 MB/s +Search_Success_CachedRE2/128 1000000 1157 ns/op 110.59 MB/s +Search_Success_CachedRE2/256 500000 2032 ns/op 125.94 MB/s +Search_Success_CachedRE2/512 500000 3778 ns/op 135.49 MB/s +Search_Success_CachedRE2/1K 200000 7275 ns/op 140.75 MB/s +Search_Success_CachedRE2/2K 100000 14222 ns/op 144.00 MB/s +Search_Success_CachedRE2/4K 50000 28255 ns/op 144.96 MB/s +Search_Success_CachedRE2/8K 20000 56056 ns/op 146.14 MB/s +Search_Success_CachedRE2/16K 10000 112188 ns/op 146.04 MB/s +Search_Success_CachedRE2/32K 5000 223466 ns/op 146.64 MB/s +Search_Success_CachedRE2/64K 5000 448677 ns/op 146.06 MB/s +Search_Success_CachedRE2/128K 2000 901883 ns/op 145.33 MB/s +Search_Success_CachedRE2/256K 1000 1810495 ns/op 144.79 MB/s +Search_Success_CachedRE2/512K 500 3631582 ns/op 144.37 MB/s +Search_Success_CachedRE2/1M 200 7434340 ns/op 141.04 MB/s +Search_Success_CachedRE2/2M 100 15224310 ns/op 137.75 MB/s +Search_Success_CachedRE2/4M 50 31757460 ns/op 132.07 MB/s +Search_Success_CachedRE2/8M 20 70959200 ns/op 118.22 MB/s +Search_Success_CachedRE2/16M 5 215988600 ns/op 77.68 MB/s +Search_Success1_PCRE/8 500000 3292 ns/op 2.43 MB/s +Search_Success1_PCRE/16 500000 3360 ns/op 4.76 MB/s +Search_Success1_PCRE/32 500000 3476 ns/op 9.21 MB/s +Search_Success1_PCRE/64 500000 3775 ns/op 16.95 MB/s +Search_Success1_PCRE/128 500000 4303 ns/op 29.75 MB/s +Search_Success1_PCRE/256 200000 5430 ns/op 47.14 MB/s +Search_Success1_PCRE/512 200000 7664 ns/op 66.80 MB/s +Search_Success1_PCRE/1K 100000 12062 ns/op 84.89 MB/s +Search_Success1_PCRE/2K 50000 20956 ns/op 97.73 MB/s +Search_Success1_PCRE/4K 50000 38521 ns/op 106.33 MB/s +Search_Success1_PCRE/8K 20000 73852 ns/op 110.92 MB/s +Search_Success1_PCRE/16K 10000 144900 ns/op 113.07 MB/s +Search_Success1_PCRE/32K 5000 286158 ns/op 114.51 MB/s +Search_Success1_PCRE/64K 2000 569992 ns/op 114.98 MB/s +Search_Success1_PCRE/128K 1000 1144770 ns/op 114.50 MB/s +Search_Success1_PCRE/256K 500 2292086 ns/op 114.37 MB/s +Search_Success1_PCRE/512K 500 4578494 ns/op 114.51 MB/s +Search_Success1_PCRE/1M 200 9410760 ns/op 111.42 MB/s +Search_Success1_PCRE/2M 100 19166460 ns/op 109.42 MB/s +Search_Success1_PCRE/4M 50 39599000 ns/op 105.92 MB/s +Search_Success1_PCRE/8M 10 100725900 ns/op 83.28 MB/s +Search_Success1_PCRE/16M 5 249356000 ns/op 67.28 MB/s +Search_Success1_RE2/8 50000 33188 ns/op 0.24 MB/s +Search_Success1_RE2/16 50000 33012 ns/op 0.48 MB/s +Search_Success1_RE2/32 50000 32845 ns/op 0.97 MB/s +Search_Success1_RE2/64 50000 33133 ns/op 1.93 MB/s +Search_Success1_RE2/128 50000 33536 ns/op 3.82 MB/s +Search_Success1_RE2/256 50000 34548 ns/op 7.41 MB/s +Search_Success1_RE2/512 50000 36303 ns/op 14.10 MB/s +Search_Success1_RE2/1K 50000 39676 ns/op 25.81 MB/s +Search_Success1_RE2/2K 50000 46563 ns/op 43.98 MB/s +Search_Success1_RE2/4K 20000 60801 ns/op 67.37 MB/s +Search_Success1_RE2/8K 20000 88743 ns/op 92.31 MB/s +Search_Success1_RE2/16K 10000 145159 ns/op 112.87 MB/s +Search_Success1_RE2/32K 5000 257245 ns/op 127.38 MB/s +Search_Success1_RE2/64K 5000 482971 ns/op 135.69 MB/s +Search_Success1_RE2/128K 2000 935136 ns/op 140.16 MB/s +Search_Success1_RE2/256K 1000 1844695 ns/op 142.11 MB/s +Search_Success1_RE2/512K 500 3676360 ns/op 142.61 MB/s +Search_Success1_RE2/1M 200 7511915 ns/op 139.59 MB/s +Search_Success1_RE2/2M 100 15301160 ns/op 137.06 MB/s +Search_Success1_RE2/4M 50 31848480 ns/op 131.70 MB/s +Search_Success1_RE2/8M 20 71078250 ns/op 118.02 MB/s +Search_Success1_RE2/16M 5 215988000 ns/op 77.68 MB/s +Search_Success1_Cached_PCRE/8 5000000 442 ns/op 18.08 MB/s +Search_Success1_Cached_PCRE/16 2000000 511 ns/op 31.31 MB/s +Search_Success1_Cached_PCRE/32 2000000 649 ns/op 49.30 MB/s +Search_Success1_Cached_PCRE/64 2000000 926 ns/op 69.09 MB/s +Search_Success1_Cached_PCRE/128 1000000 1476 ns/op 86.70 MB/s +Search_Success1_Cached_PCRE/256 500000 2584 ns/op 99.04 MB/s +Search_Success1_Cached_PCRE/512 500000 4787 ns/op 106.93 MB/s +Search_Success1_Cached_PCRE/1K 200000 9217 ns/op 111.10 MB/s +Search_Success1_Cached_PCRE/2K 100000 18078 ns/op 113.28 MB/s +Search_Success1_Cached_PCRE/4K 50000 35681 ns/op 114.79 MB/s +Search_Success1_Cached_PCRE/8K 20000 71032 ns/op 115.33 MB/s +Search_Success1_Cached_PCRE/16K 10000 142121 ns/op 115.28 MB/s +Search_Success1_Cached_PCRE/32K 5000 283243 ns/op 115.69 MB/s +Search_Success1_Cached_PCRE/64K 2000 566937 ns/op 115.60 MB/s +Search_Success1_Cached_PCRE/128K 1000 1141044 ns/op 114.87 MB/s +Search_Success1_Cached_PCRE/256K 500 2283570 ns/op 114.80 MB/s +Search_Success1_Cached_PCRE/512K 500 4573362 ns/op 114.64 MB/s +Search_Success1_Cached_PCRE/1M 200 9377975 ns/op 111.81 MB/s +Search_Success1_Cached_PCRE/2M 100 19150760 ns/op 109.51 MB/s +Search_Success1_Cached_PCRE/4M 50 39578540 ns/op 105.97 MB/s +Search_Success1_Cached_PCRE/8M 10 102111900 ns/op 82.15 MB/s +Search_Success1_Cached_PCRE/16M 5 247123000 ns/op 67.89 MB/s +Search_Success1_Cached_RE2/8 5000000 348 ns/op 22.94 MB/s +Search_Success1_Cached_RE2/16 5000000 396 ns/op 40.35 MB/s +Search_Success1_Cached_RE2/32 5000000 492 ns/op 65.01 MB/s +Search_Success1_Cached_RE2/64 2000000 716 ns/op 89.38 MB/s +Search_Success1_Cached_RE2/128 1000000 1131 ns/op 113.09 MB/s +Search_Success1_Cached_RE2/256 1000000 1961 ns/op 130.49 MB/s +Search_Success1_Cached_RE2/512 500000 3626 ns/op 141.17 MB/s +Search_Success1_Cached_RE2/1K 200000 6941 ns/op 147.51 MB/s +Search_Success1_Cached_RE2/2K 100000 13591 ns/op 150.69 MB/s +Search_Success1_Cached_RE2/4K 50000 26867 ns/op 152.45 MB/s +Search_Success1_Cached_RE2/8K 20000 53455 ns/op 153.25 MB/s +Search_Success1_Cached_RE2/16K 10000 106632 ns/op 153.65 MB/s +Search_Success1_Cached_RE2/32K 5000 213141 ns/op 153.74 MB/s +Search_Success1_Cached_RE2/64K 5000 426628 ns/op 153.61 MB/s +Search_Success1_Cached_RE2/128K 2000 861903 ns/op 152.07 MB/s +Search_Success1_Cached_RE2/256K 1000 1729300 ns/op 151.59 MB/s +Search_Success1_Cached_RE2/512K 500 3470894 ns/op 151.05 MB/s +Search_Success1_Cached_RE2/1M 200 7120350 ns/op 147.26 MB/s +Search_Success1_Cached_RE2/2M 100 14538650 ns/op 144.25 MB/s +Search_Success1_Cached_RE2/4M 50 30323940 ns/op 138.32 MB/s +Search_Success1_Cached_RE2/8M 20 68069300 ns/op 123.24 MB/s +Search_Success1_Cached_RE2/16M 5 211011000 ns/op 79.51 MB/s +Search_Digits_PCRE 200000 7008 ns/op +Search_Digits_RE2 50000 27251 ns/op +Parse_Digits_PCRE 200000 6887 ns/op +Parse_Digits_RE2 100000 13239 ns/op +Parse_CachedDigits_PCRE 2000000 776 ns/op +Parse_CachedDigits_RE2 5000000 451 ns/op +Parse_DigitDs_PCRE 200000 6558 ns/op +Parse_DigitDs_RE2 100000 12946 ns/op +Parse_CachedDigitDs_PCRE 2000000 766 ns/op +Parse_CachedDigitDs_RE2 5000000 445 ns/op +Parse_Split_PCRE 500000 4751 ns/op +Parse_Split_RE2 100000 14060 ns/op +Parse_CachedSplit_PCRE 2000000 568 ns/op +Parse_CachedSplit_RE2 5000000 275 ns/op +Parse_SplitHard_PCRE 500000 4650 ns/op +Parse_SplitHard_RE2 100000 17606 ns/op +Parse_CachedSplitHard_PCRE 2000000 554 ns/op +Parse_CachedSplitHard_RE2 500000 2987 ns/op +Parse_CachedSplitBig1_PCRE 200 8376500 ns/op +Parse_CachedSplitBig1_RE2 1000 1342272 ns/op +Parse_CachedSplitBig2_PCRE 2000 848859 ns/op +Parse_CachedSplitBig2_RE2 10 781553500 ns/op +BM_PCRE_Compile 200000 5582 ns/op +BM_RE2_Compile 100000 13961 ns/op +SearchPhone_CachedPCRE/8 500000 2107 ns/op 3.80 MB/s +SearchPhone_CachedPCRE/16 500000 3526 ns/op 4.54 MB/s +SearchPhone_CachedPCRE/32 200000 6320 ns/op 5.06 MB/s +SearchPhone_CachedPCRE/64 100000 11953 ns/op 5.35 MB/s +SearchPhone_CachedPCRE/128 50000 23357 ns/op 5.48 MB/s +SearchPhone_CachedPCRE/256 50000 45919 ns/op 5.57 MB/s +SearchPhone_CachedPCRE/512 20000 90828 ns/op 5.64 MB/s +SearchPhone_CachedPCRE/1K 10000 181299 ns/op 5.65 MB/s +SearchPhone_CachedPCRE/2K 5000 358095 ns/op 5.72 MB/s +SearchPhone_CachedPCRE/4K 2000 709670 ns/op 5.77 MB/s +SearchPhone_CachedPCRE/8K 1000 1412480 ns/op 5.80 MB/s +SearchPhone_CachedPCRE/16K 500 2826286 ns/op 5.80 MB/s +SearchPhone_CachedPCRE/32K 200 5643125 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/64K 100 11303300 ns/op 5.80 MB/s +SearchPhone_CachedPCRE/128K 50 22564640 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/256K 50 45145780 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/512K 20 90272200 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/1M 10 180937900 ns/op 5.80 MB/s +SearchPhone_CachedPCRE/2M 5 362303400 ns/op 5.79 MB/s +SearchPhone_CachedPCRE/4M 2 725048500 ns/op 5.78 MB/s +SearchPhone_CachedPCRE/8M 1 1449458000 ns/op 5.79 MB/s +SearchPhone_CachedPCRE/16M 1 2898562000 ns/op 5.79 MB/s +SearchPhone_CachedRE2/8 1000000 1038 ns/op 7.70 MB/s +SearchPhone_CachedRE2/16 1000000 1106 ns/op 14.46 MB/s +SearchPhone_CachedRE2/32 1000000 1210 ns/op 26.44 MB/s +SearchPhone_CachedRE2/64 1000000 1429 ns/op 44.78 MB/s +SearchPhone_CachedRE2/128 1000000 1864 ns/op 68.64 MB/s +SearchPhone_CachedRE2/256 500000 2741 ns/op 93.38 MB/s +SearchPhone_CachedRE2/512 500000 4483 ns/op 114.18 MB/s +SearchPhone_CachedRE2/1K 200000 7984 ns/op 128.24 MB/s +SearchPhone_CachedRE2/2K 100000 14957 ns/op 136.92 MB/s +SearchPhone_CachedRE2/4K 50000 28994 ns/op 141.27 MB/s +SearchPhone_CachedRE2/8K 20000 56950 ns/op 143.85 MB/s +SearchPhone_CachedRE2/16K 10000 112907 ns/op 145.11 MB/s +SearchPhone_CachedRE2/32K 5000 224855 ns/op 145.73 MB/s +SearchPhone_CachedRE2/64K 5000 449976 ns/op 145.64 MB/s +SearchPhone_CachedRE2/128K 2000 899644 ns/op 145.69 MB/s +SearchPhone_CachedRE2/256K 1000 1798122 ns/op 145.79 MB/s +SearchPhone_CachedRE2/512K 500 3597034 ns/op 145.76 MB/s +SearchPhone_CachedRE2/1M 200 7261140 ns/op 144.41 MB/s +SearchPhone_CachedRE2/2M 100 14532060 ns/op 144.31 MB/s +SearchPhone_CachedRE2/4M 50 29033780 ns/op 144.46 MB/s +SearchPhone_CachedRE2/8M 20 57850800 ns/op 145.00 MB/s +SearchPhone_CachedRE2/16M 10 115699800 ns/op 145.01 MB/s +EmptyPartialMatchPCRE 10000000 192 ns/op +EmptyPartialMatchRE2 5000000 273 ns/op +SimplePartialMatchPCRE 5000000 263 ns/op +SimplePartialMatchRE2 5000000 347 ns/op +HTTPPartialMatchPCRE 2000000 885 ns/op +HTTPPartialMatchRE2 1000000 1099 ns/op +SmallHTTPPartialMatchPCRE 2000000 890 ns/op +SmallHTTPPartialMatchRE2 1000000 1097 ns/op +DotMatchPCRE 2000000 860 ns/op +DotMatchRE2 1000000 1175 ns/op +ASCIIMatchPCRE 2000000 767 ns/op +ASCIIMatchRE2 1000000 1174 ns/op +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 15:25:04 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# d7671f473f1a+ tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +Search_Easy0_CachedPCRE/8 10000000 187 ns/op 42.72 MB/s +Search_Easy0_CachedPCRE/16 5000000 225 ns/op 71.05 MB/s +Search_Easy0_CachedPCRE/32 5000000 321 ns/op 99.66 MB/s +Search_Easy0_CachedPCRE/64 5000000 472 ns/op 135.54 MB/s +Search_Easy0_CachedPCRE/128 2000000 768 ns/op 166.47 MB/s +Search_Easy0_CachedPCRE/256 500000 2071 ns/op 123.57 MB/s +Search_Easy0_CachedPCRE/512 500000 3601 ns/op 142.17 MB/s +Search_Easy0_CachedPCRE/1K 200000 7120 ns/op 143.81 MB/s +Search_Easy0_CachedPCRE/2K 100000 12071 ns/op 169.66 MB/s +Search_Easy0_CachedPCRE/4K 50000 24017 ns/op 170.54 MB/s +Search_Easy0_CachedPCRE/8K 50000 49303 ns/op 166.15 MB/s +Search_Easy0_CachedPCRE/16K 20000 96809 ns/op 169.24 MB/s +Search_Easy0_CachedPCRE/32K 10000 194402 ns/op 168.56 MB/s +Search_Easy0_CachedPCRE/64K 5000 387333 ns/op 169.20 MB/s +Search_Easy0_CachedPCRE/128K 2000 785405 ns/op 166.88 MB/s +Search_Easy0_CachedPCRE/256K 1000 1553127 ns/op 168.78 MB/s +Search_Easy0_CachedPCRE/512K 500 3111082 ns/op 168.52 MB/s +Search_Easy0_CachedPCRE/1M 200 6329655 ns/op 165.66 MB/s +Search_Easy0_CachedPCRE/2M 100 12689760 ns/op 165.26 MB/s +Search_Easy0_CachedPCRE/4M 50 25449660 ns/op 164.81 MB/s +Search_Easy0_CachedPCRE/8M 20 50878350 ns/op 164.88 MB/s +Search_Easy0_CachedPCRE/16M 10 101730500 ns/op 164.92 MB/s +Search_Easy0_CachedRE2/8 5000000 293 ns/op 27.22 MB/s +Search_Easy0_CachedRE2/16 5000000 296 ns/op 53.97 MB/s +Search_Easy0_CachedRE2/32 5000000 332 ns/op 96.20 MB/s +Search_Easy0_CachedRE2/64 5000000 344 ns/op 185.55 MB/s +Search_Easy0_CachedRE2/128 5000000 419 ns/op 304.99 MB/s +Search_Easy0_CachedRE2/256 5000000 498 ns/op 513.30 MB/s +Search_Easy0_CachedRE2/512 2000000 693 ns/op 738.23 MB/s +Search_Easy0_CachedRE2/1K 1000000 1178 ns/op 869.24 MB/s +Search_Easy0_CachedRE2/2K 500000 2137 ns/op 958.09 MB/s +Search_Easy0_CachedRE2/4K 500000 3802 ns/op 1077.05 MB/s +Search_Easy0_CachedRE2/8K 200000 7338 ns/op 1116.29 MB/s +Search_Easy0_CachedRE2/16K 100000 14223 ns/op 1151.89 MB/s +Search_Easy0_CachedRE2/32K 50000 29212 ns/op 1121.69 MB/s +Search_Easy0_CachedRE2/64K 20000 58571 ns/op 1118.91 MB/s +Search_Easy0_CachedRE2/128K 10000 127652 ns/op 1026.79 MB/s +Search_Easy0_CachedRE2/256K 5000 254173 ns/op 1031.36 MB/s +Search_Easy0_CachedRE2/512K 2000 512878 ns/op 1022.25 MB/s +Search_Easy0_CachedRE2/1M 1000 1025916 ns/op 1022.09 MB/s +Search_Easy0_CachedRE2/2M 500 2025662 ns/op 1035.29 MB/s +Search_Easy0_CachedRE2/4M 500 4102988 ns/op 1022.26 MB/s +Search_Easy0_CachedRE2/8M 200 8215045 ns/op 1021.13 MB/s +Search_Easy0_CachedRE2/16M 100 16434700 ns/op 1020.84 MB/s +Search_Easy1_CachedPCRE/8 10000000 194 ns/op 41.08 MB/s +Search_Easy1_CachedPCRE/16 5000000 234 ns/op 68.20 MB/s +Search_Easy1_CachedPCRE/32 5000000 327 ns/op 97.57 MB/s +Search_Easy1_CachedPCRE/64 5000000 474 ns/op 134.94 MB/s +Search_Easy1_CachedPCRE/128 1000000 1189 ns/op 107.57 MB/s +Search_Easy1_CachedPCRE/256 1000000 1782 ns/op 143.66 MB/s +Search_Easy1_CachedPCRE/512 500000 3957 ns/op 129.38 MB/s +Search_Easy1_CachedPCRE/1K 200000 7231 ns/op 141.60 MB/s +Search_Easy1_CachedPCRE/2K 100000 13633 ns/op 150.22 MB/s +Search_Easy1_CachedPCRE/4K 50000 25380 ns/op 161.39 MB/s +Search_Easy1_CachedPCRE/8K 20000 50466 ns/op 162.32 MB/s +Search_Easy1_CachedPCRE/16K 20000 98483 ns/op 166.36 MB/s +Search_Easy1_CachedPCRE/32K 10000 197239 ns/op 166.13 MB/s +Search_Easy1_CachedPCRE/64K 5000 393513 ns/op 166.54 MB/s +Search_Easy1_CachedPCRE/128K 2000 792420 ns/op 165.41 MB/s +Search_Easy1_CachedPCRE/256K 1000 1577956 ns/op 166.13 MB/s +Search_Easy1_CachedPCRE/512K 500 3162854 ns/op 165.76 MB/s +Search_Easy1_CachedPCRE/1M 200 6433560 ns/op 162.99 MB/s +Search_Easy1_CachedPCRE/2M 100 12888530 ns/op 162.71 MB/s +Search_Easy1_CachedPCRE/4M 50 25851040 ns/op 162.25 MB/s +Search_Easy1_CachedPCRE/8M 20 51705700 ns/op 162.24 MB/s +Search_Easy1_CachedPCRE/16M 10 103423200 ns/op 162.22 MB/s +Search_Easy1_CachedRE2/8 5000000 292 ns/op 27.34 MB/s +Search_Easy1_CachedRE2/16 5000000 293 ns/op 54.49 MB/s +Search_Easy1_CachedRE2/32 5000000 330 ns/op 96.86 MB/s +Search_Easy1_CachedRE2/64 5000000 343 ns/op 186.51 MB/s +Search_Easy1_CachedRE2/128 5000000 421 ns/op 304.03 MB/s +Search_Easy1_CachedRE2/256 5000000 499 ns/op 512.53 MB/s +Search_Easy1_CachedRE2/512 2000000 697 ns/op 734.27 MB/s +Search_Easy1_CachedRE2/1K 1000000 1180 ns/op 867.12 MB/s +Search_Easy1_CachedRE2/2K 500000 2136 ns/op 958.55 MB/s +Search_Easy1_CachedRE2/4K 500000 3808 ns/op 1075.53 MB/s +Search_Easy1_CachedRE2/8K 200000 7335 ns/op 1116.83 MB/s +Search_Easy1_CachedRE2/16K 100000 14184 ns/op 1155.10 MB/s +Search_Easy1_CachedRE2/32K 50000 29181 ns/op 1122.91 MB/s +Search_Easy1_CachedRE2/64K 20000 58567 ns/op 1118.98 MB/s +Search_Easy1_CachedRE2/128K 10000 127629 ns/op 1026.98 MB/s +Search_Easy1_CachedRE2/256K 5000 254045 ns/op 1031.88 MB/s +Search_Easy1_CachedRE2/512K 5000 494356 ns/op 1060.55 MB/s +Search_Easy1_CachedRE2/1M 1000 1027490 ns/op 1020.52 MB/s +Search_Easy1_CachedRE2/2M 500 2033222 ns/op 1031.44 MB/s +Search_Easy1_CachedRE2/4M 500 4106182 ns/op 1021.46 MB/s +Search_Easy1_CachedRE2/8M 200 8215690 ns/op 1021.05 MB/s +Search_Easy1_CachedRE2/16M 100 16420070 ns/op 1021.75 MB/s +Search_Medium_CachedPCRE/8 10000000 200 ns/op 39.93 MB/s +Search_Medium_CachedPCRE/16 5000000 242 ns/op 66.08 MB/s +Search_Medium_CachedPCRE/32 5000000 315 ns/op 101.47 MB/s +Search_Medium_CachedPCRE/64 5000000 461 ns/op 138.71 MB/s +Search_Medium_CachedPCRE/128 2000000 753 ns/op 169.80 MB/s +Search_Medium_CachedPCRE/256 100000 16809 ns/op 15.23 MB/s +Search_Medium_CachedPCRE/512 50000 39860 ns/op 12.84 MB/s +Search_Medium_CachedPCRE/1K 20000 78547 ns/op 13.04 MB/s +Search_Medium_CachedPCRE/2K 10000 117089 ns/op 17.49 MB/s +Search_Medium_CachedPCRE/4K 5000 289169 ns/op 14.16 MB/s +Search_Medium_CachedPCRE/8K 2000 625908 ns/op 13.09 MB/s +Search_Medium_CachedPCRE/16K 1000 1277969 ns/op 12.82 MB/s +Search_Medium_CachedPCRE/32K 500 2554842 ns/op 12.83 MB/s +Search_Medium_CachedPCRE/64K 200 5105160 ns/op 12.84 MB/s +Search_Medium_CachedPCRE/128K 100 10206360 ns/op 12.84 MB/s +Search_Medium_CachedPCRE/256K 50 20440340 ns/op 12.82 MB/s +Search_Medium_CachedRE2/8 5000000 334 ns/op 23.89 MB/s +Search_Medium_CachedRE2/16 5000000 388 ns/op 41.15 MB/s +Search_Medium_CachedRE2/32 5000000 496 ns/op 64.50 MB/s +Search_Medium_CachedRE2/64 2000000 717 ns/op 89.22 MB/s +Search_Medium_CachedRE2/128 1000000 1157 ns/op 110.60 MB/s +Search_Medium_CachedRE2/256 500000 2037 ns/op 125.64 MB/s +Search_Medium_CachedRE2/512 500000 3792 ns/op 135.01 MB/s +Search_Medium_CachedRE2/1K 200000 7288 ns/op 140.50 MB/s +Search_Medium_CachedRE2/2K 100000 14294 ns/op 143.27 MB/s +Search_Medium_CachedRE2/4K 50000 28286 ns/op 144.81 MB/s +Search_Medium_CachedRE2/8K 20000 56393 ns/op 145.27 MB/s +Search_Medium_CachedRE2/16K 10000 112792 ns/op 145.26 MB/s +Search_Medium_CachedRE2/32K 5000 231024 ns/op 141.84 MB/s +Search_Medium_CachedRE2/64K 5000 450957 ns/op 145.33 MB/s +Search_Medium_CachedRE2/128K 2000 906402 ns/op 144.61 MB/s +Search_Medium_CachedRE2/256K 1000 1813827 ns/op 144.53 MB/s +Search_Medium_CachedRE2/512K 500 3619796 ns/op 144.84 MB/s +Search_Medium_CachedRE2/1M 200 7317695 ns/op 143.29 MB/s +Search_Medium_CachedRE2/2M 100 14642030 ns/op 143.23 MB/s +Search_Medium_CachedRE2/4M 50 29237140 ns/op 143.46 MB/s +Search_Medium_CachedRE2/8M 20 58367050 ns/op 143.72 MB/s +Search_Medium_CachedRE2/16M 10 116398000 ns/op 144.14 MB/s +Search_Hard_CachedPCRE/8 10000000 199 ns/op 40.08 MB/s +Search_Hard_CachedPCRE/16 5000000 241 ns/op 66.13 MB/s +Search_Hard_CachedPCRE/32 5000000 316 ns/op 101.13 MB/s +Search_Hard_CachedPCRE/64 5000000 460 ns/op 138.85 MB/s +Search_Hard_CachedPCRE/128 2000000 753 ns/op 169.98 MB/s +Search_Hard_CachedPCRE/256 1000 1038013 ns/op 0.25 MB/s +Search_Hard_CachedPCRE/512 500 4263992 ns/op 0.12 MB/s +Search_Hard_CachedPCRE/1K 100 16899150 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/2K 20 61792450 ns/op 0.03 MB/s +Search_Hard_CachedPCRE/4K 5 266424400 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 331 ns/op 24.16 MB/s +Search_Hard_CachedRE2/16 5000000 386 ns/op 41.36 MB/s +Search_Hard_CachedRE2/32 5000000 492 ns/op 64.95 MB/s +Search_Hard_CachedRE2/64 2000000 713 ns/op 89.67 MB/s +Search_Hard_CachedRE2/128 1000000 1151 ns/op 111.15 MB/s +Search_Hard_CachedRE2/256 500000 2025 ns/op 126.36 MB/s +Search_Hard_CachedRE2/512 500000 3774 ns/op 135.64 MB/s +Search_Hard_CachedRE2/1K 200000 7271 ns/op 140.82 MB/s +Search_Hard_CachedRE2/2K 100000 14274 ns/op 143.48 MB/s +Search_Hard_CachedRE2/4K 50000 28261 ns/op 144.93 MB/s +Search_Hard_CachedRE2/8K 20000 56253 ns/op 145.63 MB/s +Search_Hard_CachedRE2/16K 10000 112279 ns/op 145.92 MB/s +Search_Hard_CachedRE2/32K 5000 224208 ns/op 146.15 MB/s +Search_Hard_CachedRE2/64K 5000 448835 ns/op 146.01 MB/s +Search_Hard_CachedRE2/128K 2000 906965 ns/op 144.52 MB/s +Search_Hard_CachedRE2/256K 1000 1821843 ns/op 143.89 MB/s +Search_Hard_CachedRE2/512K 500 3616856 ns/op 144.96 MB/s +Search_Hard_CachedRE2/1M 200 7319770 ns/op 143.25 MB/s +Search_Hard_CachedRE2/2M 100 14614680 ns/op 143.50 MB/s +Search_Hard_CachedRE2/4M 50 29189100 ns/op 143.69 MB/s +Search_Hard_CachedRE2/8M 20 58239300 ns/op 144.04 MB/s +Search_Hard_CachedRE2/16M 10 116307800 ns/op 144.25 MB/s +Search_Parens_CachedPCRE/8 5000000 307 ns/op 26.03 MB/s +Search_Parens_CachedRE2/8 5000000 333 ns/op 24.01 MB/s +Search_Parens_CachedRE2/16 5000000 383 ns/op 41.71 MB/s +Search_Parens_CachedRE2/32 5000000 496 ns/op 64.49 MB/s +Search_Parens_CachedRE2/64 2000000 696 ns/op 91.88 MB/s +Search_Parens_CachedRE2/128 1000000 1113 ns/op 114.97 MB/s +Search_Parens_CachedRE2/256 1000000 2025 ns/op 126.38 MB/s +Search_Parens_CachedRE2/512 500000 3776 ns/op 135.58 MB/s +Search_Parens_CachedRE2/1K 200000 7292 ns/op 140.41 MB/s +Search_Parens_CachedRE2/2K 100000 14272 ns/op 143.49 MB/s +Search_Parens_CachedRE2/4K 50000 28252 ns/op 144.98 MB/s +Search_Parens_CachedRE2/8K 20000 56203 ns/op 145.76 MB/s +Search_Parens_CachedRE2/16K 10000 112272 ns/op 145.93 MB/s +Search_Parens_CachedRE2/32K 5000 224140 ns/op 146.19 MB/s +Search_Parens_CachedRE2/64K 5000 448597 ns/op 146.09 MB/s +Search_Parens_CachedRE2/128K 2000 903965 ns/op 145.00 MB/s +Search_Parens_CachedRE2/256K 1000 1806597 ns/op 145.10 MB/s +Search_Parens_CachedRE2/512K 500 3614264 ns/op 145.06 MB/s +Search_Parens_CachedRE2/1M 200 7293425 ns/op 143.77 MB/s +Search_Parens_CachedRE2/2M 100 14617970 ns/op 143.46 MB/s +Search_Parens_CachedRE2/4M 50 29199860 ns/op 143.64 MB/s +Search_Parens_CachedRE2/8M 20 58260650 ns/op 143.98 MB/s +Search_Parens_CachedRE2/16M 10 116198600 ns/op 144.38 MB/s +Search_BigFixed_CachedPCRE/8 5000000 387 ns/op 20.67 MB/s +Search_BigFixed_CachedPCRE/16 5000000 475 ns/op 33.67 MB/s +Search_BigFixed_CachedPCRE/32 2000000 637 ns/op 50.21 MB/s +Search_BigFixed_CachedPCRE/64 2000000 965 ns/op 66.28 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1618 ns/op 79.09 MB/s +Search_BigFixed_CachedPCRE/256 500000 2936 ns/op 87.18 MB/s +Search_BigFixed_CachedPCRE/512 200000 5558 ns/op 92.12 MB/s +Search_BigFixed_CachedPCRE/1K 100000 10840 ns/op 94.46 MB/s +Search_BigFixed_CachedPCRE/2K 50000 21291 ns/op 96.19 MB/s +Search_BigFixed_CachedPCRE/4K 50000 42377 ns/op 96.66 MB/s +Search_BigFixed_CachedPCRE/8K 20000 84365 ns/op 97.10 MB/s +Search_BigFixed_CachedPCRE/16K 10000 168595 ns/op 97.18 MB/s +Search_BigFixed_CachedPCRE/32K 5000 337102 ns/op 97.20 MB/s +Search_BigFixed_CachedRE2/8 10000000 174 ns/op 45.85 MB/s +Search_BigFixed_CachedRE2/16 5000000 360 ns/op 44.33 MB/s +Search_BigFixed_CachedRE2/32 5000000 432 ns/op 74.06 MB/s +Search_BigFixed_CachedRE2/64 2000000 556 ns/op 115.06 MB/s +Search_BigFixed_CachedRE2/128 2000000 803 ns/op 159.24 MB/s +Search_BigFixed_CachedRE2/256 1000000 1307 ns/op 195.83 MB/s +Search_BigFixed_CachedRE2/512 500000 2308 ns/op 221.82 MB/s +Search_BigFixed_CachedRE2/1K 500000 4321 ns/op 236.95 MB/s +Search_BigFixed_CachedRE2/2K 200000 8334 ns/op 245.73 MB/s +Search_BigFixed_CachedRE2/4K 100000 16361 ns/op 250.34 MB/s +Search_BigFixed_CachedRE2/8K 50000 30995 ns/op 264.30 MB/s +Search_BigFixed_CachedRE2/16K 20000 64632 ns/op 253.49 MB/s +Search_BigFixed_CachedRE2/32K 10000 128875 ns/op 254.26 MB/s +Search_BigFixed_CachedRE2/64K 5000 258009 ns/op 254.01 MB/s +Search_BigFixed_CachedRE2/128K 2000 511023 ns/op 256.49 MB/s +Search_BigFixed_CachedRE2/256K 1000 1031677 ns/op 254.10 MB/s +Search_BigFixed_CachedRE2/512K 500 2124050 ns/op 246.83 MB/s +Search_BigFixed_CachedRE2/1M 500 4316322 ns/op 242.93 MB/s +Search_Success_PCRE/8 500000 3060 ns/op 2.61 MB/s +Search_Success_PCRE/16 500000 3122 ns/op 5.12 MB/s +Search_Success_PCRE/32 500000 3256 ns/op 9.83 MB/s +Search_Success_PCRE/64 500000 3545 ns/op 18.05 MB/s +Search_Success_PCRE/128 500000 4098 ns/op 31.23 MB/s +Search_Success_PCRE/256 200000 5215 ns/op 49.08 MB/s +Search_Success_PCRE/512 200000 7408 ns/op 69.11 MB/s +Search_Success_PCRE/1K 100000 11838 ns/op 86.50 MB/s +Search_Success_PCRE/2K 50000 20731 ns/op 98.79 MB/s +Search_Success_PCRE/4K 50000 38394 ns/op 106.68 MB/s +Search_Success_PCRE/8K 20000 73969 ns/op 110.75 MB/s +Search_Success_PCRE/16K 10000 144799 ns/op 113.15 MB/s +Search_Success_PCRE/32K 5000 286717 ns/op 114.29 MB/s +Search_Success_PCRE/64K 2000 571529 ns/op 114.67 MB/s +Search_Success_PCRE/128K 1000 1144131 ns/op 114.56 MB/s +Search_Success_PCRE/256K 500 2292450 ns/op 114.35 MB/s +Search_Success_PCRE/512K 500 4584198 ns/op 114.37 MB/s +Search_Success_PCRE/1M 200 9385225 ns/op 111.73 MB/s +Search_Success_PCRE/2M 100 19063720 ns/op 110.01 MB/s +Search_Success_PCRE/4M 50 39404920 ns/op 106.44 MB/s +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 16:16:46 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# a94585d91e66 tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +==BENCHMARK== r70.mtv.corp.google.com Fri Feb 26 16:29:12 PST 2010 +# Linux r70.mtv.corp.google.com 2.6.24-gg804007-generic #1 SMP Thu Jan 21 11:28:34 PST 2010 x86_64 GNU/Linux +# g++ (GCC) 4.2.4 (Ubuntu 4.2.4-1ubuntu4) +# Copyright (C) 2007 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# a94585d91e66 tip +# obj/test/regexp_benchmark: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), for GNU/Linux 2.6.8, dynamically linked (uses shared libs), not stripped + +Search_Easy0_CachedPCRE/8 10000000 186 ns/op 42.88 MB/s +Search_Easy0_CachedPCRE/16 10000000 225 ns/op 70.90 MB/s +Search_Easy0_CachedPCRE/32 5000000 319 ns/op 100.14 MB/s +Search_Easy0_CachedPCRE/64 5000000 461 ns/op 138.59 MB/s +Search_Easy0_CachedPCRE/128 2000000 752 ns/op 170.02 MB/s +Search_Easy0_CachedPCRE/256 1000000 2054 ns/op 124.63 MB/s +Search_Easy0_CachedPCRE/512 500000 3573 ns/op 143.26 MB/s +Search_Easy0_CachedPCRE/1K 500000 7078 ns/op 144.66 MB/s +Search_Easy0_CachedPCRE/2K 200000 12015 ns/op 170.44 MB/s +Search_Easy0_CachedPCRE/4K 100000 23912 ns/op 171.29 MB/s +Search_Easy0_CachedPCRE/8K 50000 49095 ns/op 166.86 MB/s +Search_Easy0_CachedPCRE/16K 20000 96360 ns/op 170.03 MB/s +Search_Easy0_CachedPCRE/32K 10000 193533 ns/op 169.31 MB/s +Search_Easy0_CachedPCRE/64K 5000 385817 ns/op 169.86 MB/s +Search_Easy0_CachedPCRE/128K 2000 774428 ns/op 169.25 MB/s +Search_Easy0_CachedPCRE/256K 1000 1548917 ns/op 169.24 MB/s +Search_Easy0_CachedPCRE/512K 500 3100914 ns/op 169.08 MB/s +Search_Easy0_CachedPCRE/1M 500 6304122 ns/op 166.33 MB/s +Search_Easy0_CachedPCRE/2M 100 12604920 ns/op 166.38 MB/s +Search_Easy0_CachedPCRE/4M 100 25405120 ns/op 165.10 MB/s +Search_Easy0_CachedPCRE/8M 50 50940620 ns/op 164.67 MB/s +Search_Easy0_CachedPCRE/16M 20 102598300 ns/op 163.52 MB/s +Search_Easy0_CachedRE2/8 10000000 302 ns/op 26.41 MB/s +Search_Easy0_CachedRE2/16 5000000 314 ns/op 50.87 MB/s +Search_Easy0_CachedRE2/32 5000000 349 ns/op 91.67 MB/s +Search_Easy0_CachedRE2/64 5000000 349 ns/op 183.19 MB/s +Search_Easy0_CachedRE2/128 5000000 415 ns/op 308.21 MB/s +Search_Easy0_CachedRE2/256 5000000 486 ns/op 526.44 MB/s +Search_Easy0_CachedRE2/512 5000000 644 ns/op 794.39 MB/s +Search_Easy0_CachedRE2/1K 1000000 1143 ns/op 895.18 MB/s +Search_Easy0_CachedRE2/2K 1000000 2099 ns/op 975.57 MB/s +Search_Easy0_CachedRE2/4K 500000 3655 ns/op 1120.64 MB/s +Search_Easy0_CachedRE2/8K 500000 7055 ns/op 1161.07 MB/s +Search_Easy0_CachedRE2/16K 200000 13913 ns/op 1177.56 MB/s +Search_Easy0_CachedRE2/32K 100000 28452 ns/op 1151.67 MB/s +Search_Easy0_CachedRE2/64K 50000 56987 ns/op 1150.02 MB/s +Search_Easy0_CachedRE2/128K 10000 119200 ns/op 1099.60 MB/s +Search_Easy0_CachedRE2/256K 10000 237730 ns/op 1102.70 MB/s +Search_Easy0_CachedRE2/512K 5000 468114 ns/op 1120.00 MB/s +Search_Easy0_CachedRE2/1M 2000 1021877 ns/op 1026.13 MB/s +Search_Easy0_CachedRE2/2M 1000 2011649 ns/op 1042.50 MB/s +Search_Easy0_CachedRE2/4M 500 4080682 ns/op 1027.84 MB/s +Search_Easy0_CachedRE2/8M 200 8181740 ns/op 1025.28 MB/s +Search_Easy0_CachedRE2/16M 100 16334010 ns/op 1027.13 MB/s +Search_Easy1_CachedPCRE/8 10000000 192 ns/op 41.54 MB/s +Search_Easy1_CachedPCRE/16 10000000 235 ns/op 67.96 MB/s +Search_Easy1_CachedPCRE/32 5000000 322 ns/op 99.33 MB/s +Search_Easy1_CachedPCRE/64 5000000 464 ns/op 137.87 MB/s +Search_Easy1_CachedPCRE/128 1000000 1181 ns/op 108.34 MB/s +Search_Easy1_CachedPCRE/256 1000000 1777 ns/op 144.06 MB/s +Search_Easy1_CachedPCRE/512 500000 3954 ns/op 129.48 MB/s +Search_Easy1_CachedPCRE/1K 500000 7239 ns/op 141.44 MB/s +Search_Easy1_CachedPCRE/2K 200000 13617 ns/op 150.39 MB/s +Search_Easy1_CachedPCRE/4K 100000 25383 ns/op 161.37 MB/s +Search_Easy1_CachedPCRE/8K 50000 50456 ns/op 162.36 MB/s +Search_Easy1_CachedPCRE/16K 20000 98470 ns/op 166.38 MB/s +Search_Easy1_CachedPCRE/32K 10000 197261 ns/op 166.11 MB/s +Search_Easy1_CachedPCRE/64K 5000 393359 ns/op 166.61 MB/s +Search_Easy1_CachedPCRE/128K 2000 791563 ns/op 165.59 MB/s +Search_Easy1_CachedPCRE/256K 1000 1584273 ns/op 165.47 MB/s +Search_Easy1_CachedPCRE/512K 500 3164934 ns/op 165.66 MB/s +Search_Easy1_CachedPCRE/1M 500 6457384 ns/op 162.38 MB/s +Search_Easy1_CachedPCRE/2M 100 13022700 ns/op 161.04 MB/s +Search_Easy1_CachedPCRE/4M 100 26111890 ns/op 160.63 MB/s +Search_Easy1_CachedPCRE/8M 50 52238340 ns/op 160.58 MB/s +Search_Easy1_CachedPCRE/16M 20 104536750 ns/op 160.49 MB/s +Search_Easy1_CachedRE2/8 10000000 295 ns/op 27.08 MB/s +Search_Easy1_CachedRE2/16 5000000 302 ns/op 52.81 MB/s +Search_Easy1_CachedRE2/32 5000000 335 ns/op 95.38 MB/s +Search_Easy1_CachedRE2/64 5000000 344 ns/op 185.80 MB/s +Search_Easy1_CachedRE2/128 5000000 421 ns/op 303.85 MB/s +Search_Easy1_CachedRE2/256 5000000 503 ns/op 508.89 MB/s +Search_Easy1_CachedRE2/512 5000000 694 ns/op 736.74 MB/s +Search_Easy1_CachedRE2/1K 1000000 1176 ns/op 870.59 MB/s +Search_Easy1_CachedRE2/2K 1000000 2139 ns/op 957.38 MB/s +Search_Easy1_CachedRE2/4K 500000 3803 ns/op 1076.84 MB/s +Search_Easy1_CachedRE2/8K 500000 7336 ns/op 1116.54 MB/s +Search_Easy1_CachedRE2/16K 200000 14191 ns/op 1154.47 MB/s +Search_Easy1_CachedRE2/32K 100000 29177 ns/op 1123.07 MB/s +Search_Easy1_CachedRE2/64K 50000 58598 ns/op 1118.38 MB/s +Search_Easy1_CachedRE2/128K 10000 127625 ns/op 1027.01 MB/s +Search_Easy1_CachedRE2/256K 10000 254186 ns/op 1031.30 MB/s +Search_Easy1_CachedRE2/512K 5000 493326 ns/op 1062.76 MB/s +Search_Easy1_CachedRE2/1M 2000 1135745 ns/op 923.25 MB/s +Search_Easy1_CachedRE2/2M 1000 2250206 ns/op 931.98 MB/s +Search_Easy1_CachedRE2/4M 500 4513804 ns/op 929.22 MB/s +Search_Easy1_CachedRE2/8M 200 9019710 ns/op 930.03 MB/s +Search_Easy1_CachedRE2/16M 100 18027570 ns/op 930.64 MB/s +Search_Medium_CachedPCRE/8 10000000 172 ns/op 46.39 MB/s +Search_Medium_CachedPCRE/16 10000000 215 ns/op 74.33 MB/s +Search_Medium_CachedPCRE/32 5000000 298 ns/op 107.26 MB/s +Search_Medium_CachedPCRE/64 5000000 441 ns/op 144.98 MB/s +Search_Medium_CachedPCRE/128 5000000 729 ns/op 175.45 MB/s +Search_Medium_CachedPCRE/256 100000 16796 ns/op 15.24 MB/s +Search_Medium_CachedPCRE/512 50000 40007 ns/op 12.80 MB/s +Search_Medium_CachedPCRE/1K 20000 78764 ns/op 13.00 MB/s +Search_Medium_CachedPCRE/2K 10000 116986 ns/op 17.51 MB/s +Search_Medium_CachedPCRE/4K 10000 289854 ns/op 14.13 MB/s +Search_Medium_CachedPCRE/8K 5000 627300 ns/op 13.06 MB/s +Search_Medium_CachedPCRE/16K 2000 1277751 ns/op 12.82 MB/s +Search_Medium_CachedPCRE/32K 1000 2555076 ns/op 12.82 MB/s +Search_Medium_CachedPCRE/64K 500 5106302 ns/op 12.83 MB/s +Search_Medium_CachedPCRE/128K 100 10204640 ns/op 12.84 MB/s +Search_Medium_CachedPCRE/256K 100 20416970 ns/op 12.84 MB/s +Search_Medium_CachedRE2/8 5000000 333 ns/op 24.02 MB/s +Search_Medium_CachedRE2/16 5000000 389 ns/op 41.12 MB/s +Search_Medium_CachedRE2/32 5000000 498 ns/op 64.23 MB/s +Search_Medium_CachedRE2/64 5000000 716 ns/op 89.35 MB/s +Search_Medium_CachedRE2/128 1000000 1152 ns/op 111.08 MB/s +Search_Medium_CachedRE2/256 1000000 2027 ns/op 126.29 MB/s +Search_Medium_CachedRE2/512 500000 3772 ns/op 135.70 MB/s +Search_Medium_CachedRE2/1K 500000 7264 ns/op 140.95 MB/s +Search_Medium_CachedRE2/2K 200000 14266 ns/op 143.56 MB/s +Search_Medium_CachedRE2/4K 100000 28230 ns/op 145.09 MB/s +Search_Medium_CachedRE2/8K 50000 56221 ns/op 145.71 MB/s +Search_Medium_CachedRE2/16K 10000 112045 ns/op 146.23 MB/s +Search_Medium_CachedRE2/32K 10000 223917 ns/op 146.34 MB/s +Search_Medium_CachedRE2/64K 5000 448381 ns/op 146.16 MB/s +Search_Medium_CachedRE2/128K 2000 903067 ns/op 145.14 MB/s +Search_Medium_CachedRE2/256K 1000 1804888 ns/op 145.24 MB/s +Search_Medium_CachedRE2/512K 500 3621616 ns/op 144.77 MB/s +Search_Medium_CachedRE2/1M 500 7316090 ns/op 143.32 MB/s +Search_Medium_CachedRE2/2M 100 14672140 ns/op 142.93 MB/s +Search_Medium_CachedRE2/4M 100 29322600 ns/op 143.04 MB/s +Search_Medium_CachedRE2/8M 50 58591820 ns/op 143.17 MB/s +Search_Medium_CachedRE2/16M 20 117035300 ns/op 143.35 MB/s +Search_Hard_CachedPCRE/8 10000000 189 ns/op 42.19 MB/s +Search_Hard_CachedPCRE/16 10000000 232 ns/op 68.88 MB/s +Search_Hard_CachedPCRE/32 5000000 308 ns/op 103.56 MB/s +Search_Hard_CachedPCRE/64 5000000 459 ns/op 139.43 MB/s +Search_Hard_CachedPCRE/128 2000000 752 ns/op 170.21 MB/s +Search_Hard_CachedPCRE/256 2000 1039441 ns/op 0.25 MB/s +Search_Hard_CachedPCRE/512 500 4261278 ns/op 0.12 MB/s +Search_Hard_CachedPCRE/1K 100 16900780 ns/op 0.06 MB/s +Search_Hard_CachedPCRE/2K 50 61840340 ns/op 0.03 MB/s +Search_Hard_CachedPCRE/4K 5 266433000 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 333 ns/op 24.01 MB/s +Search_Hard_CachedRE2/16 5000000 386 ns/op 41.42 MB/s +Search_Hard_CachedRE2/32 5000000 498 ns/op 64.13 MB/s +Search_Hard_CachedRE2/64 5000000 719 ns/op 88.97 MB/s +Search_Hard_CachedRE2/128 1000000 1153 ns/op 110.93 MB/s +Search_Hard_CachedRE2/256 1000000 2029 ns/op 126.12 MB/s +Search_Hard_CachedRE2/512 500000 3765 ns/op 135.98 MB/s +Search_Hard_CachedRE2/1K 500000 7257 ns/op 141.10 MB/s +Search_Hard_CachedRE2/2K 200000 14263 ns/op 143.58 MB/s +Search_Hard_CachedRE2/4K 100000 28235 ns/op 145.07 MB/s +Search_Hard_CachedRE2/8K 50000 56166 ns/op 145.85 MB/s +Search_Hard_CachedRE2/16K 10000 111887 ns/op 146.43 MB/s +Search_Hard_CachedRE2/32K 10000 224057 ns/op 146.25 MB/s +Search_Hard_CachedRE2/64K 5000 447562 ns/op 146.43 MB/s +Search_Hard_CachedRE2/128K 2000 902071 ns/op 145.30 MB/s +Search_Hard_CachedRE2/256K 1000 1804780 ns/op 145.25 MB/s +Search_Hard_CachedRE2/512K 500 3601118 ns/op 145.59 MB/s +Search_Hard_CachedRE2/1M 500 7287856 ns/op 143.88 MB/s +Search_Hard_CachedRE2/2M 100 14713470 ns/op 142.53 MB/s +Search_Hard_CachedRE2/4M 100 29151470 ns/op 143.88 MB/s +Search_Hard_CachedRE2/8M 50 58191300 ns/op 144.16 MB/s +Search_Hard_CachedRE2/16M 20 116104850 ns/op 144.50 MB/s +Search_Parens_CachedPCRE/8 5000000 305 ns/op 26.22 MB/s +Search_Parens_CachedRE2/8 5000000 329 ns/op 24.27 MB/s +Search_Parens_CachedRE2/16 5000000 386 ns/op 41.35 MB/s +Search_Parens_CachedRE2/32 5000000 494 ns/op 64.69 MB/s +Search_Parens_CachedRE2/64 5000000 711 ns/op 89.92 MB/s +Search_Parens_CachedRE2/128 1000000 1150 ns/op 111.21 MB/s +Search_Parens_CachedRE2/256 1000000 2018 ns/op 126.81 MB/s +Search_Parens_CachedRE2/512 500000 3767 ns/op 135.88 MB/s +Search_Parens_CachedRE2/1K 500000 7254 ns/op 141.15 MB/s +Search_Parens_CachedRE2/2K 200000 14250 ns/op 143.71 MB/s +Search_Parens_CachedRE2/4K 100000 28199 ns/op 145.25 MB/s +Search_Parens_CachedRE2/8K 50000 56158 ns/op 145.87 MB/s +Search_Parens_CachedRE2/16K 10000 112139 ns/op 146.10 MB/s +Search_Parens_CachedRE2/32K 10000 223758 ns/op 146.44 MB/s +Search_Parens_CachedRE2/64K 5000 447242 ns/op 146.53 MB/s +Search_Parens_CachedRE2/128K 2000 902342 ns/op 145.26 MB/s +Search_Parens_CachedRE2/256K 1000 1804484 ns/op 145.27 MB/s +Search_Parens_CachedRE2/512K 500 3603350 ns/op 145.50 MB/s +Search_Parens_CachedRE2/1M 500 7275228 ns/op 144.13 MB/s +Search_Parens_CachedRE2/2M 100 14546350 ns/op 144.17 MB/s +Search_Parens_CachedRE2/4M 100 29132730 ns/op 143.97 MB/s +Search_Parens_CachedRE2/8M 50 58143420 ns/op 144.27 MB/s +Search_Parens_CachedRE2/16M 20 116224000 ns/op 144.35 MB/s +Search_BigFixed_CachedPCRE/8 5000000 386 ns/op 20.73 MB/s +Search_BigFixed_CachedPCRE/16 5000000 475 ns/op 33.64 MB/s +Search_BigFixed_CachedPCRE/32 5000000 639 ns/op 50.07 MB/s +Search_BigFixed_CachedPCRE/64 2000000 966 ns/op 66.19 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1619 ns/op 79.02 MB/s +Search_BigFixed_CachedPCRE/256 1000000 2927 ns/op 87.43 MB/s +Search_BigFixed_CachedPCRE/512 500000 5547 ns/op 92.29 MB/s +Search_BigFixed_CachedPCRE/1K 200000 10789 ns/op 94.91 MB/s +Search_BigFixed_CachedPCRE/2K 100000 21254 ns/op 96.36 MB/s +Search_BigFixed_CachedPCRE/4K 50000 42248 ns/op 96.95 MB/s +Search_BigFixed_CachedPCRE/8K 20000 85732 ns/op 95.55 MB/s +Search_BigFixed_CachedPCRE/16K 10000 169041 ns/op 96.92 MB/s +Search_BigFixed_CachedPCRE/32K 5000 336530 ns/op 97.37 MB/s +Search_BigFixed_CachedRE2/8 10000000 173 ns/op 46.13 MB/s +Search_BigFixed_CachedRE2/16 5000000 358 ns/op 44.63 MB/s +Search_BigFixed_CachedRE2/32 5000000 428 ns/op 74.60 MB/s +Search_BigFixed_CachedRE2/64 5000000 552 ns/op 115.91 MB/s +Search_BigFixed_CachedRE2/128 2000000 786 ns/op 162.81 MB/s +Search_BigFixed_CachedRE2/256 1000000 1261 ns/op 202.95 MB/s +Search_BigFixed_CachedRE2/512 1000000 2226 ns/op 229.95 MB/s +Search_BigFixed_CachedRE2/1K 500000 4306 ns/op 237.77 MB/s +Search_BigFixed_CachedRE2/2K 200000 8298 ns/op 246.80 MB/s +Search_BigFixed_CachedRE2/4K 100000 15641 ns/op 261.87 MB/s +Search_BigFixed_CachedRE2/8K 50000 32298 ns/op 253.63 MB/s +Search_BigFixed_CachedRE2/16K 50000 64673 ns/op 253.33 MB/s +Search_BigFixed_CachedRE2/32K 10000 128773 ns/op 254.46 MB/s +Search_BigFixed_CachedRE2/64K 5000 260717 ns/op 251.37 MB/s +Search_BigFixed_CachedRE2/128K 5000 511763 ns/op 256.12 MB/s +Search_BigFixed_CachedRE2/256K 2000 1010685 ns/op 259.37 MB/s +Search_BigFixed_CachedRE2/512K 1000 2045435 ns/op 256.32 MB/s +Search_BigFixed_CachedRE2/1M 500 4194192 ns/op 250.01 MB/s +Search_Success_PCRE/8 500000 3180 ns/op 2.52 MB/s +Search_Success_PCRE/16 500000 3257 ns/op 4.91 MB/s +Search_Success_PCRE/32 500000 3398 ns/op 9.42 MB/s +Search_Success_PCRE/64 500000 3667 ns/op 17.45 MB/s +Search_Success_PCRE/128 500000 4217 ns/op 30.35 MB/s +Search_Success_PCRE/256 500000 5323 ns/op 48.09 MB/s +Search_Success_PCRE/512 200000 7548 ns/op 67.82 MB/s +Search_Success_PCRE/1K 200000 11978 ns/op 85.48 MB/s +Search_Success_PCRE/2K 100000 20952 ns/op 97.74 MB/s +Search_Success_PCRE/4K 50000 38810 ns/op 105.54 MB/s +Search_Success_PCRE/8K 50000 74005 ns/op 110.69 MB/s +Search_Success_PCRE/16K 10000 145100 ns/op 112.91 MB/s +Search_Success_PCRE/32K 10000 286997 ns/op 114.18 MB/s +Search_Success_PCRE/64K 5000 570876 ns/op 114.80 MB/s +Search_Success_PCRE/128K 2000 1145287 ns/op 114.44 MB/s +Search_Success_PCRE/256K 1000 2293161 ns/op 114.32 MB/s +Search_Success_PCRE/512K 500 4615962 ns/op 113.58 MB/s +Search_Success_PCRE/1M 200 9465575 ns/op 110.78 MB/s +Search_Success_PCRE/2M 100 19204210 ns/op 109.20 MB/s +Search_Success_PCRE/4M 50 39546740 ns/op 106.06 MB/s +Search_Success_PCRE/8M 20 86620850 ns/op 96.84 MB/s +Search_Success_PCRE/16M 5 249759000 ns/op 67.17 MB/s +Search_Success_RE2/8 200000 11045 ns/op 0.72 MB/s +Search_Success_RE2/16 100000 24945 ns/op 0.64 MB/s +Search_Success_RE2/32 100000 25051 ns/op 1.28 MB/s +Search_Success_RE2/64 100000 25231 ns/op 2.54 MB/s +Search_Success_RE2/128 100000 25674 ns/op 4.99 MB/s +Search_Success_RE2/256 100000 26494 ns/op 9.66 MB/s +Search_Success_RE2/512 100000 28177 ns/op 18.17 MB/s +Search_Success_RE2/1K 50000 31724 ns/op 32.28 MB/s +Search_Success_RE2/2K 50000 38681 ns/op 52.95 MB/s +Search_Success_RE2/4K 50000 52757 ns/op 77.64 MB/s +Search_Success_RE2/8K 20000 81316 ns/op 100.74 MB/s +Search_Success_RE2/16K 10000 137268 ns/op 119.36 MB/s +Search_Success_RE2/32K 10000 250210 ns/op 130.96 MB/s +Search_Success_RE2/64K 5000 475959 ns/op 137.69 MB/s +Search_Success_RE2/128K 2000 932651 ns/op 140.54 MB/s +Search_Success_RE2/256K 1000 1834279 ns/op 142.91 MB/s +Search_Success_RE2/512K 500 3667904 ns/op 142.94 MB/s +Search_Success_RE2/1M 200 7492295 ns/op 139.95 MB/s +Search_Success_RE2/2M 100 15393340 ns/op 136.24 MB/s +Search_Success_RE2/4M 50 31713440 ns/op 132.26 MB/s +Search_Success_RE2/8M 20 70783000 ns/op 118.51 MB/s +Search_Success_RE2/16M 5 214766800 ns/op 78.12 MB/s +Search_Success_CachedPCRE/8 5000000 398 ns/op 20.07 MB/s +Search_Success_CachedPCRE/16 5000000 467 ns/op 34.21 MB/s +Search_Success_CachedPCRE/32 5000000 606 ns/op 52.78 MB/s +Search_Success_CachedPCRE/64 2000000 889 ns/op 71.92 MB/s +Search_Success_CachedPCRE/128 1000000 1435 ns/op 89.15 MB/s +Search_Success_CachedPCRE/256 1000000 2548 ns/op 100.46 MB/s +Search_Success_CachedPCRE/512 500000 4759 ns/op 107.58 MB/s +Search_Success_CachedPCRE/1K 200000 9196 ns/op 111.34 MB/s +Search_Success_CachedPCRE/2K 100000 18028 ns/op 113.60 MB/s +Search_Success_CachedPCRE/4K 50000 35661 ns/op 114.86 MB/s +Search_Success_CachedPCRE/8K 50000 71119 ns/op 115.19 MB/s +Search_Success_CachedPCRE/16K 10000 141806 ns/op 115.54 MB/s +Search_Success_CachedPCRE/32K 10000 283456 ns/op 115.60 MB/s +Search_Success_CachedPCRE/64K 5000 567732 ns/op 115.43 MB/s +Search_Success_CachedPCRE/128K 2000 1138747 ns/op 115.10 MB/s +Search_Success_CachedPCRE/256K 1000 2313186 ns/op 113.33 MB/s +Search_Success_CachedPCRE/512K 500 4577496 ns/op 114.54 MB/s +Search_Success_CachedPCRE/1M 200 9356010 ns/op 112.08 MB/s +Search_Success_CachedPCRE/2M 100 19004790 ns/op 110.35 MB/s +Search_Success_CachedPCRE/4M 50 39343000 ns/op 106.61 MB/s +Search_Success_CachedPCRE/8M 20 86153650 ns/op 97.37 MB/s +Search_Success_CachedPCRE/16M 5 246868000 ns/op 67.96 MB/s +Search_Success_CachedRE2/8 10000000 194 ns/op 41.10 MB/s +Search_Success_CachedRE2/16 5000000 398 ns/op 40.20 MB/s +Search_Success_CachedRE2/32 5000000 503 ns/op 63.59 MB/s +Search_Success_CachedRE2/64 5000000 723 ns/op 88.49 MB/s +Search_Success_CachedRE2/128 1000000 1158 ns/op 110.49 MB/s +Search_Success_CachedRE2/256 1000000 2033 ns/op 125.88 MB/s +Search_Success_CachedRE2/512 500000 3778 ns/op 135.49 MB/s +Search_Success_CachedRE2/1K 500000 7267 ns/op 140.91 MB/s +Search_Success_CachedRE2/2K 200000 14244 ns/op 143.77 MB/s +Search_Success_CachedRE2/4K 100000 28205 ns/op 145.22 MB/s +Search_Success_CachedRE2/8K 50000 56127 ns/op 145.95 MB/s +Search_Success_CachedRE2/16K 10000 111843 ns/op 146.49 MB/s +Search_Success_CachedRE2/32K 10000 223998 ns/op 146.29 MB/s +Search_Success_CachedRE2/64K 5000 448512 ns/op 146.12 MB/s +Search_Success_CachedRE2/128K 2000 901455 ns/op 145.40 MB/s +Search_Success_CachedRE2/256K 1000 1806001 ns/op 145.15 MB/s +Search_Success_CachedRE2/512K 500 3657618 ns/op 143.34 MB/s +Search_Success_CachedRE2/1M 200 7519345 ns/op 139.45 MB/s +Search_Success_CachedRE2/2M 100 15277030 ns/op 137.27 MB/s +Search_Success_CachedRE2/4M 50 31999980 ns/op 131.07 MB/s +Search_Success_CachedRE2/8M 20 70956150 ns/op 118.22 MB/s +Search_Success_CachedRE2/16M 5 216152800 ns/op 77.62 MB/s +Search_Success1_PCRE/8 500000 3423 ns/op 2.34 MB/s +Search_Success1_PCRE/16 500000 3479 ns/op 4.60 MB/s +Search_Success1_PCRE/32 500000 3569 ns/op 8.97 MB/s +Search_Success1_PCRE/64 500000 3861 ns/op 16.57 MB/s +Search_Success1_PCRE/128 500000 4451 ns/op 28.76 MB/s +Search_Success1_PCRE/256 500000 5540 ns/op 46.21 MB/s +Search_Success1_PCRE/512 200000 7746 ns/op 66.09 MB/s +Search_Success1_PCRE/1K 200000 12197 ns/op 83.95 MB/s +Search_Success1_PCRE/2K 100000 21043 ns/op 97.32 MB/s +Search_Success1_PCRE/4K 50000 38724 ns/op 105.77 MB/s +Search_Success1_PCRE/8K 50000 74377 ns/op 110.14 MB/s +Search_Success1_PCRE/16K 10000 145584 ns/op 112.54 MB/s +Search_Success1_PCRE/32K 10000 287938 ns/op 113.80 MB/s +Search_Success1_PCRE/64K 5000 573818 ns/op 114.21 MB/s +Search_Success1_PCRE/128K 2000 1143687 ns/op 114.60 MB/s +Search_Success1_PCRE/256K 1000 2289906 ns/op 114.48 MB/s +Search_Success1_PCRE/512K 500 4585568 ns/op 114.33 MB/s +Search_Success1_PCRE/1M 200 9418160 ns/op 111.34 MB/s +Search_Success1_PCRE/2M 100 19084930 ns/op 109.89 MB/s +Search_Success1_PCRE/4M 50 39363100 ns/op 106.55 MB/s +Search_Success1_PCRE/8M 20 86060150 ns/op 97.47 MB/s +Search_Success1_PCRE/16M 5 250110600 ns/op 67.08 MB/s +Search_Success1_RE2/8 50000 33378 ns/op 0.24 MB/s +Search_Success1_RE2/16 50000 33315 ns/op 0.48 MB/s +Search_Success1_RE2/32 50000 33282 ns/op 0.96 MB/s +Search_Success1_RE2/64 50000 33648 ns/op 1.90 MB/s +Search_Success1_RE2/128 50000 34114 ns/op 3.75 MB/s +Search_Success1_RE2/256 50000 35068 ns/op 7.30 MB/s +Search_Success1_RE2/512 50000 36888 ns/op 13.88 MB/s +Search_Success1_RE2/1K 50000 40304 ns/op 25.41 MB/s +Search_Success1_RE2/2K 50000 47214 ns/op 43.38 MB/s +Search_Success1_RE2/4K 50000 61269 ns/op 66.85 MB/s +Search_Success1_RE2/8K 20000 89250 ns/op 91.79 MB/s +Search_Success1_RE2/16K 10000 146292 ns/op 111.99 MB/s +Search_Success1_RE2/32K 10000 258737 ns/op 126.65 MB/s +Search_Success1_RE2/64K 5000 484877 ns/op 135.16 MB/s +Search_Success1_RE2/128K 2000 943913 ns/op 138.86 MB/s +Search_Success1_RE2/256K 1000 1873214 ns/op 139.94 MB/s +Search_Success1_RE2/512K 500 3705398 ns/op 141.49 MB/s +Search_Success1_RE2/1M 200 7572110 ns/op 138.48 MB/s +Search_Success1_RE2/2M 100 15408090 ns/op 136.11 MB/s +Search_Success1_RE2/4M 50 31925020 ns/op 131.38 MB/s +Search_Success1_RE2/8M 20 71334800 ns/op 117.59 MB/s +Search_Success1_RE2/16M 5 215033000 ns/op 78.02 MB/s +Search_Success1_Cached_PCRE/8 5000000 444 ns/op 18.02 MB/s +Search_Success1_Cached_PCRE/16 5000000 512 ns/op 31.25 MB/s +Search_Success1_Cached_PCRE/32 5000000 648 ns/op 49.31 MB/s +Search_Success1_Cached_PCRE/64 2000000 924 ns/op 69.23 MB/s +Search_Success1_Cached_PCRE/128 1000000 1479 ns/op 86.50 MB/s +Search_Success1_Cached_PCRE/256 1000000 2583 ns/op 99.09 MB/s +Search_Success1_Cached_PCRE/512 500000 4820 ns/op 106.21 MB/s +Search_Success1_Cached_PCRE/1K 200000 9312 ns/op 109.95 MB/s +Search_Success1_Cached_PCRE/2K 100000 18101 ns/op 113.14 MB/s +Search_Success1_Cached_PCRE/4K 50000 35873 ns/op 114.18 MB/s +Search_Success1_Cached_PCRE/8K 50000 71355 ns/op 114.81 MB/s +Search_Success1_Cached_PCRE/16K 10000 142622 ns/op 114.88 MB/s +Search_Success1_Cached_PCRE/32K 10000 284619 ns/op 115.13 MB/s +Search_Success1_Cached_PCRE/64K 5000 569459 ns/op 115.08 MB/s +Search_Success1_Cached_PCRE/128K 2000 1141538 ns/op 114.82 MB/s +Search_Success1_Cached_PCRE/256K 1000 2284009 ns/op 114.77 MB/s +Search_Success1_Cached_PCRE/512K 500 4600102 ns/op 113.97 MB/s +Search_Success1_Cached_PCRE/1M 200 9412150 ns/op 111.41 MB/s +Search_Success1_Cached_PCRE/2M 100 19149300 ns/op 109.52 MB/s +Search_Success1_Cached_PCRE/4M 50 39554360 ns/op 106.04 MB/s +Search_Success1_Cached_PCRE/8M 20 86455700 ns/op 97.03 MB/s +Search_Success1_Cached_PCRE/16M 5 247629000 ns/op 67.75 MB/s +Search_Success1_Cached_RE2/8 5000000 342 ns/op 23.34 MB/s +Search_Success1_Cached_RE2/16 5000000 393 ns/op 40.65 MB/s +Search_Success1_Cached_RE2/32 5000000 491 ns/op 65.09 MB/s +Search_Success1_Cached_RE2/64 5000000 722 ns/op 88.62 MB/s +Search_Success1_Cached_RE2/128 1000000 1157 ns/op 110.54 MB/s +Search_Success1_Cached_RE2/256 1000000 2032 ns/op 125.94 MB/s +Search_Success1_Cached_RE2/512 500000 3783 ns/op 135.32 MB/s +Search_Success1_Cached_RE2/1K 500000 7283 ns/op 140.59 MB/s +Search_Success1_Cached_RE2/2K 200000 14272 ns/op 143.49 MB/s +Search_Success1_Cached_RE2/4K 100000 28247 ns/op 145.00 MB/s +Search_Success1_Cached_RE2/8K 50000 56279 ns/op 145.56 MB/s +Search_Success1_Cached_RE2/16K 10000 112283 ns/op 145.92 MB/s +Search_Success1_Cached_RE2/32K 10000 224269 ns/op 146.11 MB/s +Search_Success1_Cached_RE2/64K 5000 448363 ns/op 146.17 MB/s +Search_Success1_Cached_RE2/128K 2000 903637 ns/op 145.05 MB/s +Search_Success1_Cached_RE2/256K 1000 1811174 ns/op 144.74 MB/s +Search_Success1_Cached_RE2/512K 500 3637266 ns/op 144.14 MB/s +Search_Success1_Cached_RE2/1M 200 7452810 ns/op 140.70 MB/s +Search_Success1_Cached_RE2/2M 100 15218540 ns/op 137.80 MB/s +Search_Success1_Cached_RE2/4M 50 31624240 ns/op 132.63 MB/s +Search_Success1_Cached_RE2/8M 20 70441100 ns/op 119.09 MB/s +Search_Success1_Cached_RE2/16M 5 214653600 ns/op 78.16 MB/s +Search_Digits_PCRE 500000 7117 ns/op +Search_Digits_RE2 100000 27121 ns/op +Parse_Digits_PCRE 500000 7214 ns/op +Parse_Digits_RE2 200000 13193 ns/op +Parse_CachedDigits_PCRE 2000000 771 ns/op +Parse_CachedDigits_RE2 5000000 452 ns/op +Parse_DigitDs_PCRE 500000 6655 ns/op +Parse_DigitDs_RE2 200000 12935 ns/op +Parse_CachedDigitDs_PCRE 2000000 761 ns/op +Parse_CachedDigitDs_RE2 5000000 452 ns/op +Parse_Split_PCRE 500000 4849 ns/op +Parse_Split_RE2 200000 14149 ns/op +Parse_CachedSplit_PCRE 5000000 572 ns/op +Parse_CachedSplit_RE2 10000000 278 ns/op +Parse_SplitHard_PCRE 500000 4695 ns/op +Parse_SplitHard_RE2 100000 17776 ns/op +Parse_CachedSplitHard_PCRE 5000000 558 ns/op +Parse_CachedSplitHard_RE2 500000 2925 ns/op +Parse_CachedSplitBig1_PCRE 200 8378325 ns/op +Parse_CachedSplitBig1_RE2 2000 1296256 ns/op +Parse_CachedSplitBig2_PCRE 2000 849668 ns/op +Parse_CachedSplitBig2_RE2 20 93559400 ns/op +BM_PCRE_Compile 500000 5773 ns/op +BM_RE2_Compile 200000 14117 ns/op +SearchPhone_CachedPCRE/8 1000000 2107 ns/op 3.80 MB/s +SearchPhone_CachedPCRE/16 500000 3511 ns/op 4.56 MB/s +SearchPhone_CachedPCRE/32 500000 6303 ns/op 5.08 MB/s +SearchPhone_CachedPCRE/64 200000 11898 ns/op 5.38 MB/s +SearchPhone_CachedPCRE/128 100000 23242 ns/op 5.51 MB/s +SearchPhone_CachedPCRE/256 50000 45867 ns/op 5.58 MB/s +SearchPhone_CachedPCRE/512 20000 90764 ns/op 5.64 MB/s +SearchPhone_CachedPCRE/1K 10000 180150 ns/op 5.68 MB/s +SearchPhone_CachedPCRE/2K 5000 356942 ns/op 5.74 MB/s +SearchPhone_CachedPCRE/4K 5000 707356 ns/op 5.79 MB/s +SearchPhone_CachedPCRE/8K 2000 1408777 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/16K 1000 2816931 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/32K 500 5630556 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/64K 100 11257450 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/128K 100 22480780 ns/op 5.83 MB/s +SearchPhone_CachedPCRE/256K 50 44877320 ns/op 5.84 MB/s +SearchPhone_CachedPCRE/512K 20 90030600 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/1M 10 180520400 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/2M 5 360229400 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/4M 5 720922200 ns/op 5.82 MB/s +SearchPhone_CachedPCRE/8M 1 1443346000 ns/op 5.81 MB/s +SearchPhone_CachedPCRE/16M 1 2885907000 ns/op 5.81 MB/s +SearchPhone_CachedRE2/8 1000000 1035 ns/op 7.73 MB/s +SearchPhone_CachedRE2/16 1000000 1096 ns/op 14.59 MB/s +SearchPhone_CachedRE2/32 1000000 1206 ns/op 26.53 MB/s +SearchPhone_CachedRE2/64 1000000 1421 ns/op 45.01 MB/s +SearchPhone_CachedRE2/128 1000000 1868 ns/op 68.49 MB/s +SearchPhone_CachedRE2/256 1000000 2742 ns/op 93.35 MB/s +SearchPhone_CachedRE2/512 500000 4488 ns/op 114.06 MB/s +SearchPhone_CachedRE2/1K 200000 7960 ns/op 128.63 MB/s +SearchPhone_CachedRE2/2K 200000 14980 ns/op 136.71 MB/s +SearchPhone_CachedRE2/4K 100000 28984 ns/op 141.32 MB/s +SearchPhone_CachedRE2/8K 50000 56914 ns/op 143.93 MB/s +SearchPhone_CachedRE2/16K 10000 113004 ns/op 144.99 MB/s +SearchPhone_CachedRE2/32K 10000 224690 ns/op 145.84 MB/s +SearchPhone_CachedRE2/64K 5000 449388 ns/op 145.83 MB/s +SearchPhone_CachedRE2/128K 2000 898866 ns/op 145.82 MB/s +SearchPhone_CachedRE2/256K 1000 1796509 ns/op 145.92 MB/s +SearchPhone_CachedRE2/512K 500 3590754 ns/op 146.01 MB/s +SearchPhone_CachedRE2/1M 500 7255254 ns/op 144.53 MB/s +SearchPhone_CachedRE2/2M 100 14476190 ns/op 144.87 MB/s +SearchPhone_CachedRE2/4M 100 28990300 ns/op 144.68 MB/s +SearchPhone_CachedRE2/8M 50 57857200 ns/op 144.99 MB/s +SearchPhone_CachedRE2/16M 20 115874300 ns/op 144.79 MB/s +EmptyPartialMatchPCRE 10000000 190 ns/op +EmptyPartialMatchRE2 10000000 272 ns/op +SimplePartialMatchPCRE 10000000 271 ns/op +SimplePartialMatchRE2 5000000 334 ns/op +HTTPPartialMatchPCRE 2000000 896 ns/op +HTTPPartialMatchRE2 1000000 1089 ns/op +SmallHTTPPartialMatchPCRE 2000000 895 ns/op +SmallHTTPPartialMatchRE2 1000000 1080 ns/op +DotMatchPCRE 2000000 863 ns/op +DotMatchRE2 1000000 1080 ns/op +ASCIIMatchPCRE 2000000 780 ns/op +ASCIIMatchRE2 1000000 1079 ns/op diff --git a/outside/re2/benchlog/benchlog.wreck b/outside/re2/benchlog/benchlog.wreck new file mode 100644 index 000000000..073ec4c6b --- /dev/null +++ b/outside/re2/benchlog/benchlog.wreck @@ -0,0 +1,1058 @@ +hw.machine = i386 +hw.model = MacPro1,1 +hw.ncpu = 4 +hw.byteorder = 1234 +hw.physmem = 2147483648 +hw.usermem = 1477443584 +hw.pagesize = 4096 +hw.epoch = 0 +hw.vectorunit = 1 +hw.busfrequency = 1332000000 +hw.cpufrequency = 2660000000 +hw.cachelinesize = 64 +hw.l1icachesize = 32768 +hw.l1dcachesize = 32768 +hw.l2settings = 1 +hw.l2cachesize = 4194304 +hw.tbfrequency = 1000000000 +hw.memsize = 4294967296 +hw.availcpu = 4 +net.link.ether.inet.apple_hwcksum_rx: 1 +net.link.ether.inet.apple_hwcksum_tx: 1 +hw.ncpu: 4 +hw.byteorder: 1234 +hw.memsize: 4294967296 +hw.activecpu: 4 +hw.optional.x86_64: 1 +hw.optional.sse4_2: 0 +hw.optional.sse4_1: 0 +hw.optional.supplementalsse3: 1 +hw.optional.sse3: 1 +hw.optional.sse2: 1 +hw.optional.sse: 1 +hw.optional.mmx: 1 +hw.optional.floatingpoint: 1 +hw.packages: 2 +hw.tbfrequency: 1000000000 +hw.l2cachesize: 4194304 +hw.l1dcachesize: 32768 +hw.l1icachesize: 32768 +hw.cachelinesize: 64 +hw.cpufrequency_max: 2660000000 +hw.cpufrequency_min: 2660000000 +hw.cpufrequency: 2660000000 +hw.busfrequency_max: 1332000000 +hw.busfrequency_min: 1332000000 +hw.busfrequency: 1332000000 +hw.pagesize: 4096 +hw.cachesize: 4294967296 32768 4194304 0 0 0 0 0 0 0 +hw.cacheconfig: 4 1 2 0 0 0 0 0 0 0 +hw.cpufamily: 1114597871 +hw.cpu64bit_capable: 1 +hw.cpusubtype: 4 +hw.cputype: 7 +hw.logicalcpu_max: 4 +hw.logicalcpu: 4 +hw.physicalcpu_max: 4 +hw.physicalcpu: 4 +machdep.pmap.hashwalks: 1141082341 + +machdep.cpu.thread_count: 2 +machdep.cpu.core_count: 2 +machdep.cpu.address_bits.virtual: 48 +machdep.cpu.address_bits.physical: 36 +machdep.cpu.tlb.data_large: 32 +machdep.cpu.tlb.inst_large: 8 +machdep.cpu.tlb.data_small: 256 +machdep.cpu.tlb.inst_small: 128 +machdep.cpu.cache.size: 4096 +machdep.cpu.cache.L2_associativity: 8 +machdep.cpu.cache.linesize: 64 +machdep.cpu.arch_perf.fixed_width: 0 +machdep.cpu.arch_perf.fixed_number: 0 +machdep.cpu.arch_perf.events: 0 +machdep.cpu.arch_perf.events_number: 7 +machdep.cpu.arch_perf.width: 40 +machdep.cpu.arch_perf.number: 2 +machdep.cpu.arch_perf.version: 2 +machdep.cpu.thermal.ACNT_MCNT: 1 +machdep.cpu.thermal.thresholds: 2 +machdep.cpu.thermal.dynamic_acceleration: 0 +machdep.cpu.thermal.sensor: 1 +machdep.cpu.mwait.sub_Cstates: 32 +machdep.cpu.mwait.extensions: 3 +machdep.cpu.mwait.linesize_max: 64 +machdep.cpu.mwait.linesize_min: 64 +machdep.cpu.microcode_version: 68 +machdep.cpu.cores_per_package: 2 +machdep.cpu.logical_per_package: 2 +machdep.cpu.extfeatures: XD EM64T +machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM +machdep.cpu.brand: 0 +machdep.cpu.signature: 1782 +machdep.cpu.extfeature_bits: 537919488 1 +machdep.cpu.feature_bits: -1075053569 320445 +machdep.cpu.stepping: 6 +machdep.cpu.extfamily: 0 +machdep.cpu.extmodel: 0 +machdep.cpu.model: 15 +machdep.cpu.family: 6 +machdep.cpu.brand_string: Intel(R) Xeon(R) CPU 5150 @ 2.66GHz +machdep.cpu.vendor: GenuineIntel + +==BENCHMARK== wreck.mtv.corp.google.com Fri Feb 26 13:45:06 PST 2010 +# Darwin wreck.mtv.corp.google.com 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:55:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_I386 i386 +# i686-apple-darwin9-g++-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5484) +# Copyright (C) 2005 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# d7671f473f1a+ tip +obj/test/regexp_benchmark: Mach-O executable i386 + +Search_Easy0_CachedPCRE/8 10000000 142 ns/op 56.08 MB/s +Search_Easy0_CachedPCRE/16 10000000 184 ns/op 86.70 MB/s +Search_Easy0_CachedPCRE/32 5000000 266 ns/op 120.15 MB/s +Search_Easy0_CachedPCRE/64 5000000 433 ns/op 147.54 MB/s +Search_Easy0_CachedPCRE/128 2000000 782 ns/op 163.61 MB/s +Search_Easy0_CachedPCRE/256 1000000 1435 ns/op 178.37 MB/s +Search_Easy0_CachedPCRE/512 500000 3151 ns/op 162.46 MB/s +Search_Easy0_CachedPCRE/1K 200000 6522 ns/op 156.99 MB/s +Search_Easy0_CachedPCRE/2K 100000 12024 ns/op 170.32 MB/s +Search_Easy0_CachedPCRE/4K 50000 24372 ns/op 168.06 MB/s +Search_Easy0_CachedPCRE/8K 50000 48326 ns/op 169.51 MB/s +Search_Easy0_CachedPCRE/16K 20000 96331 ns/op 170.08 MB/s +Search_Easy0_CachedPCRE/32K 10000 189172 ns/op 173.22 MB/s +Search_Easy0_CachedPCRE/64K 5000 380022 ns/op 172.45 MB/s +Search_Easy0_CachedPCRE/128K 2000 759526 ns/op 172.57 MB/s +Search_Easy0_CachedPCRE/256K 1000 1514090 ns/op 173.14 MB/s +Search_Easy0_CachedPCRE/512K 500 3039238 ns/op 172.51 MB/s +Search_Easy0_CachedPCRE/1M 200 6089745 ns/op 172.19 MB/s +Search_Easy0_CachedPCRE/2M 100 12326550 ns/op 170.13 MB/s +Search_Easy0_CachedPCRE/4M 50 24663580 ns/op 170.06 MB/s +Search_Easy0_CachedPCRE/8M 50 49853660 ns/op 168.26 MB/s +Search_Easy0_CachedPCRE/16M 10 100141300 ns/op 167.54 MB/s +Search_Easy0_CachedRE2/8 5000000 441 ns/op 18.13 MB/s +Search_Easy0_CachedRE2/16 5000000 451 ns/op 35.44 MB/s +Search_Easy0_CachedRE2/32 5000000 477 ns/op 66.96 MB/s +Search_Easy0_CachedRE2/64 2000000 527 ns/op 121.31 MB/s +Search_Easy0_CachedRE2/128 2000000 601 ns/op 212.78 MB/s +Search_Easy0_CachedRE2/256 2000000 800 ns/op 319.67 MB/s +Search_Easy0_CachedRE2/512 1000000 1189 ns/op 430.48 MB/s +Search_Easy0_CachedRE2/1K 1000000 2010 ns/op 509.44 MB/s +Search_Easy0_CachedRE2/2K 500000 3660 ns/op 559.55 MB/s +Search_Easy0_CachedRE2/4K 200000 7036 ns/op 582.08 MB/s +Search_Easy0_CachedRE2/8K 100000 13675 ns/op 599.04 MB/s +Search_Easy0_CachedRE2/16K 50000 27108 ns/op 604.38 MB/s +Search_Easy0_CachedRE2/32K 20000 53246 ns/op 615.40 MB/s +Search_Easy0_CachedRE2/64K 10000 105293 ns/op 622.41 MB/s +Search_Easy0_CachedRE2/128K 5000 210763 ns/op 621.89 MB/s +Search_Easy0_CachedRE2/256K 5000 418416 ns/op 626.51 MB/s +Search_Easy0_CachedRE2/512K 2000 840122 ns/op 624.06 MB/s +Search_Easy0_CachedRE2/1M 1000 1686321 ns/op 621.81 MB/s +Search_Easy0_CachedRE2/2M 500 3394234 ns/op 617.86 MB/s +Search_Easy0_CachedRE2/4M 200 6926710 ns/op 605.53 MB/s +Search_Easy0_CachedRE2/8M 100 13850090 ns/op 605.67 MB/s +Search_Easy0_CachedRE2/16M 50 27810480 ns/op 603.27 MB/s +Search_Easy1_CachedPCRE/8 10000000 145 ns/op 55.08 MB/s +Search_Easy1_CachedPCRE/16 10000000 186 ns/op 85.59 MB/s +Search_Easy1_CachedPCRE/32 5000000 268 ns/op 119.32 MB/s +Search_Easy1_CachedPCRE/64 5000000 436 ns/op 146.76 MB/s +Search_Easy1_CachedPCRE/128 2000000 786 ns/op 162.78 MB/s +Search_Easy1_CachedPCRE/256 1000000 1446 ns/op 176.97 MB/s +Search_Easy1_CachedPCRE/512 500000 2947 ns/op 173.70 MB/s +Search_Easy1_CachedPCRE/1K 200000 6212 ns/op 164.82 MB/s +Search_Easy1_CachedPCRE/2K 100000 12544 ns/op 163.25 MB/s +Search_Easy1_CachedPCRE/4K 50000 24997 ns/op 163.85 MB/s +Search_Easy1_CachedPCRE/8K 50000 49945 ns/op 164.02 MB/s +Search_Easy1_CachedPCRE/16K 20000 98856 ns/op 165.73 MB/s +Search_Easy1_CachedPCRE/32K 10000 196635 ns/op 166.64 MB/s +Search_Easy1_CachedPCRE/64K 5000 392336 ns/op 167.04 MB/s +Search_Easy1_CachedPCRE/128K 2000 781551 ns/op 167.71 MB/s +Search_Easy1_CachedPCRE/256K 1000 1572536 ns/op 166.70 MB/s +Search_Easy1_CachedPCRE/512K 500 3133634 ns/op 167.31 MB/s +Search_Easy1_CachedPCRE/1M 200 6268370 ns/op 167.28 MB/s +Search_Easy1_CachedPCRE/2M 100 12629380 ns/op 166.05 MB/s +Search_Easy1_CachedPCRE/4M 50 25311280 ns/op 165.71 MB/s +Search_Easy1_CachedPCRE/8M 20 50747250 ns/op 165.30 MB/s +Search_Easy1_CachedPCRE/16M 10 102157400 ns/op 164.23 MB/s +Search_Easy1_CachedRE2/8 5000000 431 ns/op 18.53 MB/s +Search_Easy1_CachedRE2/16 5000000 448 ns/op 35.70 MB/s +Search_Easy1_CachedRE2/32 5000000 475 ns/op 67.36 MB/s +Search_Easy1_CachedRE2/64 2000000 526 ns/op 121.54 MB/s +Search_Easy1_CachedRE2/128 2000000 603 ns/op 212.23 MB/s +Search_Easy1_CachedRE2/256 2000000 799 ns/op 320.12 MB/s +Search_Easy1_CachedRE2/512 1000000 1182 ns/op 433.15 MB/s +Search_Easy1_CachedRE2/1K 1000000 2001 ns/op 511.61 MB/s +Search_Easy1_CachedRE2/2K 500000 3639 ns/op 562.68 MB/s +Search_Easy1_CachedRE2/4K 200000 7020 ns/op 583.43 MB/s +Search_Easy1_CachedRE2/8K 100000 13720 ns/op 597.04 MB/s +Search_Easy1_CachedRE2/16K 50000 27091 ns/op 604.76 MB/s +Search_Easy1_CachedRE2/32K 20000 53363 ns/op 614.06 MB/s +Search_Easy1_CachedRE2/64K 10000 104803 ns/op 625.32 MB/s +Search_Easy1_CachedRE2/128K 5000 210012 ns/op 624.11 MB/s +Search_Easy1_CachedRE2/256K 5000 416117 ns/op 629.98 MB/s +Search_Easy1_CachedRE2/512K 2000 832909 ns/op 629.47 MB/s +Search_Easy1_CachedRE2/1M 1000 1685969 ns/op 621.94 MB/s +Search_Easy1_CachedRE2/2M 500 3388716 ns/op 618.86 MB/s +Search_Easy1_CachedRE2/4M 200 6872645 ns/op 610.29 MB/s +Search_Easy1_CachedRE2/8M 100 13975650 ns/op 600.23 MB/s +Search_Easy1_CachedRE2/16M 50 27882420 ns/op 601.71 MB/s +Search_Medium_CachedPCRE/8 10000000 144 ns/op 55.25 MB/s +Search_Medium_CachedPCRE/16 10000000 192 ns/op 83.21 MB/s +Search_Medium_CachedPCRE/32 5000000 280 ns/op 114.08 MB/s +Search_Medium_CachedPCRE/64 5000000 452 ns/op 141.46 MB/s +Search_Medium_CachedPCRE/128 200000 6086 ns/op 21.03 MB/s +Search_Medium_CachedPCRE/256 100000 11456 ns/op 22.35 MB/s +Search_Medium_CachedPCRE/512 50000 27208 ns/op 18.82 MB/s +Search_Medium_CachedPCRE/1K 20000 53266 ns/op 19.22 MB/s +Search_Medium_CachedPCRE/2K 20000 84985 ns/op 24.10 MB/s +Search_Medium_CachedPCRE/4K 5000 205715 ns/op 19.91 MB/s +Search_Medium_CachedPCRE/8K 5000 421092 ns/op 19.45 MB/s +Search_Medium_CachedPCRE/16K 2000 847861 ns/op 19.32 MB/s +Search_Medium_CachedPCRE/32K 1000 1688903 ns/op 19.40 MB/s +Search_Medium_CachedPCRE/64K 500 3374828 ns/op 19.42 MB/s +Search_Medium_CachedPCRE/128K 200 6737375 ns/op 19.45 MB/s +Search_Medium_CachedPCRE/256K 100 13497210 ns/op 19.42 MB/s +Search_Medium_CachedRE2/8 5000000 456 ns/op 17.53 MB/s +Search_Medium_CachedRE2/16 5000000 499 ns/op 32.05 MB/s +Search_Medium_CachedRE2/32 2000000 575 ns/op 55.62 MB/s +Search_Medium_CachedRE2/64 2000000 730 ns/op 87.61 MB/s +Search_Medium_CachedRE2/128 1000000 1051 ns/op 121.72 MB/s +Search_Medium_CachedRE2/256 1000000 1695 ns/op 150.98 MB/s +Search_Medium_CachedRE2/512 500000 2947 ns/op 173.73 MB/s +Search_Medium_CachedRE2/1K 200000 5474 ns/op 187.04 MB/s +Search_Medium_CachedRE2/2K 100000 10384 ns/op 197.21 MB/s +Search_Medium_CachedRE2/4K 50000 20546 ns/op 199.35 MB/s +Search_Medium_CachedRE2/8K 50000 39540 ns/op 207.18 MB/s +Search_Medium_CachedRE2/16K 20000 77860 ns/op 210.43 MB/s +Search_Medium_CachedRE2/32K 10000 154440 ns/op 212.17 MB/s +Search_Medium_CachedRE2/64K 5000 306800 ns/op 213.61 MB/s +Search_Medium_CachedRE2/128K 2000 627489 ns/op 208.88 MB/s +Search_Medium_CachedRE2/256K 1000 1232221 ns/op 212.74 MB/s +Search_Medium_CachedRE2/512K 500 2473372 ns/op 211.97 MB/s +Search_Medium_CachedRE2/1M 500 4963800 ns/op 211.24 MB/s +Search_Medium_CachedRE2/2M 200 10010555 ns/op 209.49 MB/s +Search_Medium_CachedRE2/4M 50 20355180 ns/op 206.06 MB/s +Search_Medium_CachedRE2/8M 50 40085120 ns/op 209.27 MB/s +Search_Medium_CachedRE2/16M 20 81232650 ns/op 206.53 MB/s +Search_Hard_CachedPCRE/8 10000000 145 ns/op 54.95 MB/s +Search_Hard_CachedPCRE/16 10000000 191 ns/op 83.60 MB/s +Search_Hard_CachedPCRE/32 5000000 279 ns/op 114.53 MB/s +Search_Hard_CachedPCRE/64 5000000 463 ns/op 137.99 MB/s +Search_Hard_CachedPCRE/128 5000 235508 ns/op 0.54 MB/s +Search_Hard_CachedPCRE/256 2000 885356 ns/op 0.29 MB/s +Search_Hard_CachedPCRE/512 500 3682430 ns/op 0.14 MB/s +Search_Hard_CachedPCRE/1K 100 14493660 ns/op 0.07 MB/s +Search_Hard_CachedPCRE/2K 20 54810600 ns/op 0.04 MB/s +Search_Hard_CachedPCRE/4K 5 236421800 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 460 ns/op 17.39 MB/s +Search_Hard_CachedRE2/16 5000000 498 ns/op 32.11 MB/s +Search_Hard_CachedRE2/32 2000000 570 ns/op 56.05 MB/s +Search_Hard_CachedRE2/64 2000000 726 ns/op 88.08 MB/s +Search_Hard_CachedRE2/128 1000000 1044 ns/op 122.53 MB/s +Search_Hard_CachedRE2/256 1000000 1669 ns/op 153.37 MB/s +Search_Hard_CachedRE2/512 500000 2910 ns/op 175.92 MB/s +Search_Hard_CachedRE2/1K 200000 5380 ns/op 190.32 MB/s +Search_Hard_CachedRE2/2K 100000 10730 ns/op 190.86 MB/s +Search_Hard_CachedRE2/4K 50000 20827 ns/op 196.66 MB/s +Search_Hard_CachedRE2/8K 50000 39641 ns/op 206.65 MB/s +Search_Hard_CachedRE2/16K 20000 78174 ns/op 209.58 MB/s +Search_Hard_CachedRE2/32K 10000 154236 ns/op 212.45 MB/s +Search_Hard_CachedRE2/64K 5000 307131 ns/op 213.38 MB/s +Search_Hard_CachedRE2/128K 2000 617929 ns/op 212.11 MB/s +Search_Hard_CachedRE2/256K 1000 1235441 ns/op 212.19 MB/s +Search_Hard_CachedRE2/512K 500 2465954 ns/op 212.61 MB/s +Search_Hard_CachedRE2/1M 500 4943778 ns/op 212.10 MB/s +Search_Hard_CachedRE2/2M 200 9957805 ns/op 210.60 MB/s +Search_Hard_CachedRE2/4M 50 20109920 ns/op 208.57 MB/s +Search_Hard_CachedRE2/8M 50 40249680 ns/op 208.41 MB/s +Search_Hard_CachedRE2/16M 20 79626800 ns/op 210.70 MB/s +Search_Parens_CachedPCRE/8 5000000 207 ns/op 38.46 MB/s +Search_Parens_CachedRE2/8 5000000 460 ns/op 17.35 MB/s +Search_Parens_CachedRE2/16 5000000 499 ns/op 32.01 MB/s +Search_Parens_CachedRE2/32 2000000 566 ns/op 56.44 MB/s +Search_Parens_CachedRE2/64 2000000 731 ns/op 87.44 MB/s +Search_Parens_CachedRE2/128 1000000 1046 ns/op 122.35 MB/s +Search_Parens_CachedRE2/256 1000000 1674 ns/op 152.87 MB/s +Search_Parens_CachedRE2/512 500000 2889 ns/op 177.21 MB/s +Search_Parens_CachedRE2/1K 200000 5456 ns/op 187.68 MB/s +Search_Parens_CachedRE2/2K 100000 10527 ns/op 194.54 MB/s +Search_Parens_CachedRE2/4K 50000 20632 ns/op 198.52 MB/s +Search_Parens_CachedRE2/8K 50000 39791 ns/op 205.87 MB/s +Search_Parens_CachedRE2/16K 20000 77748 ns/op 210.73 MB/s +Search_Parens_CachedRE2/32K 10000 154317 ns/op 212.34 MB/s +Search_Parens_CachedRE2/64K 5000 306631 ns/op 213.73 MB/s +Search_Parens_CachedRE2/128K 2000 618071 ns/op 212.07 MB/s +Search_Parens_CachedRE2/256K 1000 1231452 ns/op 212.87 MB/s +Search_Parens_CachedRE2/512K 500 2463338 ns/op 212.84 MB/s +Search_Parens_CachedRE2/1M 500 4945594 ns/op 212.02 MB/s +Search_Parens_CachedRE2/2M 100 10028120 ns/op 209.13 MB/s +Search_Parens_CachedRE2/4M 50 20201820 ns/op 207.62 MB/s +Search_Parens_CachedRE2/8M 50 40668120 ns/op 206.27 MB/s +Search_Parens_CachedRE2/16M 20 80655350 ns/op 208.01 MB/s +Search_BigFixed_CachedPCRE/8 5000000 285 ns/op 28.06 MB/s +Search_BigFixed_CachedPCRE/16 5000000 371 ns/op 43.10 MB/s +Search_BigFixed_CachedPCRE/32 2000000 544 ns/op 58.77 MB/s +Search_BigFixed_CachedPCRE/64 2000000 891 ns/op 71.75 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1599 ns/op 80.04 MB/s +Search_BigFixed_CachedPCRE/256 500000 2995 ns/op 85.46 MB/s +Search_BigFixed_CachedPCRE/512 200000 5724 ns/op 89.44 MB/s +Search_BigFixed_CachedPCRE/1K 100000 11311 ns/op 90.53 MB/s +Search_BigFixed_CachedPCRE/2K 50000 22347 ns/op 91.65 MB/s +Search_BigFixed_CachedPCRE/4K 50000 44379 ns/op 92.29 MB/s +Search_BigFixed_CachedPCRE/8K 20000 87509 ns/op 93.61 MB/s +Search_BigFixed_CachedPCRE/16K 10000 175594 ns/op 93.31 MB/s +Search_BigFixed_CachedPCRE/32K 5000 352953 ns/op 92.84 MB/s +Search_BigFixed_CachedRE2/8 10000000 164 ns/op 48.68 MB/s +Search_BigFixed_CachedRE2/16 5000000 487 ns/op 32.84 MB/s +Search_BigFixed_CachedRE2/32 2000000 539 ns/op 59.28 MB/s +Search_BigFixed_CachedRE2/64 2000000 612 ns/op 104.53 MB/s +Search_BigFixed_CachedRE2/128 2000000 781 ns/op 163.87 MB/s +Search_BigFixed_CachedRE2/256 1000000 1082 ns/op 236.58 MB/s +Search_BigFixed_CachedRE2/512 1000000 1689 ns/op 303.00 MB/s +Search_BigFixed_CachedRE2/1K 500000 2924 ns/op 350.17 MB/s +Search_BigFixed_CachedRE2/2K 200000 5753 ns/op 355.99 MB/s +Search_BigFixed_CachedRE2/4K 100000 10436 ns/op 392.46 MB/s +Search_BigFixed_CachedRE2/8K 50000 20223 ns/op 405.08 MB/s +Search_BigFixed_CachedRE2/16K 50000 40733 ns/op 402.23 MB/s +Search_BigFixed_CachedRE2/32K 20000 80342 ns/op 407.86 MB/s +Search_BigFixed_CachedRE2/64K 10000 159585 ns/op 410.66 MB/s +Search_BigFixed_CachedRE2/128K 5000 320376 ns/op 409.12 MB/s +Search_BigFixed_CachedRE2/256K 2000 641718 ns/op 408.50 MB/s +Search_BigFixed_CachedRE2/512K 1000 1290373 ns/op 406.31 MB/s +Search_BigFixed_CachedRE2/1M 500 2638566 ns/op 397.40 MB/s +Search_Success_PCRE/8 500000 3393 ns/op 2.36 MB/s +Search_Success_PCRE/16 500000 3469 ns/op 4.61 MB/s +Search_Success_PCRE/32 500000 3499 ns/op 9.15 MB/s +Search_Success_PCRE/64 500000 3848 ns/op 16.63 MB/s +Search_Success_PCRE/128 500000 4582 ns/op 27.93 MB/s +Search_Success_PCRE/256 200000 5678 ns/op 45.08 MB/s +Search_Success_PCRE/512 200000 8267 ns/op 61.93 MB/s +Search_Success_PCRE/1K 100000 13341 ns/op 76.75 MB/s +Search_Success_PCRE/2K 50000 23974 ns/op 85.42 MB/s +Search_Success_PCRE/4K 50000 44459 ns/op 92.13 MB/s +Search_Success_PCRE/8K 20000 87665 ns/op 93.45 MB/s +Search_Success_PCRE/16K 10000 174412 ns/op 93.94 MB/s +Search_Success_PCRE/32K 5000 348685 ns/op 93.98 MB/s +Search_Success_PCRE/64K 2000 695853 ns/op 94.18 MB/s +Search_Success_PCRE/128K 1000 1382530 ns/op 94.81 MB/s +Search_Success_PCRE/256K 500 2777966 ns/op 94.37 MB/s +Search_Success_PCRE/512K 200 5622585 ns/op 93.25 MB/s +Search_Success_PCRE/1M 100 11355970 ns/op 92.34 MB/s +Search_Success_PCRE/2M 50 23359260 ns/op 89.78 MB/s +Search_Success_PCRE/4M 20 50359900 ns/op 83.29 MB/s +Search_Success_PCRE/8M 10 111431900 ns/op 75.28 MB/s +Search_Success_PCRE/16M 5 265918600 ns/op 63.09 MB/s +Search_Success_RE2/8 100000 16060 ns/op 0.50 MB/s +Search_Success_RE2/16 50000 34580 ns/op 0.46 MB/s +Search_Success_RE2/32 50000 35094 ns/op 0.91 MB/s +Search_Success_RE2/64 50000 35110 ns/op 1.82 MB/s +Search_Success_RE2/128 50000 35001 ns/op 3.66 MB/s +Search_Success_RE2/256 50000 35354 ns/op 7.24 MB/s +Search_Success_RE2/512 50000 36899 ns/op 13.88 MB/s +Search_Success_RE2/1K 50000 39012 ns/op 26.25 MB/s +Search_Success_RE2/2K 50000 42906 ns/op 47.73 MB/s +Search_Success_RE2/4K 20000 53136 ns/op 77.08 MB/s +Search_Success_RE2/8K 20000 72624 ns/op 112.80 MB/s +Search_Success_RE2/16K 10000 112251 ns/op 145.96 MB/s +Search_Success_RE2/32K 10000 189404 ns/op 173.01 MB/s +Search_Success_RE2/64K 5000 345391 ns/op 189.74 MB/s +Search_Success_RE2/128K 2000 651836 ns/op 201.08 MB/s +Search_Success_RE2/256K 1000 1265262 ns/op 207.19 MB/s +Search_Success_RE2/512K 500 2516902 ns/op 208.31 MB/s +Search_Success_RE2/1M 200 5097685 ns/op 205.70 MB/s +Search_Success_RE2/2M 100 10551640 ns/op 198.75 MB/s +Search_Success_RE2/4M 50 22130760 ns/op 189.52 MB/s +Search_Success_RE2/8M 20 51212750 ns/op 163.80 MB/s +Search_Success_RE2/16M 10 125281500 ns/op 133.92 MB/s +Search_Success_CachedPCRE/8 5000000 276 ns/op 28.97 MB/s +Search_Success_CachedPCRE/16 5000000 354 ns/op 45.15 MB/s +Search_Success_CachedPCRE/32 2000000 515 ns/op 62.03 MB/s +Search_Success_CachedPCRE/64 2000000 823 ns/op 77.73 MB/s +Search_Success_CachedPCRE/128 1000000 1470 ns/op 87.05 MB/s +Search_Success_CachedPCRE/256 500000 2739 ns/op 93.46 MB/s +Search_Success_CachedPCRE/512 200000 5254 ns/op 97.44 MB/s +Search_Success_CachedPCRE/1K 100000 10228 ns/op 100.11 MB/s +Search_Success_CachedPCRE/2K 50000 20449 ns/op 100.15 MB/s +Search_Success_CachedPCRE/4K 50000 41084 ns/op 99.70 MB/s +Search_Success_CachedPCRE/8K 20000 84617 ns/op 96.81 MB/s +Search_Success_CachedPCRE/16K 10000 168594 ns/op 97.18 MB/s +Search_Success_CachedPCRE/32K 5000 339675 ns/op 96.47 MB/s +Search_Success_CachedPCRE/64K 2000 682138 ns/op 96.07 MB/s +Search_Success_CachedPCRE/128K 1000 1373131 ns/op 95.45 MB/s +Search_Success_CachedPCRE/256K 500 2767366 ns/op 94.73 MB/s +Search_Success_CachedPCRE/512K 200 5562225 ns/op 94.26 MB/s +Search_Success_CachedPCRE/1M 100 11188570 ns/op 93.72 MB/s +Search_Success_CachedPCRE/2M 50 23191460 ns/op 90.43 MB/s +Search_Success_CachedPCRE/4M 20 50011200 ns/op 83.87 MB/s +Search_Success_CachedPCRE/8M 10 111201800 ns/op 75.44 MB/s +Search_Success_CachedPCRE/16M 5 266875000 ns/op 62.87 MB/s +Search_Success_CachedRE2/8 10000000 183 ns/op 43.67 MB/s +Search_Success_CachedRE2/16 5000000 491 ns/op 32.56 MB/s +Search_Success_CachedRE2/32 2000000 582 ns/op 54.98 MB/s +Search_Success_CachedRE2/64 2000000 738 ns/op 86.61 MB/s +Search_Success_CachedRE2/128 1000000 1043 ns/op 122.69 MB/s +Search_Success_CachedRE2/256 1000000 1623 ns/op 157.70 MB/s +Search_Success_CachedRE2/512 500000 2854 ns/op 179.39 MB/s +Search_Success_CachedRE2/1K 200000 5165 ns/op 198.23 MB/s +Search_Success_CachedRE2/2K 100000 10648 ns/op 192.32 MB/s +Search_Success_CachedRE2/4K 50000 20892 ns/op 196.05 MB/s +Search_Success_CachedRE2/8K 50000 38909 ns/op 210.54 MB/s +Search_Success_CachedRE2/16K 20000 76762 ns/op 213.44 MB/s +Search_Success_CachedRE2/32K 10000 153917 ns/op 212.89 MB/s +Search_Success_CachedRE2/64K 5000 307908 ns/op 212.84 MB/s +Search_Success_CachedRE2/128K 2000 610789 ns/op 214.59 MB/s +Search_Success_CachedRE2/256K 1000 1228572 ns/op 213.37 MB/s +Search_Success_CachedRE2/512K 500 2467884 ns/op 212.44 MB/s +Search_Success_CachedRE2/1M 200 5100045 ns/op 205.60 MB/s +Search_Success_CachedRE2/2M 100 10388080 ns/op 201.88 MB/s +Search_Success_CachedRE2/4M 50 22091760 ns/op 189.86 MB/s +Search_Success_CachedRE2/8M 20 51066600 ns/op 164.27 MB/s +Search_Success_CachedRE2/16M 10 124756300 ns/op 134.48 MB/s +Search_Success1_PCRE/8 500000 3329 ns/op 2.40 MB/s +Search_Success1_PCRE/16 500000 3422 ns/op 4.68 MB/s +Search_Success1_PCRE/32 500000 3562 ns/op 8.98 MB/s +Search_Success1_PCRE/64 500000 3875 ns/op 16.51 MB/s +Search_Success1_PCRE/128 500000 4487 ns/op 28.52 MB/s +Search_Success1_PCRE/256 200000 5781 ns/op 44.28 MB/s +Search_Success1_PCRE/512 200000 8232 ns/op 62.20 MB/s +Search_Success1_PCRE/1K 100000 13396 ns/op 76.44 MB/s +Search_Success1_PCRE/2K 50000 24063 ns/op 85.11 MB/s +Search_Success1_PCRE/4K 50000 44662 ns/op 91.71 MB/s +Search_Success1_PCRE/8K 20000 87800 ns/op 93.30 MB/s +Search_Success1_PCRE/16K 10000 173248 ns/op 94.57 MB/s +Search_Success1_PCRE/32K 5000 345953 ns/op 94.72 MB/s +Search_Success1_PCRE/64K 2000 690898 ns/op 94.86 MB/s +Search_Success1_PCRE/128K 1000 1380064 ns/op 94.98 MB/s +Search_Success1_PCRE/256K 500 2756944 ns/op 95.08 MB/s +Search_Success1_PCRE/512K 200 5554180 ns/op 94.40 MB/s +Search_Success1_PCRE/1M 100 11227360 ns/op 93.39 MB/s +Search_Success1_PCRE/2M 50 23068500 ns/op 90.91 MB/s +Search_Success1_PCRE/4M 50 46455720 ns/op 90.29 MB/s +Search_Success1_PCRE/8M 10 112184900 ns/op 74.77 MB/s +Search_Success1_PCRE/16M 5 267271800 ns/op 62.77 MB/s +Search_Success1_RE2/8 50000 47078 ns/op 0.17 MB/s +Search_Success1_RE2/16 50000 46927 ns/op 0.34 MB/s +Search_Success1_RE2/32 50000 46852 ns/op 0.68 MB/s +Search_Success1_RE2/64 50000 47478 ns/op 1.35 MB/s +Search_Success1_RE2/128 50000 47471 ns/op 2.70 MB/s +Search_Success1_RE2/256 50000 47911 ns/op 5.34 MB/s +Search_Success1_RE2/512 50000 48982 ns/op 10.45 MB/s +Search_Success1_RE2/1K 20000 50955 ns/op 20.10 MB/s +Search_Success1_RE2/2K 20000 55280 ns/op 37.05 MB/s +Search_Success1_RE2/4K 20000 65176 ns/op 62.84 MB/s +Search_Success1_RE2/8K 20000 84613 ns/op 96.82 MB/s +Search_Success1_RE2/16K 10000 125384 ns/op 130.67 MB/s +Search_Success1_RE2/32K 5000 200634 ns/op 163.32 MB/s +Search_Success1_RE2/64K 5000 352274 ns/op 186.04 MB/s +Search_Success1_RE2/128K 2000 655683 ns/op 199.90 MB/s +Search_Success1_RE2/256K 1000 1289421 ns/op 203.30 MB/s +Search_Success1_RE2/512K 500 2514970 ns/op 208.47 MB/s +Search_Success1_RE2/1M 200 5109155 ns/op 205.23 MB/s +Search_Success1_RE2/2M 100 10655670 ns/op 196.81 MB/s +Search_Success1_RE2/4M 50 22707220 ns/op 184.71 MB/s +Search_Success1_RE2/8M 20 50906850 ns/op 164.78 MB/s +Search_Success1_RE2/16M 10 125901300 ns/op 133.26 MB/s +Search_Success1_Cached_PCRE/8 5000000 308 ns/op 25.89 MB/s +Search_Success1_Cached_PCRE/16 5000000 390 ns/op 40.98 MB/s +Search_Success1_Cached_PCRE/32 2000000 556 ns/op 57.51 MB/s +Search_Success1_Cached_PCRE/64 2000000 862 ns/op 74.24 MB/s +Search_Success1_Cached_PCRE/128 1000000 1585 ns/op 80.72 MB/s +Search_Success1_Cached_PCRE/256 500000 2772 ns/op 92.34 MB/s +Search_Success1_Cached_PCRE/512 200000 5261 ns/op 97.31 MB/s +Search_Success1_Cached_PCRE/1K 100000 10302 ns/op 99.40 MB/s +Search_Success1_Cached_PCRE/2K 50000 20828 ns/op 98.33 MB/s +Search_Success1_Cached_PCRE/4K 50000 41370 ns/op 99.01 MB/s +Search_Success1_Cached_PCRE/8K 20000 84354 ns/op 97.11 MB/s +Search_Success1_Cached_PCRE/16K 10000 170170 ns/op 96.28 MB/s +Search_Success1_Cached_PCRE/32K 5000 342755 ns/op 95.60 MB/s +Search_Success1_Cached_PCRE/64K 2000 688438 ns/op 95.20 MB/s +Search_Success1_Cached_PCRE/128K 1000 1372324 ns/op 95.51 MB/s +Search_Success1_Cached_PCRE/256K 500 2771422 ns/op 94.59 MB/s +Search_Success1_Cached_PCRE/512K 200 5608635 ns/op 93.48 MB/s +Search_Success1_Cached_PCRE/1M 100 11354700 ns/op 92.35 MB/s +Search_Success1_Cached_PCRE/2M 50 23295740 ns/op 90.02 MB/s +Search_Success1_Cached_PCRE/4M 20 50142650 ns/op 83.65 MB/s +Search_Success1_Cached_PCRE/8M 10 111720200 ns/op 75.09 MB/s +Search_Success1_Cached_PCRE/16M 5 269077800 ns/op 62.35 MB/s +Search_Success1_Cached_RE2/8 5000000 461 ns/op 17.35 MB/s +Search_Success1_Cached_RE2/16 2000000 503 ns/op 31.76 MB/s +Search_Success1_Cached_RE2/32 2000000 579 ns/op 55.25 MB/s +Search_Success1_Cached_RE2/64 2000000 739 ns/op 86.50 MB/s +Search_Success1_Cached_RE2/128 1000000 1033 ns/op 123.83 MB/s +Search_Success1_Cached_RE2/256 1000000 1643 ns/op 155.77 MB/s +Search_Success1_Cached_RE2/512 500000 2869 ns/op 178.40 MB/s +Search_Success1_Cached_RE2/1K 200000 5099 ns/op 200.79 MB/s +Search_Success1_Cached_RE2/2K 100000 10309 ns/op 198.64 MB/s +Search_Success1_Cached_RE2/4K 100000 19360 ns/op 211.57 MB/s +Search_Success1_Cached_RE2/8K 50000 38961 ns/op 210.26 MB/s +Search_Success1_Cached_RE2/16K 20000 78081 ns/op 209.83 MB/s +Search_Success1_Cached_RE2/32K 10000 154337 ns/op 212.31 MB/s +Search_Success1_Cached_RE2/64K 5000 306992 ns/op 213.48 MB/s +Search_Success1_Cached_RE2/128K 2000 609073 ns/op 215.20 MB/s +Search_Success1_Cached_RE2/256K 1000 1226916 ns/op 213.66 MB/s +Search_Success1_Cached_RE2/512K 500 2486650 ns/op 210.84 MB/s +Search_Success1_Cached_RE2/1M 200 5026605 ns/op 208.61 MB/s +Search_Success1_Cached_RE2/2M 100 10540280 ns/op 198.97 MB/s +Search_Success1_Cached_RE2/4M 50 22296140 ns/op 188.12 MB/s +Search_Success1_Cached_RE2/8M 20 51183250 ns/op 163.89 MB/s +Search_Success1_Cached_RE2/16M 10 125691100 ns/op 133.48 MB/s +Search_Digits_PCRE 200000 7096 ns/op +Search_Digits_RE2 50000 37491 ns/op +Parse_Digits_PCRE 200000 7325 ns/op +Parse_Digits_RE2 100000 19423 ns/op +Parse_CachedDigits_PCRE 2000000 596 ns/op +Parse_CachedDigits_RE2 5000000 325 ns/op +Parse_DigitDs_PCRE 200000 6459 ns/op +Parse_DigitDs_RE2 100000 19040 ns/op +Parse_CachedDigitDs_PCRE 2000000 591 ns/op +Parse_CachedDigitDs_RE2 5000000 334 ns/op +Parse_Split_PCRE 500000 4865 ns/op +Parse_Split_RE2 50000 20898 ns/op +Parse_CachedSplit_PCRE 5000000 424 ns/op +Parse_CachedSplit_RE2 5000000 237 ns/op +Parse_SplitHard_PCRE 500000 4821 ns/op +Parse_SplitHard_RE2 50000 25920 ns/op +Parse_CachedSplitHard_PCRE 5000000 422 ns/op +Parse_CachedSplitHard_RE2 500000 2340 ns/op +Parse_CachedSplitBig1_PCRE 200 5460640 ns/op +Parse_CachedSplitBig1_RE2 2000 935880 ns/op +Parse_CachedSplitBig2_PCRE 1000 1050260 ns/op +Parse_CachedSplitBig2_RE2 10 100186200 ns/op +BM_PCRE_Compile 200000 5937 ns/op +BM_RE2_Compile 50000 22091 ns/op +SearchPhone_CachedPCRE/8 1000000 1520 ns/op 5.26 MB/s +SearchPhone_CachedPCRE/16 500000 2461 ns/op 6.50 MB/s +SearchPhone_CachedPCRE/32 500000 4142 ns/op 7.72 MB/s +SearchPhone_CachedPCRE/64 200000 7477 ns/op 8.56 MB/s +SearchPhone_CachedPCRE/128 100000 14151 ns/op 9.04 MB/s +SearchPhone_CachedPCRE/256 50000 27740 ns/op 9.23 MB/s +SearchPhone_CachedPCRE/512 20000 55556 ns/op 9.22 MB/s +SearchPhone_CachedPCRE/1K 10000 109542 ns/op 9.35 MB/s +SearchPhone_CachedPCRE/2K 5000 213707 ns/op 9.58 MB/s +SearchPhone_CachedPCRE/4K 5000 423086 ns/op 9.68 MB/s +SearchPhone_CachedPCRE/8K 2000 854898 ns/op 9.58 MB/s +SearchPhone_CachedPCRE/16K 1000 1699907 ns/op 9.64 MB/s +SearchPhone_CachedPCRE/32K 500 3411732 ns/op 9.60 MB/s +SearchPhone_CachedPCRE/64K 200 6718010 ns/op 9.76 MB/s +SearchPhone_CachedPCRE/128K 100 13504430 ns/op 9.71 MB/s +SearchPhone_CachedPCRE/256K 50 27150480 ns/op 9.66 MB/s +SearchPhone_CachedPCRE/512K 20 54088550 ns/op 9.69 MB/s +SearchPhone_CachedPCRE/1M 10 107855400 ns/op 9.72 MB/s +SearchPhone_CachedPCRE/2M 5 216948400 ns/op 9.67 MB/s +SearchPhone_CachedPCRE/4M 5 432028400 ns/op 9.71 MB/s +SearchPhone_CachedPCRE/8M 2 867550000 ns/op 9.67 MB/s +SearchPhone_CachedPCRE/16M 1 1732859000 ns/op 9.68 MB/s +SearchPhone_CachedRE2/8 1000000 1253 ns/op 6.38 MB/s +SearchPhone_CachedRE2/16 1000000 1300 ns/op 12.30 MB/s +SearchPhone_CachedRE2/32 1000000 1379 ns/op 23.20 MB/s +SearchPhone_CachedRE2/64 1000000 1569 ns/op 40.77 MB/s +SearchPhone_CachedRE2/128 1000000 1875 ns/op 68.24 MB/s +SearchPhone_CachedRE2/256 500000 2460 ns/op 104.05 MB/s +SearchPhone_CachedRE2/512 500000 3629 ns/op 141.08 MB/s +SearchPhone_CachedRE2/1K 200000 5971 ns/op 171.49 MB/s +SearchPhone_CachedRE2/2K 100000 10981 ns/op 186.50 MB/s +SearchPhone_CachedRE2/4K 50000 20502 ns/op 199.78 MB/s +SearchPhone_CachedRE2/8K 50000 39182 ns/op 209.07 MB/s +SearchPhone_CachedRE2/16K 20000 77462 ns/op 211.51 MB/s +SearchPhone_CachedRE2/32K 10000 154502 ns/op 212.09 MB/s +SearchPhone_CachedRE2/64K 5000 307476 ns/op 213.14 MB/s +SearchPhone_CachedRE2/128K 2000 611231 ns/op 214.44 MB/s +SearchPhone_CachedRE2/256K 1000 1224134 ns/op 214.15 MB/s +SearchPhone_CachedRE2/512K 500 2450828 ns/op 213.92 MB/s +SearchPhone_CachedRE2/1M 500 4939050 ns/op 212.30 MB/s +SearchPhone_CachedRE2/2M 200 9875035 ns/op 212.37 MB/s +SearchPhone_CachedRE2/4M 50 20061240 ns/op 209.08 MB/s +SearchPhone_CachedRE2/8M 50 39959540 ns/op 209.93 MB/s +SearchPhone_CachedRE2/16M 20 79246550 ns/op 211.71 MB/s +EmptyPartialMatchPCRE 10000000 139 ns/op +EmptyPartialMatchRE2 5000000 423 ns/op +SimplePartialMatchPCRE 10000000 201 ns/op +SimplePartialMatchRE2 5000000 464 ns/op +HTTPPartialMatchPCRE 2000000 640 ns/op +HTTPPartialMatchRE2 1000000 1026 ns/op +SmallHTTPPartialMatchPCRE 2000000 636 ns/op +SmallHTTPPartialMatchRE2 1000000 1023 ns/op +DotMatchPCRE 2000000 847 ns/op +DotMatchRE2 1000000 1055 ns/op +ASCIIMatchPCRE 5000000 470 ns/op +ASCIIMatchRE2 1000000 1051 ns/op +==BENCHMARK== wreck.mtv.corp.google.com Fri Feb 26 16:59:13 PST 2010 +# Darwin wreck.mtv.corp.google.com 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:55:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_I386 i386 +# i686-apple-darwin9-g++-4.0.1 (GCC) 4.0.1 (Apple Inc. build 5484) +# Copyright (C) 2005 Free Software Foundation, Inc. +# This is free software; see the source for copying conditions. There is NO +# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# a94585d91e66 tip +# obj/test/regexp_benchmark: Mach-O executable i386 + +Search_Easy0_CachedPCRE/8 20000000 143 ns/op 55.84 MB/s +Search_Easy0_CachedPCRE/16 10000000 185 ns/op 86.27 MB/s +Search_Easy0_CachedPCRE/32 10000000 263 ns/op 121.33 MB/s +Search_Easy0_CachedPCRE/64 5000000 425 ns/op 150.51 MB/s +Search_Easy0_CachedPCRE/128 2000000 770 ns/op 166.11 MB/s +Search_Easy0_CachedPCRE/256 1000000 1415 ns/op 180.86 MB/s +Search_Easy0_CachedPCRE/512 500000 3141 ns/op 162.97 MB/s +Search_Easy0_CachedPCRE/1K 500000 6420 ns/op 159.49 MB/s +Search_Easy0_CachedPCRE/2K 200000 11854 ns/op 172.76 MB/s +Search_Easy0_CachedPCRE/4K 100000 24056 ns/op 170.27 MB/s +Search_Easy0_CachedPCRE/8K 50000 47702 ns/op 171.73 MB/s +Search_Easy0_CachedPCRE/16K 20000 94800 ns/op 172.83 MB/s +Search_Easy0_CachedPCRE/32K 10000 188696 ns/op 173.65 MB/s +Search_Easy0_CachedPCRE/64K 5000 375648 ns/op 174.46 MB/s +Search_Easy0_CachedPCRE/128K 2000 750820 ns/op 174.57 MB/s +Search_Easy0_CachedPCRE/256K 1000 1501631 ns/op 174.57 MB/s +Search_Easy0_CachedPCRE/512K 500 3041566 ns/op 172.37 MB/s +Search_Easy0_CachedPCRE/1M 500 5961312 ns/op 175.90 MB/s +Search_Easy0_CachedPCRE/2M 100 12012730 ns/op 174.58 MB/s +Search_Easy0_CachedPCRE/4M 100 24195970 ns/op 173.35 MB/s +Search_Easy0_CachedPCRE/8M 50 48470420 ns/op 173.07 MB/s +Search_Easy0_CachedPCRE/16M 20 97183200 ns/op 172.63 MB/s +Search_Easy0_CachedRE2/8 5000000 430 ns/op 18.58 MB/s +Search_Easy0_CachedRE2/16 5000000 443 ns/op 36.04 MB/s +Search_Easy0_CachedRE2/32 5000000 467 ns/op 68.43 MB/s +Search_Easy0_CachedRE2/64 5000000 523 ns/op 122.32 MB/s +Search_Easy0_CachedRE2/128 5000000 604 ns/op 211.72 MB/s +Search_Easy0_CachedRE2/256 2000000 789 ns/op 324.09 MB/s +Search_Easy0_CachedRE2/512 1000000 1166 ns/op 438.91 MB/s +Search_Easy0_CachedRE2/1K 1000000 1984 ns/op 515.90 MB/s +Search_Easy0_CachedRE2/2K 500000 3565 ns/op 574.46 MB/s +Search_Easy0_CachedRE2/4K 500000 6845 ns/op 598.33 MB/s +Search_Easy0_CachedRE2/8K 200000 13387 ns/op 611.90 MB/s +Search_Easy0_CachedRE2/16K 100000 26446 ns/op 619.52 MB/s +Search_Easy0_CachedRE2/32K 50000 51345 ns/op 638.19 MB/s +Search_Easy0_CachedRE2/64K 10000 102368 ns/op 640.20 MB/s +Search_Easy0_CachedRE2/128K 10000 203304 ns/op 644.71 MB/s +Search_Easy0_CachedRE2/256K 5000 405765 ns/op 646.05 MB/s +Search_Easy0_CachedRE2/512K 2000 810785 ns/op 646.64 MB/s +Search_Easy0_CachedRE2/1M 1000 1649854 ns/op 635.56 MB/s +Search_Easy0_CachedRE2/2M 500 3268662 ns/op 641.59 MB/s +Search_Easy0_CachedRE2/4M 500 6628094 ns/op 632.81 MB/s +Search_Easy0_CachedRE2/8M 100 13442320 ns/op 624.04 MB/s +Search_Easy0_CachedRE2/16M 50 27306780 ns/op 614.40 MB/s +Search_Easy1_CachedPCRE/8 20000000 143 ns/op 55.63 MB/s +Search_Easy1_CachedPCRE/16 10000000 182 ns/op 87.52 MB/s +Search_Easy1_CachedPCRE/32 10000000 265 ns/op 120.60 MB/s +Search_Easy1_CachedPCRE/64 5000000 426 ns/op 150.14 MB/s +Search_Easy1_CachedPCRE/128 2000000 776 ns/op 164.74 MB/s +Search_Easy1_CachedPCRE/256 1000000 1414 ns/op 180.99 MB/s +Search_Easy1_CachedPCRE/512 1000000 2889 ns/op 177.17 MB/s +Search_Easy1_CachedPCRE/1K 500000 6111 ns/op 167.55 MB/s +Search_Easy1_CachedPCRE/2K 200000 12463 ns/op 164.32 MB/s +Search_Easy1_CachedPCRE/4K 100000 24610 ns/op 166.43 MB/s +Search_Easy1_CachedPCRE/8K 50000 49456 ns/op 165.64 MB/s +Search_Easy1_CachedPCRE/16K 20000 97720 ns/op 167.66 MB/s +Search_Easy1_CachedPCRE/32K 10000 196508 ns/op 166.75 MB/s +Search_Easy1_CachedPCRE/64K 5000 385132 ns/op 170.16 MB/s +Search_Easy1_CachedPCRE/128K 2000 771133 ns/op 169.97 MB/s +Search_Easy1_CachedPCRE/256K 1000 1547561 ns/op 169.39 MB/s +Search_Easy1_CachedPCRE/512K 500 3083398 ns/op 170.04 MB/s +Search_Easy1_CachedPCRE/1M 500 6178714 ns/op 169.71 MB/s +Search_Easy1_CachedPCRE/2M 100 12357130 ns/op 169.71 MB/s +Search_Easy1_CachedPCRE/4M 100 24767250 ns/op 169.35 MB/s +Search_Easy1_CachedPCRE/8M 50 50543820 ns/op 165.97 MB/s +Search_Easy1_CachedPCRE/16M 20 100643550 ns/op 166.70 MB/s +Search_Easy1_CachedRE2/8 5000000 439 ns/op 18.18 MB/s +Search_Easy1_CachedRE2/16 5000000 446 ns/op 35.87 MB/s +Search_Easy1_CachedRE2/32 5000000 468 ns/op 68.24 MB/s +Search_Easy1_CachedRE2/64 5000000 519 ns/op 123.23 MB/s +Search_Easy1_CachedRE2/128 5000000 611 ns/op 209.37 MB/s +Search_Easy1_CachedRE2/256 2000000 787 ns/op 324.89 MB/s +Search_Easy1_CachedRE2/512 1000000 1176 ns/op 435.25 MB/s +Search_Easy1_CachedRE2/1K 1000000 1969 ns/op 519.86 MB/s +Search_Easy1_CachedRE2/2K 500000 3572 ns/op 573.31 MB/s +Search_Easy1_CachedRE2/4K 500000 6911 ns/op 592.63 MB/s +Search_Easy1_CachedRE2/8K 200000 13437 ns/op 609.63 MB/s +Search_Easy1_CachedRE2/16K 100000 26382 ns/op 621.02 MB/s +Search_Easy1_CachedRE2/32K 50000 52112 ns/op 628.80 MB/s +Search_Easy1_CachedRE2/64K 10000 102128 ns/op 641.70 MB/s +Search_Easy1_CachedRE2/128K 10000 203580 ns/op 643.84 MB/s +Search_Easy1_CachedRE2/256K 5000 408200 ns/op 642.19 MB/s +Search_Easy1_CachedRE2/512K 2000 816006 ns/op 642.51 MB/s +Search_Easy1_CachedRE2/1M 1000 1630582 ns/op 643.07 MB/s +Search_Easy1_CachedRE2/2M 500 3315480 ns/op 632.53 MB/s +Search_Easy1_CachedRE2/4M 500 6623626 ns/op 633.23 MB/s +Search_Easy1_CachedRE2/8M 100 13362480 ns/op 627.77 MB/s +Search_Easy1_CachedRE2/16M 100 26699900 ns/op 628.36 MB/s +Search_Medium_CachedPCRE/8 20000000 144 ns/op 55.23 MB/s +Search_Medium_CachedPCRE/16 10000000 188 ns/op 85.00 MB/s +Search_Medium_CachedPCRE/32 10000000 274 ns/op 116.45 MB/s +Search_Medium_CachedPCRE/64 5000000 446 ns/op 143.18 MB/s +Search_Medium_CachedPCRE/128 500000 5989 ns/op 21.37 MB/s +Search_Medium_CachedPCRE/256 200000 11152 ns/op 22.96 MB/s +Search_Medium_CachedPCRE/512 100000 26444 ns/op 19.36 MB/s +Search_Medium_CachedPCRE/1K 50000 51772 ns/op 19.78 MB/s +Search_Medium_CachedPCRE/2K 20000 83901 ns/op 24.41 MB/s +Search_Medium_CachedPCRE/4K 10000 201033 ns/op 20.37 MB/s +Search_Medium_CachedPCRE/8K 5000 410276 ns/op 19.97 MB/s +Search_Medium_CachedPCRE/16K 2000 824703 ns/op 19.87 MB/s +Search_Medium_CachedPCRE/32K 1000 1654099 ns/op 19.81 MB/s +Search_Medium_CachedPCRE/64K 500 3345594 ns/op 19.59 MB/s +Search_Medium_CachedPCRE/128K 500 6597588 ns/op 19.87 MB/s +Search_Medium_CachedPCRE/256K 100 13204280 ns/op 19.85 MB/s +Search_Medium_CachedRE2/8 5000000 447 ns/op 17.88 MB/s +Search_Medium_CachedRE2/16 5000000 488 ns/op 32.78 MB/s +Search_Medium_CachedRE2/32 5000000 565 ns/op 56.60 MB/s +Search_Medium_CachedRE2/64 5000000 711 ns/op 90.00 MB/s +Search_Medium_CachedRE2/128 1000000 1027 ns/op 124.61 MB/s +Search_Medium_CachedRE2/256 1000000 1632 ns/op 156.77 MB/s +Search_Medium_CachedRE2/512 1000000 2826 ns/op 181.15 MB/s +Search_Medium_CachedRE2/1K 500000 5336 ns/op 191.89 MB/s +Search_Medium_CachedRE2/2K 200000 10524 ns/op 194.59 MB/s +Search_Medium_CachedRE2/4K 100000 20398 ns/op 200.80 MB/s +Search_Medium_CachedRE2/8K 50000 38371 ns/op 213.49 MB/s +Search_Medium_CachedRE2/16K 20000 75467 ns/op 217.10 MB/s +Search_Medium_CachedRE2/32K 10000 150407 ns/op 217.86 MB/s +Search_Medium_CachedRE2/64K 5000 300663 ns/op 217.97 MB/s +Search_Medium_CachedRE2/128K 5000 600814 ns/op 218.16 MB/s +Search_Medium_CachedRE2/256K 2000 1212538 ns/op 216.19 MB/s +Search_Medium_CachedRE2/512K 1000 2408767 ns/op 217.66 MB/s +Search_Medium_CachedRE2/1M 500 4816914 ns/op 217.69 MB/s +Search_Medium_CachedRE2/2M 200 9658095 ns/op 217.14 MB/s +Search_Medium_CachedRE2/4M 100 19816050 ns/op 211.66 MB/s +Search_Medium_CachedRE2/8M 50 39373200 ns/op 213.05 MB/s +Search_Medium_CachedRE2/16M 20 78759400 ns/op 213.02 MB/s +Search_Hard_CachedPCRE/8 20000000 143 ns/op 55.68 MB/s +Search_Hard_CachedPCRE/16 10000000 188 ns/op 84.70 MB/s +Search_Hard_CachedPCRE/32 10000000 276 ns/op 115.88 MB/s +Search_Hard_CachedPCRE/64 5000000 447 ns/op 143.08 MB/s +Search_Hard_CachedPCRE/128 10000 225891 ns/op 0.57 MB/s +Search_Hard_CachedPCRE/256 2000 869631 ns/op 0.29 MB/s +Search_Hard_CachedPCRE/512 500 3629904 ns/op 0.14 MB/s +Search_Hard_CachedPCRE/1K 100 14249010 ns/op 0.07 MB/s +Search_Hard_CachedPCRE/2K 50 53816760 ns/op 0.04 MB/s +Search_Hard_CachedPCRE/4K 10 227514600 ns/op 0.02 MB/s +Search_Hard_CachedRE2/8 5000000 448 ns/op 17.83 MB/s +Search_Hard_CachedRE2/16 5000000 487 ns/op 32.85 MB/s +Search_Hard_CachedRE2/32 5000000 557 ns/op 57.41 MB/s +Search_Hard_CachedRE2/64 5000000 699 ns/op 91.50 MB/s +Search_Hard_CachedRE2/128 1000000 1009 ns/op 126.74 MB/s +Search_Hard_CachedRE2/256 1000000 1604 ns/op 159.57 MB/s +Search_Hard_CachedRE2/512 1000000 2810 ns/op 182.14 MB/s +Search_Hard_CachedRE2/1K 500000 5294 ns/op 193.41 MB/s +Search_Hard_CachedRE2/2K 200000 10504 ns/op 194.97 MB/s +Search_Hard_CachedRE2/4K 100000 20510 ns/op 199.70 MB/s +Search_Hard_CachedRE2/8K 50000 38946 ns/op 210.34 MB/s +Search_Hard_CachedRE2/16K 20000 76344 ns/op 214.61 MB/s +Search_Hard_CachedRE2/32K 10000 150705 ns/op 217.43 MB/s +Search_Hard_CachedRE2/64K 5000 300904 ns/op 217.80 MB/s +Search_Hard_CachedRE2/128K 5000 600464 ns/op 218.28 MB/s +Search_Hard_CachedRE2/256K 2000 1210236 ns/op 216.61 MB/s +Search_Hard_CachedRE2/512K 1000 2405366 ns/op 217.97 MB/s +Search_Hard_CachedRE2/1M 500 4806626 ns/op 218.15 MB/s +Search_Hard_CachedRE2/2M 200 9610875 ns/op 218.21 MB/s +Search_Hard_CachedRE2/4M 100 19793040 ns/op 211.91 MB/s +Search_Hard_CachedRE2/8M 50 39302500 ns/op 213.44 MB/s +Search_Hard_CachedRE2/16M 20 78721650 ns/op 213.12 MB/s +Search_Parens_CachedPCRE/8 10000000 204 ns/op 39.08 MB/s +Search_Parens_CachedRE2/8 5000000 451 ns/op 17.70 MB/s +Search_Parens_CachedRE2/16 5000000 483 ns/op 33.10 MB/s +Search_Parens_CachedRE2/32 5000000 558 ns/op 57.28 MB/s +Search_Parens_CachedRE2/64 5000000 707 ns/op 90.46 MB/s +Search_Parens_CachedRE2/128 1000000 1044 ns/op 122.53 MB/s +Search_Parens_CachedRE2/256 1000000 1624 ns/op 157.57 MB/s +Search_Parens_CachedRE2/512 1000000 2806 ns/op 182.41 MB/s +Search_Parens_CachedRE2/1K 500000 5191 ns/op 197.26 MB/s +Search_Parens_CachedRE2/2K 200000 10005 ns/op 204.68 MB/s +Search_Parens_CachedRE2/4K 100000 20406 ns/op 200.72 MB/s +Search_Parens_CachedRE2/8K 50000 38039 ns/op 215.36 MB/s +Search_Parens_CachedRE2/16K 50000 75328 ns/op 217.50 MB/s +Search_Parens_CachedRE2/32K 10000 150731 ns/op 217.39 MB/s +Search_Parens_CachedRE2/64K 5000 300916 ns/op 217.79 MB/s +Search_Parens_CachedRE2/128K 5000 600672 ns/op 218.21 MB/s +Search_Parens_CachedRE2/256K 2000 1200385 ns/op 218.38 MB/s +Search_Parens_CachedRE2/512K 1000 2405773 ns/op 217.93 MB/s +Search_Parens_CachedRE2/1M 500 4857044 ns/op 215.89 MB/s +Search_Parens_CachedRE2/2M 200 9654535 ns/op 217.22 MB/s +Search_Parens_CachedRE2/4M 100 19599170 ns/op 214.00 MB/s +Search_Parens_CachedRE2/8M 50 39356100 ns/op 213.15 MB/s +Search_Parens_CachedRE2/16M 20 78612450 ns/op 213.42 MB/s +Search_BigFixed_CachedPCRE/8 10000000 268 ns/op 29.77 MB/s +Search_BigFixed_CachedPCRE/16 5000000 358 ns/op 44.64 MB/s +Search_BigFixed_CachedPCRE/32 5000000 524 ns/op 60.96 MB/s +Search_BigFixed_CachedPCRE/64 2000000 866 ns/op 73.85 MB/s +Search_BigFixed_CachedPCRE/128 1000000 1573 ns/op 81.36 MB/s +Search_BigFixed_CachedPCRE/256 1000000 2932 ns/op 87.29 MB/s +Search_BigFixed_CachedPCRE/512 500000 5603 ns/op 91.37 MB/s +Search_BigFixed_CachedPCRE/1K 200000 10992 ns/op 93.16 MB/s +Search_BigFixed_CachedPCRE/2K 100000 21994 ns/op 93.11 MB/s +Search_BigFixed_CachedPCRE/4K 50000 43354 ns/op 94.48 MB/s +Search_BigFixed_CachedPCRE/8K 20000 85192 ns/op 96.16 MB/s +Search_BigFixed_CachedPCRE/16K 10000 173058 ns/op 94.67 MB/s +Search_BigFixed_CachedPCRE/32K 5000 346039 ns/op 94.69 MB/s +Search_BigFixed_CachedRE2/8 10000000 161 ns/op 49.53 MB/s +Search_BigFixed_CachedRE2/16 5000000 477 ns/op 33.53 MB/s +Search_BigFixed_CachedRE2/32 5000000 523 ns/op 61.15 MB/s +Search_BigFixed_CachedRE2/64 5000000 600 ns/op 106.60 MB/s +Search_BigFixed_CachedRE2/128 2000000 767 ns/op 166.75 MB/s +Search_BigFixed_CachedRE2/256 1000000 1080 ns/op 236.97 MB/s +Search_BigFixed_CachedRE2/512 1000000 1682 ns/op 304.28 MB/s +Search_BigFixed_CachedRE2/1K 1000000 2848 ns/op 359.43 MB/s +Search_BigFixed_CachedRE2/2K 500000 5376 ns/op 380.95 MB/s +Search_BigFixed_CachedRE2/4K 200000 10112 ns/op 405.06 MB/s +Search_BigFixed_CachedRE2/8K 100000 20308 ns/op 403.37 MB/s +Search_BigFixed_CachedRE2/16K 50000 40343 ns/op 406.11 MB/s +Search_BigFixed_CachedRE2/32K 20000 78888 ns/op 415.37 MB/s +Search_BigFixed_CachedRE2/64K 10000 156583 ns/op 418.54 MB/s +Search_BigFixed_CachedRE2/128K 5000 308819 ns/op 424.43 MB/s +Search_BigFixed_CachedRE2/256K 5000 626294 ns/op 418.56 MB/s +Search_BigFixed_CachedRE2/512K 2000 1242990 ns/op 421.80 MB/s +Search_BigFixed_CachedRE2/1M 500 2551348 ns/op 410.99 MB/s +Search_Success_PCRE/8 500000 3284 ns/op 2.44 MB/s +Search_Success_PCRE/16 500000 3343 ns/op 4.79 MB/s +Search_Success_PCRE/32 500000 3425 ns/op 9.34 MB/s +Search_Success_PCRE/64 500000 3673 ns/op 17.42 MB/s +Search_Success_PCRE/128 500000 4401 ns/op 29.08 MB/s +Search_Success_PCRE/256 500000 5526 ns/op 46.32 MB/s +Search_Success_PCRE/512 200000 8015 ns/op 63.87 MB/s +Search_Success_PCRE/1K 200000 13062 ns/op 78.39 MB/s +Search_Success_PCRE/2K 100000 23200 ns/op 88.27 MB/s +Search_Success_PCRE/4K 50000 43223 ns/op 94.76 MB/s +Search_Success_PCRE/8K 20000 85092 ns/op 96.27 MB/s +Search_Success_PCRE/16K 10000 169823 ns/op 96.48 MB/s +Search_Success_PCRE/32K 5000 343536 ns/op 95.38 MB/s +Search_Success_PCRE/64K 5000 677599 ns/op 96.72 MB/s +Search_Success_PCRE/128K 2000 1350767 ns/op 97.04 MB/s +Search_Success_PCRE/256K 1000 2702077 ns/op 97.02 MB/s +Search_Success_PCRE/512K 500 5452538 ns/op 96.15 MB/s +Search_Success_PCRE/1M 200 10893210 ns/op 96.26 MB/s +Search_Success_PCRE/2M 100 22137760 ns/op 94.73 MB/s +Search_Success_PCRE/4M 50 45563840 ns/op 92.05 MB/s +Search_Success_PCRE/8M 10 108622300 ns/op 77.23 MB/s +Search_Success_PCRE/16M 5 259894000 ns/op 64.55 MB/s +Search_Success_RE2/8 100000 15751 ns/op 0.51 MB/s +Search_Success_RE2/16 50000 33455 ns/op 0.48 MB/s +Search_Success_RE2/32 50000 33825 ns/op 0.95 MB/s +Search_Success_RE2/64 50000 34252 ns/op 1.87 MB/s +Search_Success_RE2/128 50000 34026 ns/op 3.76 MB/s +Search_Success_RE2/256 50000 34117 ns/op 7.50 MB/s +Search_Success_RE2/512 50000 35615 ns/op 14.38 MB/s +Search_Success_RE2/1K 50000 38105 ns/op 26.87 MB/s +Search_Success_RE2/2K 50000 42071 ns/op 48.68 MB/s +Search_Success_RE2/4K 50000 52244 ns/op 78.40 MB/s +Search_Success_RE2/8K 50000 70924 ns/op 115.50 MB/s +Search_Success_RE2/16K 10000 110263 ns/op 148.59 MB/s +Search_Success_RE2/32K 10000 185668 ns/op 176.49 MB/s +Search_Success_RE2/64K 5000 340829 ns/op 192.28 MB/s +Search_Success_RE2/128K 5000 637700 ns/op 205.54 MB/s +Search_Success_RE2/256K 2000 1244739 ns/op 210.60 MB/s +Search_Success_RE2/512K 1000 2455934 ns/op 213.48 MB/s +Search_Success_RE2/1M 500 4916210 ns/op 213.29 MB/s +Search_Success_RE2/2M 200 9864960 ns/op 212.59 MB/s +Search_Success_RE2/4M 50 21928160 ns/op 191.27 MB/s +Search_Success_RE2/8M 20 50505050 ns/op 166.09 MB/s +Search_Success_RE2/16M 10 123615800 ns/op 135.72 MB/s +Search_Success_CachedPCRE/8 10000000 269 ns/op 29.68 MB/s +Search_Success_CachedPCRE/16 5000000 345 ns/op 46.27 MB/s +Search_Success_CachedPCRE/32 5000000 500 ns/op 63.90 MB/s +Search_Success_CachedPCRE/64 2000000 810 ns/op 79.00 MB/s +Search_Success_CachedPCRE/128 1000000 1513 ns/op 84.56 MB/s +Search_Success_CachedPCRE/256 1000000 2844 ns/op 90.01 MB/s +Search_Success_CachedPCRE/512 500000 5152 ns/op 99.37 MB/s +Search_Success_CachedPCRE/1K 200000 10063 ns/op 101.76 MB/s +Search_Success_CachedPCRE/2K 100000 20455 ns/op 100.12 MB/s +Search_Success_CachedPCRE/4K 50000 40840 ns/op 100.29 MB/s +Search_Success_CachedPCRE/8K 20000 82378 ns/op 99.44 MB/s +Search_Success_CachedPCRE/16K 10000 167041 ns/op 98.08 MB/s +Search_Success_CachedPCRE/32K 5000 335674 ns/op 97.62 MB/s +Search_Success_CachedPCRE/64K 5000 671790 ns/op 97.55 MB/s +Search_Success_CachedPCRE/128K 2000 1359318 ns/op 96.42 MB/s +Search_Success_CachedPCRE/256K 1000 2694557 ns/op 97.29 MB/s +Search_Success_CachedPCRE/512K 500 5414676 ns/op 96.83 MB/s +Search_Success_CachedPCRE/1M 200 10888010 ns/op 96.31 MB/s +Search_Success_CachedPCRE/2M 100 22137680 ns/op 94.73 MB/s +Search_Success_CachedPCRE/4M 50 45685360 ns/op 91.81 MB/s +Search_Success_CachedPCRE/8M 10 108998100 ns/op 76.96 MB/s +Search_Success_CachedPCRE/16M 5 261873000 ns/op 64.07 MB/s +Search_Success_CachedRE2/8 10000000 184 ns/op 43.45 MB/s +Search_Success_CachedRE2/16 5000000 493 ns/op 32.45 MB/s +Search_Success_CachedRE2/32 5000000 564 ns/op 56.65 MB/s +Search_Success_CachedRE2/64 5000000 719 ns/op 88.90 MB/s +Search_Success_CachedRE2/128 2000000 986 ns/op 129.71 MB/s +Search_Success_CachedRE2/256 1000000 1515 ns/op 168.96 MB/s +Search_Success_CachedRE2/512 1000000 2755 ns/op 185.79 MB/s +Search_Success_CachedRE2/1K 500000 5393 ns/op 189.85 MB/s +Search_Success_CachedRE2/2K 200000 10600 ns/op 193.19 MB/s +Search_Success_CachedRE2/4K 100000 20483 ns/op 199.96 MB/s +Search_Success_CachedRE2/8K 50000 38668 ns/op 211.85 MB/s +Search_Success_CachedRE2/16K 20000 76366 ns/op 214.55 MB/s +Search_Success_CachedRE2/32K 10000 150929 ns/op 217.11 MB/s +Search_Success_CachedRE2/64K 5000 305399 ns/op 214.59 MB/s +Search_Success_CachedRE2/128K 5000 602232 ns/op 217.64 MB/s +Search_Success_CachedRE2/256K 2000 1205052 ns/op 217.54 MB/s +Search_Success_CachedRE2/512K 1000 2422666 ns/op 216.41 MB/s +Search_Success_CachedRE2/1M 500 4914886 ns/op 213.35 MB/s +Search_Success_CachedRE2/2M 200 9935245 ns/op 211.08 MB/s +Search_Success_CachedRE2/4M 50 21790440 ns/op 192.48 MB/s +Search_Success_CachedRE2/8M 20 50113100 ns/op 167.39 MB/s +Search_Success_CachedRE2/16M 10 123046100 ns/op 136.35 MB/s +Search_Success1_PCRE/8 500000 3366 ns/op 2.38 MB/s +Search_Success1_PCRE/16 500000 3454 ns/op 4.63 MB/s +Search_Success1_PCRE/32 500000 3660 ns/op 8.74 MB/s +Search_Success1_PCRE/64 500000 3867 ns/op 16.55 MB/s +Search_Success1_PCRE/128 500000 4565 ns/op 28.04 MB/s +Search_Success1_PCRE/256 500000 5799 ns/op 44.14 MB/s +Search_Success1_PCRE/512 200000 8419 ns/op 60.81 MB/s +Search_Success1_PCRE/1K 200000 13336 ns/op 76.78 MB/s +Search_Success1_PCRE/2K 100000 23535 ns/op 87.02 MB/s +Search_Success1_PCRE/4K 50000 43661 ns/op 93.81 MB/s +Search_Success1_PCRE/8K 20000 86796 ns/op 94.38 MB/s +Search_Success1_PCRE/16K 10000 168549 ns/op 97.21 MB/s +Search_Success1_PCRE/32K 5000 335853 ns/op 97.57 MB/s +Search_Success1_PCRE/64K 5000 677253 ns/op 96.77 MB/s +Search_Success1_PCRE/128K 2000 1353762 ns/op 96.82 MB/s +Search_Success1_PCRE/256K 1000 2736863 ns/op 95.78 MB/s +Search_Success1_PCRE/512K 500 5461592 ns/op 96.00 MB/s +Search_Success1_PCRE/1M 200 10982585 ns/op 95.48 MB/s +Search_Success1_PCRE/2M 100 22383350 ns/op 93.69 MB/s +Search_Success1_PCRE/4M 50 46209500 ns/op 90.77 MB/s +Search_Success1_PCRE/8M 10 110218000 ns/op 76.11 MB/s +Search_Success1_PCRE/16M 5 264726600 ns/op 63.38 MB/s +Search_Success1_RE2/8 50000 46109 ns/op 0.17 MB/s +Search_Success1_RE2/16 50000 46782 ns/op 0.34 MB/s +Search_Success1_RE2/32 50000 46352 ns/op 0.69 MB/s +Search_Success1_RE2/64 50000 46245 ns/op 1.38 MB/s +Search_Success1_RE2/128 50000 46455 ns/op 2.76 MB/s +Search_Success1_RE2/256 50000 47186 ns/op 5.43 MB/s +Search_Success1_RE2/512 50000 48004 ns/op 10.67 MB/s +Search_Success1_RE2/1K 50000 50252 ns/op 20.38 MB/s +Search_Success1_RE2/2K 50000 54161 ns/op 37.81 MB/s +Search_Success1_RE2/4K 50000 64963 ns/op 63.05 MB/s +Search_Success1_RE2/8K 20000 82940 ns/op 98.77 MB/s +Search_Success1_RE2/16K 10000 122743 ns/op 133.48 MB/s +Search_Success1_RE2/32K 10000 197762 ns/op 165.69 MB/s +Search_Success1_RE2/64K 5000 352522 ns/op 185.91 MB/s +Search_Success1_RE2/128K 5000 658216 ns/op 199.13 MB/s +Search_Success1_RE2/256K 2000 1258225 ns/op 208.34 MB/s +Search_Success1_RE2/512K 1000 2478527 ns/op 211.53 MB/s +Search_Success1_RE2/1M 500 4926770 ns/op 212.83 MB/s +Search_Success1_RE2/2M 200 10027130 ns/op 209.15 MB/s +Search_Success1_RE2/4M 50 21907720 ns/op 191.45 MB/s +Search_Success1_RE2/8M 20 50590450 ns/op 165.81 MB/s +Search_Success1_RE2/16M 10 122882000 ns/op 136.53 MB/s +Search_Success1_Cached_PCRE/8 10000000 298 ns/op 26.77 MB/s +Search_Success1_Cached_PCRE/16 5000000 372 ns/op 42.92 MB/s +Search_Success1_Cached_PCRE/32 5000000 525 ns/op 60.90 MB/s +Search_Success1_Cached_PCRE/64 2000000 837 ns/op 76.39 MB/s +Search_Success1_Cached_PCRE/128 1000000 1472 ns/op 86.94 MB/s +Search_Success1_Cached_PCRE/256 1000000 2741 ns/op 93.36 MB/s +Search_Success1_Cached_PCRE/512 500000 5211 ns/op 98.24 MB/s +Search_Success1_Cached_PCRE/1K 200000 10138 ns/op 101.00 MB/s +Search_Success1_Cached_PCRE/2K 100000 20494 ns/op 99.93 MB/s +Search_Success1_Cached_PCRE/4K 50000 41028 ns/op 99.83 MB/s +Search_Success1_Cached_PCRE/8K 20000 83370 ns/op 98.26 MB/s +Search_Success1_Cached_PCRE/16K 10000 169360 ns/op 96.74 MB/s +Search_Success1_Cached_PCRE/32K 5000 335152 ns/op 97.77 MB/s +Search_Success1_Cached_PCRE/64K 5000 672917 ns/op 97.39 MB/s +Search_Success1_Cached_PCRE/128K 2000 1357874 ns/op 96.53 MB/s +Search_Success1_Cached_PCRE/256K 1000 2691864 ns/op 97.38 MB/s +Search_Success1_Cached_PCRE/512K 500 5409458 ns/op 96.92 MB/s +Search_Success1_Cached_PCRE/1M 200 10914605 ns/op 96.07 MB/s +Search_Success1_Cached_PCRE/2M 100 22352650 ns/op 93.82 MB/s +Search_Success1_Cached_PCRE/4M 50 45584220 ns/op 92.01 MB/s +Search_Success1_Cached_PCRE/8M 10 109049200 ns/op 76.92 MB/s +Search_Success1_Cached_PCRE/16M 5 262203600 ns/op 63.99 MB/s +Search_Success1_Cached_RE2/8 5000000 456 ns/op 17.54 MB/s +Search_Success1_Cached_RE2/16 5000000 485 ns/op 32.98 MB/s +Search_Success1_Cached_RE2/32 5000000 567 ns/op 56.44 MB/s +Search_Success1_Cached_RE2/64 5000000 721 ns/op 88.72 MB/s +Search_Success1_Cached_RE2/128 1000000 1008 ns/op 126.93 MB/s +Search_Success1_Cached_RE2/256 1000000 1564 ns/op 163.65 MB/s +Search_Success1_Cached_RE2/512 1000000 2669 ns/op 191.81 MB/s +Search_Success1_Cached_RE2/1K 500000 5409 ns/op 189.28 MB/s +Search_Success1_Cached_RE2/2K 200000 10523 ns/op 194.61 MB/s +Search_Success1_Cached_RE2/4K 100000 20564 ns/op 199.18 MB/s +Search_Success1_Cached_RE2/8K 50000 38430 ns/op 213.16 MB/s +Search_Success1_Cached_RE2/16K 20000 76032 ns/op 215.49 MB/s +Search_Success1_Cached_RE2/32K 10000 151271 ns/op 216.62 MB/s +Search_Success1_Cached_RE2/64K 5000 302063 ns/op 216.96 MB/s +Search_Success1_Cached_RE2/128K 5000 605221 ns/op 216.57 MB/s +Search_Success1_Cached_RE2/256K 2000 1205637 ns/op 217.43 MB/s +Search_Success1_Cached_RE2/512K 1000 2421347 ns/op 216.53 MB/s +Search_Success1_Cached_RE2/1M 500 4865300 ns/op 215.52 MB/s +Search_Success1_Cached_RE2/2M 200 10079725 ns/op 208.06 MB/s +Search_Success1_Cached_RE2/4M 50 21765520 ns/op 192.70 MB/s +Search_Success1_Cached_RE2/8M 20 50470050 ns/op 166.21 MB/s +Search_Success1_Cached_RE2/16M 10 122714000 ns/op 136.72 MB/s +Search_Digits_PCRE 500000 6942 ns/op +Search_Digits_RE2 50000 36247 ns/op +Parse_Digits_PCRE 500000 7096 ns/op +Parse_Digits_RE2 100000 18800 ns/op +Parse_CachedDigits_PCRE 5000000 566 ns/op +Parse_CachedDigits_RE2 5000000 340 ns/op +Parse_DigitDs_PCRE 500000 6292 ns/op +Parse_DigitDs_RE2 100000 18679 ns/op +Parse_CachedDigitDs_PCRE 5000000 569 ns/op +Parse_CachedDigitDs_RE2 5000000 335 ns/op +Parse_Split_PCRE 500000 4704 ns/op +Parse_Split_RE2 100000 20487 ns/op +Parse_CachedSplit_PCRE 5000000 422 ns/op +Parse_CachedSplit_RE2 10000000 231 ns/op +Parse_SplitHard_PCRE 500000 4807 ns/op +Parse_SplitHard_RE2 100000 25767 ns/op +Parse_CachedSplitHard_PCRE 5000000 426 ns/op +Parse_CachedSplitHard_RE2 1000000 2295 ns/op +Parse_CachedSplitBig1_PCRE 500 5471602 ns/op +Parse_CachedSplitBig1_RE2 2000 922666 ns/op +Parse_CachedSplitBig2_PCRE 2000 1036110 ns/op +Parse_CachedSplitBig2_RE2 20 95396100 ns/op +BM_PCRE_Compile 500000 5864 ns/op +BM_RE2_Compile 100000 21683 ns/op +SearchPhone_CachedPCRE/8 1000000 1487 ns/op 5.38 MB/s +SearchPhone_CachedPCRE/16 1000000 2368 ns/op 6.75 MB/s +SearchPhone_CachedPCRE/32 500000 4068 ns/op 7.87 MB/s +SearchPhone_CachedPCRE/64 500000 7319 ns/op 8.74 MB/s +SearchPhone_CachedPCRE/128 200000 14025 ns/op 9.13 MB/s +SearchPhone_CachedPCRE/256 100000 27296 ns/op 9.38 MB/s +SearchPhone_CachedPCRE/512 50000 53753 ns/op 9.52 MB/s +SearchPhone_CachedPCRE/1K 10000 106767 ns/op 9.59 MB/s +SearchPhone_CachedPCRE/2K 10000 213088 ns/op 9.61 MB/s +SearchPhone_CachedPCRE/4K 5000 418855 ns/op 9.78 MB/s +SearchPhone_CachedPCRE/8K 2000 838067 ns/op 9.77 MB/s +SearchPhone_CachedPCRE/16K 1000 1680195 ns/op 9.75 MB/s +SearchPhone_CachedPCRE/32K 500 3348730 ns/op 9.79 MB/s +SearchPhone_CachedPCRE/64K 500 6741460 ns/op 9.72 MB/s +SearchPhone_CachedPCRE/128K 100 13386160 ns/op 9.79 MB/s +SearchPhone_CachedPCRE/256K 100 26777290 ns/op 9.79 MB/s +SearchPhone_CachedPCRE/512K 50 53539340 ns/op 9.79 MB/s +SearchPhone_CachedPCRE/1M 20 107442600 ns/op 9.76 MB/s +SearchPhone_CachedPCRE/2M 10 215474400 ns/op 9.73 MB/s +SearchPhone_CachedPCRE/4M 5 429385600 ns/op 9.77 MB/s +SearchPhone_CachedPCRE/8M 5 858351200 ns/op 9.77 MB/s +SearchPhone_CachedPCRE/16M 1 1728512000 ns/op 9.71 MB/s +SearchPhone_CachedRE2/8 1000000 1229 ns/op 6.51 MB/s +SearchPhone_CachedRE2/16 1000000 1267 ns/op 12.62 MB/s +SearchPhone_CachedRE2/32 1000000 1347 ns/op 23.74 MB/s +SearchPhone_CachedRE2/64 1000000 1534 ns/op 41.71 MB/s +SearchPhone_CachedRE2/128 1000000 1835 ns/op 69.73 MB/s +SearchPhone_CachedRE2/256 1000000 2481 ns/op 103.16 MB/s +SearchPhone_CachedRE2/512 500000 3680 ns/op 139.11 MB/s +SearchPhone_CachedRE2/1K 500000 5979 ns/op 171.26 MB/s +SearchPhone_CachedRE2/2K 200000 11101 ns/op 184.48 MB/s +SearchPhone_CachedRE2/4K 100000 20468 ns/op 200.11 MB/s +SearchPhone_CachedRE2/8K 50000 39643 ns/op 206.64 MB/s +SearchPhone_CachedRE2/16K 20000 76829 ns/op 213.25 MB/s +SearchPhone_CachedRE2/32K 10000 151593 ns/op 216.16 MB/s +SearchPhone_CachedRE2/64K 5000 301378 ns/op 217.45 MB/s +SearchPhone_CachedRE2/128K 5000 601951 ns/op 217.75 MB/s +SearchPhone_CachedRE2/256K 2000 1216569 ns/op 215.48 MB/s +SearchPhone_CachedRE2/512K 1000 2408186 ns/op 217.71 MB/s +SearchPhone_CachedRE2/1M 500 4819808 ns/op 217.56 MB/s +SearchPhone_CachedRE2/2M 200 9686115 ns/op 216.51 MB/s +SearchPhone_CachedRE2/4M 100 19783390 ns/op 212.01 MB/s +SearchPhone_CachedRE2/8M 50 39521640 ns/op 212.25 MB/s +SearchPhone_CachedRE2/16M 20 78231500 ns/op 214.46 MB/s +EmptyPartialMatchPCRE 20000000 137 ns/op +EmptyPartialMatchRE2 5000000 413 ns/op +SimplePartialMatchPCRE 10000000 205 ns/op +SimplePartialMatchRE2 5000000 457 ns/op +HTTPPartialMatchPCRE 5000000 636 ns/op +HTTPPartialMatchRE2 1000000 1005 ns/op +SmallHTTPPartialMatchPCRE 5000000 634 ns/op +SmallHTTPPartialMatchRE2 2000000 1009 ns/op +DotMatchPCRE 2000000 837 ns/op +DotMatchRE2 1000000 1043 ns/op +ASCIIMatchPCRE 5000000 468 ns/op +ASCIIMatchRE2 1000000 1040 ns/op diff --git a/outside/re2/benchlog/mktable b/outside/re2/benchlog/mktable new file mode 100755 index 000000000..da0659820 --- /dev/null +++ b/outside/re2/benchlog/mktable @@ -0,0 +1,155 @@ +#!/usr/bin/perl +# XXX + +sub table() { + my ($name) = @_; + print <<'EOF'; + + +EOF + foreach my $sys (@sys) { + my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'}; + my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'}; + printf "\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.; + } + print <<'EOF'; + +
SystemPCRERE2
%s%.1f µs%.1f µs
+EOF +} + +@sizes = ( + "8", "16", "32", "64", "128", "256", "512", + "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", + "1M", "2M", "4M", "8M", "16M" +); + +%color = ( + "PCRE" => "0.7 0 0", + "RE2" => "0 0 1", +); + +$ngraph = 0; + +sub graph() { + my ($name) = @_; + + my $sys = "wreck"; + my $base = sprintf("regexp3g%d", ++$ngraph); + + open(JGR, ">$base.jgr") || die "open >$base.jgr: $!"; + printf JGR "bbox -20 -12 392 95\n"; + printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n"; + $ymax = 0; + %lastx = (); + %lasty = (); + foreach my $who ("PCRE", "RE2") { + printf JGR "newcurve pts\n"; + for(my $i=0; $i<@sizes; $i++) { + my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]); + my $val = $data{$sys}->{$key}->{'MB/s'}; + next if !defined($val); + if($val > $ymax) { + $ymax = $val; + } + $lastx{$who} = $i; + $lasty{$who} = $val; + printf JGR "$i %f (* %s *)\n", $val, $key; + } + my $color = $color{$who}; + printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n"; + } + my $n = @sizes; + printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n"; + printf JGR " no_auto_hash_marks hash_labels fontsize 9\n"; + for($i=0; $i<@sizes; $i+=3) { + printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n"; + } + my $y = 1; + while(10*$y <= $ymax) { + $y = 10*$y; + } + for($i=2; $i<=10; $i++) { + if($i*$y > $ymax) { + $y = $i*$y; + last; + } + } + foreach my $who ("PCRE", "RE2") { + $x1 = $lastx{$who}; + $y1 = $lasty{$who}; + $x1 *= 1.01; + my $v = "vjc"; + if($y1 < 0.05 * $y) { + $v = "vjb"; + $y1 = 0.05 * $y; + } + printf JGR "newstring x $x1 y $y1 hjl $v : $who\n"; + } + printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n"; + printf JGR " hash_labels fontsize 9\n"; + # printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n"; + + system("jgraph $base.jgr >$base.eps"); # die "system: $!"; + system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps"); + + printf "\n" + +} + +sub skip() { + while(<>) { + if(/^/) { + print; + last; + } + } +} + +@sys = ("r70", "c2", "wreck", "mini"); +%sysname = ( + "r70" => "AMD Opteron 8214 HE, 2.2 GHz", + "c2" => "Intel Core2 Duo E7200, 2.53 GHz", + "wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)", + "mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)", +); + +%func = ( + "table" => \&table, + "graph" => \&graph, + +); + +foreach my $sys (@sys) { + open(F, "benchlog.$sys") || die "open benchlog.$sys: $!"; + my %sysdat; + while() { + if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) { + my %row; + $row{"name"} = $1; + $row{"iter"} = $2; + $row{"ns/op"} = $3; + if(/([\d.]+) MB\/s/){ + $row{"MB/s"} = $1; + } + $sysdat{$row{"name"}} = \%row; + } + } + close F; + $data{$sys} = \%sysdat; +} + +while(<>) { + print; + if(/^/) { + $func{$1}(); + skip(); + next; + } + if(/^/) { + $func{$1}($2); + skip(); + next; + } +} + diff --git a/outside/re2/doc/README.xkcd b/outside/re2/doc/README.xkcd new file mode 100644 index 000000000..b50a579a5 --- /dev/null +++ b/outside/re2/doc/README.xkcd @@ -0,0 +1 @@ +xkcd.png is a cropped version of http://xkcd.com/208/ diff --git a/outside/re2/doc/mksyntaxgo b/outside/re2/doc/mksyntaxgo new file mode 100755 index 000000000..42e87d62e --- /dev/null +++ b/outside/re2/doc/mksyntaxgo @@ -0,0 +1,41 @@ +#!/bin/sh + +set -e +out=$GOROOT/src/pkg/regexp/syntax/doc.go +cp syntax.txt $out +sam -d $out <<'!' +,x g/NOT SUPPORTED/d +/^Unicode character class/,$d +,s/[«»]//g +,x g/^Possessive repetitions:/d +,x g/\\C/d +,x g/Flag syntax/d +,s/.=(true|false)/flag &/g +,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/ +,s/\n\n\n+/\n\n/g +,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}' +1,2c +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution. + +/* +Package syntax parses regular expressions into parse trees and compiles +parse trees into programs. Most clients of regular expressions will use the +facilities of package regexp (such as Compile and Match) instead of this package. + +Syntax + +The regular expression syntax understood by this package when parsing with the Perl flag is as follows. +Parts of the syntax can be disabled by passing alternate flags to Parse. + +. +$a +*/ +package syntax +. +w +q +! diff --git a/outside/re2/doc/mksyntaxhtml b/outside/re2/doc/mksyntaxhtml new file mode 100755 index 000000000..0292ea00a --- /dev/null +++ b/outside/re2/doc/mksyntaxhtml @@ -0,0 +1,42 @@ +#!/bin/sh + +cp syntax.txt syntax.html +sam -d syntax.html <<'!' +,s/\&/\&/g +,s//\>/g +,s!== (([^()]|\([^()]*\))*)!≡ \1!g +,s!«!!g +,s!»!!g +,s! vim$! VIM!g +,s! pcre$! PCRE!g +,s! perl$! PERL!g +,x g/NOT SUPPORTED/ s!^[^ ]+!&! +,s!NOT SUPPORTED!!g +,s!(^[^ ]+) (.*)\n!\1\2\n!g +,s!.*:$!&!g +,s!^$!!g +,x v// s!.*!&! +1,2c + + + + +RE2 regular expression syntax reference + + +

RE2 regular expression syntax reference

+ + + + + +. +$a +
This page lists the regular expression syntax accepted by RE2.
It also lists syntax accepted by PCRE, PERL, and VIM.
Grayed out expressions are not supported by RE2.
+ + +. +w +q +! diff --git a/outside/re2/doc/mksyntaxwiki b/outside/re2/doc/mksyntaxwiki new file mode 100755 index 000000000..930b3896e --- /dev/null +++ b/outside/re2/doc/mksyntaxwiki @@ -0,0 +1,36 @@ +#!/bin/sh + +cp syntax.txt syntax.wiki +sam -d syntax.wiki <<'!' +,s!`!`````!g +,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g +,s!«!`!g +,s!»!`!g +,s! vim$! VIM!g +,s! pcre$! PCRE!g +,s! perl$! PERL!g +,s!(^[^ ]+) (.*)\n!`\1` \2\n!g +,x g/NOT SUPPORTED/ s!^[^ ]+!&! +,s!NOT SUPPORTED!(&)!g +,s!(^[^ ]+) (.*)\n!\1\2\n!g +,s!.*:$!&!g +,s!^$!!g +,x v// s!.*!&! +1,2c +#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth + + +GENERATED BY mksyntaxwiki. DO NOT EDIT + + + + + + +. +$a +
This page lists the regular expression syntax accepted by RE2.
It also lists syntax accepted by PCRE, PERL, and VIM.
Grayed out expressions are not supported by RE2.
+. +w +q +! diff --git a/outside/re2/doc/syntax.html b/outside/re2/doc/syntax.html new file mode 100644 index 000000000..7f5e15a19 --- /dev/null +++ b/outside/re2/doc/syntax.html @@ -0,0 +1,388 @@ + + + + +RE2 regular expression syntax reference + + +

RE2 regular expression syntax reference

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
This page lists the regular expression syntax accepted by RE2.
It also lists syntax accepted by PCRE, PERL, and VIM.
Grayed out expressions are not supported by RE2.
See http://go/re2 and http://go/re2quick.
Single characters:
.any character, including newline (s=true)
[xyz]character class
[^xyz]negated character class
\dPerl character class
\Dnegated Perl character class
[:alpha:]ASCII character class
[:^alpha:]negated ASCII character class
\pNUnicode character class (one-letter name)
\p{Greek}Unicode character class
\PNnegated Unicode character class (one-letter name)
\P{Greek}negated Unicode character class
Composites:
xyx followed by y
x|yx or y (prefer x)
Repetitions:
x*zero or more x, prefer more
x+one or more x, prefer more
x?zero or one x, prefer one
x{n,m}n or n+1 or ... or m x, prefer more
x{n,}n or more x, prefer more
x{n}exactly n x
x*?zero or more x, prefer fewer
x+?one or more x, prefer fewer
x??zero or one x, prefer zero
x{n,m}?n or n+1 or ... or m x, prefer fewer
x{n,}?n or more x, prefer fewer
x{n}?exactly n x
x{}(≡ x*) VIM
x{-}(≡ x*?) VIM
x{-n}(≡ x{n}?) VIM
x=(≡ x?) VIM
Possessive repetitions:
x*+zero or more x, possessive
x++one or more x, possessive
x?+zero or one x, possessive
x{n,m}+n or ... or m x, possessive
x{n,}+n or more x, possessive
x{n}+exactly n x, possessive
Grouping:
(re)numbered capturing group
(?P<name>re)named & numbered capturing group
(?<name>re)named & numbered capturing group
(?'name're)named & numbered capturing group
(?:re)non-capturing group
(?flags)set flags until outer paren closes; non-capturing
(?flags:re)set flags during re; non-capturing
(?#text)comment
(?|x|y|z)branch numbering reset
(?>re)possessive match of re
re@>possessive match of re VIM
%(re)non-capturing group VIM
Flags:
icase-insensitive (default false)
mmulti-line mode (default false)
slet . match \n (default false)
Uungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z).
Empty strings:
^at beginning of text or line (m=true)
$at end of text or line (m=true)
\Aat beginning of text
\bat word boundary (\w to left and \W to right or vice versa)
\Bnot a word boundary
\Gat beginning of subtext being searched PCRE
\Gat end of last match PERL
\Zat end of text, or before newline at end of text
\zat end of text
(?=re)before text matching re
(?!re)before text not matching re
(?<=re)after text matching re
(?<!re)after text not matching re
re&before text matching re VIM
re@=before text matching re VIM
re@!before text not matching re VIM
re@<=after text matching re VIM
re@<!after text not matching re VIM
\zssets start of match (= \K) VIM
\zesets end of match VIM
\%^beginning of file VIM
\%$end of file VIM
\%Von screen VIM
\%#cursor position VIM
\%'mmark m position VIM
\%23lin line 23 VIM
\%23cin column 23 VIM
\%23vin virtual column 23 VIM
Escape sequences:
\abell (≡ \007)
\fform feed (≡ \014)
\thorizontal tab (≡ \011)
\nnewline (≡ \012)
\rcarriage return (≡ \015)
\vvertical tab character (≡ \013)
\*literal *, for any punctuation character *
\123octal character code (up to three digits)
\x7Fhex character code (exactly two digits)
\x{10FFFF}hex character code
\Cmatch a single byte even in UTF-8 mode
\Q...\Eliteral text ... even if ... has punctuation
\1backreference
\bbackspace (use \010)
\cKcontrol char ^K (use \001 etc)
\eescape (use \033)
\g1backreference
\g{1}backreference
\g{+1}backreference
\g{-1}backreference
\g{name}named backreference
\g<name>subroutine call
\g'name'subroutine call
\k<name>named backreference
\k'name'named backreference
\lXlowercase X
\uxuppercase x
\L...\Elowercase text ...
\Kreset beginning of $0
\N{name}named Unicode character
\Rline break
\U...\Eupper case text ...
\Xextended Unicode sequence
\%d123decimal character 123 VIM
\%xFFhex character FF VIM
\%o123octal character 123 VIM
\%u1234Unicode character 0x1234 VIM
\%U12345678Unicode character 0x12345678 VIM
Character class elements:
xsingle character
A-Zcharacter range (inclusive)
\dPerl character class
[:foo:]ASCII character class foo
\p{Foo}Unicode character class Foo
\pFUnicode character class F (one-letter name)
Named character classes as character class elements:
[\d]digits (≡ \d)
[^\d]not digits (≡ \D)
[\D]not digits (≡ \D)
[^\D]not not digits (≡ \d)
[[:name:]]named ASCII class inside character class (≡ [:name:])
[^[:name:]]named ASCII class inside negated character class (≡ [:^name:])
[\p{Name}]named Unicode property inside character class (≡ \p{Name})
[^\p{Name}]named Unicode property inside negated character class (≡ \P{Name})
Perl character classes:
\ddigits (≡ [0-9])
\Dnot digits (≡ [^0-9])
\swhitespace (≡ [\t\n\f\r ])
\Snot whitespace (≡ [^\t\n\f\r ])
\wword characters (≡ [0-9A-Za-z_])
\Wnot word characters (≡ [^0-9A-Za-z_])
\hhorizontal space
\Hnot horizontal space
\vvertical space
\Vnot vertical space
ASCII character classes:
[:alnum:]alphanumeric (≡ [0-9A-Za-z])
[:alpha:]alphabetic (≡ [A-Za-z])
[:ascii:]ASCII (≡ [\x00-\x7F])
[:blank:]blank (≡ [\t ])
[:cntrl:]control (≡ [\x00-\x1F\x7F])
[:digit:]digits (≡ [0-9])
[:graph:]graphical (≡ [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[:lower:]lower case (≡ [a-z])
[:print:]printable (≡ [ -~] == [ [:graph:]])
[:punct:]punctuation (≡ [!-/:-@[-`{-~])
[:space:]whitespace (≡ [\t\n\v\f\r ])
[:upper:]upper case (≡ [A-Z])
[:word:]word characters (≡ [0-9A-Za-z_])
[:xdigit:]hex digit (≡ [0-9A-Fa-f])
Unicode character class names--general category:
Cother
Cccontrol
Cfformat
Cnunassigned code points
Coprivate use
Cssurrogate
Lletter
LCcased letter
L&cased letter
Lllowercase letter
Lmmodifier letter
Loother letter
Lttitlecase letter
Luuppercase letter
Mmark
Mcspacing mark
Meenclosing mark
Mnnon-spacing mark
Nnumber
Nddecimal number
Nlletter number
Noother number
Ppunctuation
Pcconnector punctuation
Pddash punctuation
Peclose punctuation
Pffinal punctuation
Piinitial punctuation
Poother punctuation
Psopen punctuation
Ssymbol
Sccurrency symbol
Skmodifier symbol
Smmath symbol
Soother symbol
Zseparator
Zlline separator
Zpparagraph separator
Zsspace separator
Unicode character class names--scripts:
ArabicArabic
ArmenianArmenian
BalineseBalinese
BengaliBengali
BopomofoBopomofo
BrailleBraille
BugineseBuginese
BuhidBuhid
Canadian_AboriginalCanadian Aboriginal
CarianCarian
ChamCham
CherokeeCherokee
Commoncharacters not specific to one script
CopticCoptic
CuneiformCuneiform
CypriotCypriot
CyrillicCyrillic
DeseretDeseret
DevanagariDevanagari
EthiopicEthiopic
GeorgianGeorgian
GlagoliticGlagolitic
GothicGothic
GreekGreek
GujaratiGujarati
GurmukhiGurmukhi
HanHan
HangulHangul
HanunooHanunoo
HebrewHebrew
HiraganaHiragana
Inheritedinherit script from previous character
KannadaKannada
KatakanaKatakana
Kayah_LiKayah Li
KharoshthiKharoshthi
KhmerKhmer
LaoLao
LatinLatin
LepchaLepcha
LimbuLimbu
Linear_BLinear B
LycianLycian
LydianLydian
MalayalamMalayalam
MongolianMongolian
MyanmarMyanmar
New_Tai_LueNew Tai Lue (aka Simplified Tai Lue)
NkoNko
OghamOgham
Ol_ChikiOl Chiki
Old_ItalicOld Italic
Old_PersianOld Persian
OriyaOriya
OsmanyaOsmanya
Phags_Pa'Phags Pa
PhoenicianPhoenician
RejangRejang
RunicRunic
SaurashtraSaurashtra
ShavianShavian
SinhalaSinhala
SundaneseSundanese
Syloti_NagriSyloti Nagri
SyriacSyriac
TagalogTagalog
TagbanwaTagbanwa
Tai_LeTai Le
TamilTamil
TeluguTelugu
ThaanaThaana
ThaiThai
TibetanTibetan
TifinaghTifinagh
UgariticUgaritic
VaiVai
YiYi
Vim character classes:
\iidentifier character VIM
\I\i except digits VIM
\kkeyword character VIM
\K\k except digits VIM
\ffile name character VIM
\F\f except digits VIM
\pprintable character VIM
\P\p except digits VIM
\swhitespace character (≡ [ \t]) VIM
\Snon-white space character (≡ [^ \t]) VIM
\ddigits (≡ [0-9]) VIM
\Dnot \d VIM
\xhex digits (≡ [0-9A-Fa-f]) VIM
\Xnot \x VIM
\ooctal digits (≡ [0-7]) VIM
\Onot \o VIM
\wword character VIM
\Wnot \w VIM
\hhead of word character VIM
\Hnot \h VIM
\aalphabetic VIM
\Anot \a VIM
\llowercase VIM
\Lnot lowercase VIM
\uuppercase VIM
\Unot uppercase VIM
\_x\x plus newline, for any x VIM
Vim flags:
\cignore case VIM
\Cmatch case VIM
\mmagic VIM
\Mnomagic VIM
\vverymagic VIM
\Vverynomagic VIM
\Zignore differences in Unicode combining characters VIM
Magic:
(?{code})arbitrary Perl code PERL
(??{code})postponed arbitrary Perl code PERL
(?n)recursive call to regexp capturing group n
(?+n)recursive call to relative group +n
(?-n)recursive call to relative group -n
(?C)PCRE callout PCRE
(?R)recursive call to entire regexp (≡ (?0))
(?&name)recursive call to named group
(?P=name)named backreference
(?P>name)recursive call to named group
(?(cond)true|false)conditional branch
(?(cond)true)conditional branch
(*ACCEPT)make regexps more like Prolog
(*COMMIT)
(*F)
(*FAIL)
(*MARK)
(*PRUNE)
(*SKIP)
(*THEN)
(*ANY)set newline convention
(*ANYCRLF)
(*CR)
(*CRLF)
(*LF)
(*BSR_ANYCRLF)set \R convention PCRE
(*BSR_UNICODE) PCRE
+ + diff --git a/outside/re2/doc/syntax.txt b/outside/re2/doc/syntax.txt new file mode 100644 index 000000000..d9a3dbc0a --- /dev/null +++ b/outside/re2/doc/syntax.txt @@ -0,0 +1,395 @@ +RE2 regular expression syntax reference +-------------------------­-------­----- + +Single characters: +. any character, possibly including newline (s=true) +[xyz] character class +[^xyz] negated character class +\d Perl character class +\D negated Perl character class +[:alpha:] ASCII character class +[:^alpha:] negated ASCII character class +\pN Unicode character class (one-letter name) +\p{Greek} Unicode character class +\PN negated Unicode character class (one-letter name) +\P{Greek} negated Unicode character class + +Composites: +xy «x» followed by «y» +x|y «x» or «y» (prefer «x») + +Repetitions: +x* zero or more «x», prefer more +x+ one or more «x», prefer more +x? zero or one «x», prefer one +x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more +x{n,} «n» or more «x», prefer more +x{n} exactly «n» «x» +x*? zero or more «x», prefer fewer +x+? one or more «x», prefer fewer +x?? zero or one «x», prefer zero +x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer +x{n,}? «n» or more «x», prefer fewer +x{n}? exactly «n» «x» +x{} (== x*) NOT SUPPORTED vim +x{-} (== x*?) NOT SUPPORTED vim +x{-n} (== x{n}?) NOT SUPPORTED vim +x= (== x?) NOT SUPPORTED vim + +Possessive repetitions: +x*+ zero or more «x», possessive NOT SUPPORTED +x++ one or more «x», possessive NOT SUPPORTED +x?+ zero or one «x», possessive NOT SUPPORTED +x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED +x{n,}+ «n» or more «x», possessive NOT SUPPORTED +x{n}+ exactly «n» «x», possessive NOT SUPPORTED + +Grouping: +(re) numbered capturing group +(?Pre) named & numbered capturing group +(?re) named & numbered capturing group NOT SUPPORTED +(?'name're) named & numbered capturing group NOT SUPPORTED +(?:re) non-capturing group +(?flags) set flags within current group; non-capturing +(?flags:re) set flags during re; non-capturing +(?#text) comment NOT SUPPORTED +(?|x|y|z) branch numbering reset NOT SUPPORTED +(?>re) possessive match of «re» NOT SUPPORTED +re@> possessive match of «re» NOT SUPPORTED vim +%(re) non-capturing group NOT SUPPORTED vim + +Flags: +i case-insensitive (default false) +m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false) +s let «.» match «\n» (default false) +U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false) +Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»). + +Empty strings: +^ at beginning of text or line («m»=true) +$ at end of text (like «\z» not «\Z») or line («m»=true) +\A at beginning of text +\b at word boundary («\w» on one side and «\W», «\A», or «\z» on the other) +\B not a word boundary +\G at beginning of subtext being searched NOT SUPPORTED pcre +\G at end of last match NOT SUPPORTED perl +\Z at end of text, or before newline at end of text NOT SUPPORTED +\z at end of text +(?=re) before text matching «re» NOT SUPPORTED +(?!re) before text not matching «re» NOT SUPPORTED +(?<=re) after text matching «re» NOT SUPPORTED +(? subroutine call NOT SUPPORTED +\g'name' subroutine call NOT SUPPORTED +\k named backreference NOT SUPPORTED +\k'name' named backreference NOT SUPPORTED +\lX lowercase «X» NOT SUPPORTED +\ux uppercase «x» NOT SUPPORTED +\L...\E lowercase text «...» NOT SUPPORTED +\K reset beginning of «$0» NOT SUPPORTED +\N{name} named Unicode character NOT SUPPORTED +\R line break NOT SUPPORTED +\U...\E upper case text «...» NOT SUPPORTED +\X extended Unicode sequence NOT SUPPORTED + +\%d123 decimal character 123 NOT SUPPORTED vim +\%xFF hex character FF NOT SUPPORTED vim +\%o123 octal character 123 NOT SUPPORTED vim +\%u1234 Unicode character 0x1234 NOT SUPPORTED vim +\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim + +Character class elements: +x single character +A-Z character range (inclusive) +\d Perl character class +[:foo:] ASCII character class «foo» +\p{Foo} Unicode character class «Foo» +\pF Unicode character class «F» (one-letter name) + +Named character classes as character class elements: +[\d] digits (== \d) +[^\d] not digits (== \D) +[\D] not digits (== \D) +[^\D] not not digits (== \d) +[[:name:]] named ASCII class inside character class (== [:name:]) +[^[:name:]] named ASCII class inside negated character class (== [:^name:]) +[\p{Name}] named Unicode property inside character class (== \p{Name}) +[^\p{Name}] named Unicode property inside negated character class (== \P{Name}) + +Perl character classes: +\d digits (== [0-9]) +\D not digits (== [^0-9]) +\s whitespace (== [\t\n\f\r ]) +\S not whitespace (== [^\t\n\f\r ]) +\w word characters (== [0-9A-Za-z_]) +\W not word characters (== [^0-9A-Za-z_]) + +\h horizontal space NOT SUPPORTED +\H not horizontal space NOT SUPPORTED +\v vertical space NOT SUPPORTED +\V not vertical space NOT SUPPORTED + +ASCII character classes: +[:alnum:] alphanumeric (== [0-9A-Za-z]) +[:alpha:] alphabetic (== [A-Za-z]) +[:ascii:] ASCII (== [\x00-\x7F]) +[:blank:] blank (== [\t ]) +[:cntrl:] control (== [\x00-\x1F\x7F]) +[:digit:] digits (== [0-9]) +[:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) +[:lower:] lower case (== [a-z]) +[:print:] printable (== [ -~] == [ [:graph:]]) +[:punct:] punctuation (== [!-/:-@[-`{-~]) +[:space:] whitespace (== [\t\n\v\f\r ]) +[:upper:] upper case (== [A-Z]) +[:word:] word characters (== [0-9A-Za-z_]) +[:xdigit:] hex digit (== [0-9A-Fa-f]) + +Unicode character class names--general category: +C other +Cc control +Cf format +Cn unassigned code points NOT SUPPORTED +Co private use +Cs surrogate +L letter +LC cased letter NOT SUPPORTED +L& cased letter NOT SUPPORTED +Ll lowercase letter +Lm modifier letter +Lo other letter +Lt titlecase letter +Lu uppercase letter +M mark +Mc spacing mark +Me enclosing mark +Mn non-spacing mark +N number +Nd decimal number +Nl letter number +No other number +P punctuation +Pc connector punctuation +Pd dash punctuation +Pe close punctuation +Pf final punctuation +Pi initial punctuation +Po other punctuation +Ps open punctuation +S symbol +Sc currency symbol +Sk modifier symbol +Sm math symbol +So other symbol +Z separator +Zl line separator +Zp paragraph separator +Zs space separator + +Unicode character class names--scripts: +Arabic Arabic +Armenian Armenian +Balinese Balinese +Bamum Bamum +Batak Batak +Bengali Bengali +Bopomofo Bopomofo +Brahmi Brahmi +Braille Braille +Buginese Buginese +Buhid Buhid +Canadian_Aboriginal Canadian Aboriginal +Carian Carian +Chakma Chakma +Cham Cham +Cherokee Cherokee +Common characters not specific to one script +Coptic Coptic +Cuneiform Cuneiform +Cypriot Cypriot +Cyrillic Cyrillic +Deseret Deseret +Devanagari Devanagari +Egyptian_Hieroglyphs Egyptian Hieroglyphs +Ethiopic Ethiopic +Georgian Georgian +Glagolitic Glagolitic +Gothic Gothic +Greek Greek +Gujarati Gujarati +Gurmukhi Gurmukhi +Han Han +Hangul Hangul +Hanunoo Hanunoo +Hebrew Hebrew +Hiragana Hiragana +Imperial_Aramaic Imperial Aramaic +Inherited inherit script from previous character +Inscriptional_Pahlavi Inscriptional Pahlavi +Inscriptional_Parthian Inscriptional Parthian +Javanese Javanese +Kaithi Kaithi +Kannada Kannada +Katakana Katakana +Kayah_Li Kayah Li +Kharoshthi Kharoshthi +Khmer Khmer +Lao Lao +Latin Latin +Lepcha Lepcha +Limbu Limbu +Linear_B Linear B +Lycian Lycian +Lydian Lydian +Malayalam Malayalam +Mandaic Mandaic +Meetei_Mayek Meetei Mayek +Meroitic_Cursive Meroitic Cursive +Meroitic_Hieroglyphs Meroitic Hieroglyphs +Miao Miao +Mongolian Mongolian +Myanmar Myanmar +New_Tai_Lue New Tai Lue (aka Simplified Tai Lue) +Nko Nko +Ogham Ogham +Ol_Chiki Ol Chiki +Old_Italic Old Italic +Old_Persian Old Persian +Old_South_Arabian Old South Arabian +Old_Turkic Old Turkic +Oriya Oriya +Osmanya Osmanya +Phags_Pa 'Phags Pa +Phoenician Phoenician +Rejang Rejang +Runic Runic +Saurashtra Saurashtra +Sharada Sharada +Shavian Shavian +Sinhala Sinhala +Sora_Sompeng Sora Sompeng +Sundanese Sundanese +Syloti_Nagri Syloti Nagri +Syriac Syriac +Tagalog Tagalog +Tagbanwa Tagbanwa +Tai_Le Tai Le +Tai_Tham Tai Tham +Tai_Viet Tai Viet +Takri Takri +Tamil Tamil +Telugu Telugu +Thaana Thaana +Thai Thai +Tibetan Tibetan +Tifinagh Tifinagh +Ugaritic Ugaritic +Vai Vai +Yi Yi + +Vim character classes: +\i identifier character NOT SUPPORTED vim +\I «\i» except digits NOT SUPPORTED vim +\k keyword character NOT SUPPORTED vim +\K «\k» except digits NOT SUPPORTED vim +\f file name character NOT SUPPORTED vim +\F «\f» except digits NOT SUPPORTED vim +\p printable character NOT SUPPORTED vim +\P «\p» except digits NOT SUPPORTED vim +\s whitespace character (== [ \t]) NOT SUPPORTED vim +\S non-white space character (== [^ \t]) NOT SUPPORTED vim +\d digits (== [0-9]) vim +\D not «\d» vim +\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim +\X not «\x» NOT SUPPORTED vim +\o octal digits (== [0-7]) NOT SUPPORTED vim +\O not «\o» NOT SUPPORTED vim +\w word character vim +\W not «\w» vim +\h head of word character NOT SUPPORTED vim +\H not «\h» NOT SUPPORTED vim +\a alphabetic NOT SUPPORTED vim +\A not «\a» NOT SUPPORTED vim +\l lowercase NOT SUPPORTED vim +\L not lowercase NOT SUPPORTED vim +\u uppercase NOT SUPPORTED vim +\U not uppercase NOT SUPPORTED vim +\_x «\x» plus newline, for any «x» NOT SUPPORTED vim + +Vim flags: +\c ignore case NOT SUPPORTED vim +\C match case NOT SUPPORTED vim +\m magic NOT SUPPORTED vim +\M nomagic NOT SUPPORTED vim +\v verymagic NOT SUPPORTED vim +\V verynomagic NOT SUPPORTED vim +\Z ignore differences in Unicode combining characters NOT SUPPORTED vim + +Magic: +(?{code}) arbitrary Perl code NOT SUPPORTED perl +(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl +(?n) recursive call to regexp capturing group «n» NOT SUPPORTED +(?+n) recursive call to relative group «+n» NOT SUPPORTED +(?-n) recursive call to relative group «-n» NOT SUPPORTED +(?C) PCRE callout NOT SUPPORTED pcre +(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED +(?&name) recursive call to named group NOT SUPPORTED +(?P=name) named backreference NOT SUPPORTED +(?P>name) recursive call to named group NOT SUPPORTED +(?(cond)true|false) conditional branch NOT SUPPORTED +(?(cond)true) conditional branch NOT SUPPORTED +(*ACCEPT) make regexps more like Prolog NOT SUPPORTED +(*COMMIT) NOT SUPPORTED +(*F) NOT SUPPORTED +(*FAIL) NOT SUPPORTED +(*MARK) NOT SUPPORTED +(*PRUNE) NOT SUPPORTED +(*SKIP) NOT SUPPORTED +(*THEN) NOT SUPPORTED +(*ANY) set newline convention NOT SUPPORTED +(*ANYCRLF) NOT SUPPORTED +(*CR) NOT SUPPORTED +(*CRLF) NOT SUPPORTED +(*LF) NOT SUPPORTED +(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre +(*BSR_UNICODE) NOT SUPPORTED pcre + diff --git a/outside/re2/doc/xkcd.png b/outside/re2/doc/xkcd.png new file mode 100644 index 0000000000000000000000000000000000000000..6249e8e0d325415ec447d16c0cb201515b834b63 GIT binary patch literal 26496 zcmY(KRajfW+NcxUU4py26)0NV-QC@a6)5iR?h+h|ySo-BF2$Wf(PBkTy7&Gs&PDPh zlUcLYd^$7v-Y69%X;dUaBme+_Dk~$Q1^_^rLq1O-KtbMXeV_vY0N{$PxVVa}xH!3r ztFx7@gCzjq1q@48q=zvDjlDNvVNy&fI-?0A)wVQGn68;wE>J%yeV>VMX(2QB^gZmC z{p_YXD9b+%3(o)#FCC5J`bB$kWXx;Jd!u#nTltoEz-__%tsvK|fM~w$VqrtM#2m;1gs3!O9NxK;P0&`mJQ z+I9>Vh6fJIRKz0OyPzSWxfpSKqzJ0#F6q)J)L`=x42k>i3GK~C?wHR|3}ocuFCxtL zPu;h@H$a>0 zth>SF>uFf2{!ZW5>%8KRf3{`RKj1ah8?x@~Ac2gagPSkstLoq;wE=6=o#^ptU3XY+ z(CF5pR-e``PHn9HY4A`9qAP!#OYmU^qJ)zhZeA5aU$`Gf1cQnZ2FN*!29(Bi84R!A zHy6`Sh=BoW#vh84bW=PQ$SND&Lf7Yqf3jxqenI_w|5N=h+~V48-*4A%uiu|x&EOKc zcp8tkA(}>Xk}jq~Qg;*o1F)m-L2rso{X(@sx~H#S&Am zSAmg~1S8K&zH0GDHUGG4`#iVuGArwS!|I6R{UP9;S6Ns#2rQ~b&X9;BgVbEqZ@w{X?f$>7 z-0C|IeebUY?=QzkUkC)^ab&>6V|Cena!Ay$&k8{jlfu|u?a|6$;%}EpwX%@L@B*mY z8G%b^bV2r)i)p+WJXq2c&&f4P%k&h;=n4q1%sKgE!XIej5TAB#xycVh(1rOJdS?A; zVUwX8>8~EgMBX?OycPZ&7QEfjU8%%PxTu*JFkFSw2^Oulz<~{124Km2PKp*;G$vAa z!Tq5s0K7q&sbN)Vl9oxv>Qp0rjAaA2??les90QGW)X@9Vz@fiU%D%-6E(?mhnqMCt z&k_)}eQ&=j``@JC>zD*_>$YLhnI}W-D+29@^S(e_!?A7i>t)YU{$9Rw`)+vGW$)MB zFnp!@8a3LXeF@mlyF5ElIfisY&oyM$_QG`TfPXVc5e-W4J)jb}XjW$; ze!F~syPTJ2^LskXP=>;6Ou&{8VrgC73&mEd@aS8kG;C5{yg8VKh~DI!Au*Q2Dhalnm6$MrLKEhh;K`9a3N z)!~j>WhbTJS{)e!oZ+f9ho zJa%orIw?KpNIOm=t~Jf?{u|20Kf_@uRBZ?H({G3o`d*LV|9Z#Hzk<=0bZS}BaubVL ze4hFLdsD0J=x;wS3v6A-=Q#KBe|-Ou;P0N}*qS9FhSEJ7@OE3Uss8#U;NeSxp93Kn zPU%&(f<0n{-nZ0Yzg>L?J>W4UXW97mOdCHF?5CJXylP!;r<6$yYO$^^{^vCIbv^eH zA}_mZjP>IDQK=-6DRs$EA%*>@yg#qXczaGEw&2pFzxdEAo&RYu&qGiQD3djRfDi}E zby(9hqi0{+G9&cxlg;l`9@@s6@(>W2t4R%;&2ZHZjeZU>O?kGSyOtHBu4&FGiQ=Az ztsK#{Tnh6Ps1!w@`qABja=D+bbI+gYZ0+tn+%aOeX@E?!=Ohq=zgh-~TP5(f%@8VY z_uahw`%}TYcLa$i-(DmY_1<99e!Rd3@Jz)&%gp!o-1ht8Mor&vQXFICIzglCccu<6 znwiP~FM3cz$#6xF^b_JPUB5ejK_N2(Xd#mSqE43Qb5_~5<-?R`KX_0=pC*zF6;KLj zyRytMZ0$ejd;8Vrzl&@PLQ$|2n@kOOMSBWRD}2X!JXH?xnU|&WJ2xlmI?n%^u49H6 zT_W;!oiN4L_u?GzcVj<6#Q*j??_~gXV>q}!QZ-OBnT)SGl&ZS;^!A~BsePO-i}CcN{#zue0IDaP*^zayYlsN%Mq{IFeKZqwB$m=NHr)_9v2@4v~OL1TPgs`HoBe48?UQ+tGE^BP!y zgwLm=9OWFEE7;4%#TBUK_ov$Ubq8K#yekdH`zXuW|Mi#g z`&@+Z@Q2e#X)viMw4Elk+A;!XNdd>4wT6ThDKgU4TEjN-Em2O7mXWt5?F4?`;|QMC z&ui`88{DbGW0Ci#B`tkN=1zfjC3f8)N~=wQ-|f)u6#KLO_gn}PM&(lt$-xylPMuM` zzX@i&;D~)r<&BYY3$v-bC<|fGub^fA#qmSe^`W-DS4UD!Iekce0In`+8((x&gJ`K} zIu^cE3pIP1MN%YcYwN06-Pg@uvGa67r?PJCshh4-ED^-6lV$Ft!6SbCEmpW(-oq)DG3VQ@IR<+LnkzD12dH}7H9932Zh^7-}83hym^ z-KT8Tv2b#(HM0W0dtmftTbL~95Qt&|O|&x%tv6{bxfF;YighW9e2uBqbEa9BOi~=E z$&gl-o5+b6gC+RTF$*g)31iESIPNXy8Hup;t$i>JIQeOUvTW{ylCB!P|_`g0sKv? z^DU)_b|0s_tl`v zfH{q%X`+-85cWv|SCmEa$UFJ z4vST#(o~A2FpMj(${Vm-C77iA&KjcAEpz+*nD;m8T7`m0<$4^e=4Obk+7xs6 zA|PA#Ks*E)W+Ni@z)6&45IzIeEKLs~$JAb_?}WV3ASzJ6AvcS{D2r>ohK+H=X<@qG ziXlokUQOSoD9oaP(x%>Ze?OM%98r}AIg!TX&yx0~ zfl03;7!Z30x@dNxE*j;=w5=eD9ug!J;P%%t8W=bEks6LRV5(egkrGUO%<;Vz3+`!@ z$k}%%2wrbn_$mlpp}JxwN;epi_iiAeunB72_i6xspN&PpCHs;{@qAvYS)+t+H(I}8#YQx_6 zub%iad8--cqPPLW5II{i%fL;oj;#fZpxj(Inc$5J_Z&bDSMn3j{%Fv;8F>M$o?L9E z%u8W~~V6R?9e2(LXtADbndVQo^+oN6(x&u#J)Sct2dTOd6M@a61Pt4j{a^lXh9N*L7h(5?OA+6FF4&ARvXLxblvAx!4;d zV4~&&F(?YYd}>~x$?xw)oAeNa{0vMj4!gs-IIi-$R+uc=$JI|{n{6K+rr<=l6usZ~$ebhwlVU#vQKWOrK?SDDxPt3-dLc z?&0^6;6vtf=@H=@=;lYV=FC}z>(-wwWegl=NGDaT(qLQ)FpUkE!{4qcH6ivTLLWAo zdO;64;)kqUcpa1!du~Wsp$5@QD0f3|{d4HO4u7BuAymap8sc%PtpH3*f@q>rfAygC zqi%j=hI~^|wpZ;R_3oBB3${~OsM;cCumoNcvfrH;w8anh3w*p1DUsJOg=|999f6_8 z0ksoB%W6i3IR}e`A~c$-&sdeqORP2Y!qosZ&$$>CY8gFFbffZv=Uw!GNU}CnEMU-~ zQ`ZHY+|&k&LIxBy3e^HkjqJfdKHFx31%euv;5EzyPIvW*}(O zWq|2iXlCShcV=KejTIxB#=*H#Hoc^y!J7m}t!y~mFtDxoOgaycM}}&hg(;vYFzv!2 z0$VOS&~PPcb)aR<=#?Uqr})G=!D$~jqbpt*r?*ynzgH0tCbM;(P{t5ScC-coxyD+o zVWfY#ifU&#w(@yqxoRb6`OlSVrOFH%OF)5wMzv(|QV4i=Dskr@n>+?HKtOh@mU+@~ zxpbaA78gFh_ELguiAf3lLdw8na-<=>EqD})#CgGo^a|A5M8h21QqgO<+lTG9i*062 zBxUOYRSbKp)B=shX8-NAmt;)$BW6a&{zQEW z=ojyC$H-dXZ&dV{>L~d6}0EEVhHK{w@<$bWVi^(H1AG5pFGwO8yPYn zJdXe9gArgWliHzW7c^#)#w#}rLYE5LwdPh!ybAW5cP5CV(Khd=9?T`UU&IAT*aqKn z%~17ts_E_7Xac9L5X|EoF>H(a33A4(>@&*T1V3-Pi+@Z_#oDMEkc1yHo_K&Q?q%TV zI&2H5*zU;7+tSmuOIU2b)H?vZH=n^0-?SEUbLEY;H|_u6fpT)F3YpAQ$L2dQ-;O6< z^CMf)F-I-&`&HZKG%a@d+A-@l^GtLbVe`1$)|)0VuLldmQhl+p{pKkgN+lG%#~wQC zd&H&LNX~Q2=^1Kg(KkfxKRhXr@*R5 zpyK#VF{?ehU%RAY$PGuI>InKigL~DUwGQ|airwwu%@YyCr`@<5=xx{cdS^-t!+@IM zv8*c{KtZE?avC3^HIM2RtBR}|-Vqk!Of=EIQpduvY7}YH!*m9duh>e@tz;gkI1^bp zld^>*@Mtvc6E_r+dNl;!xafIO;!&W6Q!$6?DRYEtp+>mR-Hd$&qmNzYlZ;Isg(X_p zwlk(YwuYR-kW~yDSxY&@tTr#r8t|PH3B~#;9i&2$VJ^QUc0G=ET)rG0OKb2dI}Z5G_jNKz~>=;_R4A-}et2CstI* zYAnVehZL<-TRrb#y~&$}wtB;=)wGtx3dKk^M4N3M_CAe7P;I#kVS~`aKt>AgF{EAGi_gLt%=+aS9 zYbs5oToO3xdHNLKF@7kYRGnFXY`wtx`Q&)Q&wXjJYWr)u!EH-837?^U?8TzFm0gQik0arX?P z&WTNaikY(8ws20+w4Nk#>6Ekosim!;kJAk^=8n}Uv1rCT2%Nz^+}NTQb)?@U=nfu(Q8>fyYl2vvWFKGx(&zY!mqUo>&Kh?v3L9(LVB@;tTe)>9k8~XFQ#sty`+T zN&^0*w3dA&!yP|%U8++CrB%j(MtShyKq`(;X2jF$6b{SeVy9Z2{q`7BPyzDglpc0gAmtSc{ z3?pCzH(I^37jllMD<8`~ghm!;dAscLK1?UPTWc%TTdA9gVzwpwY&O{ZVIykDh0-V5 zd54WE?ra(<0g5Uno?Fkd3U;zo6B8TtTdyPPuf|P{P6zGcdn#>v!rKL5V$>mg?d==k zKNl`#!)VvZ6!@ea*>LwcSxcFEAVALaBiBoi^B2fY@yi^Wu0duyGth=6<}m7e<6WE9 zjm$eS=>-$su$uhiiGngG*=0W3wvwlF3@3(|26F6o^kHVFX<^nSD4vkO3D{faLs~I# zxyZ0y67w29O<_NdbH(jebW}(H7Fp})xS0tP@PO!I?+HV;egKZ>(N>eeXMNs_L9^kk zs#?m^ZTbAej2{kD)b4R$xz`~ZH2=;bDB29_a}>Vl$K>L`7O}XZtzp!D2Nexwk{Djf@##OC7dd zIr#8uxAqI(*yyae!cHHUW1SkiLii3B&WDG}Yd=&=5!=*1e`I9Jm5ewYL^aK6$O05N zs^!o|N)jF|0PR@Qj&WX;YkTpwe%>8y*(InnXx1tXq(!^2e5cQ#@Zz1-aNaFwTX3 zHA+LPjgl6!@=8o*Q|6VK=nI$71Qe3Ia@254XX+t6%Esy!$6>g9tASzlLS&&3&>(}k^=)gTqoK}6^cTM89x?{0sv~1S&Vppd zWVDn#u7T5y^)VTP4IPQeI+ebpqBNvsc5p4_pdozZI=l3}Ddq8#1%OyTzF~POTEtzzq533GDSt;p;wLFuwYRu+lKv8EvmsuDX{3VX zX$nychDbk(0)n43G@e)+2nAqxtKmn`SAI5EcQGN>aGrL3##gn zOwki9OWLGP3p!JKl>Ye4Ih8dkTyHX`Wn(f9TJLIV610a^yA*?-f$YrNFytSM5Z_HR zDn#GqJ5$SUydDf{!7)o`=YVBsxF5(+aY=)Fz!0UZfu`7^rUUkm&xe!B5v4Ihk}mtS zbsTIb^dA|)@Uc`GTP1>MQcJ=oE0xp=o>dtV@)0uDT= z2(C-%Mb6=AZpS=z2GI_*dJIjPdq7R>DRO(j!or9$^k_^hQ52TXFGOUigc)0>G%oLo z_!deTEkH>|r(tH8blA|PyH|WsGNMs?3p;l}97PJT5zwWZ*A4J9pSQlS*Jb+7R2N-B zmGMCY6}Dt;uBH^dGSZc84;3GQx(B|GVV>Ol_f3k^W7L-O z5?cY9G*CTI1+$NuYuEc3R(!|#$3?0c>>fgFFzfZ?G{^4eJVIF%RMt-`IFVt)LE*%R z)1jIgixSUa9yEXD5KCS1juI-x-r`fk(pVk)ed&un4q4)h(-7GP6S_-uUZ$854)5!w zCL~00+kJ}I84eKvydZHDlxyH;q&LHzv^+9}UyL0~ZR{S0>3{v9KYc}Rd18(-K0^Ye zUfyF6NxU|}G4(Usg#R|$k-V=EqQU})9X`er3W(~{v>&wHbtt00L z(tSwSY7WlKoqUjaVwlpN6>tN|5U5YISt z1IQ#=eJUURbrqKR)9-Y20O!jm9R;ecq3LPSja^)19(&817Im~1@!=5h6qa<8d)%af z6W9PGL7Xt};hy>6?6{loJ)8}MO>z6-dh2AE;}flHwUQ`mzdsb{Jkg(Mb1Kb(d0h9( zn7o+qFhB4&5q@LR-O!-Z-YEc~w1b|B1!+B4g|O5mG&y2+yi7TDYiaNh#MlMkF)@zO z5Ng#sl_Q9Siep=#!aWggVB2PX?`>$dSYuT~X|kvdYy% z={J<;|0WUv&j8(~^D27zxv3VCAZN8frryy0nz77VAy zc)Teyok9Lf3D38NX)CZ$Qf?}j9ws&_IN8$JPS&!85>BPDpDHCO$cZ8DkB2;L(b2ca8!3wf551gd6 znm_c{pOTQ&kVdG^ONcg2rG~h)PjQ-40VSI85OS&sG28?cIFGEoIM`J&_U~e>Yn+Oq zyHO75b{qAm@(F*Dz@~e}5T>QB*(Ws;p0UkoX43$)F@J{{=_0r^ewGVdQZ&~)ZvDd) zdC5kY&e(})B3ysguwF$Npdu{IyEex5{fe9>97ROh!hyOm$@nN$cONVS(8jVfof1Vr1Q4aXbEAv+kW+*0fw!{HFzRhNUV1U5C^|7A7%M?oi_GCu z6efE?txq?I@4Nzvsn9d%k*OcOI9qs-_$85&59=55$L2CyeC+tScGsKg(hkGMi6}Bs zFm)y9IJBN}LbDIb5Fzn)DKH%vNc5`^l=@>QR7z8xg0_+;hQZ2Bb*I1}Q_Q!#r5G+0 z3&c{iU)frP7p~TrA>&vN2HorFw-6Gpk_O@*5&aU?Q47@NMN%-&v|38t-NTqXs85#Y zfgH`evNPep5rz=GSpN8#>gkCsO0X9ih0P*epk?N%K2u*R3x3(a=NJpt1aCd*pO=W^ z8Q}fp#8!ws<8x_jy&gERu?bed)^wVuxEL#m@z&v`wDc-8VS#2tbfFDF6nE5DtTSDP z?N_H%WNDJL-%W81eu5o6D>jF5UDwT(JQXTzDT6ack(1{qfeI`*%M_%mJENrGQXsc1 zEUB6rqQ^=qGNMI3Lx3}$;>fhBnNXi(_`{eOV*kV-`WO;WTL92Efx<5A8BzJgUZ>*gPSC;YwflykPoSycVdbRArOE+WAN6Ll_@v&tSZWCa zhgrCMw2*QeF$^nsNJZ?Pbretp8L?|Vz@qf7In=hsO^*$)hM=i89BC{8WS@t9sxS-j zPCw|83Ro-nE-9{MB#VX&t!?=>>wHctR%LRQ7J|fw3O0~F4k2u+%-A}a(T9Y|`xUaH z=P}%y6su}v3|PJ4>|(~KaSYw)Jc^$_I;AW^#OC%sa!SWUAN@ zEe#dn+)KY+${O=C*#o|l5?qk53VTn=Ccuh-dp5`0D zLq_H=V+OwrNXCzM@a_$D11&+8ACCz;+M-euo-KMm!V~=Zs;gXGH}up5t)})NIrqv; zj@~mBB%bcT8Rr;GT~skxQDtXB|6zn(MpR48G>rtil@%u|HZsasoq2}PRWYN&k)#kfQh-19{lSSm-pMU3m`C0#+k)+C`qwP@# zX&Ofgk2T`n(nCgMzK$^IW|!v+Cm9_3vs5ufZO=$&8UhTnydZo1jo#$vmKyw23WdOQ z6IW`#Jb@*P=5eWeKJ0$XnC8a($mTud;orrP6eR2K02F$o9ci7Fy)<_9=$0H4u#}vk z$tms*(EL0&G3^MCbL1FB1gB0%jp7qf$uS){TDsxl?PHxTk-=b%j#))|#1fD(1#Op} z<=YZt?o0^=0;BG@sgWf0QZ+X2s?Dnf+g-+^jEqNYdujz;cu$HB!RdZoXS0EvGLO+^ zMfu~L)a4}8f+k3@o}5~~V0K{cKn6xNrg+c9h#O}}&wLw=VwEAo!oDTq){mhU#-=*d z-XpDzAX6ME6RKB;a}I787ne>wD9;Y((gr$IdWgGzL3iaRDY?z z!iSUo_DdzvC^RBq{(zrdtO94E*8UeUex$BK-3sIjn#jo*V&+K7YJGf8FOK}#oswzvQBiiv-%#W^BzU*7FP9V2z)Z|BdIwpjN zVSP`7o;gPU7t9iFN>1((HY(v*l=nwFTN(#M4AwU-8ewQRCE!#W#c<@-Di$%(7xvqg zHZJvPDD2_P0g3Rtg-rWg%1yUH+uTp(OIf14-;D!maY}J*lvSVqQD&${lvL z4dqqCpp3HS0`ek>bbvS&Gc`z3vNoi>AI$t&!)QVuiag@1@x=kZeh%YN#zaMtY`~>5 zo>hbDPaYGhQ(qcOuK-x5b0)l6Srg@FV>#ZlRlItL1D+?$eiP)JE~69X13F&C~B9aNzG zwLuv{lm?NWF17BSu}hI8CiC|{a@;E{*^MHI8&?B`m*#jLi58lVqCqSaySuB+%j?0H z;7|*#e&yQ^b++BFMrA20M_Qpk7!-m%wTo+jsmBs(2ckG<-DJ@#XRnDUnK7 z9HzSzr-6*>C%M_Bx#Fk6`2jM@JJW7d;^$9Zu%j4xQ4xGjo-x+HY}3ZcZ2h$ZGU=vq zE*vzwCEv1ewltB$DO8!x?UzEwQ=gsGVd6D2Y{~UylQ{O}fXmD=NnxYqb$Jo&-wc2E z8V^yai{aa;ZI3_LIRs=D#FTi*4hgelz|&E45^qLn4HiWV!?@k-Z5obf8yRZyIv=@J zAUe}?>E6jYKiUqKZpwjmpvX`~1mndkWeTq9xJ3h3^5KSMp_U#I)W7-2v&R*fLYaUQ z{1Tt1CRu^QX#iro@cWQ0J5>q06Idt7aJo=uZAEsb-Bma^AM4WQT{f_4S#EjDrts_m zdFk&II`_4qBkL5J@nP|nWEyj+45%WRHUquDTo|jkrr*QTUvx?jz1+K&6Vf4R$+e$$ z15-Cp%UrpnlI*b`@<;q&)0ISsrhAj{7vf2|sk2`K-^tl!rLNS^lM)$5p-Hp7;8peq z$pi*gN96H3b|{CyMY!MWh}~I8P~N5R;Z-S3h=VcVT^X9Hic>RqLoOC283=O%5zCh1 z`9Shb16&iF;njUh-<1ryCKf(lA%xFvtVd}t&HJGJ%1E|N{>p0F%=v-xa9faPG9$A& zFa>mx#7=f@l)zzXMR@oqfP}$JbO8^SxaRv(yG-?VQ7c`ai1r=H-qE9RfaJ^1T`IjF z{b0=~O!}6#}(T z$pU?VZH3zaHL#&V$W~$^9xk>`B_anUDXD}k4YECI<0M_VYQbSGT_X4G!O;{P11Vgq1YNA<2I>F3Kb#@@Z6dtf(xOz;rh9 zhe9J>w0C;j(2FxIdOoZTHz^XNE8Mc$Of2h zuU`uKK6=g*Ayg4+Kx%p*sqlf$c|@MTSyeVL5`9-Cxd`Dr%vMhD<{4>c&S!vZOj;G+ zrsjv%(~aEp*R{?G`;4baHO^hGKB>~(m(!sqh2FZ)^Q(g(6fmoP9HMXiBnk_I?jI2H zgzt2ED~eS2wF&WhB%}rxyBf*5&+nK1_Fl5w9e3`DIR(?oXQzwsZw69zSO{SXQDxl? zj$lmj{YW=Gt%&&sAG<2YJI~d~Shv&Kb=vK|ET-JJJ0;ZSi}Sa}+)4P~9t;LS z77E~(Dd}b@A|nY@bf%tC{$?5r9OyjTO*B9BzkS|j47_tx<(gZ+#kta302*G091UUL z3HFy3IU}jI7Dw)*QTFK#ROP|Ev)_fdu;H#MxvsC8Xu~U^NT*rH2*;g9cxS!+?PE`o zNHxGnmU5LkmS@O(E10l$&=^S5@p2xDK}R};gfDY&%j6RzIxABsEdmiTBI(zvp(4Z+%~BZH1d~s$xRy?6 zRv1QWD)BfZ4eoQ{%hrin;ZC=BNFe$%>ZSxkscO8D_#^E!qp*`LsyH~*HZX2c|5<4Z zr);qRQJp6}ecg+qb5+*NhLH;H!?`3K8tOm#u`iru63<`~m3E}TAa`+~2SpiGD-nDk zD#sy}UEb}A!CyzVC0hU8{EqM#X`sM#xKan!y^1d>>xkCFQ8{(~yfs)2r|(|D9^Ed{ z4;O84BGqM60xdJPiX;Ch$;i^ ztv8*F5nRGnO6@Br+(cMW|vplo2 z5H+`FRqrHM|5Gc*4#ODccBSxO3f?LTFFD{p*HyHsqG<5fMc~dH3iji(suMoL^rayW z|AuJF0&Eq-g6EmQt_ioTR3}wjk_n>bftiGd?VN{j{7HbDui+PZ4JTzNE^97Ov;$!$ylg$MeLO-|jsmY8d# zV)RRA=@^Jhg~p1iW=YdOTNqBzHyWrLPP45oL*T6Gcp`=Oy9H~7@lSMJVyibS+i5dp zgO8$TUa=F;kGsxAd!sr2ufv%tHe!?RA2VwwJ`c#q^4A8=$dM_GHgm?S?-}TJK4+ID zruD=QCYRsx>(4|Fg(kO&%|k_}3i_K-0Hf0K#7S^MfsZ`53+)+L%j@|LH~8Ka8u00S zzR>br0}5}P(3Ld1(H68szH!kzovA9iyC$UDs9St!0E&LAw-b#>aI~eIs({Q8o=w^W zO8>Mc7I-ktq-7AH8pqYXzD3+)GD9K6{FLG1oH$Y2o~Na`&+U0#KdxrvVD@t<0D%MN z)Im>`x-(W87hsr58E0FegX(o6#=L(zIlfn3X^pbcErY5qgAc8*YK4@s-mcBMm5mUH zx-AtyjFG`iV zPfXjAX`nZG)8Yj)Y(Hc8_uY3aagHi)4sY9sd|05Kg<<&q*-}rIkSRM=X1o3VmgzJEx_0+l|5% zDr&AMgMPfOU#ed5u(BX9p!QW;INsy!4D745rzCzZ*sUS@%hy4Y)&@1Bpi&aCqqEo0 zt*7aa^jN7*PC%-?Uyh4*yhVNrQMt{Y)b`3x8qQ*4uADra6MR_6$*3tcd;N8Tp|*Gq zQu?@pO0ES#aQ16z;~X@u%@$s{)vhzLUJPtMztdD{)|4_*YNE*i{DqC7-GE`)N|~!`iTz^IxqlrH|HWi}Z?ucE;gtl9I}$zWj<+PIn=+u@qKa8g=1($y2TrQkPY!?q0cK1M z1Jx@#wv?t(YCap20CIbDw??naFE+nSYf^RtAxLomPe_b$Wd%oJv5Q;SBnY-Ah6>Y* z-&S-2N<|(YiAiOv6>d+*P-8^4xtC-rQwppgU`bFw@Q*YJkAd2KH_*CY9tE-1xqiWDl&Lc z>BIWEsE=($@TqMGc@peaO6_Sifv~QE{#4~iC~vXQho$`Gd4<7QiAmYV$+aU7 zNSo&^Mcs4Lq7M{>V2g|hPyv^=8gZTFxo9daxhN@X*%NaP6!jV>+LD_;Bp9{h!_&14 zH4~h-q`euD*WAw^$&n^li{xte>kR`%!lT^lc(8*^?P^bBueoq0YV5BggnJ3tJ~vb| z31#R%8?6pQr)yK2-y;7ySdtSdMOflRY|+D2gY1kcw?dFkCGaB)8t2{f@06~!wb71D z#?_S**!RDr4u)FbukR3?;QP4s7wN|gw8o_FAiLyka{Gdf&UVZn-y6H<#oUJTHbzmz zFBsI(hP1~&{DX16 zt7wTX1Ba~k*5x4dOWoh?SQ2s^T;QN}l5b|lanh>y8+;i0Ph8BND2;T8qdQbq_z=)Y z0cf4|v1+td&APy^6*Uz?fwU3Pn2MVhKzlvLw93zA+S4WhUER|P<1;F&E*8jBp6nkG zM96_jFIIUwnEIr78j>njoS}|r3#wh+t)V_lg@+Bh$$4o4)oxg#R3ao-YwI&IR53Rf zSVd*&`DZ$-PT0~0lDiF4lDqy@8mjC%T*P45z?M&Jp(F|Y^9*r!tSkd%&C7gQ98sw$ zPn%6E5J0LJAbCQQVKB(FbdLyc-arxz6Vr9LRjKAF9Ej%vm2Lt+#}NsFxvB?>6EJy;QYtym;>Yx*bHRw>fxcnXtY|i&ooQ= zuIjjS`N=9*{8E5=aZw}-JQ+?Q?(G#RTR@_Es&mcJn6m)aae{5Zf31Y)elzwBS448k_RQUkeN*#L89z{;}^F% zp9K|_7qp-qa28`_0`5T2J8%E&2bLH5G&!D*D>2Xu?(Bm^_{z6Yz8);VK8k3`P4%C% zjQnWo*hvI+e#Bp?+=D032_%E?iZo44Tp%dRGuk9oJUVK6U1&iQoz-|Xk{=~@k_1u^ z;*1#XA5dna(vRhAUSOC2ra$^f4phx`MRARE7Y#m0&^OcKSbr`u|9n^BASh!tu9cTiX;BRL>xS*0Dk z{GIxP9@tg((J-NP<3Y;p1WF^ z4jHb1>&P8d%%nCAX)41(f|Gox1gzt(*>r}KfAHKtV3M8U9}Jg>p0}y+KlY83AyHbR3W~($xX+C7RV`GSsFzc~aw7JsqLTSr|A>v z(5GYlsggH1Oq|L*V!_z5#2cmo&Dc^}I;OA@+TOWJoRqKQR{tLdvKf7yK|APgh@IxjQ z@dEw5**m`Lr&$aO@s-!}X?ezN4~`uCR|ue00Kvb$**p%Q`|lV*?LsIS|F%uf4G3^l zT3|bpCGwv^Vv!+(j2+6io<=aako;DtsTl+}D=8Rx?S`QXUWuCG<92xOyxh);K$x_9 z2yz3#ww$gWrrK=%rRe@Mg^qGjpS$T--V-{fQA_ovwa2qV`1d{$k3(Nr=7+X`W$_92vj;GyMVuKR(9 z`PY_Er@{WEZ^;S%cO@~*jR=e9wDM*D#W2kkf);zxHpghBS$pp0|FJiA?3G1*e>X29 zGv}rZFQm!*?jQyRWow?&U1mau;BCO@AiAIVeriLubL*brV}7M@y>bh48yJC zf7}P+Ak=DTKOSBul>J?ob*(+N3*kw<$m7v$rp5diw<s2gtoA)9iG`V58W=zeSyZG`iqlb?= zUQ3$Vn)~uzrDD8YO9aTllTGU07wMo$gi!ZX)&1Oz{||HZ_i6NEtzVdxSd3@&<{?MG=N z2o#LP+<8)*mvXrA-ylAU0}=-}v+eQJd82+&>D2mGq>`dO{THJ6A#{>qQ;7dwKcS@I(g0!UMQW7HJe^!6@gZt@Rd(E6ZbI$qX z`!zkafphhLE)MPh>81|-z92q2YPw*EG^NbMj8AB?96CD>0Dt($1^5G~Mqmb@_=ZZLj-Z2q zWp9v3eAm6d0mkEuKYUzRArLHi2T1<)jUYH#V~Zgwtn!NjZMEa{Xvht~LB{I{S3!cP`R@tE?Iy{ z^>-$f@dJ~dM*B*j2-eQHWmggn`s>S9y%(O_8zX4k+QVFN_e9JuR%FA}RLHx-nn?+{d z#ETX7U%+WcMIJO zii>@5x=(&g@_x<0AtA_y3@)oRG-X6(8>i9Ei0o5wb+tO?`>Z(NMm?^4>-Cx99ThI( z5*!qUY_{(t9@qL%eDgRJp`jVR(Nq5c$65SE3=k~#LVA9S1DaURTbkGa7LlnR=!OPA z9&l(fPgM?_l#32r@Ebi^o*Nrp35XhRV~zp>Uof8kH%QoH`9j*e_U_(2PFKrT4op6t zE#d>awE2iM)-EX#Vlv&F)CA7csoX_Dv~uC-)whA0=V1=M<_q~y1hblg8UM%?AZNm#JSe&7Ibco%y!)kRh7SMc(+CU72BBx|FHyVp(lFW1WoczYGm`{p z0PFTN(#mLHmc^>Ey>vm>ua3J>KXYaRqnfIT0v+qhueHAy#8k%#TS!El6t5VG)}GpI z2WiqWgQb+G`O=UW6hpP-l?RelKAdEo%^$y%#EvBF%aB%VbC0cA0deBg_T?2P0UlMq>dJRuIT5RtT?VeDl2i~uKPb6;A{WbPE^o`r!0UxNsZ``v2 z_&PD>)6DCD`dx--wWE52aQICLw1CdCqB$>FIrFn?f%l?29cdv&R)X{gQ2dcO(*C)K zM*q0y%1?lj@WF_n;=Q5Bd_hUFRRhHms?1xHsy&4RM^ke3|GC{o(x!bvGG3vqF5_;1 zuSx8}=5vCZ901srsTIWn4B~YussI!v6&JTsQ?H}ENg=X3Y3uAg;nsG`a5g?s^%26I z1;CW5)q9rD0Ir>&{zn`e^ILVr!K+CvPh24ulxTk+W^=5LN*}nF2^jCWPtxuWG7F#V znIr8t{#g+_Vls)+2%0wu)8(O#(x*48uKR76e6W)9p?%9xKYK^FyY->o0rJKhue%?%EUe5UrS~ zU?PJPp1%v3h$ByB(wa9c*M$4Tj4};V?niW~E=`{W0k;+ki^Uxh0e}ZmHw{UzN|-VY z1_-vWBjjcPkc*~xDPvL?QOvQN$7y36lK#j0gX4NycJ@MoIXgx4=F7{-^!McSOzV7g z-$H`Fh755%$fQ+`x7IyQ4Ezmx`d@2VTqR!ur+_G-82thv-ra-Ob_dzfQ2iFK#E4M+ z@2>oL>q2^yD{Z7or8N`}cXU2p;_|p8<9K5!YI!D3CIsdC3{)sy%$X>g_SlR*xz0aDWSee zu15q<0jCd@%`Q@qS%WqqIU*XJZjMzu62ZYQgPLO`Nj_k7zfz168dKB>Ly^;<&`-J-wLrF~ znWuY&Wk#sFggJk67YSNc+={ABZRq#AGG15Xj_`g}{|{+om%QQ6c8+0Zt7O$O zIaRO{`zk+#urY&(Ov6r>U^`|`Etr5ah497YKd)LB3rY=Hk$8JGrzk7QTFD})p8llS zlj(+updMzC&qkZdV`Sz{j! zrzOAD3^Ji61jo8+B1QFo?^jVpjxMK;`^@og1wi4O1e0?trMaXg=}mMm4p|8WDm85c z?6ZceQmD_X^WUK;Rx!{a#p&wix?Q#?)eUyd%#OK4B}=5rZmWJe5~e*it<<(>CA70h zQxDeGRQP5jlTxk0gY+&KbVAkiWJnYimY2M-YZb1-HlZJDf|@3Ji^ zn=%ik6(Xkt;eR0z9&B!vg(gbc>rQ>cvnBIa!l1>TjN*60^ul4M0AA!r_7`LbK*T6y zEOr45f3efB@$0vh#&e3oR(5S-hDQIah&L8=*TF}Q9l@^8y0DUFK(A6lHmvl4QEsK; zRqM0pB1zrarFOPdrBv3bjw}Hv9V^!dI$S>8o7oTmbtL_X zN)3U~6L;L3->2ikezsNFFl!TflB*30Rd_lqwB2Ol50Trd*a*uuOvho0*$$Orm6lsxu1ntFs3wqCnHzqLOyqQe({{tJg@=mMmDWX_l-{@Gkw6)(B=bY$~uab zx!s!ii^Ds1%n*&>LFDHU$c>M-(f@+$G}+J;Ib}3)^U7fG^Hr8A(V0)!7&lrhP1VN| zXKyvWj+Im_YNJgi|0$15-?ct%jF>wD;UJFCiF#f!eIU{e{xwL0Ti#?inXFA)b1l0u z_Qnii!SnaOj;6Vx!)S5_px^o~ja7X9!ynbJ!11ecMdnsfrLrAmX&q7}cYDJ9Zu(wx zYlq@67;v4ih*>tuGL0r?sBiMr4Fj0rDfm<*1~p)a6T(atjy=btLKuDTgVNVNs8 zkvyLO6Lroa*%2IG5+UNMt>04(+vcB|#u9GV{lr1p{z{E_ukir;TrG$-)%S1LaDaGH z!Q{~h!tpJ2g%MvI%qLwr#kpIj!Mp56(u;AzP%s5YT5Mm~`H*+>OdJIKsU zM8~v)B&U%|OJRjZT}A;>|I9(9Ch!jUY$h64i#3|5>kCGr@Y@1s6Gf9-uUMfe`sVMv zuZw!KdaugVj|>4neh{aE(S_8FcJAdj6Kk5WR2446dEo#c;ODv|;w=Ev_HQjZv<3Na zS7pxg!3kyn>M9$FAvXOaeHthqJ|VumChm;!BR$^$81eK2+;=+! z73$hwwa=6edEo}ajWRRbvt090p7^36Rp2Oe)yb&-CKP#lzc4rmS8{7d8)Ek_HdPfy zRUib3G)c$jUb{S(vUyV$vvX|Murw{-kwkHG2Edq@$T@bsWI+>3*HHfH%HJ!=6^O=4 z%XFK`OF&#{5|*2#X~1>+JvjL3m5pIE;gVEzY2PEv^^;pP($9qmP1;*+J*j5x^6%Fl zeLHSk0NzV+sZx(hhHx4_wL~c+CU5Ph*d_M)q$HFEst)yLiqiXIAm!k8Se6BMK|vC6 zgZq+~DM{ThXIBQ|_kbU~r+bU&ufM7hTiVdZ$ajh6QB7)EojS;IWP()J-gnx2f687{ zFLS$RKjB{1A5x1atD>-WL`nTt^7@0uayr!%)6>Eb)rz3~pj`n{NpyMyW3OO?w$Qj2c42mZ>beRHknzg)P zv;Ief4a;E@dhGs;1{=<*D;03-et1 zo=uomMgIYvEtS}8S;oix8Cn0l+Km6&1S6956CKJjBGdDA1`n2ub5i2(h-fUL@{S0a z_A-spAdN%kRRv00IYJ;!Rs>wr3!kUE{j6+4x36Dm)hNq091-jaJtjcJRoSj+q#yAZ z<#D*G3ILHA41kCAI{~Rbnp!(zGw%gmQd2Lja-0q9b@P3FcE;dDd{hOEdy4&Z+|5dn z75y2LcPed=)o%@3QIrbE+r*;$o>8y{arYOHF@T6=6o-^!Cn*KOZD8UoVOe!z) zhRw7@A_q>No9G4AikmhJwv5vxpClfP+}PMaER2ZjAG3l|xv>`_e+sliJwgp_Qsj{s z?lSqE-B|PVa+>n|0A`=O680-!*Ww0%#3cF@e?O2;K`6?(ixx1{Z zIVua#soX+vKAX($n2fWEoHw>SN2Bd`{pNj2=D;-lYZ&l(q67@B!S=q3!-wwf9T%HY z9V>>4HO%aF-5xDISP&^tyxsgVz&^Fvs8*mRUigtL7jOg}UeI}XL$OF=UEI5cg?Cd6 zdav}NZ7v3doT@I!sa4=ESQIR|U%xlHSgNauD>=2=?1NP@qQZ+6{pZ9~PAz8pd7f=s zN@U-}^v%7=eIPH&J|1&cx9MbdZ*B6+=Cq_|hK`paghD-Athw4DZ4iW8CCVYZ#p)cM zz*;6>8qaU0z8%dKrTgT)8~(>B;cd%ylqo>nLDKc7#ptW2pYLzQxuh1F7*!M|aRc|T z1oU$&=2Nk;L*OsY{lLlG68em4UjLIHg86?sh*J^Oiugn9MA_^dXqk!4jPLajCbkc1 zQ4^r33FO+98C;38d3xhH!XSL&&0@~}p$*}YrKeb{{#aH!foaUSd}9cgRDV)uM2J{Y zg27z&{aIgOp}L8O`nl~iu}1XTicaMDqva)fX00o~-PGTp0g2{QWw1GYRW{M6K2%Vf z-l0+JB%Wb9;3g!&sWsEg|4?DxQu0mB)r3#m2nu-)ads-wsY`guNnr7QT^URC1p-Da z=SRg_CbIsSDh;Z$HKPQ56Zgn$0Ej2J4M*k?u_uXNjse-I+9S#>&L~~QiOf_Zkvq@7 zPaj`j6FRu0NZ9Dj-1%mkPbAyJz;v-v7r4i+lw6zOo0!CJQBXzlyLu6&z?w=}T6@T; z&$Mx=cbT8Cja_huB^n7=QT%4fx_}y+uNZ@dJeqHQbR<^X#e6@7R17qp30J(56$Nv2tJh6BslQ2%fz^QAatoYz{K&nf~ z=wl@M!N2$JhZ6@2LmE;6$if~Y^AzS9EcT(=&6Fp$NWmJ@u0C9OQ%|;gN6}hm6=L-# zRqHcm3`vpa9e<#4Ca#I~XD7UfpV>s_5{t^daN<~=@7jRSk_EE3XgxONY|uUI+gaBi zyQSqRN#nm!ms9?#vqvsTXT(j$Ua+2Wo*o^PRi~~P)p_dmi_r8NWerjgIjx$j3dFjB z08@_C%+Io%*#ki+8W;3a7gCgh)D84@_b2_5fAj<(KRhUA!H?HI7f1tRGTSz_Lv1i1 zr?f>>4^Rt7az<{~JT>}K2*toss@8JtH92GsPW{Caj4xe0Vwp1gB|YN3MMKxQ{SlSn ziD_Z}juc#S5g>iQTPjeXDkYX(pQcyR&p=h81-H4^G+KPdxYs&B%2FG0&HGr4;sPIkjv(HJQUGOF{&1=r( z+!CeadG>GoA#Zz})G~qY-%x@k?fHOged6%+h{tv46fhd|1d z*_7v#NCN9HDjoyOmXfzd>{h-Fl^tovKmy45T^_GeC@yio^(V?TGL--F5u*Jhd14QT zc0_xpc_rjTyVyV$n7MJOz5S(yfN`xzOArat*DyZOw8pCwt|1{sq31p>2{(R;8I9F( zM?Wm#)so67CLcCbLt5*>Y4}8?j|I(E#RNm`((32X-}y*E%!U3F5L`e!SfkQ2aUULO zW11J_7Atu{P!Hrkvc{$nb8W^(!6Idg6omX!?Ezv%sK+%e=OQKTxoHS^YIA&ebXh$y>*Fo6L@J#Q-z`L94c_m zLpHgD_ZIS+rR}%`(_Cb=})q?7Z8oEF0BccHB-bo%pA1cG2OVmuoFrc7XW(0Y3&ULCp)PVEo+=SQgFvY~@(bukR_3gN2ufvNa zRm)tq*k^6MbKU@%Br9r(hMd&9p1M+dv0{z^BvPS{UcCyp`zcx?gWUNYW+SVY1obMS z2GP@>OI~kFd@AG-&k$uvY;&gw$k(4&`>mQrjDb~ung<{ja+T2>=Nplnb&>5Fh(wB`G~ zc~xMxS7eOMsT8H-2fr*pH}7}!XvWdr6gW5=j9>#Xu3ZIGwsdJrBt4+s@0>%E6PxY_ z%PahJtn(NBTR!;GkWpX?>>Oo2Pwar3y;UgQ|cHveu5+SrhkS z7N85^l#@pDLC)H4u2k)blt9P`M_}l_YV>U2kG5?XogBfwfCZ+(m9r)Ql%zQQWW5*PIf zZzPh)JY8wCrf`ZQfc34P{K6}-z?llU%(FHiq4}XZISEnW9M(}X4gj*9-jXb(rpSCj zyYO|?0n~rax!cYpaaU+jjV2N-?0o2HV`m>qA9$}mi*m5JPdLXHyzp%ISHXMrPW<2i zg~~RSuOLc1duDtM?ALN(ZM1M$lW3f=8O%LQcvUe%dsb@E|bSPz5FqQ zS%L5ldZBv6S5dAKmmU7&#EV~Ds0VduS^9*uxMbk9Rf)(6+$C3}a))!hb+LwqW#mY; zcRFO(B-zpBT4dQ&DO_>|J z5pL`^cWsE+RYx2pMKDTf)fhV^u8!ILBa_bPQdMna^k*P_oLKJ*q{k83pQ`!%@ z07P<$W~yQQ#f~GrG`R_C4B;HZBHV_?xmYjdFtVxntLxO8*MmTlNX_$TOH2CUAu|GW zW-8w`_Xa7p)H@Y7aiCUo*bo6@yAblKPZywiwiKe8%vx$Q-fsZ0&`C)Zwyo(hTlh}R zLEqwf*lWl?3}b%|3Y@hNiKB5?5N1w=#VTy^G;(8_H(Z2`_<&sbG5i(iu{u zN&7HAIA%)Ug&nc6us3d6ekMYWoS>$i=w30Y6%&p+FjPOzocDF2d6?hod@nJGeDOP@%wFVU#c z@>gc7W3h}eBS^Ys;GzxZ<8@qB(4i@j#n^JYpolsG2gc|(~; z=X%o5t#(pXA#1$DyNEvric(e~*v0ir<`nT(qI-vfX(5)WHw!5e;SM&|)_fl7@Mael zn;GRaq#a;XpdHEN!y6vekVi>;1?k938i&~g#&pXRFS>TPgI_pFTOlBz zvcta-&^e8Mn(sBQ^qX35>il~w5q%dOxSSjhXq-tL^_?;Ux^cshZsa`ZkJaucd@Au? zh<~-Kj{aI<;^%k8I;+4J4C8guZ$I8Y8UOJ_@e#YarZF!4t+>CKM}kuz{a$;}@`*mg zVwQLlwv8fZzjpOQ?zxY$Sn2(W3$p^>e`=b0F^U-LTRLM1^3h@|9_5y_Y0=^-U*Fch zbGCmjsRIYYSIwj(C}CkgLluoNgymxFzH!HubI2-}?Hn^C(m&WB1?PFBO-dtBR z@TwaFS}ax#`QLtIvv?2_4qL*vz3_I5KP26cnNS$=LegLz+?!t)hfhZ;z z01XB;*()IZ>1~9}QwuQO5=F0pw2Ba`ViV`2s z(zVsyD;EKJFGl5>#AcxW1*%%v)$7;bv@PgLK$qALY-`5lHa3Qd1^~a#XAIBsx!{6O zg-vpI*v;1#cLJz>v*pD5D{o^1ePs&Q0$mqRZ172Bg&;s=X#b--)BLSuH|b`d0HbN4 z?x<1V6WZ2tRsQkc$9C+D+Y@%Pi#Vh0<==<-h|zgsJRN#Plmvn+uNep#|Ge9 z#}SBFtrk3;MmgKo=xNJLpJ$o*J2+UaIqb3S^uFiE2Ej?fBZD}tD~{~yq79~eMteNq zrDZ>A#y=ZC-LKT1`}_PcHWUkZj^dt*tr$?hmJS6d`JPDzbMkUyT zl?G_FYy2^Q22ha*8m7E-Vlrne;qp;5xhj1&wLF5=So4!$u4)m0sC~7lR_T%8)7Eso zqXJ*qkFCp1rTtV|7CE+=?ptX5W@1ndm>J_I75a1T>yTPo5FXC znQ%rr^PUOSze4&WcFz|s&qNH=0imwNB|9-DyfdIzbE5$wk^fq^n)dG(Eg0kwC4~W6 zzK}q>caK|Y$bHqClr6f=M{I^GwWbg2+Wm>lZKDFELfcMBrH2uV(jTD8SgP2lA@p@$ zk0v(zJ3OXVhmz^UL}Q5&f*>sys.stderr, "This is a Mercurial extension and should not be invoked directly." + sys.exit(2) + +# We require Python 2.6 for the json package. +if sys.version < '2.6': + print >>sys.stderr, "The codereview extension requires Python 2.6 or newer." + print >>sys.stderr, "You are running Python " + sys.version + sys.exit(2) + +import json +import os +import re +import stat +import subprocess +import threading +import time + +from mercurial import commands as hg_commands +from mercurial import util as hg_util + +defaultcc = None +codereview_disabled = None +real_rollback = None +releaseBranch = None +server = "codereview.appspot.com" +server_url_base = None + +####################################################################### +# Normally I would split this into multiple files, but it simplifies +# import path headaches to keep it all in one file. Sorry. +# The different parts of the file are separated by banners like this one. + +####################################################################### +# Helpers + +def RelativePath(path, cwd): + n = len(cwd) + if path.startswith(cwd) and path[n] == '/': + return path[n+1:] + return path + +def Sub(l1, l2): + return [l for l in l1 if l not in l2] + +def Add(l1, l2): + l = l1 + Sub(l2, l1) + l.sort() + return l + +def Intersect(l1, l2): + return [l for l in l1 if l in l2] + +####################################################################### +# RE: UNICODE STRING HANDLING +# +# Python distinguishes between the str (string of bytes) +# and unicode (string of code points) types. Most operations +# work on either one just fine, but some (like regexp matching) +# require unicode, and others (like write) require str. +# +# As befits the language, Python hides the distinction between +# unicode and str by converting between them silently, but +# *only* if all the bytes/code points involved are 7-bit ASCII. +# This means that if you're not careful, your program works +# fine on "hello, world" and fails on "hello, 世界". And of course, +# the obvious way to be careful - use static types - is unavailable. +# So the only way is trial and error to find where to put explicit +# conversions. +# +# Because more functions do implicit conversion to str (string of bytes) +# than do implicit conversion to unicode (string of code points), +# the convention in this module is to represent all text as str, +# converting to unicode only when calling a unicode-only function +# and then converting back to str as soon as possible. + +def typecheck(s, t): + if type(s) != t: + raise hg_util.Abort("type check failed: %s has type %s != %s" % (repr(s), type(s), t)) + +# If we have to pass unicode instead of str, ustr does that conversion clearly. +def ustr(s): + typecheck(s, str) + return s.decode("utf-8") + +# Even with those, Mercurial still sometimes turns unicode into str +# and then tries to use it as ascii. Change Mercurial's default. +def set_mercurial_encoding_to_utf8(): + from mercurial import encoding + encoding.encoding = 'utf-8' + +set_mercurial_encoding_to_utf8() + +# Even with those we still run into problems. +# I tried to do things by the book but could not convince +# Mercurial to let me check in a change with UTF-8 in the +# CL description or author field, no matter how many conversions +# between str and unicode I inserted and despite changing the +# default encoding. I'm tired of this game, so set the default +# encoding for all of Python to 'utf-8', not 'ascii'. +def default_to_utf8(): + import sys + stdout, __stdout__ = sys.stdout, sys.__stdout__ + reload(sys) # site.py deleted setdefaultencoding; get it back + sys.stdout, sys.__stdout__ = stdout, __stdout__ + sys.setdefaultencoding('utf-8') + +default_to_utf8() + +####################################################################### +# Status printer for long-running commands + +global_status = None + +def set_status(s): + if verbosity > 0: + print >>sys.stderr, time.asctime(), s + global global_status + global_status = s + +class StatusThread(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + def run(self): + # pause a reasonable amount of time before + # starting to display status messages, so that + # most hg commands won't ever see them. + time.sleep(30) + + # now show status every 15 seconds + while True: + time.sleep(15 - time.time() % 15) + s = global_status + if s is None: + continue + if s == "": + s = "(unknown status)" + print >>sys.stderr, time.asctime(), s + +def start_status_thread(): + t = StatusThread() + t.setDaemon(True) # allowed to exit if t is still running + t.start() + +####################################################################### +# Change list parsing. +# +# Change lists are stored in .hg/codereview/cl.nnnnnn +# where nnnnnn is the number assigned by the code review server. +# Most data about a change list is stored on the code review server +# too: the description, reviewer, and cc list are all stored there. +# The only thing in the cl.nnnnnn file is the list of relevant files. +# Also, the existence of the cl.nnnnnn file marks this repository +# as the one where the change list lives. + +emptydiff = """Index: ~rietveld~placeholder~ +=================================================================== +diff --git a/~rietveld~placeholder~ b/~rietveld~placeholder~ +new file mode 100644 +""" + +class CL(object): + def __init__(self, name): + typecheck(name, str) + self.name = name + self.desc = '' + self.files = [] + self.reviewer = [] + self.cc = [] + self.url = '' + self.local = False + self.web = False + self.copied_from = None # None means current user + self.mailed = False + self.private = False + self.lgtm = [] + + def DiskText(self): + cl = self + s = "" + if cl.copied_from: + s += "Author: " + cl.copied_from + "\n\n" + if cl.private: + s += "Private: " + str(self.private) + "\n" + s += "Mailed: " + str(self.mailed) + "\n" + s += "Description:\n" + s += Indent(cl.desc, "\t") + s += "Files:\n" + for f in cl.files: + s += "\t" + f + "\n" + typecheck(s, str) + return s + + def EditorText(self): + cl = self + s = _change_prolog + s += "\n" + if cl.copied_from: + s += "Author: " + cl.copied_from + "\n" + if cl.url != '': + s += 'URL: ' + cl.url + ' # cannot edit\n\n' + if cl.private: + s += "Private: True\n" + s += "Reviewer: " + JoinComma(cl.reviewer) + "\n" + s += "CC: " + JoinComma(cl.cc) + "\n" + s += "\n" + s += "Description:\n" + if cl.desc == '': + s += "\t\n" + else: + s += Indent(cl.desc, "\t") + s += "\n" + if cl.local or cl.name == "new": + s += "Files:\n" + for f in cl.files: + s += "\t" + f + "\n" + s += "\n" + typecheck(s, str) + return s + + def PendingText(self, quick=False): + cl = self + s = cl.name + ":" + "\n" + s += Indent(cl.desc, "\t") + s += "\n" + if cl.copied_from: + s += "\tAuthor: " + cl.copied_from + "\n" + if not quick: + s += "\tReviewer: " + JoinComma(cl.reviewer) + "\n" + for (who, line) in cl.lgtm: + s += "\t\t" + who + ": " + line + "\n" + s += "\tCC: " + JoinComma(cl.cc) + "\n" + s += "\tFiles:\n" + for f in cl.files: + s += "\t\t" + f + "\n" + typecheck(s, str) + return s + + def Flush(self, ui, repo): + if self.name == "new": + self.Upload(ui, repo, gofmt_just_warn=True, creating=True) + dir = CodeReviewDir(ui, repo) + path = dir + '/cl.' + self.name + f = open(path+'!', "w") + f.write(self.DiskText()) + f.close() + if sys.platform == "win32" and os.path.isfile(path): + os.remove(path) + os.rename(path+'!', path) + if self.web and not self.copied_from: + EditDesc(self.name, desc=self.desc, + reviewers=JoinComma(self.reviewer), cc=JoinComma(self.cc), + private=self.private) + + def Delete(self, ui, repo): + dir = CodeReviewDir(ui, repo) + os.unlink(dir + "/cl." + self.name) + + def Subject(self): + s = line1(self.desc) + if len(s) > 60: + s = s[0:55] + "..." + if self.name != "new": + s = "code review %s: %s" % (self.name, s) + typecheck(s, str) + return s + + def Upload(self, ui, repo, send_mail=False, gofmt=True, gofmt_just_warn=False, creating=False, quiet=False): + if not self.files and not creating: + ui.warn("no files in change list\n") + if ui.configbool("codereview", "force_gofmt", True) and gofmt: + CheckFormat(ui, repo, self.files, just_warn=gofmt_just_warn) + set_status("uploading CL metadata + diffs") + os.chdir(repo.root) + form_fields = [ + ("content_upload", "1"), + ("reviewers", JoinComma(self.reviewer)), + ("cc", JoinComma(self.cc)), + ("description", self.desc), + ("base_hashes", ""), + ] + + if self.name != "new": + form_fields.append(("issue", self.name)) + vcs = None + # We do not include files when creating the issue, + # because we want the patch sets to record the repository + # and base revision they are diffs against. We use the patch + # set message for that purpose, but there is no message with + # the first patch set. Instead the message gets used as the + # new CL's overall subject. So omit the diffs when creating + # and then we'll run an immediate upload. + # This has the effect that every CL begins with an empty "Patch set 1". + if self.files and not creating: + vcs = MercurialVCS(upload_options, ui, repo) + data = vcs.GenerateDiff(self.files) + files = vcs.GetBaseFiles(data) + if len(data) > MAX_UPLOAD_SIZE: + uploaded_diff_file = [] + form_fields.append(("separate_patches", "1")) + else: + uploaded_diff_file = [("data", "data.diff", data)] + else: + uploaded_diff_file = [("data", "data.diff", emptydiff)] + + if vcs and self.name != "new": + form_fields.append(("subject", "diff -r " + vcs.base_rev + " " + ui.expandpath("default"))) + else: + # First upload sets the subject for the CL itself. + form_fields.append(("subject", self.Subject())) + ctype, body = EncodeMultipartFormData(form_fields, uploaded_diff_file) + response_body = MySend("/upload", body, content_type=ctype) + patchset = None + msg = response_body + lines = msg.splitlines() + if len(lines) >= 2: + msg = lines[0] + patchset = lines[1].strip() + patches = [x.split(" ", 1) for x in lines[2:]] + if response_body.startswith("Issue updated.") and quiet: + pass + else: + ui.status(msg + "\n") + set_status("uploaded CL metadata + diffs") + if not response_body.startswith("Issue created.") and not response_body.startswith("Issue updated."): + raise hg_util.Abort("failed to update issue: " + response_body) + issue = msg[msg.rfind("/")+1:] + self.name = issue + if not self.url: + self.url = server_url_base + self.name + if not uploaded_diff_file: + set_status("uploading patches") + patches = UploadSeparatePatches(issue, rpc, patchset, data, upload_options) + if vcs: + set_status("uploading base files") + vcs.UploadBaseFiles(issue, rpc, patches, patchset, upload_options, files) + if send_mail: + set_status("sending mail") + MySend("/" + issue + "/mail", payload="") + self.web = True + set_status("flushing changes to disk") + self.Flush(ui, repo) + return + + def Mail(self, ui, repo): + pmsg = "Hello " + JoinComma(self.reviewer) + if self.cc: + pmsg += " (cc: %s)" % (', '.join(self.cc),) + pmsg += ",\n" + pmsg += "\n" + repourl = ui.expandpath("default") + if not self.mailed: + pmsg += "I'd like you to review this change to\n" + repourl + "\n" + else: + pmsg += "Please take another look.\n" + typecheck(pmsg, str) + PostMessage(ui, self.name, pmsg, subject=self.Subject()) + self.mailed = True + self.Flush(ui, repo) + +def GoodCLName(name): + typecheck(name, str) + return re.match("^[0-9]+$", name) + +def ParseCL(text, name): + typecheck(text, str) + typecheck(name, str) + sname = None + lineno = 0 + sections = { + 'Author': '', + 'Description': '', + 'Files': '', + 'URL': '', + 'Reviewer': '', + 'CC': '', + 'Mailed': '', + 'Private': '', + } + for line in text.split('\n'): + lineno += 1 + line = line.rstrip() + if line != '' and line[0] == '#': + continue + if line == '' or line[0] == ' ' or line[0] == '\t': + if sname == None and line != '': + return None, lineno, 'text outside section' + if sname != None: + sections[sname] += line + '\n' + continue + p = line.find(':') + if p >= 0: + s, val = line[:p].strip(), line[p+1:].strip() + if s in sections: + sname = s + if val != '': + sections[sname] += val + '\n' + continue + return None, lineno, 'malformed section header' + + for k in sections: + sections[k] = StripCommon(sections[k]).rstrip() + + cl = CL(name) + if sections['Author']: + cl.copied_from = sections['Author'] + cl.desc = sections['Description'] + for line in sections['Files'].split('\n'): + i = line.find('#') + if i >= 0: + line = line[0:i].rstrip() + line = line.strip() + if line == '': + continue + cl.files.append(line) + cl.reviewer = SplitCommaSpace(sections['Reviewer']) + cl.cc = SplitCommaSpace(sections['CC']) + cl.url = sections['URL'] + if sections['Mailed'] != 'False': + # Odd default, but avoids spurious mailings when + # reading old CLs that do not have a Mailed: line. + # CLs created with this update will always have + # Mailed: False on disk. + cl.mailed = True + if sections['Private'] in ('True', 'true', 'Yes', 'yes'): + cl.private = True + if cl.desc == '': + cl.desc = '' + return cl, 0, '' + +def SplitCommaSpace(s): + typecheck(s, str) + s = s.strip() + if s == "": + return [] + return re.split(", *", s) + +def CutDomain(s): + typecheck(s, str) + i = s.find('@') + if i >= 0: + s = s[0:i] + return s + +def JoinComma(l): + for s in l: + typecheck(s, str) + return ", ".join(l) + +def ExceptionDetail(): + s = str(sys.exc_info()[0]) + if s.startswith(""): + s = s[7:-2] + elif s.startswith(""): + s = s[8:-2] + arg = str(sys.exc_info()[1]) + if len(arg) > 0: + s += ": " + arg + return s + +def IsLocalCL(ui, repo, name): + return GoodCLName(name) and os.access(CodeReviewDir(ui, repo) + "/cl." + name, 0) + +# Load CL from disk and/or the web. +def LoadCL(ui, repo, name, web=True): + typecheck(name, str) + set_status("loading CL " + name) + if not GoodCLName(name): + return None, "invalid CL name" + dir = CodeReviewDir(ui, repo) + path = dir + "cl." + name + if os.access(path, 0): + ff = open(path) + text = ff.read() + ff.close() + cl, lineno, err = ParseCL(text, name) + if err != "": + return None, "malformed CL data: "+err + cl.local = True + else: + cl = CL(name) + if web: + set_status("getting issue metadata from web") + d = JSONGet(ui, "/api/" + name + "?messages=true") + set_status(None) + if d is None: + return None, "cannot load CL %s from server" % (name,) + if 'owner_email' not in d or 'issue' not in d or str(d['issue']) != name: + return None, "malformed response loading CL data from code review server" + cl.dict = d + cl.reviewer = d.get('reviewers', []) + cl.cc = d.get('cc', []) + if cl.local and cl.copied_from and cl.desc: + # local copy of CL written by someone else + # and we saved a description. use that one, + # so that committers can edit the description + # before doing hg submit. + pass + else: + cl.desc = d.get('description', "") + cl.url = server_url_base + name + cl.web = True + cl.private = d.get('private', False) != False + cl.lgtm = [] + for m in d.get('messages', []): + if m.get('approval', False) == True or m.get('disapproval', False) == True: + who = re.sub('@.*', '', m.get('sender', '')) + text = re.sub("\n(.|\n)*", '', m.get('text', '')) + cl.lgtm.append((who, text)) + + set_status("loaded CL " + name) + return cl, '' + +class LoadCLThread(threading.Thread): + def __init__(self, ui, repo, dir, f, web): + threading.Thread.__init__(self) + self.ui = ui + self.repo = repo + self.dir = dir + self.f = f + self.web = web + self.cl = None + def run(self): + cl, err = LoadCL(self.ui, self.repo, self.f[3:], web=self.web) + if err != '': + self.ui.warn("loading "+self.dir+self.f+": " + err + "\n") + return + self.cl = cl + +# Load all the CLs from this repository. +def LoadAllCL(ui, repo, web=True): + dir = CodeReviewDir(ui, repo) + m = {} + files = [f for f in os.listdir(dir) if f.startswith('cl.')] + if not files: + return m + active = [] + first = True + for f in files: + t = LoadCLThread(ui, repo, dir, f, web) + t.start() + if web and first: + # first request: wait in case it needs to authenticate + # otherwise we get lots of user/password prompts + # running in parallel. + t.join() + if t.cl: + m[t.cl.name] = t.cl + first = False + else: + active.append(t) + for t in active: + t.join() + if t.cl: + m[t.cl.name] = t.cl + return m + +# Find repository root. On error, ui.warn and return None +def RepoDir(ui, repo): + url = repo.url(); + if not url.startswith('file:'): + ui.warn("repository %s is not in local file system\n" % (url,)) + return None + url = url[5:] + if url.endswith('/'): + url = url[:-1] + typecheck(url, str) + return url + +# Find (or make) code review directory. On error, ui.warn and return None +def CodeReviewDir(ui, repo): + dir = RepoDir(ui, repo) + if dir == None: + return None + dir += '/.hg/codereview/' + if not os.path.isdir(dir): + try: + os.mkdir(dir, 0700) + except: + ui.warn('cannot mkdir %s: %s\n' % (dir, ExceptionDetail())) + return None + typecheck(dir, str) + return dir + +# Turn leading tabs into spaces, so that the common white space +# prefix doesn't get confused when people's editors write out +# some lines with spaces, some with tabs. Only a heuristic +# (some editors don't use 8 spaces either) but a useful one. +def TabsToSpaces(line): + i = 0 + while i < len(line) and line[i] == '\t': + i += 1 + return ' '*(8*i) + line[i:] + +# Strip maximal common leading white space prefix from text +def StripCommon(text): + typecheck(text, str) + ws = None + for line in text.split('\n'): + line = line.rstrip() + if line == '': + continue + line = TabsToSpaces(line) + white = line[:len(line)-len(line.lstrip())] + if ws == None: + ws = white + else: + common = '' + for i in range(min(len(white), len(ws))+1): + if white[0:i] == ws[0:i]: + common = white[0:i] + ws = common + if ws == '': + break + if ws == None: + return text + t = '' + for line in text.split('\n'): + line = line.rstrip() + line = TabsToSpaces(line) + if line.startswith(ws): + line = line[len(ws):] + if line == '' and t == '': + continue + t += line + '\n' + while len(t) >= 2 and t[-2:] == '\n\n': + t = t[:-1] + typecheck(t, str) + return t + +# Indent text with indent. +def Indent(text, indent): + typecheck(text, str) + typecheck(indent, str) + t = '' + for line in text.split('\n'): + t += indent + line + '\n' + typecheck(t, str) + return t + +# Return the first line of l +def line1(text): + typecheck(text, str) + return text.split('\n')[0] + +_change_prolog = """# Change list. +# Lines beginning with # are ignored. +# Multi-line values should be indented. +""" + +desc_re = '^(.+: |(tag )?(release|weekly)\.|fix build|undo CL)' + +desc_msg = '''Your CL description appears not to use the standard form. + +The first line of your change description is conventionally a +one-line summary of the change, prefixed by the primary affected package, +and is used as the subject for code review mail; the rest of the description +elaborates. + +Examples: + + encoding/rot13: new package + + math: add IsInf, IsNaN + + net: fix cname in LookupHost + + unicode: update to Unicode 5.0.2 + +''' + +def promptyesno(ui, msg): + return ui.promptchoice(msg, ["&yes", "&no"], 0) == 0 + +def promptremove(ui, repo, f): + if promptyesno(ui, "hg remove %s (y/n)?" % (f,)): + if hg_commands.remove(ui, repo, 'path:'+f) != 0: + ui.warn("error removing %s" % (f,)) + +def promptadd(ui, repo, f): + if promptyesno(ui, "hg add %s (y/n)?" % (f,)): + if hg_commands.add(ui, repo, 'path:'+f) != 0: + ui.warn("error adding %s" % (f,)) + +def EditCL(ui, repo, cl): + set_status(None) # do not show status + s = cl.EditorText() + while True: + s = ui.edit(s, ui.username()) + + # We can't trust Mercurial + Python not to die before making the change, + # so, by popular demand, just scribble the most recent CL edit into + # $(hg root)/last-change so that if Mercurial does die, people + # can look there for their work. + try: + f = open(repo.root+"/last-change", "w") + f.write(s) + f.close() + except: + pass + + clx, line, err = ParseCL(s, cl.name) + if err != '': + if not promptyesno(ui, "error parsing change list: line %d: %s\nre-edit (y/n)?" % (line, err)): + return "change list not modified" + continue + + # Check description. + if clx.desc == '': + if promptyesno(ui, "change list should have a description\nre-edit (y/n)?"): + continue + elif re.search('', clx.desc): + if promptyesno(ui, "change list description omits reason for undo\nre-edit (y/n)?"): + continue + elif not re.match(desc_re, clx.desc.split('\n')[0]): + if promptyesno(ui, desc_msg + "re-edit (y/n)?"): + continue + + # Check file list for files that need to be hg added or hg removed + # or simply aren't understood. + pats = ['path:'+f for f in clx.files] + changed = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) + deleted = hg_matchPattern(ui, repo, *pats, deleted=True) + unknown = hg_matchPattern(ui, repo, *pats, unknown=True) + ignored = hg_matchPattern(ui, repo, *pats, ignored=True) + clean = hg_matchPattern(ui, repo, *pats, clean=True) + files = [] + for f in clx.files: + if f in changed: + files.append(f) + continue + if f in deleted: + promptremove(ui, repo, f) + files.append(f) + continue + if f in unknown: + promptadd(ui, repo, f) + files.append(f) + continue + if f in ignored: + ui.warn("error: %s is excluded by .hgignore; omitting\n" % (f,)) + continue + if f in clean: + ui.warn("warning: %s is listed in the CL but unchanged\n" % (f,)) + files.append(f) + continue + p = repo.root + '/' + f + if os.path.isfile(p): + ui.warn("warning: %s is a file but not known to hg\n" % (f,)) + files.append(f) + continue + if os.path.isdir(p): + ui.warn("error: %s is a directory, not a file; omitting\n" % (f,)) + continue + ui.warn("error: %s does not exist; omitting\n" % (f,)) + clx.files = files + + cl.desc = clx.desc + cl.reviewer = clx.reviewer + cl.cc = clx.cc + cl.files = clx.files + cl.private = clx.private + break + return "" + +# For use by submit, etc. (NOT by change) +# Get change list number or list of files from command line. +# If files are given, make a new change list. +def CommandLineCL(ui, repo, pats, opts, op="verb", defaultcc=None): + if len(pats) > 0 and GoodCLName(pats[0]): + if len(pats) != 1: + return None, "cannot specify change number and file names" + if opts.get('message'): + return None, "cannot use -m with existing CL" + cl, err = LoadCL(ui, repo, pats[0], web=True) + if err != "": + return None, err + else: + cl = CL("new") + cl.local = True + cl.files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) + if not cl.files: + return None, "no files changed (use hg %s to use existing CL)" % op + if opts.get('reviewer'): + cl.reviewer = Add(cl.reviewer, SplitCommaSpace(opts.get('reviewer'))) + if opts.get('cc'): + cl.cc = Add(cl.cc, SplitCommaSpace(opts.get('cc'))) + if defaultcc: + cl.cc = Add(cl.cc, defaultcc) + if cl.name == "new": + if opts.get('message'): + cl.desc = opts.get('message') + else: + err = EditCL(ui, repo, cl) + if err != '': + return None, err + return cl, "" + +####################################################################### +# Change list file management + +# Return list of changed files in repository that match pats. +# The patterns came from the command line, so we warn +# if they have no effect or cannot be understood. +def ChangedFiles(ui, repo, pats, taken=None): + taken = taken or {} + # Run each pattern separately so that we can warn about + # patterns that didn't do anything useful. + for p in pats: + for f in hg_matchPattern(ui, repo, p, unknown=True): + promptadd(ui, repo, f) + for f in hg_matchPattern(ui, repo, p, removed=True): + promptremove(ui, repo, f) + files = hg_matchPattern(ui, repo, p, modified=True, added=True, removed=True) + for f in files: + if f in taken: + ui.warn("warning: %s already in CL %s\n" % (f, taken[f].name)) + if not files: + ui.warn("warning: %s did not match any modified files\n" % (p,)) + + # Again, all at once (eliminates duplicates) + l = hg_matchPattern(ui, repo, *pats, modified=True, added=True, removed=True) + l.sort() + if taken: + l = Sub(l, taken.keys()) + return l + +# Return list of changed files in repository that match pats and still exist. +def ChangedExistingFiles(ui, repo, pats, opts): + l = hg_matchPattern(ui, repo, *pats, modified=True, added=True) + l.sort() + return l + +# Return list of files claimed by existing CLs +def Taken(ui, repo): + all = LoadAllCL(ui, repo, web=False) + taken = {} + for _, cl in all.items(): + for f in cl.files: + taken[f] = cl + return taken + +# Return list of changed files that are not claimed by other CLs +def DefaultFiles(ui, repo, pats): + return ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) + +####################################################################### +# File format checking. + +def CheckFormat(ui, repo, files, just_warn=False): + set_status("running gofmt") + CheckGofmt(ui, repo, files, just_warn) + CheckTabfmt(ui, repo, files, just_warn) + +# Check that gofmt run on the list of files does not change them +def CheckGofmt(ui, repo, files, just_warn): + files = gofmt_required(files) + if not files: + return + cwd = os.getcwd() + files = [RelativePath(repo.root + '/' + f, cwd) for f in files] + files = [f for f in files if os.access(f, 0)] + if not files: + return + try: + cmd = subprocess.Popen(["gofmt", "-l"] + files, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=sys.platform != "win32") + cmd.stdin.close() + except: + raise hg_util.Abort("gofmt: " + ExceptionDetail()) + data = cmd.stdout.read() + errors = cmd.stderr.read() + cmd.wait() + set_status("done with gofmt") + if len(errors) > 0: + ui.warn("gofmt errors:\n" + errors.rstrip() + "\n") + return + if len(data) > 0: + msg = "gofmt needs to format these files (run hg gofmt):\n" + Indent(data, "\t").rstrip() + if just_warn: + ui.warn("warning: " + msg + "\n") + else: + raise hg_util.Abort(msg) + return + +# Check that *.[chys] files indent using tabs. +def CheckTabfmt(ui, repo, files, just_warn): + files = [f for f in files if f.startswith('src/') and re.search(r"\.[chys]$", f) and not re.search(r"\.tab\.[ch]$", f)] + if not files: + return + cwd = os.getcwd() + files = [RelativePath(repo.root + '/' + f, cwd) for f in files] + files = [f for f in files if os.access(f, 0)] + badfiles = [] + for f in files: + try: + for line in open(f, 'r'): + # Four leading spaces is enough to complain about, + # except that some Plan 9 code uses four spaces as the label indent, + # so allow that. + if line.startswith(' ') and not re.match(' [A-Za-z0-9_]+:', line): + badfiles.append(f) + break + except: + # ignore cannot open file, etc. + pass + if len(badfiles) > 0: + msg = "these files use spaces for indentation (use tabs instead):\n\t" + "\n\t".join(badfiles) + if just_warn: + ui.warn("warning: " + msg + "\n") + else: + raise hg_util.Abort(msg) + return + +####################################################################### +# CONTRIBUTORS file parsing + +contributorsCache = None +contributorsURL = None + +def ReadContributors(ui, repo): + global contributorsCache + if contributorsCache is not None: + return contributorsCache + + try: + if contributorsURL is not None: + opening = contributorsURL + f = urllib2.urlopen(contributorsURL) + else: + opening = repo.root + '/CONTRIBUTORS' + f = open(repo.root + '/CONTRIBUTORS', 'r') + except: + ui.write("warning: cannot open %s: %s\n" % (opening, ExceptionDetail())) + return + + contributors = {} + for line in f: + # CONTRIBUTORS is a list of lines like: + # Person + # Person + # The first email address is the one used in commit logs. + if line.startswith('#'): + continue + m = re.match(r"([^<>]+\S)\s+(<[^<>\s]+>)((\s+<[^<>\s]+>)*)\s*$", line) + if m: + name = m.group(1) + email = m.group(2)[1:-1] + contributors[email.lower()] = (name, email) + for extra in m.group(3).split(): + contributors[extra[1:-1].lower()] = (name, email) + + contributorsCache = contributors + return contributors + +def CheckContributor(ui, repo, user=None): + set_status("checking CONTRIBUTORS file") + user, userline = FindContributor(ui, repo, user, warn=False) + if not userline: + raise hg_util.Abort("cannot find %s in CONTRIBUTORS" % (user,)) + return userline + +def FindContributor(ui, repo, user=None, warn=True): + if not user: + user = ui.config("ui", "username") + if not user: + raise hg_util.Abort("[ui] username is not configured in .hgrc") + user = user.lower() + m = re.match(r".*<(.*)>", user) + if m: + user = m.group(1) + + contributors = ReadContributors(ui, repo) + if user not in contributors: + if warn: + ui.warn("warning: cannot find %s in CONTRIBUTORS\n" % (user,)) + return user, None + + user, email = contributors[user] + return email, "%s <%s>" % (user, email) + +####################################################################### +# Mercurial helper functions. +# Read http://mercurial.selenic.com/wiki/MercurialApi before writing any of these. +# We use the ui.pushbuffer/ui.popbuffer + hg_commands.xxx tricks for all interaction +# with Mercurial. It has proved the most stable as they make changes. + +hgversion = hg_util.version() + +# We require Mercurial 1.9 and suggest Mercurial 2.1. +# The details of the scmutil package changed then, +# so allowing earlier versions would require extra band-aids below. +# Ubuntu 11.10 ships with Mercurial 1.9.1 as the default version. +hg_required = "1.9" +hg_suggested = "2.1" + +old_message = """ + +The code review extension requires Mercurial """+hg_required+""" or newer. +You are using Mercurial """+hgversion+""". + +To install a new Mercurial, visit http://mercurial.selenic.com/downloads/. +""" + +linux_message = """ +You may need to clear your current Mercurial installation by running: + + sudo apt-get remove mercurial mercurial-common + sudo rm -rf /etc/mercurial +""" + +if hgversion < hg_required: + msg = old_message + if os.access("/etc/mercurial", 0): + msg += linux_message + raise hg_util.Abort(msg) + +from mercurial.hg import clean as hg_clean +from mercurial import cmdutil as hg_cmdutil +from mercurial import error as hg_error +from mercurial import match as hg_match +from mercurial import node as hg_node + +class uiwrap(object): + def __init__(self, ui): + self.ui = ui + ui.pushbuffer() + self.oldQuiet = ui.quiet + ui.quiet = True + self.oldVerbose = ui.verbose + ui.verbose = False + def output(self): + ui = self.ui + ui.quiet = self.oldQuiet + ui.verbose = self.oldVerbose + return ui.popbuffer() + +def to_slash(path): + if sys.platform == "win32": + return path.replace('\\', '/') + return path + +def hg_matchPattern(ui, repo, *pats, **opts): + w = uiwrap(ui) + hg_commands.status(ui, repo, *pats, **opts) + text = w.output() + ret = [] + prefix = to_slash(os.path.realpath(repo.root))+'/' + for line in text.split('\n'): + f = line.split() + if len(f) > 1: + if len(pats) > 0: + # Given patterns, Mercurial shows relative to cwd + p = to_slash(os.path.realpath(f[1])) + if not p.startswith(prefix): + print >>sys.stderr, "File %s not in repo root %s.\n" % (p, prefix) + else: + ret.append(p[len(prefix):]) + else: + # Without patterns, Mercurial shows relative to root (what we want) + ret.append(to_slash(f[1])) + return ret + +def hg_heads(ui, repo): + w = uiwrap(ui) + hg_commands.heads(ui, repo) + return w.output() + +noise = [ + "", + "resolving manifests", + "searching for changes", + "couldn't find merge tool hgmerge", + "adding changesets", + "adding manifests", + "adding file changes", + "all local heads known remotely", +] + +def isNoise(line): + line = str(line) + for x in noise: + if line == x: + return True + return False + +def hg_incoming(ui, repo): + w = uiwrap(ui) + ret = hg_commands.incoming(ui, repo, force=False, bundle="") + if ret and ret != 1: + raise hg_util.Abort(ret) + return w.output() + +def hg_log(ui, repo, **opts): + for k in ['date', 'keyword', 'rev', 'user']: + if not opts.has_key(k): + opts[k] = "" + w = uiwrap(ui) + ret = hg_commands.log(ui, repo, **opts) + if ret: + raise hg_util.Abort(ret) + return w.output() + +def hg_outgoing(ui, repo, **opts): + w = uiwrap(ui) + ret = hg_commands.outgoing(ui, repo, **opts) + if ret and ret != 1: + raise hg_util.Abort(ret) + return w.output() + +def hg_pull(ui, repo, **opts): + w = uiwrap(ui) + ui.quiet = False + ui.verbose = True # for file list + err = hg_commands.pull(ui, repo, **opts) + for line in w.output().split('\n'): + if isNoise(line): + continue + if line.startswith('moving '): + line = 'mv ' + line[len('moving '):] + if line.startswith('getting ') and line.find(' to ') >= 0: + line = 'mv ' + line[len('getting '):] + if line.startswith('getting '): + line = '+ ' + line[len('getting '):] + if line.startswith('removing '): + line = '- ' + line[len('removing '):] + ui.write(line + '\n') + return err + +def hg_update(ui, repo, **opts): + w = uiwrap(ui) + ui.quiet = False + ui.verbose = True # for file list + err = hg_commands.update(ui, repo, **opts) + for line in w.output().split('\n'): + if isNoise(line): + continue + if line.startswith('moving '): + line = 'mv ' + line[len('moving '):] + if line.startswith('getting ') and line.find(' to ') >= 0: + line = 'mv ' + line[len('getting '):] + if line.startswith('getting '): + line = '+ ' + line[len('getting '):] + if line.startswith('removing '): + line = '- ' + line[len('removing '):] + ui.write(line + '\n') + return err + +def hg_push(ui, repo, **opts): + w = uiwrap(ui) + ui.quiet = False + ui.verbose = True + err = hg_commands.push(ui, repo, **opts) + for line in w.output().split('\n'): + if not isNoise(line): + ui.write(line + '\n') + return err + +def hg_commit(ui, repo, *pats, **opts): + return hg_commands.commit(ui, repo, *pats, **opts) + +####################################################################### +# Mercurial precommit hook to disable commit except through this interface. + +commit_okay = False + +def precommithook(ui, repo, **opts): + if hgversion >= "2.1": + from mercurial import phases + if repo.ui.config('phases', 'new-commit') >= phases.secret: + return False + if commit_okay: + return False # False means okay. + ui.write("\ncodereview extension enabled; use mail, upload, or submit instead of commit\n\n") + return True + +####################################################################### +# @clnumber file pattern support + +# We replace scmutil.match with the MatchAt wrapper to add the @clnumber pattern. + +match_repo = None +match_ui = None +match_orig = None + +def InstallMatch(ui, repo): + global match_repo + global match_ui + global match_orig + + match_ui = ui + match_repo = repo + + from mercurial import scmutil + match_orig = scmutil.match + scmutil.match = MatchAt + +def MatchAt(ctx, pats=None, opts=None, globbed=False, default='relpath'): + taken = [] + files = [] + pats = pats or [] + opts = opts or {} + + for p in pats: + if p.startswith('@'): + taken.append(p) + clname = p[1:] + if clname == "default": + files = DefaultFiles(match_ui, match_repo, []) + else: + if not GoodCLName(clname): + raise hg_util.Abort("invalid CL name " + clname) + cl, err = LoadCL(match_repo.ui, match_repo, clname, web=False) + if err != '': + raise hg_util.Abort("loading CL " + clname + ": " + err) + if not cl.files: + raise hg_util.Abort("no files in CL " + clname) + files = Add(files, cl.files) + pats = Sub(pats, taken) + ['path:'+f for f in files] + + # work-around for http://selenic.com/hg/rev/785bbc8634f8 + if not hasattr(ctx, 'match'): + ctx = ctx[None] + return match_orig(ctx, pats=pats, opts=opts, globbed=globbed, default=default) + +####################################################################### +# Commands added by code review extension. + +def hgcommand(f): + return f + +####################################################################### +# hg change + +@hgcommand +def change(ui, repo, *pats, **opts): + """create, edit or delete a change list + + Create, edit or delete a change list. + A change list is a group of files to be reviewed and submitted together, + plus a textual description of the change. + Change lists are referred to by simple alphanumeric names. + + Changes must be reviewed before they can be submitted. + + In the absence of options, the change command opens the + change list for editing in the default editor. + + Deleting a change with the -d or -D flag does not affect + the contents of the files listed in that change. To revert + the files listed in a change, use + + hg revert @123456 + + before running hg change -d 123456. + """ + + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + dirty = {} + if len(pats) > 0 and GoodCLName(pats[0]): + name = pats[0] + if len(pats) != 1: + raise hg_util.Abort("cannot specify CL name and file patterns") + pats = pats[1:] + cl, err = LoadCL(ui, repo, name, web=True) + if err != '': + raise hg_util.Abort(err) + if not cl.local and (opts["stdin"] or not opts["stdout"]): + raise hg_util.Abort("cannot change non-local CL " + name) + else: + name = "new" + cl = CL("new") + if repo[None].branch() != "default": + raise hg_util.Abort("cannot create CL outside default branch; switch with 'hg update default'") + dirty[cl] = True + files = ChangedFiles(ui, repo, pats, taken=Taken(ui, repo)) + + if opts["delete"] or opts["deletelocal"]: + if opts["delete"] and opts["deletelocal"]: + raise hg_util.Abort("cannot use -d and -D together") + flag = "-d" + if opts["deletelocal"]: + flag = "-D" + if name == "new": + raise hg_util.Abort("cannot use "+flag+" with file patterns") + if opts["stdin"] or opts["stdout"]: + raise hg_util.Abort("cannot use "+flag+" with -i or -o") + if not cl.local: + raise hg_util.Abort("cannot change non-local CL " + name) + if opts["delete"]: + if cl.copied_from: + raise hg_util.Abort("original author must delete CL; hg change -D will remove locally") + PostMessage(ui, cl.name, "*** Abandoned ***", send_mail=cl.mailed) + EditDesc(cl.name, closed=True, private=cl.private) + cl.Delete(ui, repo) + return + + if opts["stdin"]: + s = sys.stdin.read() + clx, line, err = ParseCL(s, name) + if err != '': + raise hg_util.Abort("error parsing change list: line %d: %s" % (line, err)) + if clx.desc is not None: + cl.desc = clx.desc; + dirty[cl] = True + if clx.reviewer is not None: + cl.reviewer = clx.reviewer + dirty[cl] = True + if clx.cc is not None: + cl.cc = clx.cc + dirty[cl] = True + if clx.files is not None: + cl.files = clx.files + dirty[cl] = True + if clx.private != cl.private: + cl.private = clx.private + dirty[cl] = True + + if not opts["stdin"] and not opts["stdout"]: + if name == "new": + cl.files = files + err = EditCL(ui, repo, cl) + if err != "": + raise hg_util.Abort(err) + dirty[cl] = True + + for d, _ in dirty.items(): + name = d.name + d.Flush(ui, repo) + if name == "new": + d.Upload(ui, repo, quiet=True) + + if opts["stdout"]: + ui.write(cl.EditorText()) + elif opts["pending"]: + ui.write(cl.PendingText()) + elif name == "new": + if ui.quiet: + ui.write(cl.name) + else: + ui.write("CL created: " + cl.url + "\n") + return + +####################################################################### +# hg code-login (broken?) + +@hgcommand +def code_login(ui, repo, **opts): + """log in to code review server + + Logs in to the code review server, saving a cookie in + a file in your home directory. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + MySend(None) + +####################################################################### +# hg clpatch / undo / release-apply / download +# All concerned with applying or unapplying patches to the repository. + +@hgcommand +def clpatch(ui, repo, clname, **opts): + """import a patch from the code review server + + Imports a patch from the code review server into the local client. + If the local client has already modified any of the files that the + patch modifies, this command will refuse to apply the patch. + + Submitting an imported patch will keep the original author's + name as the Author: line but add your own name to a Committer: line. + """ + if repo[None].branch() != "default": + raise hg_util.Abort("cannot run hg clpatch outside default branch") + err = clpatch_or_undo(ui, repo, clname, opts, mode="clpatch") + if err: + raise hg_util.Abort(err) + +@hgcommand +def undo(ui, repo, clname, **opts): + """undo the effect of a CL + + Creates a new CL that undoes an earlier CL. + After creating the CL, opens the CL text for editing so that + you can add the reason for the undo to the description. + """ + if repo[None].branch() != "default": + raise hg_util.Abort("cannot run hg undo outside default branch") + err = clpatch_or_undo(ui, repo, clname, opts, mode="undo") + if err: + raise hg_util.Abort(err) + +@hgcommand +def release_apply(ui, repo, clname, **opts): + """apply a CL to the release branch + + Creates a new CL copying a previously committed change + from the main branch to the release branch. + The current client must either be clean or already be in + the release branch. + + The release branch must be created by starting with a + clean client, disabling the code review plugin, and running: + + hg update weekly.YYYY-MM-DD + hg branch release-branch.rNN + hg commit -m 'create release-branch.rNN' + hg push --new-branch + + Then re-enable the code review plugin. + + People can test the release branch by running + + hg update release-branch.rNN + + in a clean client. To return to the normal tree, + + hg update default + + Move changes since the weekly into the release branch + using hg release-apply followed by the usual code review + process and hg submit. + + When it comes time to tag the release, record the + final long-form tag of the release-branch.rNN + in the *default* branch's .hgtags file. That is, run + + hg update default + + and then edit .hgtags as you would for a weekly. + + """ + c = repo[None] + if not releaseBranch: + raise hg_util.Abort("no active release branches") + if c.branch() != releaseBranch: + if c.modified() or c.added() or c.removed(): + raise hg_util.Abort("uncommitted local changes - cannot switch branches") + err = hg_clean(repo, releaseBranch) + if err: + raise hg_util.Abort(err) + try: + err = clpatch_or_undo(ui, repo, clname, opts, mode="backport") + if err: + raise hg_util.Abort(err) + except Exception, e: + hg_clean(repo, "default") + raise e + +def rev2clname(rev): + # Extract CL name from revision description. + # The last line in the description that is a codereview URL is the real one. + # Earlier lines might be part of the user-written description. + all = re.findall('(?m)^https?://codereview.appspot.com/([0-9]+)$', rev.description()) + if len(all) > 0: + return all[-1] + return "" + +undoHeader = """undo CL %s / %s + + + +««« original CL description +""" + +undoFooter = """ +»»» +""" + +backportHeader = """[%s] %s + +««« CL %s / %s +""" + +backportFooter = """ +»»» +""" + +# Implementation of clpatch/undo. +def clpatch_or_undo(ui, repo, clname, opts, mode): + if codereview_disabled: + return codereview_disabled + + if mode == "undo" or mode == "backport": + # Find revision in Mercurial repository. + # Assume CL number is 7+ decimal digits. + # Otherwise is either change log sequence number (fewer decimal digits), + # hexadecimal hash, or tag name. + # Mercurial will fall over long before the change log + # sequence numbers get to be 7 digits long. + if re.match('^[0-9]{7,}$', clname): + found = False + for r in hg_log(ui, repo, keyword="codereview.appspot.com/"+clname, limit=100, template="{node}\n").split(): + rev = repo[r] + # Last line with a code review URL is the actual review URL. + # Earlier ones might be part of the CL description. + n = rev2clname(rev) + if n == clname: + found = True + break + if not found: + return "cannot find CL %s in local repository" % clname + else: + rev = repo[clname] + if not rev: + return "unknown revision %s" % clname + clname = rev2clname(rev) + if clname == "": + return "cannot find CL name in revision description" + + # Create fresh CL and start with patch that would reverse the change. + vers = hg_node.short(rev.node()) + cl = CL("new") + desc = str(rev.description()) + if mode == "undo": + cl.desc = (undoHeader % (clname, vers)) + desc + undoFooter + else: + cl.desc = (backportHeader % (releaseBranch, line1(desc), clname, vers)) + desc + undoFooter + v1 = vers + v0 = hg_node.short(rev.parents()[0].node()) + if mode == "undo": + arg = v1 + ":" + v0 + else: + vers = v0 + arg = v0 + ":" + v1 + patch = RunShell(["hg", "diff", "--git", "-r", arg]) + + else: # clpatch + cl, vers, patch, err = DownloadCL(ui, repo, clname) + if err != "": + return err + if patch == emptydiff: + return "codereview issue %s has no diff" % clname + + # find current hg version (hg identify) + ctx = repo[None] + parents = ctx.parents() + id = '+'.join([hg_node.short(p.node()) for p in parents]) + + # if version does not match the patch version, + # try to update the patch line numbers. + if vers != "" and id != vers: + # "vers in repo" gives the wrong answer + # on some versions of Mercurial. Instead, do the actual + # lookup and catch the exception. + try: + repo[vers].description() + except: + return "local repository is out of date; sync to get %s" % (vers) + patch1, err = portPatch(repo, patch, vers, id) + if err != "": + if not opts["ignore_hgapplydiff_failure"]: + return "codereview issue %s is out of date: %s (%s->%s)" % (clname, err, vers, id) + else: + patch = patch1 + argv = ["hgapplydiff"] + if opts["no_incoming"] or mode == "backport": + argv += ["--checksync=false"] + try: + cmd = subprocess.Popen(argv, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=None, close_fds=sys.platform != "win32") + except: + return "hgapplydiff: " + ExceptionDetail() + "\nInstall hgapplydiff with:\n$ go get code.google.com/p/go.codereview/cmd/hgapplydiff\n" + + out, err = cmd.communicate(patch) + if cmd.returncode != 0 and not opts["ignore_hgapplydiff_failure"]: + return "hgapplydiff failed" + cl.local = True + cl.files = out.strip().split() + if not cl.files and not opts["ignore_hgapplydiff_failure"]: + return "codereview issue %s has no changed files" % clname + files = ChangedFiles(ui, repo, []) + extra = Sub(cl.files, files) + if extra: + ui.warn("warning: these files were listed in the patch but not changed:\n\t" + "\n\t".join(extra) + "\n") + cl.Flush(ui, repo) + if mode == "undo": + err = EditCL(ui, repo, cl) + if err != "": + return "CL created, but error editing: " + err + cl.Flush(ui, repo) + else: + ui.write(cl.PendingText() + "\n") + +# portPatch rewrites patch from being a patch against +# oldver to being a patch against newver. +def portPatch(repo, patch, oldver, newver): + lines = patch.splitlines(True) # True = keep \n + delta = None + for i in range(len(lines)): + line = lines[i] + if line.startswith('--- a/'): + file = line[6:-1] + delta = fileDeltas(repo, file, oldver, newver) + if not delta or not line.startswith('@@ '): + continue + # @@ -x,y +z,w @@ means the patch chunk replaces + # the original file's line numbers x up to x+y with the + # line numbers z up to z+w in the new file. + # Find the delta from x in the original to the same + # line in the current version and add that delta to both + # x and z. + m = re.match('@@ -([0-9]+),([0-9]+) \+([0-9]+),([0-9]+) @@', line) + if not m: + return None, "error parsing patch line numbers" + n1, len1, n2, len2 = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) + d, err = lineDelta(delta, n1, len1) + if err != "": + return "", err + n1 += d + n2 += d + lines[i] = "@@ -%d,%d +%d,%d @@\n" % (n1, len1, n2, len2) + + newpatch = ''.join(lines) + return newpatch, "" + +# fileDelta returns the line number deltas for the given file's +# changes from oldver to newver. +# The deltas are a list of (n, len, newdelta) triples that say +# lines [n, n+len) were modified, and after that range the +# line numbers are +newdelta from what they were before. +def fileDeltas(repo, file, oldver, newver): + cmd = ["hg", "diff", "--git", "-r", oldver + ":" + newver, "path:" + file] + data = RunShell(cmd, silent_ok=True) + deltas = [] + for line in data.splitlines(): + m = re.match('@@ -([0-9]+),([0-9]+) \+([0-9]+),([0-9]+) @@', line) + if not m: + continue + n1, len1, n2, len2 = int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) + deltas.append((n1, len1, n2+len2-(n1+len1))) + return deltas + +# lineDelta finds the appropriate line number delta to apply to the lines [n, n+len). +# It returns an error if those lines were rewritten by the patch. +def lineDelta(deltas, n, len): + d = 0 + for (old, oldlen, newdelta) in deltas: + if old >= n+len: + break + if old+len > n: + return 0, "patch and recent changes conflict" + d = newdelta + return d, "" + +@hgcommand +def download(ui, repo, clname, **opts): + """download a change from the code review server + + Download prints a description of the given change list + followed by its diff, downloaded from the code review server. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + cl, vers, patch, err = DownloadCL(ui, repo, clname) + if err != "": + return err + ui.write(cl.EditorText() + "\n") + ui.write(patch + "\n") + return + +####################################################################### +# hg file + +@hgcommand +def file(ui, repo, clname, pat, *pats, **opts): + """assign files to or remove files from a change list + + Assign files to or (with -d) remove files from a change list. + + The -d option only removes files from the change list. + It does not edit them or remove them from the repository. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + pats = tuple([pat] + list(pats)) + if not GoodCLName(clname): + return "invalid CL name " + clname + + dirty = {} + cl, err = LoadCL(ui, repo, clname, web=False) + if err != '': + return err + if not cl.local: + return "cannot change non-local CL " + clname + + files = ChangedFiles(ui, repo, pats) + + if opts["delete"]: + oldfiles = Intersect(files, cl.files) + if oldfiles: + if not ui.quiet: + ui.status("# Removing files from CL. To undo:\n") + ui.status("# cd %s\n" % (repo.root)) + for f in oldfiles: + ui.status("# hg file %s %s\n" % (cl.name, f)) + cl.files = Sub(cl.files, oldfiles) + cl.Flush(ui, repo) + else: + ui.status("no such files in CL") + return + + if not files: + return "no such modified files" + + files = Sub(files, cl.files) + taken = Taken(ui, repo) + warned = False + for f in files: + if f in taken: + if not warned and not ui.quiet: + ui.status("# Taking files from other CLs. To undo:\n") + ui.status("# cd %s\n" % (repo.root)) + warned = True + ocl = taken[f] + if not ui.quiet: + ui.status("# hg file %s %s\n" % (ocl.name, f)) + if ocl not in dirty: + ocl.files = Sub(ocl.files, files) + dirty[ocl] = True + cl.files = Add(cl.files, files) + dirty[cl] = True + for d, _ in dirty.items(): + d.Flush(ui, repo) + return + +####################################################################### +# hg gofmt + +@hgcommand +def gofmt(ui, repo, *pats, **opts): + """apply gofmt to modified files + + Applies gofmt to the modified files in the repository that match + the given patterns. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + files = ChangedExistingFiles(ui, repo, pats, opts) + files = gofmt_required(files) + if not files: + ui.status("no modified go files\n") + return + cwd = os.getcwd() + files = [RelativePath(repo.root + '/' + f, cwd) for f in files] + try: + cmd = ["gofmt", "-l"] + if not opts["list"]: + cmd += ["-w"] + if subprocess.call(cmd + files) != 0: + raise hg_util.Abort("gofmt did not exit cleanly") + except hg_error.Abort, e: + raise + except: + raise hg_util.Abort("gofmt: " + ExceptionDetail()) + return + +def gofmt_required(files): + return [f for f in files if (not f.startswith('test/') or f.startswith('test/bench/')) and f.endswith('.go')] + +####################################################################### +# hg mail + +@hgcommand +def mail(ui, repo, *pats, **opts): + """mail a change for review + + Uploads a patch to the code review server and then sends mail + to the reviewer and CC list asking for a review. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + cl, err = CommandLineCL(ui, repo, pats, opts, op="mail", defaultcc=defaultcc) + if err != "": + raise hg_util.Abort(err) + cl.Upload(ui, repo, gofmt_just_warn=True) + if not cl.reviewer: + # If no reviewer is listed, assign the review to defaultcc. + # This makes sure that it appears in the + # codereview.appspot.com/user/defaultcc + # page, so that it doesn't get dropped on the floor. + if not defaultcc: + raise hg_util.Abort("no reviewers listed in CL") + cl.cc = Sub(cl.cc, defaultcc) + cl.reviewer = defaultcc + cl.Flush(ui, repo) + + if cl.files == []: + raise hg_util.Abort("no changed files, not sending mail") + + cl.Mail(ui, repo) + +####################################################################### +# hg p / hg pq / hg ps / hg pending + +@hgcommand +def ps(ui, repo, *pats, **opts): + """alias for hg p --short + """ + opts['short'] = True + return pending(ui, repo, *pats, **opts) + +@hgcommand +def pq(ui, repo, *pats, **opts): + """alias for hg p --quick + """ + opts['quick'] = True + return pending(ui, repo, *pats, **opts) + +@hgcommand +def pending(ui, repo, *pats, **opts): + """show pending changes + + Lists pending changes followed by a list of unassigned but modified files. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + quick = opts.get('quick', False) + short = opts.get('short', False) + m = LoadAllCL(ui, repo, web=not quick and not short) + names = m.keys() + names.sort() + for name in names: + cl = m[name] + if short: + ui.write(name + "\t" + line1(cl.desc) + "\n") + else: + ui.write(cl.PendingText(quick=quick) + "\n") + + if short: + return 0 + files = DefaultFiles(ui, repo, []) + if len(files) > 0: + s = "Changed files not in any CL:\n" + for f in files: + s += "\t" + f + "\n" + ui.write(s) + +####################################################################### +# hg submit + +def need_sync(): + raise hg_util.Abort("local repository out of date; must sync before submit") + +@hgcommand +def submit(ui, repo, *pats, **opts): + """submit change to remote repository + + Submits change to remote repository. + Bails out if the local repository is not in sync with the remote one. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + # We already called this on startup but sometimes Mercurial forgets. + set_mercurial_encoding_to_utf8() + + if not opts["no_incoming"] and hg_incoming(ui, repo): + need_sync() + + cl, err = CommandLineCL(ui, repo, pats, opts, op="submit", defaultcc=defaultcc) + if err != "": + raise hg_util.Abort(err) + + user = None + if cl.copied_from: + user = cl.copied_from + userline = CheckContributor(ui, repo, user) + typecheck(userline, str) + + about = "" + if cl.reviewer: + about += "R=" + JoinComma([CutDomain(s) for s in cl.reviewer]) + "\n" + if opts.get('tbr'): + tbr = SplitCommaSpace(opts.get('tbr')) + cl.reviewer = Add(cl.reviewer, tbr) + about += "TBR=" + JoinComma([CutDomain(s) for s in tbr]) + "\n" + if cl.cc: + about += "CC=" + JoinComma([CutDomain(s) for s in cl.cc]) + "\n" + + if not cl.reviewer: + raise hg_util.Abort("no reviewers listed in CL") + + if not cl.local: + raise hg_util.Abort("cannot submit non-local CL") + + # upload, to sync current patch and also get change number if CL is new. + if not cl.copied_from: + cl.Upload(ui, repo, gofmt_just_warn=True) + + # check gofmt for real; allowed upload to warn in order to save CL. + cl.Flush(ui, repo) + CheckFormat(ui, repo, cl.files) + + about += "%s%s\n" % (server_url_base, cl.name) + + if cl.copied_from: + about += "\nCommitter: " + CheckContributor(ui, repo, None) + "\n" + typecheck(about, str) + + if not cl.mailed and not cl.copied_from: # in case this is TBR + cl.Mail(ui, repo) + + # submit changes locally + message = cl.desc.rstrip() + "\n\n" + about + typecheck(message, str) + + set_status("pushing " + cl.name + " to remote server") + + if hg_outgoing(ui, repo): + raise hg_util.Abort("local repository corrupt or out-of-phase with remote: found outgoing changes") + + old_heads = len(hg_heads(ui, repo).split()) + + global commit_okay + commit_okay = True + ret = hg_commit(ui, repo, *['path:'+f for f in cl.files], message=message, user=userline) + commit_okay = False + if ret: + raise hg_util.Abort("nothing changed") + node = repo["-1"].node() + # push to remote; if it fails for any reason, roll back + try: + new_heads = len(hg_heads(ui, repo).split()) + if old_heads != new_heads and not (old_heads == 0 and new_heads == 1): + # Created new head, so we weren't up to date. + need_sync() + + # Push changes to remote. If it works, we're committed. If not, roll back. + try: + if hg_push(ui, repo): + raise hg_util.Abort("push error") + except hg_error.Abort, e: + if e.message.find("push creates new heads") >= 0: + # Remote repository had changes we missed. + need_sync() + raise + except urllib2.HTTPError, e: + print >>sys.stderr, "pushing to remote server failed; do you have commit permissions?" + raise + except: + real_rollback() + raise + + # We're committed. Upload final patch, close review, add commit message. + changeURL = hg_node.short(node) + url = ui.expandpath("default") + m = re.match("(^https?://([^@/]+@)?([^.]+)\.googlecode\.com/hg/?)" + "|" + + "(^https?://([^@/]+@)?code\.google\.com/p/([^/.]+)(\.[^./]+)?/?)", url) + if m: + if m.group(1): # prj.googlecode.com/hg/ case + changeURL = "https://code.google.com/p/%s/source/detail?r=%s" % (m.group(3), changeURL) + elif m.group(4) and m.group(7): # code.google.com/p/prj.subrepo/ case + changeURL = "https://code.google.com/p/%s/source/detail?r=%s&repo=%s" % (m.group(6), changeURL, m.group(7)[1:]) + elif m.group(4): # code.google.com/p/prj/ case + changeURL = "https://code.google.com/p/%s/source/detail?r=%s" % (m.group(6), changeURL) + else: + print >>sys.stderr, "URL: ", url + else: + print >>sys.stderr, "URL: ", url + pmsg = "*** Submitted as " + changeURL + " ***\n\n" + message + + # When posting, move reviewers to CC line, + # so that the issue stops showing up in their "My Issues" page. + PostMessage(ui, cl.name, pmsg, reviewers="", cc=JoinComma(cl.reviewer+cl.cc)) + + if not cl.copied_from: + EditDesc(cl.name, closed=True, private=cl.private) + cl.Delete(ui, repo) + + c = repo[None] + if c.branch() == releaseBranch and not c.modified() and not c.added() and not c.removed(): + ui.write("switching from %s to default branch.\n" % releaseBranch) + err = hg_clean(repo, "default") + if err: + return err + return 0 + +####################################################################### +# hg sync + +@hgcommand +def sync(ui, repo, **opts): + """synchronize with remote repository + + Incorporates recent changes from the remote repository + into the local repository. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + if not opts["local"]: + # If there are incoming CLs, pull -u will do the update. + # If there are no incoming CLs, do hg update to make sure + # that an update always happens regardless. This is less + # surprising than update depending on incoming CLs. + # It is important not to do both hg pull -u and hg update + # in the same command, because the hg update will end + # up marking resolve conflicts from the hg pull -u as resolved, + # causing files with <<< >>> markers to not show up in + # hg resolve -l. Yay Mercurial. + if hg_incoming(ui, repo): + err = hg_pull(ui, repo, update=True) + else: + err = hg_update(ui, repo) + if err: + return err + sync_changes(ui, repo) + +def sync_changes(ui, repo): + # Look through recent change log descriptions to find + # potential references to http://.*/our-CL-number. + # Double-check them by looking at the Rietveld log. + for rev in hg_log(ui, repo, limit=100, template="{node}\n").split(): + desc = repo[rev].description().strip() + for clname in re.findall('(?m)^https?://(?:[^\n]+)/([0-9]+)$', desc): + if IsLocalCL(ui, repo, clname) and IsRietveldSubmitted(ui, clname, repo[rev].hex()): + ui.warn("CL %s submitted as %s; closing\n" % (clname, repo[rev])) + cl, err = LoadCL(ui, repo, clname, web=False) + if err != "": + ui.warn("loading CL %s: %s\n" % (clname, err)) + continue + if not cl.copied_from: + EditDesc(cl.name, closed=True, private=cl.private) + cl.Delete(ui, repo) + + # Remove files that are not modified from the CLs in which they appear. + all = LoadAllCL(ui, repo, web=False) + changed = ChangedFiles(ui, repo, []) + for cl in all.values(): + extra = Sub(cl.files, changed) + if extra: + ui.warn("Removing unmodified files from CL %s:\n" % (cl.name,)) + for f in extra: + ui.warn("\t%s\n" % (f,)) + cl.files = Sub(cl.files, extra) + cl.Flush(ui, repo) + if not cl.files: + if not cl.copied_from: + ui.warn("CL %s has no files; delete (abandon) with hg change -d %s\n" % (cl.name, cl.name)) + else: + ui.warn("CL %s has no files; delete locally with hg change -D %s\n" % (cl.name, cl.name)) + return 0 + +####################################################################### +# hg upload + +@hgcommand +def upload(ui, repo, name, **opts): + """upload diffs to the code review server + + Uploads the current modifications for a given change to the server. + """ + if codereview_disabled: + raise hg_util.Abort(codereview_disabled) + + repo.ui.quiet = True + cl, err = LoadCL(ui, repo, name, web=True) + if err != "": + raise hg_util.Abort(err) + if not cl.local: + raise hg_util.Abort("cannot upload non-local change") + cl.Upload(ui, repo) + print "%s%s\n" % (server_url_base, cl.name) + return 0 + +####################################################################### +# Table of commands, supplied to Mercurial for installation. + +review_opts = [ + ('r', 'reviewer', '', 'add reviewer'), + ('', 'cc', '', 'add cc'), + ('', 'tbr', '', 'add future reviewer'), + ('m', 'message', '', 'change description (for new change)'), +] + +cmdtable = { + # The ^ means to show this command in the help text that + # is printed when running hg with no arguments. + "^change": ( + change, + [ + ('d', 'delete', None, 'delete existing change list'), + ('D', 'deletelocal', None, 'delete locally, but do not change CL on server'), + ('i', 'stdin', None, 'read change list from standard input'), + ('o', 'stdout', None, 'print change list to standard output'), + ('p', 'pending', None, 'print pending summary to standard output'), + ], + "[-d | -D] [-i] [-o] change# or FILE ..." + ), + "^clpatch": ( + clpatch, + [ + ('', 'ignore_hgapplydiff_failure', None, 'create CL metadata even if hgapplydiff fails'), + ('', 'no_incoming', None, 'disable check for incoming changes'), + ], + "change#" + ), + # Would prefer to call this codereview-login, but then + # hg help codereview prints the help for this command + # instead of the help for the extension. + "code-login": ( + code_login, + [], + "", + ), + "^download": ( + download, + [], + "change#" + ), + "^file": ( + file, + [ + ('d', 'delete', None, 'delete files from change list (but not repository)'), + ], + "[-d] change# FILE ..." + ), + "^gofmt": ( + gofmt, + [ + ('l', 'list', None, 'list files that would change, but do not edit them'), + ], + "FILE ..." + ), + "^pending|p": ( + pending, + [ + ('s', 'short', False, 'show short result form'), + ('', 'quick', False, 'do not consult codereview server'), + ], + "[FILE ...]" + ), + "^ps": ( + ps, + [], + "[FILE ...]" + ), + "^pq": ( + pq, + [], + "[FILE ...]" + ), + "^mail": ( + mail, + review_opts + [ + ] + hg_commands.walkopts, + "[-r reviewer] [--cc cc] [change# | file ...]" + ), + "^release-apply": ( + release_apply, + [ + ('', 'ignore_hgapplydiff_failure', None, 'create CL metadata even if hgapplydiff fails'), + ('', 'no_incoming', None, 'disable check for incoming changes'), + ], + "change#" + ), + # TODO: release-start, release-tag, weekly-tag + "^submit": ( + submit, + review_opts + [ + ('', 'no_incoming', None, 'disable initial incoming check (for testing)'), + ] + hg_commands.walkopts + hg_commands.commitopts + hg_commands.commitopts2, + "[-r reviewer] [--cc cc] [change# | file ...]" + ), + "^sync": ( + sync, + [ + ('', 'local', None, 'do not pull changes from remote repository') + ], + "[--local]", + ), + "^undo": ( + undo, + [ + ('', 'ignore_hgapplydiff_failure', None, 'create CL metadata even if hgapplydiff fails'), + ('', 'no_incoming', None, 'disable check for incoming changes'), + ], + "change#" + ), + "^upload": ( + upload, + [], + "change#" + ), +} + +####################################################################### +# Mercurial extension initialization + +def norollback(*pats, **opts): + """(disabled when using this extension)""" + raise hg_util.Abort("codereview extension enabled; use undo instead of rollback") + +codereview_init = False + +def reposetup(ui, repo): + global codereview_disabled + global defaultcc + + # reposetup gets called both for the local repository + # and also for any repository we are pulling or pushing to. + # Only initialize the first time. + global codereview_init + if codereview_init: + return + codereview_init = True + start_status_thread() + + # Read repository-specific options from lib/codereview/codereview.cfg or codereview.cfg. + root = '' + try: + root = repo.root + except: + # Yes, repo might not have root; see issue 959. + codereview_disabled = 'codereview disabled: repository has no root' + return + + repo_config_path = '' + p1 = root + '/lib/codereview/codereview.cfg' + p2 = root + '/codereview.cfg' + if os.access(p1, os.F_OK): + repo_config_path = p1 + else: + repo_config_path = p2 + try: + f = open(repo_config_path) + for line in f: + if line.startswith('defaultcc:'): + defaultcc = SplitCommaSpace(line[len('defaultcc:'):]) + if line.startswith('contributors:'): + global contributorsURL + contributorsURL = line[len('contributors:'):].strip() + except: + codereview_disabled = 'codereview disabled: cannot open ' + repo_config_path + return + + remote = ui.config("paths", "default", "") + if remote.find("://") < 0: + raise hg_util.Abort("codereview: default path '%s' is not a URL" % (remote,)) + + InstallMatch(ui, repo) + RietveldSetup(ui, repo) + + # Disable the Mercurial commands that might change the repository. + # Only commands in this extension are supposed to do that. + ui.setconfig("hooks", "precommit.codereview", precommithook) + + # Rollback removes an existing commit. Don't do that either. + global real_rollback + real_rollback = repo.rollback + repo.rollback = norollback + + +####################################################################### +# Wrappers around upload.py for interacting with Rietveld + +from HTMLParser import HTMLParser + +# HTML form parser +class FormParser(HTMLParser): + def __init__(self): + self.map = {} + self.curtag = None + self.curdata = None + HTMLParser.__init__(self) + def handle_starttag(self, tag, attrs): + if tag == "input": + key = None + value = '' + for a in attrs: + if a[0] == 'name': + key = a[1] + if a[0] == 'value': + value = a[1] + if key is not None: + self.map[key] = value + if tag == "textarea": + key = None + for a in attrs: + if a[0] == 'name': + key = a[1] + if key is not None: + self.curtag = key + self.curdata = '' + def handle_endtag(self, tag): + if tag == "textarea" and self.curtag is not None: + self.map[self.curtag] = self.curdata + self.curtag = None + self.curdata = None + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + def handle_entityref(self, name): + import htmlentitydefs + if name in htmlentitydefs.entitydefs: + self.handle_data(htmlentitydefs.entitydefs[name]) + else: + self.handle_data("&" + name + ";") + def handle_data(self, data): + if self.curdata is not None: + self.curdata += data + +def JSONGet(ui, path): + try: + data = MySend(path, force_auth=False) + typecheck(data, str) + d = fix_json(json.loads(data)) + except: + ui.warn("JSONGet %s: %s\n" % (path, ExceptionDetail())) + return None + return d + +# Clean up json parser output to match our expectations: +# * all strings are UTF-8-encoded str, not unicode. +# * missing fields are missing, not None, +# so that d.get("foo", defaultvalue) works. +def fix_json(x): + if type(x) in [str, int, float, bool, type(None)]: + pass + elif type(x) is unicode: + x = x.encode("utf-8") + elif type(x) is list: + for i in range(len(x)): + x[i] = fix_json(x[i]) + elif type(x) is dict: + todel = [] + for k in x: + if x[k] is None: + todel.append(k) + else: + x[k] = fix_json(x[k]) + for k in todel: + del x[k] + else: + raise hg_util.Abort("unknown type " + str(type(x)) + " in fix_json") + if type(x) is str: + x = x.replace('\r\n', '\n') + return x + +def IsRietveldSubmitted(ui, clname, hex): + dict = JSONGet(ui, "/api/" + clname + "?messages=true") + if dict is None: + return False + for msg in dict.get("messages", []): + text = msg.get("text", "") + m = re.match('\*\*\* Submitted as [^*]*?([0-9a-f]+) \*\*\*', text) + if m is not None and len(m.group(1)) >= 8 and hex.startswith(m.group(1)): + return True + return False + +def IsRietveldMailed(cl): + for msg in cl.dict.get("messages", []): + if msg.get("text", "").find("I'd like you to review this change") >= 0: + return True + return False + +def DownloadCL(ui, repo, clname): + set_status("downloading CL " + clname) + cl, err = LoadCL(ui, repo, clname, web=True) + if err != "": + return None, None, None, "error loading CL %s: %s" % (clname, err) + + # Find most recent diff + diffs = cl.dict.get("patchsets", []) + if not diffs: + return None, None, None, "CL has no patch sets" + patchid = diffs[-1] + + patchset = JSONGet(ui, "/api/" + clname + "/" + str(patchid)) + if patchset is None: + return None, None, None, "error loading CL patchset %s/%d" % (clname, patchid) + if patchset.get("patchset", 0) != patchid: + return None, None, None, "malformed patchset information" + + vers = "" + msg = patchset.get("message", "").split() + if len(msg) >= 3 and msg[0] == "diff" and msg[1] == "-r": + vers = msg[2] + diff = "/download/issue" + clname + "_" + str(patchid) + ".diff" + + diffdata = MySend(diff, force_auth=False) + + # Print warning if email is not in CONTRIBUTORS file. + email = cl.dict.get("owner_email", "") + if not email: + return None, None, None, "cannot find owner for %s" % (clname) + him = FindContributor(ui, repo, email) + me = FindContributor(ui, repo, None) + if him == me: + cl.mailed = IsRietveldMailed(cl) + else: + cl.copied_from = email + + return cl, vers, diffdata, "" + +def MySend(request_path, payload=None, + content_type="application/octet-stream", + timeout=None, force_auth=True, + **kwargs): + """Run MySend1 maybe twice, because Rietveld is unreliable.""" + try: + return MySend1(request_path, payload, content_type, timeout, force_auth, **kwargs) + except Exception, e: + if type(e) != urllib2.HTTPError or e.code != 500: # only retry on HTTP 500 error + raise + print >>sys.stderr, "Loading "+request_path+": "+ExceptionDetail()+"; trying again in 2 seconds." + time.sleep(2) + return MySend1(request_path, payload, content_type, timeout, force_auth, **kwargs) + +# Like upload.py Send but only authenticates when the +# redirect is to www.google.com/accounts. This keeps +# unnecessary redirects from happening during testing. +def MySend1(request_path, payload=None, + content_type="application/octet-stream", + timeout=None, force_auth=True, + **kwargs): + """Sends an RPC and returns the response. + + Args: + request_path: The path to send the request to, eg /api/appversion/create. + payload: The body of the request, or None to send an empty request. + content_type: The Content-Type header to use. + timeout: timeout in seconds; default None i.e. no timeout. + (Note: for large requests on OS X, the timeout doesn't work right.) + kwargs: Any keyword arguments are converted into query string parameters. + + Returns: + The response body, as a string. + """ + # TODO: Don't require authentication. Let the server say + # whether it is necessary. + global rpc + if rpc == None: + rpc = GetRpcServer(upload_options) + self = rpc + if not self.authenticated and force_auth: + self._Authenticate() + if request_path is None: + return + if timeout is None: + timeout = 30 # seconds + + old_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(timeout) + try: + tries = 0 + while True: + tries += 1 + args = dict(kwargs) + url = "https://%s%s" % (self.host, request_path) + if args: + url += "?" + urllib.urlencode(args) + req = self._CreateRequest(url=url, data=payload) + req.add_header("Content-Type", content_type) + try: + f = self.opener.open(req) + response = f.read() + f.close() + # Translate \r\n into \n, because Rietveld doesn't. + response = response.replace('\r\n', '\n') + # who knows what urllib will give us + if type(response) == unicode: + response = response.encode("utf-8") + typecheck(response, str) + return response + except urllib2.HTTPError, e: + if tries > 3: + raise + elif e.code == 401: + self._Authenticate() + elif e.code == 302: + loc = e.info()["location"] + if not loc.startswith('https://www.google.com/a') or loc.find('/ServiceLogin') < 0: + return '' + self._Authenticate() + else: + raise + finally: + socket.setdefaulttimeout(old_timeout) + +def GetForm(url): + f = FormParser() + f.feed(ustr(MySend(url))) # f.feed wants unicode + f.close() + # convert back to utf-8 to restore sanity + m = {} + for k,v in f.map.items(): + m[k.encode("utf-8")] = v.replace("\r\n", "\n").encode("utf-8") + return m + +def EditDesc(issue, subject=None, desc=None, reviewers=None, cc=None, closed=False, private=False): + set_status("uploading change to description") + form_fields = GetForm("/" + issue + "/edit") + if subject is not None: + form_fields['subject'] = subject + if desc is not None: + form_fields['description'] = desc + if reviewers is not None: + form_fields['reviewers'] = reviewers + if cc is not None: + form_fields['cc'] = cc + if closed: + form_fields['closed'] = "checked" + if private: + form_fields['private'] = "checked" + ctype, body = EncodeMultipartFormData(form_fields.items(), []) + response = MySend("/" + issue + "/edit", body, content_type=ctype) + if response != "": + print >>sys.stderr, "Error editing description:\n" + "Sent form: \n", form_fields, "\n", response + sys.exit(2) + +def PostMessage(ui, issue, message, reviewers=None, cc=None, send_mail=True, subject=None): + set_status("uploading message") + form_fields = GetForm("/" + issue + "/publish") + if reviewers is not None: + form_fields['reviewers'] = reviewers + if cc is not None: + form_fields['cc'] = cc + if send_mail: + form_fields['send_mail'] = "checked" + else: + del form_fields['send_mail'] + if subject is not None: + form_fields['subject'] = subject + form_fields['message'] = message + + form_fields['message_only'] = '1' # Don't include draft comments + if reviewers is not None or cc is not None: + form_fields['message_only'] = '' # Must set '' in order to override cc/reviewer + ctype = "applications/x-www-form-urlencoded" + body = urllib.urlencode(form_fields) + response = MySend("/" + issue + "/publish", body, content_type=ctype) + if response != "": + print response + sys.exit(2) + +class opt(object): + pass + +def RietveldSetup(ui, repo): + global force_google_account + global rpc + global server + global server_url_base + global upload_options + global verbosity + + if not ui.verbose: + verbosity = 0 + + # Config options. + x = ui.config("codereview", "server") + if x is not None: + server = x + + # TODO(rsc): Take from ui.username? + email = None + x = ui.config("codereview", "email") + if x is not None: + email = x + + server_url_base = "https://" + server + "/" + + testing = ui.config("codereview", "testing") + force_google_account = ui.configbool("codereview", "force_google_account", False) + + upload_options = opt() + upload_options.email = email + upload_options.host = None + upload_options.verbose = 0 + upload_options.description = None + upload_options.description_file = None + upload_options.reviewers = None + upload_options.cc = None + upload_options.message = None + upload_options.issue = None + upload_options.download_base = False + upload_options.revision = None + upload_options.send_mail = False + upload_options.vcs = None + upload_options.server = server + upload_options.save_cookies = True + + if testing: + upload_options.save_cookies = False + upload_options.email = "test@example.com" + + rpc = None + + global releaseBranch + tags = repo.branchtags().keys() + if 'release-branch.go10' in tags: + # NOTE(rsc): This tags.sort is going to get the wrong + # answer when comparing release-branch.go9 with + # release-branch.go10. It will be a while before we care. + raise hg_util.Abort('tags.sort needs to be fixed for release-branch.go10') + tags.sort() + for t in tags: + if t.startswith('release-branch.go'): + releaseBranch = t + +####################################################################### +# http://codereview.appspot.com/static/upload.py, heavily edited. + +#!/usr/bin/env python +# +# Copyright 2007 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tool for uploading diffs from a version control system to the codereview app. + +Usage summary: upload.py [options] [-- diff_options] + +Diff options are passed to the diff command of the underlying system. + +Supported version control systems: + Git + Mercurial + Subversion + +It is important for Git/Mercurial users to specify a tree/node/branch to diff +against by using the '--rev' option. +""" +# This code is derived from appcfg.py in the App Engine SDK (open source), +# and from ASPN recipe #146306. + +import cookielib +import getpass +import logging +import mimetypes +import optparse +import os +import re +import socket +import subprocess +import sys +import urllib +import urllib2 +import urlparse + +# The md5 module was deprecated in Python 2.5. +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +try: + import readline +except ImportError: + pass + +# The logging verbosity: +# 0: Errors only. +# 1: Status messages. +# 2: Info logs. +# 3: Debug logs. +verbosity = 1 + +# Max size of patch or base file. +MAX_UPLOAD_SIZE = 900 * 1024 + +# whitelist for non-binary filetypes which do not start with "text/" +# .mm (Objective-C) shows up as application/x-freemind on my Linux box. +TEXT_MIMETYPES = [ + 'application/javascript', + 'application/x-javascript', + 'application/x-freemind' +] + +def GetEmail(prompt): + """Prompts the user for their email address and returns it. + + The last used email address is saved to a file and offered up as a suggestion + to the user. If the user presses enter without typing in anything the last + used email address is used. If the user enters a new address, it is saved + for next time we prompt. + + """ + last_email_file_name = os.path.expanduser("~/.last_codereview_email_address") + last_email = "" + if os.path.exists(last_email_file_name): + try: + last_email_file = open(last_email_file_name, "r") + last_email = last_email_file.readline().strip("\n") + last_email_file.close() + prompt += " [%s]" % last_email + except IOError, e: + pass + email = raw_input(prompt + ": ").strip() + if email: + try: + last_email_file = open(last_email_file_name, "w") + last_email_file.write(email) + last_email_file.close() + except IOError, e: + pass + else: + email = last_email + return email + + +def StatusUpdate(msg): + """Print a status message to stdout. + + If 'verbosity' is greater than 0, print the message. + + Args: + msg: The string to print. + """ + if verbosity > 0: + print msg + + +def ErrorExit(msg): + """Print an error message to stderr and exit.""" + print >>sys.stderr, msg + sys.exit(1) + + +class ClientLoginError(urllib2.HTTPError): + """Raised to indicate there was an error authenticating with ClientLogin.""" + + def __init__(self, url, code, msg, headers, args): + urllib2.HTTPError.__init__(self, url, code, msg, headers, None) + self.args = args + # .reason is now a read-only property based on .msg + # this means we ignore 'msg', but that seems to work fine. + self.msg = args["Error"] + + +class AbstractRpcServer(object): + """Provides a common interface for a simple RPC server.""" + + def __init__(self, host, auth_function, host_override=None, extra_headers={}, save_cookies=False): + """Creates a new HttpRpcServer. + + Args: + host: The host to send requests to. + auth_function: A function that takes no arguments and returns an + (email, password) tuple when called. Will be called if authentication + is required. + host_override: The host header to send to the server (defaults to host). + extra_headers: A dict of extra headers to append to every request. + save_cookies: If True, save the authentication cookies to local disk. + If False, use an in-memory cookiejar instead. Subclasses must + implement this functionality. Defaults to False. + """ + self.host = host + self.host_override = host_override + self.auth_function = auth_function + self.authenticated = False + self.extra_headers = extra_headers + self.save_cookies = save_cookies + self.opener = self._GetOpener() + if self.host_override: + logging.info("Server: %s; Host: %s", self.host, self.host_override) + else: + logging.info("Server: %s", self.host) + + def _GetOpener(self): + """Returns an OpenerDirector for making HTTP requests. + + Returns: + A urllib2.OpenerDirector object. + """ + raise NotImplementedError() + + def _CreateRequest(self, url, data=None): + """Creates a new urllib request.""" + logging.debug("Creating request for: '%s' with payload:\n%s", url, data) + req = urllib2.Request(url, data=data) + if self.host_override: + req.add_header("Host", self.host_override) + for key, value in self.extra_headers.iteritems(): + req.add_header(key, value) + return req + + def _GetAuthToken(self, email, password): + """Uses ClientLogin to authenticate the user, returning an auth token. + + Args: + email: The user's email address + password: The user's password + + Raises: + ClientLoginError: If there was an error authenticating with ClientLogin. + HTTPError: If there was some other form of HTTP error. + + Returns: + The authentication token returned by ClientLogin. + """ + account_type = "GOOGLE" + if self.host.endswith(".google.com") and not force_google_account: + # Needed for use inside Google. + account_type = "HOSTED" + req = self._CreateRequest( + url="https://www.google.com/accounts/ClientLogin", + data=urllib.urlencode({ + "Email": email, + "Passwd": password, + "service": "ah", + "source": "rietveld-codereview-upload", + "accountType": account_type, + }), + ) + try: + response = self.opener.open(req) + response_body = response.read() + response_dict = dict(x.split("=") for x in response_body.split("\n") if x) + return response_dict["Auth"] + except urllib2.HTTPError, e: + if e.code == 403: + body = e.read() + response_dict = dict(x.split("=", 1) for x in body.split("\n") if x) + raise ClientLoginError(req.get_full_url(), e.code, e.msg, e.headers, response_dict) + else: + raise + + def _GetAuthCookie(self, auth_token): + """Fetches authentication cookies for an authentication token. + + Args: + auth_token: The authentication token returned by ClientLogin. + + Raises: + HTTPError: If there was an error fetching the authentication cookies. + """ + # This is a dummy value to allow us to identify when we're successful. + continue_location = "http://localhost/" + args = {"continue": continue_location, "auth": auth_token} + req = self._CreateRequest("https://%s/_ah/login?%s" % (self.host, urllib.urlencode(args))) + try: + response = self.opener.open(req) + except urllib2.HTTPError, e: + response = e + if (response.code != 302 or + response.info()["location"] != continue_location): + raise urllib2.HTTPError(req.get_full_url(), response.code, response.msg, response.headers, response.fp) + self.authenticated = True + + def _Authenticate(self): + """Authenticates the user. + + The authentication process works as follows: + 1) We get a username and password from the user + 2) We use ClientLogin to obtain an AUTH token for the user + (see http://code.google.com/apis/accounts/AuthForInstalledApps.html). + 3) We pass the auth token to /_ah/login on the server to obtain an + authentication cookie. If login was successful, it tries to redirect + us to the URL we provided. + + If we attempt to access the upload API without first obtaining an + authentication cookie, it returns a 401 response (or a 302) and + directs us to authenticate ourselves with ClientLogin. + """ + for i in range(3): + credentials = self.auth_function() + try: + auth_token = self._GetAuthToken(credentials[0], credentials[1]) + except ClientLoginError, e: + if e.msg == "BadAuthentication": + print >>sys.stderr, "Invalid username or password." + continue + if e.msg == "CaptchaRequired": + print >>sys.stderr, ( + "Please go to\n" + "https://www.google.com/accounts/DisplayUnlockCaptcha\n" + "and verify you are a human. Then try again.") + break + if e.msg == "NotVerified": + print >>sys.stderr, "Account not verified." + break + if e.msg == "TermsNotAgreed": + print >>sys.stderr, "User has not agreed to TOS." + break + if e.msg == "AccountDeleted": + print >>sys.stderr, "The user account has been deleted." + break + if e.msg == "AccountDisabled": + print >>sys.stderr, "The user account has been disabled." + break + if e.msg == "ServiceDisabled": + print >>sys.stderr, "The user's access to the service has been disabled." + break + if e.msg == "ServiceUnavailable": + print >>sys.stderr, "The service is not available; try again later." + break + raise + self._GetAuthCookie(auth_token) + return + + def Send(self, request_path, payload=None, + content_type="application/octet-stream", + timeout=None, + **kwargs): + """Sends an RPC and returns the response. + + Args: + request_path: The path to send the request to, eg /api/appversion/create. + payload: The body of the request, or None to send an empty request. + content_type: The Content-Type header to use. + timeout: timeout in seconds; default None i.e. no timeout. + (Note: for large requests on OS X, the timeout doesn't work right.) + kwargs: Any keyword arguments are converted into query string parameters. + + Returns: + The response body, as a string. + """ + # TODO: Don't require authentication. Let the server say + # whether it is necessary. + if not self.authenticated: + self._Authenticate() + + old_timeout = socket.getdefaulttimeout() + socket.setdefaulttimeout(timeout) + try: + tries = 0 + while True: + tries += 1 + args = dict(kwargs) + url = "https://%s%s" % (self.host, request_path) + if args: + url += "?" + urllib.urlencode(args) + req = self._CreateRequest(url=url, data=payload) + req.add_header("Content-Type", content_type) + try: + f = self.opener.open(req) + response = f.read() + f.close() + return response + except urllib2.HTTPError, e: + if tries > 3: + raise + elif e.code == 401 or e.code == 302: + self._Authenticate() + else: + raise + finally: + socket.setdefaulttimeout(old_timeout) + + +class HttpRpcServer(AbstractRpcServer): + """Provides a simplified RPC-style interface for HTTP requests.""" + + def _Authenticate(self): + """Save the cookie jar after authentication.""" + super(HttpRpcServer, self)._Authenticate() + if self.save_cookies: + StatusUpdate("Saving authentication cookies to %s" % self.cookie_file) + self.cookie_jar.save() + + def _GetOpener(self): + """Returns an OpenerDirector that supports cookies and ignores redirects. + + Returns: + A urllib2.OpenerDirector object. + """ + opener = urllib2.OpenerDirector() + opener.add_handler(urllib2.ProxyHandler()) + opener.add_handler(urllib2.UnknownHandler()) + opener.add_handler(urllib2.HTTPHandler()) + opener.add_handler(urllib2.HTTPDefaultErrorHandler()) + opener.add_handler(urllib2.HTTPSHandler()) + opener.add_handler(urllib2.HTTPErrorProcessor()) + if self.save_cookies: + self.cookie_file = os.path.expanduser("~/.codereview_upload_cookies_" + server) + self.cookie_jar = cookielib.MozillaCookieJar(self.cookie_file) + if os.path.exists(self.cookie_file): + try: + self.cookie_jar.load() + self.authenticated = True + StatusUpdate("Loaded authentication cookies from %s" % self.cookie_file) + except (cookielib.LoadError, IOError): + # Failed to load cookies - just ignore them. + pass + else: + # Create an empty cookie file with mode 600 + fd = os.open(self.cookie_file, os.O_CREAT, 0600) + os.close(fd) + # Always chmod the cookie file + os.chmod(self.cookie_file, 0600) + else: + # Don't save cookies across runs of update.py. + self.cookie_jar = cookielib.CookieJar() + opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar)) + return opener + + +def GetRpcServer(options): + """Returns an instance of an AbstractRpcServer. + + Returns: + A new AbstractRpcServer, on which RPC calls can be made. + """ + + rpc_server_class = HttpRpcServer + + def GetUserCredentials(): + """Prompts the user for a username and password.""" + # Disable status prints so they don't obscure the password prompt. + global global_status + st = global_status + global_status = None + + email = options.email + if email is None: + email = GetEmail("Email (login for uploading to %s)" % options.server) + password = getpass.getpass("Password for %s: " % email) + + # Put status back. + global_status = st + return (email, password) + + # If this is the dev_appserver, use fake authentication. + host = (options.host or options.server).lower() + if host == "localhost" or host.startswith("localhost:"): + email = options.email + if email is None: + email = "test@example.com" + logging.info("Using debug user %s. Override with --email" % email) + server = rpc_server_class( + options.server, + lambda: (email, "password"), + host_override=options.host, + extra_headers={"Cookie": 'dev_appserver_login="%s:False"' % email}, + save_cookies=options.save_cookies) + # Don't try to talk to ClientLogin. + server.authenticated = True + return server + + return rpc_server_class(options.server, GetUserCredentials, + host_override=options.host, save_cookies=options.save_cookies) + + +def EncodeMultipartFormData(fields, files): + """Encode form fields for multipart/form-data. + + Args: + fields: A sequence of (name, value) elements for regular form fields. + files: A sequence of (name, filename, value) elements for data to be + uploaded as files. + Returns: + (content_type, body) ready for httplib.HTTP instance. + + Source: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + """ + BOUNDARY = '-M-A-G-I-C---B-O-U-N-D-A-R-Y-' + CRLF = '\r\n' + lines = [] + for (key, value) in fields: + typecheck(key, str) + typecheck(value, str) + lines.append('--' + BOUNDARY) + lines.append('Content-Disposition: form-data; name="%s"' % key) + lines.append('') + lines.append(value) + for (key, filename, value) in files: + typecheck(key, str) + typecheck(filename, str) + typecheck(value, str) + lines.append('--' + BOUNDARY) + lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + lines.append('Content-Type: %s' % GetContentType(filename)) + lines.append('') + lines.append(value) + lines.append('--' + BOUNDARY + '--') + lines.append('') + body = CRLF.join(lines) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def GetContentType(filename): + """Helper to guess the content-type from the filename.""" + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +# Use a shell for subcommands on Windows to get a PATH search. +use_shell = sys.platform.startswith("win") + +def RunShellWithReturnCode(command, print_output=False, + universal_newlines=True, env=os.environ): + """Executes a command and returns the output from stdout and the return code. + + Args: + command: Command to execute. + print_output: If True, the output is printed to stdout. + If False, both stdout and stderr are ignored. + universal_newlines: Use universal_newlines flag (default: True). + + Returns: + Tuple (output, return code) + """ + logging.info("Running %s", command) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=use_shell, universal_newlines=universal_newlines, env=env) + if print_output: + output_array = [] + while True: + line = p.stdout.readline() + if not line: + break + print line.strip("\n") + output_array.append(line) + output = "".join(output_array) + else: + output = p.stdout.read() + p.wait() + errout = p.stderr.read() + if print_output and errout: + print >>sys.stderr, errout + p.stdout.close() + p.stderr.close() + return output, p.returncode + + +def RunShell(command, silent_ok=False, universal_newlines=True, + print_output=False, env=os.environ): + data, retcode = RunShellWithReturnCode(command, print_output, universal_newlines, env) + if retcode: + ErrorExit("Got error status from %s:\n%s" % (command, data)) + if not silent_ok and not data: + ErrorExit("No output from %s" % command) + return data + + +class VersionControlSystem(object): + """Abstract base class providing an interface to the VCS.""" + + def __init__(self, options): + """Constructor. + + Args: + options: Command line options. + """ + self.options = options + + def GenerateDiff(self, args): + """Return the current diff as a string. + + Args: + args: Extra arguments to pass to the diff command. + """ + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + def GetUnknownFiles(self): + """Return a list of files unknown to the VCS.""" + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + def CheckForUnknownFiles(self): + """Show an "are you sure?" prompt if there are unknown files.""" + unknown_files = self.GetUnknownFiles() + if unknown_files: + print "The following files are not added to version control:" + for line in unknown_files: + print line + prompt = "Are you sure to continue?(y/N) " + answer = raw_input(prompt).strip() + if answer != "y": + ErrorExit("User aborted") + + def GetBaseFile(self, filename): + """Get the content of the upstream version of a file. + + Returns: + A tuple (base_content, new_content, is_binary, status) + base_content: The contents of the base file. + new_content: For text files, this is empty. For binary files, this is + the contents of the new file, since the diff output won't contain + information to reconstruct the current file. + is_binary: True iff the file is binary. + status: The status of the file. + """ + + raise NotImplementedError( + "abstract method -- subclass %s must override" % self.__class__) + + + def GetBaseFiles(self, diff): + """Helper that calls GetBase file for each file in the patch. + + Returns: + A dictionary that maps from filename to GetBaseFile's tuple. Filenames + are retrieved based on lines that start with "Index:" or + "Property changes on:". + """ + files = {} + for line in diff.splitlines(True): + if line.startswith('Index:') or line.startswith('Property changes on:'): + unused, filename = line.split(':', 1) + # On Windows if a file has property changes its filename uses '\' + # instead of '/'. + filename = to_slash(filename.strip()) + files[filename] = self.GetBaseFile(filename) + return files + + + def UploadBaseFiles(self, issue, rpc_server, patch_list, patchset, options, + files): + """Uploads the base files (and if necessary, the current ones as well).""" + + def UploadFile(filename, file_id, content, is_binary, status, is_base): + """Uploads a file to the server.""" + set_status("uploading " + filename) + file_too_large = False + if is_base: + type = "base" + else: + type = "current" + if len(content) > MAX_UPLOAD_SIZE: + print ("Not uploading the %s file for %s because it's too large." % + (type, filename)) + file_too_large = True + content = "" + checksum = md5(content).hexdigest() + if options.verbose > 0 and not file_too_large: + print "Uploading %s file for %s" % (type, filename) + url = "/%d/upload_content/%d/%d" % (int(issue), int(patchset), file_id) + form_fields = [ + ("filename", filename), + ("status", status), + ("checksum", checksum), + ("is_binary", str(is_binary)), + ("is_current", str(not is_base)), + ] + if file_too_large: + form_fields.append(("file_too_large", "1")) + if options.email: + form_fields.append(("user", options.email)) + ctype, body = EncodeMultipartFormData(form_fields, [("data", filename, content)]) + response_body = rpc_server.Send(url, body, content_type=ctype) + if not response_body.startswith("OK"): + StatusUpdate(" --> %s" % response_body) + sys.exit(1) + + # Don't want to spawn too many threads, nor do we want to + # hit Rietveld too hard, or it will start serving 500 errors. + # When 8 works, it's no better than 4, and sometimes 8 is + # too many for Rietveld to handle. + MAX_PARALLEL_UPLOADS = 4 + + sema = threading.BoundedSemaphore(MAX_PARALLEL_UPLOADS) + upload_threads = [] + finished_upload_threads = [] + + class UploadFileThread(threading.Thread): + def __init__(self, args): + threading.Thread.__init__(self) + self.args = args + def run(self): + UploadFile(*self.args) + finished_upload_threads.append(self) + sema.release() + + def StartUploadFile(*args): + sema.acquire() + while len(finished_upload_threads) > 0: + t = finished_upload_threads.pop() + upload_threads.remove(t) + t.join() + t = UploadFileThread(args) + upload_threads.append(t) + t.start() + + def WaitForUploads(): + for t in upload_threads: + t.join() + + patches = dict() + [patches.setdefault(v, k) for k, v in patch_list] + for filename in patches.keys(): + base_content, new_content, is_binary, status = files[filename] + file_id_str = patches.get(filename) + if file_id_str.find("nobase") != -1: + base_content = None + file_id_str = file_id_str[file_id_str.rfind("_") + 1:] + file_id = int(file_id_str) + if base_content != None: + StartUploadFile(filename, file_id, base_content, is_binary, status, True) + if new_content != None: + StartUploadFile(filename, file_id, new_content, is_binary, status, False) + WaitForUploads() + + def IsImage(self, filename): + """Returns true if the filename has an image extension.""" + mimetype = mimetypes.guess_type(filename)[0] + if not mimetype: + return False + return mimetype.startswith("image/") + + def IsBinary(self, filename): + """Returns true if the guessed mimetyped isnt't in text group.""" + mimetype = mimetypes.guess_type(filename)[0] + if not mimetype: + return False # e.g. README, "real" binaries usually have an extension + # special case for text files which don't start with text/ + if mimetype in TEXT_MIMETYPES: + return False + return not mimetype.startswith("text/") + + +class FakeMercurialUI(object): + def __init__(self): + self.quiet = True + self.output = '' + + def write(self, *args, **opts): + self.output += ' '.join(args) + def copy(self): + return self + def status(self, *args, **opts): + pass + + def formatter(self, topic, opts): + from mercurial.formatter import plainformatter + return plainformatter(self, topic, opts) + + def readconfig(self, *args, **opts): + pass + def expandpath(self, *args, **opts): + return global_ui.expandpath(*args, **opts) + def configitems(self, *args, **opts): + return global_ui.configitems(*args, **opts) + def config(self, *args, **opts): + return global_ui.config(*args, **opts) + +use_hg_shell = False # set to True to shell out to hg always; slower + +class MercurialVCS(VersionControlSystem): + """Implementation of the VersionControlSystem interface for Mercurial.""" + + def __init__(self, options, ui, repo): + super(MercurialVCS, self).__init__(options) + self.ui = ui + self.repo = repo + self.status = None + # Absolute path to repository (we can be in a subdir) + self.repo_dir = os.path.normpath(repo.root) + # Compute the subdir + cwd = os.path.normpath(os.getcwd()) + assert cwd.startswith(self.repo_dir) + self.subdir = cwd[len(self.repo_dir):].lstrip(r"\/") + if self.options.revision: + self.base_rev = self.options.revision + else: + mqparent, err = RunShellWithReturnCode(['hg', 'log', '--rev', 'qparent', '--template={node}']) + if not err and mqparent != "": + self.base_rev = mqparent + else: + out = RunShell(["hg", "parents", "-q"], silent_ok=True).strip() + if not out: + # No revisions; use 0 to mean a repository with nothing. + out = "0:0" + self.base_rev = out.split(':')[1].strip() + def _GetRelPath(self, filename): + """Get relative path of a file according to the current directory, + given its logical path in the repo.""" + assert filename.startswith(self.subdir), (filename, self.subdir) + return filename[len(self.subdir):].lstrip(r"\/") + + def GenerateDiff(self, extra_args): + # If no file specified, restrict to the current subdir + extra_args = extra_args or ["."] + cmd = ["hg", "diff", "--git", "-r", self.base_rev] + extra_args + data = RunShell(cmd, silent_ok=True) + svndiff = [] + filecount = 0 + for line in data.splitlines(): + m = re.match("diff --git a/(\S+) b/(\S+)", line) + if m: + # Modify line to make it look like as it comes from svn diff. + # With this modification no changes on the server side are required + # to make upload.py work with Mercurial repos. + # NOTE: for proper handling of moved/copied files, we have to use + # the second filename. + filename = m.group(2) + svndiff.append("Index: %s" % filename) + svndiff.append("=" * 67) + filecount += 1 + logging.info(line) + else: + svndiff.append(line) + if not filecount: + ErrorExit("No valid patches found in output from hg diff") + return "\n".join(svndiff) + "\n" + + def GetUnknownFiles(self): + """Return a list of files unknown to the VCS.""" + args = [] + status = RunShell(["hg", "status", "--rev", self.base_rev, "-u", "."], + silent_ok=True) + unknown_files = [] + for line in status.splitlines(): + st, fn = line.split(" ", 1) + if st == "?": + unknown_files.append(fn) + return unknown_files + + def get_hg_status(self, rev, path): + # We'd like to use 'hg status -C path', but that is buggy + # (see http://mercurial.selenic.com/bts/issue3023). + # Instead, run 'hg status -C' without a path + # and skim the output for the path we want. + if self.status is None: + if use_hg_shell: + out = RunShell(["hg", "status", "-C", "--rev", rev]) + else: + fui = FakeMercurialUI() + ret = hg_commands.status(fui, self.repo, *[], **{'rev': [rev], 'copies': True}) + if ret: + raise hg_util.Abort(ret) + out = fui.output + self.status = out.splitlines() + for i in range(len(self.status)): + # line is + # A path + # M path + # etc + line = to_slash(self.status[i]) + if line[2:] == path: + if i+1 < len(self.status) and self.status[i+1][:2] == ' ': + return self.status[i:i+2] + return self.status[i:i+1] + raise hg_util.Abort("no status for " + path) + + def GetBaseFile(self, filename): + set_status("inspecting " + filename) + # "hg status" and "hg cat" both take a path relative to the current subdir + # rather than to the repo root, but "hg diff" has given us the full path + # to the repo root. + base_content = "" + new_content = None + is_binary = False + oldrelpath = relpath = self._GetRelPath(filename) + out = self.get_hg_status(self.base_rev, relpath) + status, what = out[0].split(' ', 1) + if len(out) > 1 and status == "A" and what == relpath: + oldrelpath = out[1].strip() + status = "M" + if ":" in self.base_rev: + base_rev = self.base_rev.split(":", 1)[0] + else: + base_rev = self.base_rev + if status != "A": + if use_hg_shell: + base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], silent_ok=True) + else: + base_content = str(self.repo[base_rev][oldrelpath].data()) + is_binary = "\0" in base_content # Mercurial's heuristic + if status != "R": + new_content = open(relpath, "rb").read() + is_binary = is_binary or "\0" in new_content + if is_binary and base_content and use_hg_shell: + # Fetch again without converting newlines + base_content = RunShell(["hg", "cat", "-r", base_rev, oldrelpath], + silent_ok=True, universal_newlines=False) + if not is_binary or not self.IsImage(relpath): + new_content = None + return base_content, new_content, is_binary, status + + +# NOTE: The SplitPatch function is duplicated in engine.py, keep them in sync. +def SplitPatch(data): + """Splits a patch into separate pieces for each file. + + Args: + data: A string containing the output of svn diff. + + Returns: + A list of 2-tuple (filename, text) where text is the svn diff output + pertaining to filename. + """ + patches = [] + filename = None + diff = [] + for line in data.splitlines(True): + new_filename = None + if line.startswith('Index:'): + unused, new_filename = line.split(':', 1) + new_filename = new_filename.strip() + elif line.startswith('Property changes on:'): + unused, temp_filename = line.split(':', 1) + # When a file is modified, paths use '/' between directories, however + # when a property is modified '\' is used on Windows. Make them the same + # otherwise the file shows up twice. + temp_filename = to_slash(temp_filename.strip()) + if temp_filename != filename: + # File has property changes but no modifications, create a new diff. + new_filename = temp_filename + if new_filename: + if filename and diff: + patches.append((filename, ''.join(diff))) + filename = new_filename + diff = [line] + continue + if diff is not None: + diff.append(line) + if filename and diff: + patches.append((filename, ''.join(diff))) + return patches + + +def UploadSeparatePatches(issue, rpc_server, patchset, data, options): + """Uploads a separate patch for each file in the diff output. + + Returns a list of [patch_key, filename] for each file. + """ + patches = SplitPatch(data) + rv = [] + for patch in patches: + set_status("uploading patch for " + patch[0]) + if len(patch[1]) > MAX_UPLOAD_SIZE: + print ("Not uploading the patch for " + patch[0] + + " because the file is too large.") + continue + form_fields = [("filename", patch[0])] + if not options.download_base: + form_fields.append(("content_upload", "1")) + files = [("data", "data.diff", patch[1])] + ctype, body = EncodeMultipartFormData(form_fields, files) + url = "/%d/upload_patch/%d" % (int(issue), int(patchset)) + print "Uploading patch for " + patch[0] + response_body = rpc_server.Send(url, body, content_type=ctype) + lines = response_body.splitlines() + if not lines or lines[0] != "OK": + StatusUpdate(" --> %s" % response_body) + sys.exit(1) + rv.append([lines[1], patch[0]]) + return rv diff --git a/outside/re2/libre2.symbols b/outside/re2/libre2.symbols new file mode 100644 index 000000000..fa066ae30 --- /dev/null +++ b/outside/re2/libre2.symbols @@ -0,0 +1,16 @@ +{ + global: + # re2::RE2* + _ZN3re23RE2*; + _ZNK3re23RE2*; + # re2::StringPiece* + _ZN3re211StringPiece*; + _ZNK3re211StringPiece*; + # operator<<(std::ostream&, re2::StringPiece const&) + _ZlsRSoRKN3re211StringPieceE; + # re2::FilteredRE2* + _ZN3re211FilteredRE2*; + _ZNK3re211FilteredRE210AllMatches*; + local: + *; +}; diff --git a/outside/re2/libre2.symbols.darwin b/outside/re2/libre2.symbols.darwin new file mode 100644 index 000000000..7fe74a023 --- /dev/null +++ b/outside/re2/libre2.symbols.darwin @@ -0,0 +1,13 @@ +# Linker doesn't like these unmangled: +# re2::RE2* +__ZN3re23RE2* +__ZNK3re23RE2* +# re2::StringPiece* +__ZN3re211StringPiece* +__ZNK3re211StringPiece* +# operator<<(std::ostream&, re2::StringPiece const&) +__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE +# re2::FilteredRE2* +__ZN3re211FilteredRE2* +__ZNK3re211FilteredRE210AllMatches* + diff --git a/outside/re2/re2/Makefile b/outside/re2/re2/Makefile new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/outside/re2/re2/Makefile @@ -0,0 +1 @@ + diff --git a/outside/re2/re2/bitstate.cc b/outside/re2/re2/bitstate.cc new file mode 100644 index 000000000..518d64201 --- /dev/null +++ b/outside/re2/re2/bitstate.cc @@ -0,0 +1,378 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc, exhaustive_test.cc, tester.cc + +// Prog::SearchBitState is a regular expression search with submatch +// tracking for small regular expressions and texts. Like +// testing/backtrack.cc, it allocates a bit vector with (length of +// text) * (length of prog) bits, to make sure it never explores the +// same (character position, instruction) state multiple times. This +// limits the search to run in time linear in the length of the text. +// +// Unlike testing/backtrack.cc, SearchBitState is not recursive +// on the text. +// +// SearchBitState is a fast replacement for the NFA code on small +// regexps and texts when SearchOnePass cannot be used. + +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct Job { + int id; + int arg; + const char* p; +}; + +class BitState { + public: + explicit BitState(Prog* prog); + ~BitState(); + + // The usual Search prototype. + // Can only call Search once per BitState. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + inline bool ShouldVisit(int id, const char* p); + void Push(int id, const char* p, int arg); + bool GrowStack(); + bool TrySearch(int id, const char* p); + + // Search parameters + Prog* prog_; // program being run + StringPiece text_; // text being searched + StringPiece context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether match must end at text.end() + StringPiece *submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in + + // Search state + const char** cap_; // capture registers + int ncap_; + + static const int VisitedBits = 32; + uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked + int nvisited_; // # of words in bitmap + + Job *job_; // stack of text positions to explore + int njob_; + int maxjob_; +}; + +BitState::BitState(Prog* prog) + : prog_(prog), + anchored_(false), + longest_(false), + endmatch_(false), + submatch_(NULL), + nsubmatch_(0), + cap_(NULL), + ncap_(0), + visited_(NULL), + nvisited_(0), + job_(NULL), + njob_(0), + maxjob_(0) { +} + +BitState::~BitState() { + delete[] visited_; + delete[] job_; + delete[] cap_; +} + +// Should the search visit the pair ip, p? +// If so, remember that it was visited so that the next time, +// we don't repeat the visit. +bool BitState::ShouldVisit(int id, const char* p) { + uint n = id * (text_.size() + 1) + (p - text_.begin()); + if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) + return false; + visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); + return true; +} + +// Grow the stack. +bool BitState::GrowStack() { + // VLOG(0) << "Reallocate."; + maxjob_ *= 2; + Job* newjob = new Job[maxjob_]; + memmove(newjob, job_, njob_*sizeof job_[0]); + delete[] job_; + job_ = newjob; + if (njob_ >= maxjob_) { + LOG(DFATAL) << "Job stack overflow."; + return false; + } + return true; +} + +// Push the triple (id, p, arg) onto the stack, growing it if necessary. +void BitState::Push(int id, const char* p, int arg) { + if (njob_ >= maxjob_) { + if (!GrowStack()) + return; + } + int op = prog_->inst(id)->opcode(); + if (op == kInstFail) + return; + + // Only check ShouldVisit when arg == 0. + // When arg > 0, we are continuing a previous visit. + if (arg == 0 && !ShouldVisit(id, p)) + return; + + Job* j = &job_[njob_++]; + j->id = id; + j->p = p; + j->arg = arg; +} + +// Try a search from instruction id0 in state p0. +// Return whether it succeeded. +bool BitState::TrySearch(int id0, const char* p0) { + bool matched = false; + const char* end = text_.end(); + njob_ = 0; + Push(id0, p0, 0); + while (njob_ > 0) { + // Pop job off stack. + --njob_; + int id = job_[njob_].id; + const char* p = job_[njob_].p; + int arg = job_[njob_].arg; + + // Optimization: rather than push and pop, + // code that is going to Push and continue + // the loop simply updates ip, p, and arg + // and jumps to CheckAndLoop. We have to + // do the ShouldVisit check that Push + // would have, but we avoid the stack + // manipulation. + if (0) { + CheckAndLoop: + if (!ShouldVisit(id, p)) + continue; + } + + // Visit ip, p. + // VLOG(0) << "Job: " << ip->id() << " " + // << (p - text_.begin()) << " " << arg; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + case kInstFail: + default: + LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; + return false; + + case kInstAlt: + // Cannot just + // Push(ip->out1(), p, 0); + // Push(ip->out(), p, 0); + // If, during the processing of ip->out(), we encounter + // ip->out1() via another path, we want to process it then. + // Pushing it here will inhibit that. Instead, re-push + // ip with arg==1 as a reminder to push ip->out1() later. + switch (arg) { + case 0: + Push(id, p, 1); // come back when we're done + id = ip->out(); + goto CheckAndLoop; + + case 1: + // Finished ip->out(); try ip->out1(). + arg = 0; + id = ip->out1(); + goto CheckAndLoop; + } + LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; + continue; + + case kInstAltMatch: + // One opcode is byte range; the other leads to match. + if (ip->greedy(prog_)) { + // out1 is the match + Push(ip->out1(), p, 0); + id = ip->out1(); + p = end; + goto CheckAndLoop; + } + // out is the match - non-greedy + Push(ip->out(), end, 0); + id = ip->out(); + goto CheckAndLoop; + + case kInstByteRange: { + int c = -1; + if (p < end) + c = *p & 0xFF; + if (ip->Matches(c)) { + id = ip->out(); + p++; + goto CheckAndLoop; + } + continue; + } + + case kInstCapture: + switch (arg) { + case 0: + if (0 <= ip->cap() && ip->cap() < ncap_) { + // Capture p to register, but save old value. + Push(id, cap_[ip->cap()], 1); // come back when we're done + cap_[ip->cap()] = p; + } + // Continue on. + id = ip->out(); + goto CheckAndLoop; + case 1: + // Finished ip->out(); restore the old value. + cap_[ip->cap()] = p; + continue; + } + LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; + continue; + + case kInstEmptyWidth: + if (ip->empty() & ~Prog::EmptyFlags(context_, p)) + continue; + id = ip->out(); + goto CheckAndLoop; + + case kInstNop: + id = ip->out(); + goto CheckAndLoop; + + case kInstMatch: { + if (endmatch_ && p != text_.end()) + continue; + + // VLOG(0) << "Found match."; + // We found a match. If the caller doesn't care + // where the match is, no point going further. + if (nsubmatch_ == 0) + return true; + + // Record best match so far. + // Only need to check end point, because this entire + // call is only considering one start position. + matched = true; + cap_[1] = p; + if (submatch_[0].data() == NULL || + (longest_ && p > submatch_[0].end())) { + for (int i = 0; i < nsubmatch_; i++) + submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + } + + // If going for first match, we're done. + if (!longest_) + return true; + + // If we used the entire text, no longer match is possible. + if (p == text_.end()) + return true; + + // Otherwise, continue on in hope of a longer match. + continue; + } + } + } + return matched; +} + +// Search text (within context) for prog_. +bool BitState::Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + // Search parameters. + text_ = text; + context_ = context; + if (context_.begin() == NULL) + context_ = text; + if (prog_->anchor_start() && context_.begin() != text.begin()) + return false; + if (prog_->anchor_end() && context_.end() != text.end()) + return false; + anchored_ = anchored || prog_->anchor_start(); + longest_ = longest || prog_->anchor_end(); + endmatch_ = prog_->anchor_end(); + submatch_ = submatch; + nsubmatch_ = nsubmatch; + for (int i = 0; i < nsubmatch_; i++) + submatch_[i] = NULL; + + // Allocate scratch space. + nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits; + visited_ = new uint32[nvisited_]; + memset(visited_, 0, nvisited_*sizeof visited_[0]); + // VLOG(0) << "nvisited_ = " << nvisited_; + + ncap_ = 2*nsubmatch; + if (ncap_ < 2) + ncap_ = 2; + cap_ = new const char*[ncap_]; + memset(cap_, 0, ncap_*sizeof cap_[0]); + + maxjob_ = 256; + job_ = new Job[maxjob_]; + + // Anchored search must start at text.begin(). + if (anchored_) { + cap_[0] = text.begin(); + return TrySearch(prog_->start(), text.begin()); + } + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is p <= text.end(), not p < text.end(). + // This looks like it's quadratic in the size of the text, + // but we are not clearing visited_ between calls to TrySearch, + // so no work is duplicated and it ends up still being linear. + for (const char* p = text.begin(); p <= text.end(); p++) { + cap_[0] = p; + if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. + return true; + } + return false; +} + +// Bit-state search. +bool Prog::SearchBitState(const StringPiece& text, + const StringPiece& context, + Anchor anchor, + MatchKind kind, + StringPiece* match, + int nmatch) { + // If full match, we ask for an anchored longest match + // and then check that match[0] == text. + // So make sure match[0] exists. + StringPiece sp0; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch < 1) { + match = &sp0; + nmatch = 1; + } + } + + // Run the search. + BitState b(this); + bool anchored = anchor == kAnchored; + bool longest = kind != kFirstMatch; + if (!b.Search(text, context, anchored, longest, match, nmatch)) + return false; + if (kind == kFullMatch && match[0].end() != text.end()) + return false; + return true; +} + +} // namespace re2 diff --git a/outside/re2/re2/compile.cc b/outside/re2/re2/compile.cc new file mode 100644 index 000000000..08cb6fa8f --- /dev/null +++ b/outside/re2/re2/compile.cc @@ -0,0 +1,1140 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compile regular expression to Prog. +// +// Prog and Inst are defined in prog.h. +// This file's external interface is just Regexp::CompileToProg. +// The Compiler class defined in this file is private. + +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// List of pointers to Inst* that need to be filled in (patched). +// Because the Inst* haven't been filled in yet, +// we can use the Inst* word to hold the list's "next" pointer. +// It's kind of sleazy, but it works well in practice. +// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. +// +// Because the out and out1 fields in Inst are no longer pointers, +// we can't use pointers directly here either. Instead, p refers +// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). +// p == 0 represents the NULL list. This is okay because instruction #0 +// is always the fail instruction, which never appears on a list. + +struct PatchList { + uint32 p; + + // Returns patch list containing just p. + static PatchList Mk(uint32 p); + + // Patches all the entries on l to have value v. + // Caller must not ever use patch list again. + static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); + + // Deref returns the next pointer pointed at by p. + static PatchList Deref(Prog::Inst *inst0, PatchList l); + + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); +}; + +static PatchList nullPatchList = { 0 }; + +// Returns patch list containing just p. +PatchList PatchList::Mk(uint32 p) { + PatchList l; + l.p = p; + return l; +} + +// Returns the next pointer pointed at by l. +PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) + l.p = ip->out1(); + else + l.p = ip->out(); + return l; +} + +// Patches all the entries on l to have value v. +void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { + while (l.p != 0) { + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) { + l.p = ip->out1(); + ip->out1_ = val; + } else { + l.p = ip->out(); + ip->set_out(val); + } + } +} + +// Appends two patch lists and returns result. +PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.p == 0) + return l2; + if (l2.p == 0) + return l1; + + PatchList l = l1; + for (;;) { + PatchList next = PatchList::Deref(inst0, l); + if (next.p == 0) + break; + l = next; + } + + Prog::Inst* ip = &inst0[l.p>>1]; + if (l.p&1) + ip->out1_ = l2.p; + else + ip->set_out(l2.p); + + return l1; +} + +// Compiled program fragment. +struct Frag { + uint32 begin; + PatchList end; + + Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector + Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} +}; + +// Input encodings. +enum Encoding { + kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) + kEncodingLatin1, // Latin1 (0-FF) +}; + +class Compiler : public Regexp::Walker { + public: + explicit Compiler(); + ~Compiler(); + + // Compiles Regexp to a new Prog. + // Caller is responsible for deleting Prog when finished with it. + // If reversed is true, compiles for walking over the input + // string backward (reverses all concatenations). + static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); + + // Compiles alternation of all the re to a new Prog. + // Each re has a match with an id equal to its index in the vector. + static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re); + + // Interface for Regexp::Walker, which helps traverse the Regexp. + // The walk is purely post-recursive: given the machines for the + // children, PostVisit combines them to create the machine for + // the current node. The child_args are Frags. + // The Compiler traverses the Regexp parse tree, visiting + // each node in depth-first order. It invokes PreVisit before + // visiting the node's children and PostVisit after visiting + // the children. + Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); + Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, + int nchild_args); + Frag ShortVisit(Regexp* re, Frag parent_arg); + Frag Copy(Frag arg); + + // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? + Frag Plus(Frag a, bool nongreedy); + Frag Star(Frag a, bool nongreedy); + Frag Quest(Frag a, bool nongreedy); + + // Given fragment a, returns (a) capturing as \n. + Frag Capture(Frag a, int n); + + // Given fragments a and b, returns ab; a|b + Frag Cat(Frag a, Frag b); + Frag Alt(Frag a, Frag b); + + // Returns a fragment that can't match anything. + Frag NoMatch(); + + // Returns a fragment that matches the empty string. + Frag Match(int32 id); + + // Returns a no-op fragment. + Frag Nop(); + + // Returns a fragment matching the byte range lo-hi. + Frag ByteRange(int lo, int hi, bool foldcase); + + // Returns a fragment matching an empty-width special op. + Frag EmptyWidth(EmptyOp op); + + // Adds n instructions to the program. + // Returns the index of the first one. + // Returns -1 if no more instructions are available. + int AllocInst(int n); + + // Deletes unused instructions. + void Trim(); + + // Rune range compiler. + + // Begins a new alternation. + void BeginRange(); + + // Adds a fragment matching the rune range lo-hi. + void AddRuneRange(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); + void Add_80_10ffff(); + + // New suffix that matches the byte range lo-hi, then goes to next. + int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + + // Adds a suffix to alternation. + void AddSuffix(int id); + + // Returns the alternation of all the added suffixes. + Frag EndRange(); + + // Single rune. + Frag Literal(Rune r, bool foldcase); + + void Setup(Regexp::ParseFlags, int64, RE2::Anchor); + Prog* Finish(); + + // Returns .* where dot = any byte + Frag DotStar(); + + private: + Prog* prog_; // Program being built. + bool failed_; // Did we give up compiling? + Encoding encoding_; // Input encoding + bool reversed_; // Should program run backward over text? + + int max_inst_; // Maximum number of instructions. + + Prog::Inst* inst_; // Pointer to first instruction. + int inst_len_; // Number of instructions used. + int inst_cap_; // Number of instructions allocated. + + int64 max_mem_; // Total memory budget. + + map rune_cache_; + Frag rune_range_; + + RE2::Anchor anchor_; // anchor mode for RE2::Set + + DISALLOW_EVIL_CONSTRUCTORS(Compiler); +}; + +Compiler::Compiler() { + prog_ = new Prog(); + failed_ = false; + encoding_ = kEncodingUTF8; + reversed_ = false; + inst_ = NULL; + inst_len_ = 0; + inst_cap_ = 0; + max_inst_ = 1; // make AllocInst for fail instruction okay + max_mem_ = 0; + int fail = AllocInst(1); + inst_[fail].InitFail(); + max_inst_ = 0; // Caller must change +} + +Compiler::~Compiler() { + delete prog_; + delete[] inst_; +} + +int Compiler::AllocInst(int n) { + if (failed_ || inst_len_ + n > max_inst_) { + failed_ = true; + return -1; + } + + if (inst_len_ + n > inst_cap_) { + if (inst_cap_ == 0) + inst_cap_ = 8; + while (inst_len_ + n > inst_cap_) + inst_cap_ *= 2; + Prog::Inst* ip = new Prog::Inst[inst_cap_]; + memmove(ip, inst_, inst_len_ * sizeof ip[0]); + memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); + delete[] inst_; + inst_ = ip; + } + int id = inst_len_; + inst_len_ += n; + return id; +} + +void Compiler::Trim() { + if (inst_len_ < inst_cap_) { + Prog::Inst* ip = new Prog::Inst[inst_len_]; + memmove(ip, inst_, inst_len_ * sizeof ip[0]); + delete[] inst_; + inst_ = ip; + inst_cap_ = inst_len_; + } +} + +// These routines are somewhat hard to visualize in text -- +// see http://swtch.com/~rsc/regexp/regexp1.html for +// pictures explaining what is going on here. + +// Returns an unmatchable fragment. +Frag Compiler::NoMatch() { + return Frag(0, nullPatchList); +} + +// Is a an unmatchable fragment? +static bool IsNoMatch(Frag a) { + return a.begin == 0; +} + +// Given fragments a and b, returns fragment for ab. +Frag Compiler::Cat(Frag a, Frag b) { + if (IsNoMatch(a) || IsNoMatch(b)) + return NoMatch(); + + // Elide no-op. + Prog::Inst* begin = &inst_[a.begin]; + if (begin->opcode() == kInstNop && + a.end.p == (a.begin << 1) && + begin->out() == 0) { + PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere + return b; + } + + // To run backward over string, reverse all concatenations. + if (reversed_) { + PatchList::Patch(inst_, b.end, a.begin); + return Frag(b.begin, a.end); + } + + PatchList::Patch(inst_, a.end, b.begin); + return Frag(a.begin, b.end); +} + +// Given fragments for a and b, returns fragment for a|b. +Frag Compiler::Alt(Frag a, Frag b) { + // Special case for convenience in loops. + if (IsNoMatch(a)) + return b; + if (IsNoMatch(b)) + return a; + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + + inst_[id].InitAlt(a.begin, b.begin); + return Frag(id, PatchList::Append(inst_, a.end, b.end)); +} + +// When capturing submatches in like-Perl mode, a kOpAlt Inst +// treats out_ as the first choice, out1_ as the second. +// +// For *, +, and ?, if out_ causes another repetition, +// then the operator is greedy. If out1_ is the repetition +// (and out_ moves forward), then the operator is non-greedy. + +// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) +Frag Compiler::Star(Frag a, bool nongreedy) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitAlt(0, 0); + PatchList::Patch(inst_, a.end, id); + if (nongreedy) { + inst_[id].out1_ = a.begin; + return Frag(id, PatchList::Mk(id << 1)); + } else { + inst_[id].set_out(a.begin); + return Frag(id, PatchList::Mk((id << 1) | 1)); + } +} + +// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) +Frag Compiler::Plus(Frag a, bool nongreedy) { + // a+ is just a* with a different entry point. + Frag f = Star(a, nongreedy); + return Frag(a.begin, f.end); +} + +// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) +Frag Compiler::Quest(Frag a, bool nongreedy) { + if (IsNoMatch(a)) + return Nop(); + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } + return Frag(id, PatchList::Append(inst_, pl, a.end)); +} + +// Returns a fragment for the byte range lo-hi. +Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitByteRange(lo, hi, foldcase, 0); + prog_->byte_inst_count_++; + prog_->MarkByteRange(lo, hi); + if (foldcase && lo <= 'z' && hi >= 'a') { + if (lo < 'a') + lo = 'a'; + if (hi > 'z') + hi = 'z'; + if (lo <= hi) + prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); + } + return Frag(id, PatchList::Mk(id << 1)); +} + +// Returns a no-op fragment. Sometimes unavoidable. +Frag Compiler::Nop() { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitNop(0); + return Frag(id, PatchList::Mk(id << 1)); +} + +// Returns a fragment that signals a match. +Frag Compiler::Match(int32 match_id) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitMatch(match_id); + return Frag(id, nullPatchList); +} + +// Returns a fragment matching a particular empty-width op (like ^ or $) +Frag Compiler::EmptyWidth(EmptyOp empty) { + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + inst_[id].InitEmptyWidth(empty, 0); + if (empty & (kEmptyBeginLine|kEmptyEndLine)) + prog_->MarkByteRange('\n', '\n'); + if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { + int j; + for (int i = 0; i < 256; i = j) { + for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) + ; + prog_->MarkByteRange(i, j-1); + } + } + return Frag(id, PatchList::Mk(id << 1)); +} + +// Given a fragment a, returns a fragment with capturing parens around a. +Frag Compiler::Capture(Frag a, int n) { + if (IsNoMatch(a)) + return NoMatch(); + int id = AllocInst(2); + if (id < 0) + return NoMatch(); + inst_[id].InitCapture(2*n, a.begin); + inst_[id+1].InitCapture(2*n+1, 0); + PatchList::Patch(inst_, a.end, id+1); + + return Frag(id, PatchList::Mk((id+1) << 1)); +} + +// A Rune is a name for a Unicode code point. +// Returns maximum rune encoded by UTF-8 sequence of length len. +static int MaxRune(int len) { + int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) + if (len == 1) + b = 7; + else + b = 8-(len+1) + 6*(len-1); + return (1<::iterator it = rune_cache_.find(key); + if (it != rune_cache_.end()) + return it->second; + int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); + rune_cache_[key] = id; + return id; +} + +void Compiler::AddSuffix(int id) { + if (rune_range_.begin == 0) { + rune_range_.begin = id; + return; + } + + int alt = AllocInst(1); + if (alt < 0) { + rune_range_.begin = 0; + return; + } + inst_[alt].InitAlt(rune_range_.begin, id); + rune_range_.begin = alt; +} + +Frag Compiler::EndRange() { + return rune_range_; +} + +// Converts rune range lo-hi into a fragment that recognizes +// the bytes that would make up those runes in the current +// encoding (Latin 1 or UTF-8). +// This lets the machine work byte-by-byte even when +// using multibyte encodings. + +void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { + switch (encoding_) { + default: + case kEncodingUTF8: + AddRuneRangeUTF8(lo, hi, foldcase); + break; + case kEncodingLatin1: + AddRuneRangeLatin1(lo, hi, foldcase); + break; + } +} + +void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { + // Latin1 is easy: runes *are* bytes. + if (lo > hi || lo > 0xFF) + return; + if (hi > 0xFF) + hi = 0xFF; + AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); +} + +// Table describing how to make a UTF-8 matching machine +// for the rune range 80-10FFFF (Runeself-Runemax). +// This range happens frequently enough (for example /./ and /[^a-z]/) +// and the rune_cache_ map is slow enough that this is worth +// special handling. Makes compilation of a small expression +// with a dot in it about 10% faster. +// The * in the comments below mark whole sequences. +static struct ByteRangeProg { + int next; + int lo; + int hi; +} prog_80_10ffff[] = { + // Two-byte + { -1, 0x80, 0xBF, }, // 0: 80-BF + { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* + + // Three-byte + { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF + { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* + { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF + { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* + + // Four-byte + { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF + { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* + { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF + { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* + { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF + { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* +}; + +void Compiler::Add_80_10ffff() { + int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning + for (int i = 0; i < arraysize(prog_80_10ffff); i++) { + const ByteRangeProg& p = prog_80_10ffff[i]; + int next = 0; + if (p.next >= 0) + next = inst[p.next]; + inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); + if ((p.lo & 0xC0) != 0x80) + AddSuffix(inst[i]); + } +} + +void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { + if (lo > hi) + return; + + // Pick off 80-10FFFF as a common special case + // that can bypass the slow rune_cache_. + if (lo == 0x80 && hi == 0x10ffff && !reversed_) { + Add_80_10ffff(); + return; + } + + // Split range into same-length sized ranges. + for (int i = 1; i < UTFmax; i++) { + Rune max = MaxRune(i); + if (lo <= max && max < hi) { + AddRuneRangeUTF8(lo, max, foldcase); + AddRuneRangeUTF8(max+1, hi, foldcase); + return; + } + } + + // ASCII range is always a special case. + if (hi < Runeself) { + AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + return; + } + + // Split range into sections that agree on leading bytes. + for (int i = 1; i < UTFmax; i++) { + uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence + if ((lo & ~m) != (hi & ~m)) { + if ((lo & m) != 0) { + AddRuneRangeUTF8(lo, lo|m, foldcase); + AddRuneRangeUTF8((lo|m)+1, hi, foldcase); + return; + } + if ((hi & m) != m) { + AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); + AddRuneRangeUTF8(hi&~m, hi, foldcase); + return; + } + } + } + + // Finally. Generate byte matching equivalent for lo-hi. + uint8 ulo[UTFmax], uhi[UTFmax]; + int n = runetochar(reinterpret_cast(ulo), &lo); + int m = runetochar(reinterpret_cast(uhi), &hi); + (void)m; // USED(m) + DCHECK_EQ(n, m); + + int id = 0; + if (reversed_) { + for (int i = 0; i < n; i++) + id = RuneByteSuffix(ulo[i], uhi[i], false, id); + } else { + for (int i = n-1; i >= 0; i--) + id = RuneByteSuffix(ulo[i], uhi[i], false, id); + } + AddSuffix(id); +} + +// Should not be called. +Frag Compiler::Copy(Frag arg) { + // We're using WalkExponential; there should be no copying. + LOG(DFATAL) << "Compiler::Copy called!"; + failed_ = true; + return NoMatch(); +} + +// Visits a node quickly; called once WalkExponential has +// decided to cut this walk short. +Frag Compiler::ShortVisit(Regexp* re, Frag) { + failed_ = true; + return NoMatch(); +} + +// Called before traversing a node's children during the walk. +Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { + // Cut off walk if we've already failed. + if (failed_) + *stop = true; + + return Frag(); // not used by caller +} + +Frag Compiler::Literal(Rune r, bool foldcase) { + switch (encoding_) { + default: + return Frag(); + + case kEncodingLatin1: + return ByteRange(r, r, foldcase); + + case kEncodingUTF8: { + if (r < Runeself) // Make common case fast. + return ByteRange(r, r, foldcase); + uint8 buf[UTFmax]; + int n = runetochar(reinterpret_cast(buf), &r); + Frag f = ByteRange((uint8)buf[0], buf[0], false); + for (int i = 1; i < n; i++) + f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); + return f; + } + } +} + +// Called after traversing the node's children during the walk. +// Given their frags, build and return the frag for this re. +Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, + int nchild_frags) { + // If a child failed, don't bother going forward, especially + // since the child_frags might contain Frags with NULLs in them. + if (failed_) + return NoMatch(); + + // Given the child fragments, return the fragment for this node. + switch (re->op()) { + case kRegexpRepeat: + // Should not see; code at bottom of function will print error + break; + + case kRegexpNoMatch: + return NoMatch(); + + case kRegexpEmptyMatch: + return Nop(); + + case kRegexpHaveMatch: { + Frag f = Match(re->match_id()); + // Remember unanchored match to end of string. + if (anchor_ != RE2::ANCHOR_BOTH) + f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); + return f; + } + + case kRegexpConcat: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Cat(f, child_frags[i]); + return f; + } + + case kRegexpAlternate: { + Frag f = child_frags[0]; + for (int i = 1; i < nchild_frags; i++) + f = Alt(f, child_frags[i]); + return f; + } + + case kRegexpStar: + return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpPlus: + return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpQuest: + return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + + case kRegexpLiteral: + return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); + + case kRegexpLiteralString: { + // Concatenation of literals. + if (re->nrunes() == 0) + return Nop(); + Frag f; + for (int i = 0; i < re->nrunes(); i++) { + Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); + if (i == 0) + f = f1; + else + f = Cat(f, f1); + } + return f; + } + + case kRegexpAnyChar: + BeginRange(); + AddRuneRange(0, Runemax, false); + return EndRange(); + + case kRegexpAnyByte: + return ByteRange(0x00, 0xFF, false); + + case kRegexpCharClass: { + CharClass* cc = re->cc(); + if (cc->empty()) { + // This can't happen. + LOG(DFATAL) << "No ranges in char class"; + failed_ = true; + return NoMatch(); + } + + // ASCII case-folding optimization: if the char class + // behaves the same on A-Z as it does on a-z, + // discard any ranges wholly contained in A-Z + // and mark the other ranges as foldascii. + // This reduces the size of a program for + // (?i)abc from 3 insts per letter to 1 per letter. + bool foldascii = cc->FoldsASCII(); + + // Character class is just a big OR of the different + // character ranges in the class. + BeginRange(); + for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { + // ASCII case-folding optimization (see above). + if (foldascii && 'A' <= i->lo && i->hi <= 'Z') + continue; + + // If this range contains all of A-Za-z or none of it, + // the fold flag is unnecessary; don't bother. + bool fold = foldascii; + if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) + fold = false; + + AddRuneRange(i->lo, i->hi, fold); + } + return EndRange(); + } + + case kRegexpCapture: + // If this is a non-capturing parenthesis -- (?:foo) -- + // just use the inner expression. + if (re->cap() < 0) + return child_frags[0]; + return Capture(child_frags[0], re->cap()); + + case kRegexpBeginLine: + return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); + + case kRegexpEndLine: + return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); + + case kRegexpBeginText: + return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); + + case kRegexpEndText: + return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); + + case kRegexpWordBoundary: + return EmptyWidth(kEmptyWordBoundary); + + case kRegexpNoWordBoundary: + return EmptyWidth(kEmptyNonWordBoundary); + } + LOG(DFATAL) << "Missing case in Compiler: " << re->op(); + failed_ = true; + return NoMatch(); +} + +// Is this regexp required to start at the beginning of the text? +// Only approximate; can return false for complicated regexps like (\Aa|\Ab), +// but handles (\A(a|b)). Could use the Walker to write a more exact one. +static bool IsAnchorStart(Regexp** pre, int depth) { + Regexp* re = *pre; + Regexp* sub; + // The depth limit makes sure that we don't overflow + // the stack on a deeply nested regexp. As the comment + // above says, IsAnchorStart is conservative, so returning + // a false negative is okay. The exact limit is somewhat arbitrary. + if (re == NULL || depth >= 4) + return false; + switch (re->op()) { + default: + break; + case kRegexpConcat: + if (re->nsub() > 0) { + sub = re->sub()[0]->Incref(); + if (IsAnchorStart(&sub, depth+1)) { + Regexp** subcopy = new Regexp*[re->nsub()]; + subcopy[0] = sub; // already have reference + for (int i = 1; i < re->nsub(); i++) + subcopy[i] = re->sub()[i]->Incref(); + *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); + delete[] subcopy; + re->Decref(); + return true; + } + sub->Decref(); + } + break; + case kRegexpCapture: + sub = re->sub()[0]->Incref(); + if (IsAnchorStart(&sub, depth+1)) { + *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); + re->Decref(); + return true; + } + sub->Decref(); + break; + case kRegexpBeginText: + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + re->Decref(); + return true; + } + return false; +} + +// Is this regexp required to start at the end of the text? +// Only approximate; can return false for complicated regexps like (a\z|b\z), +// but handles ((a|b)\z). Could use the Walker to write a more exact one. +static bool IsAnchorEnd(Regexp** pre, int depth) { + Regexp* re = *pre; + Regexp* sub; + // The depth limit makes sure that we don't overflow + // the stack on a deeply nested regexp. As the comment + // above says, IsAnchorEnd is conservative, so returning + // a false negative is okay. The exact limit is somewhat arbitrary. + if (re == NULL || depth >= 4) + return false; + switch (re->op()) { + default: + break; + case kRegexpConcat: + if (re->nsub() > 0) { + sub = re->sub()[re->nsub() - 1]->Incref(); + if (IsAnchorEnd(&sub, depth+1)) { + Regexp** subcopy = new Regexp*[re->nsub()]; + subcopy[re->nsub() - 1] = sub; // already have reference + for (int i = 0; i < re->nsub() - 1; i++) + subcopy[i] = re->sub()[i]->Incref(); + *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); + delete[] subcopy; + re->Decref(); + return true; + } + sub->Decref(); + } + break; + case kRegexpCapture: + sub = re->sub()[0]->Incref(); + if (IsAnchorEnd(&sub, depth+1)) { + *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); + re->Decref(); + return true; + } + sub->Decref(); + break; + case kRegexpEndText: + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + re->Decref(); + return true; + } + return false; +} + +void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, + RE2::Anchor anchor) { + prog_->set_flags(flags); + + if (flags & Regexp::Latin1) + encoding_ = kEncodingLatin1; + max_mem_ = max_mem; + if (max_mem <= 0) { + max_inst_ = 100000; // more than enough + } else if (max_mem <= sizeof(Prog)) { + // No room for anything. + max_inst_ = 0; + } else { + int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); + // Limit instruction count so that inst->id() fits nicely in an int. + // SparseArray also assumes that the indices (inst->id()) are ints. + // The call to WalkExponential uses 2*max_inst_ below, + // and other places in the code use 2 or 3 * prog->size(). + // Limiting to 2^24 should avoid overflow in those places. + // (The point of allowing more than 32 bits of memory is to + // have plenty of room for the DFA states, not to use it up + // on the program.) + if (m >= 1<<24) + m = 1<<24; + + // Inst imposes its own limit (currently bigger than 2^24 but be safe). + if (m > Prog::Inst::kMaxInst) + m = Prog::Inst::kMaxInst; + + max_inst_ = m; + } + + anchor_ = anchor; +} + +// Compiles re, returning program. +// Caller is responsible for deleting prog_. +// If reversed is true, compiles a program that expects +// to run over the input string backward (reverses all concatenations). +// The reversed flag is also recorded in the returned program. +Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { + Compiler c; + + c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); + c.reversed_ = reversed; + + // Simplify to remove things like counted repetitions + // and character classes like \d. + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; + + // Record whether prog is anchored, removing the anchors. + // (They get in the way of other optimizations.) + bool is_anchor_start = IsAnchorStart(&sre, 0); + bool is_anchor_end = IsAnchorEnd(&sre, 0); + + // Generate fragment for entire regexp. + Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); + sre->Decref(); + if (c.failed_) + return NULL; + + // Success! Finish by putting Match node at end, and record start. + // Turn off c.reversed_ (if it is set) to force the remaining concatenations + // to behave normally. + c.reversed_ = false; + Frag all = c.Cat(f, c.Match(0)); + c.prog_->set_start(all.begin); + + if (reversed) { + c.prog_->set_anchor_start(is_anchor_end); + c.prog_->set_anchor_end(is_anchor_start); + } else { + c.prog_->set_anchor_start(is_anchor_start); + c.prog_->set_anchor_end(is_anchor_end); + } + + // Also create unanchored version, which starts with a .*? loop. + if (c.prog_->anchor_start()) { + c.prog_->set_start_unanchored(c.prog_->start()); + } else { + Frag unanchored = c.Cat(c.DotStar(), all); + c.prog_->set_start_unanchored(unanchored.begin); + } + + c.prog_->set_reversed(reversed); + + // Hand ownership of prog_ to caller. + return c.Finish(); +} + +Prog* Compiler::Finish() { + if (failed_) + return NULL; + + if (prog_->start() == 0 && prog_->start_unanchored() == 0) { + // No possible matches; keep Fail instruction only. + inst_len_ = 1; + } + + // Trim instruction to minimum array and transfer to Prog. + Trim(); + prog_->inst_ = inst_; + prog_->size_ = inst_len_; + inst_ = NULL; + + // Compute byte map. + prog_->ComputeByteMap(); + + prog_->Optimize(); + + // Record remaining memory for DFA. + if (max_mem_ <= 0) { + prog_->set_dfa_mem(1<<20); + } else { + int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); + if (m < 0) + m = 0; + prog_->set_dfa_mem(m); + } + + Prog* p = prog_; + prog_ = NULL; + return p; +} + +// Converts Regexp to Prog. +Prog* Regexp::CompileToProg(int64 max_mem) { + return Compiler::Compile(this, false, max_mem); +} + +Prog* Regexp::CompileToReverseProg(int64 max_mem) { + return Compiler::Compile(this, true, max_mem); +} + +Frag Compiler::DotStar() { + return Star(ByteRange(0x00, 0xff, false), true); +} + +// Compiles RE set to Prog. +Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re) { + Compiler c; + + Regexp::ParseFlags pf = static_cast(options.ParseFlags()); + c.Setup(pf, options.max_mem(), anchor); + + // Compile alternation of fragments. + Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); + re->Decref(); + if (c.failed_) + return NULL; + + if (anchor == RE2::UNANCHORED) { + // The trailing .* was added while handling kRegexpHaveMatch. + // We just have to add the leading one. + all = c.Cat(c.DotStar(), all); + } + + c.prog_->set_start(all.begin); + c.prog_->set_start_unanchored(all.begin); + c.prog_->set_anchor_start(true); + c.prog_->set_anchor_end(true); + + Prog* prog = c.Finish(); + if (prog == NULL) + return NULL; + + // Make sure DFA has enough memory to operate, + // since we're not going to fall back to the NFA. + bool failed; + StringPiece sp = "hello, world"; + prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, + NULL, &failed, NULL); + if (failed) { + delete prog; + return NULL; + } + + return prog; +} + +Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re) { + return Compiler::CompileSet(options, anchor, re); +} + +} // namespace re2 diff --git a/outside/re2/re2/dfa.cc b/outside/re2/re2/dfa.cc new file mode 100644 index 000000000..1cfde4c97 --- /dev/null +++ b/outside/re2/re2/dfa.cc @@ -0,0 +1,2115 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A DFA (deterministic finite automaton)-based regular expression search. +// +// The DFA search has two main parts: the construction of the automaton, +// which is represented by a graph of State structures, and the execution +// of the automaton over a given input string. +// +// The basic idea is that the State graph is constructed so that the +// execution can simply start with a state s, and then for each byte c in +// the input string, execute "s = s->next[c]", checking at each point whether +// the current s represents a matching state. +// +// The simple explanation just given does convey the essence of this code, +// but it omits the details of how the State graph gets constructed as well +// as some performance-driven optimizations to the execution of the automaton. +// All these details are explained in the comments for the code following +// the definition of class DFA. +// +// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. + +#include "re2/prog.h" +#include "re2/stringpiece.h" +#include "util/atomicops.h" +#include "util/flags.h" +#include "util/sparse_set.h" + +DEFINE_bool(re2_dfa_bail_when_slow, true, + "Whether the RE2 DFA should bail out early " + "if the NFA would be faster (for testing)."); + +namespace re2 { + +#if !defined(__linux__) /* only Linux seems to have memrchr */ +static void* memrchr(const void* s, int c, size_t n) { + const unsigned char* p = (const unsigned char*)s; + for (p += n; n > 0; n--) + if (*--p == c) + return (void*)p; + + return NULL; +} +#endif + +// Changing this to true compiles in prints that trace execution of the DFA. +// Generates a lot of output -- only useful for debugging. +static const bool DebugDFA = false; + +// A DFA implementation of a regular expression program. +// Since this is entirely a forward declaration mandated by C++, +// some of the comments here are better understood after reading +// the comments in the sections that follow the DFA definition. +class DFA { + public: + DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem); + ~DFA(); + bool ok() const { return !init_failed_; } + Prog::MatchKind kind() { return kind_; } + + // Searches for the regular expression in text, which is considered + // as a subsection of context for the purposes of interpreting flags + // like ^ and $ and \A and \z. + // Returns whether a match was found. + // If a match is found, sets *ep to the end point of the best match in text. + // If "anchored", the match must begin at the start of text. + // If "want_earliest_match", the match that ends first is used, not + // necessarily the best one. + // If "run_forward" is true, the DFA runs from text.begin() to text.end(). + // If it is false, the DFA runs from text.end() to text.begin(), + // returning the leftmost end of the match instead of the rightmost one. + // If the DFA cannot complete the search (for example, if it is out of + // memory), it sets *failed and returns false. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool want_earliest_match, bool run_forward, + bool* failed, const char** ep, vector* matches); + + // Builds out all states for the entire DFA. FOR TESTING ONLY + // Returns number of states. + int BuildAllStates(); + + // Computes min and max for matching strings. Won't return strings + // bigger than maxlen. + bool PossibleMatchRange(string* min, string* max, int maxlen); + + // These data structures are logically private, but C++ makes it too + // difficult to mark them as such. + class Workq; + class RWLocker; + class StateSaver; + + // A single DFA state. The DFA is represented as a graph of these + // States, linked by the next_ pointers. If in state s and reading + // byte c, the next state should be s->next_[c]. + struct State { + inline bool IsMatch() const { return flag_ & kFlagMatch; } + void SaveMatch(vector* v); + + int* inst_; // Instruction pointers in the state. + int ninst_; // # of inst_ pointers. + uint flag_; // Empty string bitfield flags in effect on the way + // into this state, along with kFlagMatch if this + // is a matching state. + State** next_; // Outgoing arrows from State, + // one per input byte class + }; + + enum { + kByteEndText = 256, // imaginary byte at end of text + + kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags + kFlagMatch = 0x1000, // State.flag_: this is a matching state + kFlagLastWord = 0x2000, // State.flag_: last byte was a word char + kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left + }; + +#ifndef STL_MSVC + // STL function structures for use with unordered_set. + struct StateEqual { + bool operator()(const State* a, const State* b) const { + if (a == b) + return true; + if (a == NULL || b == NULL) + return false; + if (a->ninst_ != b->ninst_) + return false; + if (a->flag_ != b->flag_) + return false; + for (int i = 0; i < a->ninst_; i++) + if (a->inst_[i] != b->inst_[i]) + return false; + return true; // they're equal + } + }; +#endif // STL_MSVC + struct StateHash { + size_t operator()(const State* a) const { + if (a == NULL) + return 0; + const char* s = reinterpret_cast(a->inst_); + int len = a->ninst_ * sizeof a->inst_[0]; + if (sizeof(size_t) == sizeof(uint32)) + return Hash32StringWithSeed(s, len, a->flag_); + else + return Hash64StringWithSeed(s, len, a->flag_); + } +#ifdef STL_MSVC + // Less than operator. + bool operator()(const State* a, const State* b) const { + if (a == b) + return false; + if (a == NULL || b == NULL) + return a == NULL; + if (a->ninst_ != b->ninst_) + return a->ninst_ < b->ninst_; + if (a->flag_ != b->flag_) + return a->flag_ < b->flag_; + for (int i = 0; i < a->ninst_; ++i) + if (a->inst_[i] != b->inst_[i]) + return a->inst_[i] < b->inst_[i]; + return false; // they're equal + } + // The two public members are required by msvc. 4 and 8 are default values. + // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx + static const size_t bucket_size = 4; + static const size_t min_buckets = 8; +#endif // STL_MSVC + }; + +#ifdef STL_MSVC + typedef unordered_set StateSet; +#else // !STL_MSVC + typedef unordered_set StateSet; +#endif // STL_MSVC + + + private: + // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) + enum { + kFbUnknown = -1, // No analysis has been performed. + kFbMany = -2, // Many bytes will lead out of this state. + kFbNone = -3, // No bytes lead out of this state. + }; + + enum { + // Indices into start_ for unanchored searches. + // Add kStartAnchored for anchored searches. + kStartBeginText = 0, // text at beginning of context + kStartBeginLine = 2, // text at beginning of line + kStartAfterWordChar = 4, // text follows a word character + kStartAfterNonWordChar = 6, // text follows non-word character + kMaxStart = 8, + + kStartAnchored = 1, + }; + + // Resets the DFA State cache, flushing all saved State* information. + // Releases and reacquires cache_mutex_ via cache_lock, so any + // State* existing before the call are not valid after the call. + // Use a StateSaver to preserve important states across the call. + // cache_mutex_.r <= L < mutex_ + // After: cache_mutex_.w <= L < mutex_ + void ResetCache(RWLocker* cache_lock); + + // Looks up and returns the State corresponding to a Workq. + // L >= mutex_ + State* WorkqToCachedState(Workq* q, uint flag); + + // Looks up and returns a State matching the inst, ninst, and flag. + // L >= mutex_ + State* CachedState(int* inst, int ninst, uint flag); + + // Clear the cache entirely. + // Must hold cache_mutex_.w or be in destructor. + void ClearCache(); + + // Converts a State into a Workq: the opposite of WorkqToCachedState. + // L >= mutex_ + static void StateToWorkq(State* s, Workq* q); + + // Runs a State on a given byte, returning the next state. + State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ + State* RunStateOnByte(State*, int); // L >= mutex_ + + // Runs a Workq on a given byte followed by a set of empty-string flags, + // producing a new Workq in nq. If a match instruction is encountered, + // sets *ismatch to true. + // L >= mutex_ + void RunWorkqOnByte(Workq* q, Workq* nq, + int c, uint flag, bool* ismatch, + Prog::MatchKind kind, + int new_byte_loop); + + // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. + // L >= mutex_ + void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag); + + // Adds the instruction id to the Workq, following empty arrows + // according to flag. + // L >= mutex_ + void AddToQueue(Workq* q, int id, uint flag); + + // For debugging, returns a text representation of State. + static string DumpState(State* state); + + // For debugging, returns a text representation of a Workq. + static string DumpWorkq(Workq* q); + + // Search parameters + struct SearchParams { + SearchParams(const StringPiece& text, const StringPiece& context, + RWLocker* cache_lock) + : text(text), context(context), + anchored(false), + want_earliest_match(false), + run_forward(false), + start(NULL), + firstbyte(kFbUnknown), + cache_lock(cache_lock), + failed(false), + ep(NULL), + matches(NULL) { } + + StringPiece text; + StringPiece context; + bool anchored; + bool want_earliest_match; + bool run_forward; + State* start; + int firstbyte; + RWLocker *cache_lock; + bool failed; // "out" parameter: whether search gave up + const char* ep; // "out" parameter: end pointer for match + vector* matches; + + private: + DISALLOW_EVIL_CONSTRUCTORS(SearchParams); + }; + + // Before each search, the parameters to Search are analyzed by + // AnalyzeSearch to determine the state in which to start and the + // "firstbyte" for that state, if any. + struct StartInfo { + StartInfo() : start(NULL), firstbyte(kFbUnknown) { } + State* start; + volatile int firstbyte; + }; + + // Fills in params->start and params->firstbyte using + // the other search parameters. Returns true on success, + // false on failure. + // cache_mutex_.r <= L < mutex_ + bool AnalyzeSearch(SearchParams* params); + bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags); + + // The generic search loop, inlined to create specialized versions. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + inline bool InlinedSearchLoop(SearchParams* params, + bool have_firstbyte, + bool want_earliest_match, + bool run_forward); + + // The specialized versions of InlinedSearchLoop. The three letters + // at the ends of the name denote the true/false values used as the + // last three parameters of InlinedSearchLoop. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + bool SearchFFF(SearchParams* params); + bool SearchFFT(SearchParams* params); + bool SearchFTF(SearchParams* params); + bool SearchFTT(SearchParams* params); + bool SearchTFF(SearchParams* params); + bool SearchTFT(SearchParams* params); + bool SearchTTF(SearchParams* params); + bool SearchTTT(SearchParams* params); + + // The main search loop: calls an appropriate specialized version of + // InlinedSearchLoop. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + bool FastSearchLoop(SearchParams* params); + + // For debugging, a slow search loop that calls InlinedSearchLoop + // directly -- because the booleans passed are not constants, the + // loop is not specialized like the SearchFFF etc. versions, so it + // runs much more slowly. Useful only for debugging. + // cache_mutex_.r <= L < mutex_ + // Might unlock and relock cache_mutex_ via params->cache_lock. + bool SlowSearchLoop(SearchParams* params); + + // Looks up bytes in bytemap_ but handles case c == kByteEndText too. + int ByteMap(int c) { + if (c == kByteEndText) + return prog_->bytemap_range(); + return prog_->bytemap()[c]; + } + + // Constant after initialization. + Prog* prog_; // The regular expression program to run. + Prog::MatchKind kind_; // The kind of DFA. + int start_unanchored_; // start of unanchored program + bool init_failed_; // initialization failed (out of memory) + + Mutex mutex_; // mutex_ >= cache_mutex_.r + + // Scratch areas, protected by mutex_. + Workq* q0_; // Two pre-allocated work queues. + Workq* q1_; + int* astack_; // Pre-allocated stack for AddToQueue + int nastack_; + + // State* cache. Many threads use and add to the cache simultaneously, + // holding cache_mutex_ for reading and mutex_ (above) when adding. + // If the cache fills and needs to be discarded, the discarding is done + // while holding cache_mutex_ for writing, to avoid interrupting other + // readers. Any State* pointers are only valid while cache_mutex_ + // is held. + Mutex cache_mutex_; + int64 mem_budget_; // Total memory budget for all States. + int64 state_budget_; // Amount of memory remaining for new States. + StateSet state_cache_; // All States computed so far. + StartInfo start_[kMaxStart]; + bool cache_warned_; // have printed to LOG(INFO) about the cache +}; + +// Shorthand for casting to uint8*. +static inline const uint8* BytePtr(const void* v) { + return reinterpret_cast(v); +} + +// Work queues + +// Marks separate thread groups of different priority +// in the work queue when in leftmost-longest matching mode. +#define Mark (-1) + +// Internally, the DFA uses a sparse array of +// program instruction pointers as a work queue. +// In leftmost longest mode, marks separate sections +// of workq that started executing at different +// locations in the string (earlier locations first). +class DFA::Workq : public SparseSet { + public: + // Constructor: n is number of normal slots, maxmark number of mark slots. + Workq(int n, int maxmark) : + SparseSet(n+maxmark), + n_(n), + maxmark_(maxmark), + nextmark_(n), + last_was_mark_(true) { + } + + bool is_mark(int i) { return i >= n_; } + + int maxmark() { return maxmark_; } + + void clear() { + SparseSet::clear(); + nextmark_ = n_; + } + + void mark() { + if (last_was_mark_) + return; + last_was_mark_ = false; + SparseSet::insert_new(nextmark_++); + } + + int size() { + return n_ + maxmark_; + } + + void insert(int id) { + if (contains(id)) + return; + insert_new(id); + } + + void insert_new(int id) { + last_was_mark_ = false; + SparseSet::insert_new(id); + } + + private: + int n_; // size excluding marks + int maxmark_; // maximum number of marks + int nextmark_; // id of next mark + bool last_was_mark_; // last inserted was mark + DISALLOW_EVIL_CONSTRUCTORS(Workq); +}; + +DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) + : prog_(prog), + kind_(kind), + init_failed_(false), + q0_(NULL), + q1_(NULL), + astack_(NULL), + mem_budget_(max_mem), + cache_warned_(false) { + if (DebugDFA) + fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); + int nmark = 0; + start_unanchored_ = 0; + if (kind_ == Prog::kLongestMatch) { + nmark = prog->size(); + start_unanchored_ = prog->start_unanchored(); + } + nastack_ = 2 * prog->size() + nmark; + + // Account for space needed for DFA, q0, q1, astack. + mem_budget_ -= sizeof(DFA); + mem_budget_ -= (prog_->size() + nmark) * + (sizeof(int)+sizeof(int)) * 2; // q0, q1 + mem_budget_ -= nastack_ * sizeof(int); // astack + if (mem_budget_ < 0) { + LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + prog_->size(), max_mem); + init_failed_ = true; + return; + } + + state_budget_ = mem_budget_; + + // Make sure there is a reasonable amount of working room left. + // At minimum, the search requires room for two states in order + // to limp along, restarting frequently. We'll get better performance + // if there is room for a larger number of states, say 20. + int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + + (prog_->bytemap_range()+1)*sizeof(State*); + if (state_budget_ < 20*one_state) { + LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", + prog_->size(), max_mem); + init_failed_ = true; + return; + } + + q0_ = new Workq(prog->size(), nmark); + q1_ = new Workq(prog->size(), nmark); + astack_ = new int[nastack_]; +} + +DFA::~DFA() { + delete q0_; + delete q1_; + delete[] astack_; + ClearCache(); +} + +// In the DFA state graph, s->next[c] == NULL means that the +// state has not yet been computed and needs to be. We need +// a different special value to signal that s->next[c] is a +// state that can never lead to a match (and thus the search +// can be called off). Hence DeadState. +#define DeadState reinterpret_cast(1) + +// Signals that the rest of the string matches no matter what it is. +#define FullMatchState reinterpret_cast(2) + +#define SpecialStateMax FullMatchState + +// Debugging printouts + +// For debugging, returns a string representation of the work queue. +string DFA::DumpWorkq(Workq* q) { + string s; + const char* sep = ""; + for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) { + if (q->is_mark(*it)) { + StringAppendF(&s, "|"); + sep = ""; + } else { + StringAppendF(&s, "%s%d", sep, *it); + sep = ","; + } + } + return s; +} + +// For debugging, returns a string representation of the state. +string DFA::DumpState(State* state) { + if (state == NULL) + return "_"; + if (state == DeadState) + return "X"; + if (state == FullMatchState) + return "*"; + string s; + const char* sep = ""; + StringAppendF(&s, "(%p)", state); + for (int i = 0; i < state->ninst_; i++) { + if (state->inst_[i] == Mark) { + StringAppendF(&s, "|"); + sep = ""; + } else { + StringAppendF(&s, "%s%d", sep, state->inst_[i]); + sep = ","; + } + } + StringAppendF(&s, " flag=%#x", state->flag_); + return s; +} + +////////////////////////////////////////////////////////////////////// +// +// DFA state graph construction. +// +// The DFA state graph is a heavily-linked collection of State* structures. +// The state_cache_ is a set of all the State structures ever allocated, +// so that if the same state is reached by two different paths, +// the same State structure can be used. This reduces allocation +// requirements and also avoids duplication of effort across the two +// identical states. +// +// A State is defined by an ordered list of instruction ids and a flag word. +// +// The choice of an ordered list of instructions differs from a typical +// textbook DFA implementation, which would use an unordered set. +// Textbook descriptions, however, only care about whether +// the DFA matches, not where it matches in the text. To decide where the +// DFA matches, we need to mimic the behavior of the dominant backtracking +// implementations like PCRE, which try one possible regular expression +// execution, then another, then another, stopping when one of them succeeds. +// The DFA execution tries these many executions in parallel, representing +// each by an instruction id. These pointers are ordered in the State.inst_ +// list in the same order that the executions would happen in a backtracking +// search: if a match is found during execution of inst_[2], inst_[i] for i>=3 +// can be discarded. +// +// Textbooks also typically do not consider context-aware empty string operators +// like ^ or $. These are handled by the flag word, which specifies the set +// of empty-string operators that should be matched when executing at the +// current text position. These flag bits are defined in prog.h. +// The flag word also contains two DFA-specific bits: kFlagMatch if the state +// is a matching state (one that reached a kInstMatch in the program) +// and kFlagLastWord if the last processed byte was a word character, for the +// implementation of \B and \b. +// +// The flag word also contains, shifted up 16 bits, the bits looked for by +// any kInstEmptyWidth instructions in the state. These provide a useful +// summary indicating when new flags might be useful. +// +// The permanent representation of a State's instruction ids is just an array, +// but while a state is being analyzed, these instruction ids are represented +// as a Workq, which is an array that allows iteration in insertion order. + +// NOTE(rsc): The choice of State construction determines whether the DFA +// mimics backtracking implementations (so-called leftmost first matching) or +// traditional DFA implementations (so-called leftmost longest matching as +// prescribed by POSIX). This implementation chooses to mimic the +// backtracking implementations, because we want to replace PCRE. To get +// POSIX behavior, the states would need to be considered not as a simple +// ordered list of instruction ids, but as a list of unordered sets of instruction +// ids. A match by a state in one set would inhibit the running of sets +// farther down the list but not other instruction ids in the same set. Each +// set would correspond to matches beginning at a given point in the string. +// This is implemented by separating different sets with Mark pointers. + +// Looks in the State cache for a State matching q, flag. +// If one is found, returns it. If one is not found, allocates one, +// inserts it in the cache, and returns it. +DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { + if (DEBUG_MODE) + mutex_.AssertHeld(); + + // Construct array of instruction ids for the new state. + // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: + // those are the only operators with any effect in + // RunWorkqOnEmptyString or RunWorkqOnByte. + int* inst = new int[q->size()]; + int n = 0; + uint needflags = 0; // flags needed by kInstEmptyWidth instructions + bool sawmatch = false; // whether queue contains guaranteed kInstMatch + bool sawmark = false; // whether queue contains a Mark + if (DebugDFA) + fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); + for (Workq::iterator it = q->begin(); it != q->end(); ++it) { + int id = *it; + if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) + break; + if (q->is_mark(id)) { + if (n > 0 && inst[n-1] != Mark) { + sawmark = true; + inst[n++] = Mark; + } + continue; + } + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + case kInstAltMatch: + // This state will continue to a match no matter what + // the rest of the input is. If it is the highest priority match + // being considered, return the special FullMatchState + // to indicate that it's all matches from here out. + if (kind_ != Prog::kManyMatch && + (kind_ != Prog::kFirstMatch || + (it == q->begin() && ip->greedy(prog_))) && + (kind_ != Prog::kLongestMatch || !sawmark) && + (flag & kFlagMatch)) { + delete[] inst; + if (DebugDFA) + fprintf(stderr, " -> FullMatchState\n"); + return FullMatchState; + } + // Fall through. + case kInstByteRange: // These are useful. + case kInstEmptyWidth: + case kInstMatch: + case kInstAlt: // Not useful, but necessary [*] + inst[n++] = *it; + if (ip->opcode() == kInstEmptyWidth) + needflags |= ip->empty(); + if (ip->opcode() == kInstMatch && !prog_->anchor_end()) + sawmatch = true; + break; + + default: // The rest are not. + break; + } + + // [*] kInstAlt would seem useless to record in a state, since + // we've already followed both its arrows and saved all the + // interesting states we can reach from there. The problem + // is that one of the empty-width instructions might lead + // back to the same kInstAlt (if an empty-width operator is starred), + // producing a different evaluation order depending on whether + // we keep the kInstAlt to begin with. Sigh. + // A specific case that this affects is /(^|a)+/ matching "a". + // If we don't save the kInstAlt, we will match the whole "a" (0,1) + // but in fact the correct leftmost-first match is the leading "" (0,0). + } + DCHECK_LE(n, q->size()); + if (n > 0 && inst[n-1] == Mark) + n--; + + // If there are no empty-width instructions waiting to execute, + // then the extra flag bits will not be used, so there is no + // point in saving them. (Discarding them reduces the number + // of distinct states.) + if (needflags == 0) + flag &= kFlagMatch; + + // NOTE(rsc): The code above cannot do flag &= needflags, + // because if the right flags were present to pass the current + // kInstEmptyWidth instructions, new kInstEmptyWidth instructions + // might be reached that in turn need different flags. + // The only sure thing is that if there are no kInstEmptyWidth + // instructions at all, no flags will be needed. + // We could do the extra work to figure out the full set of + // possibly needed flags by exploring past the kInstEmptyWidth + // instructions, but the check above -- are any flags needed + // at all? -- handles the most common case. More fine-grained + // analysis can only be justified by measurements showing that + // too many redundant states are being allocated. + + // If there are no Insts in the list, it's a dead state, + // which is useful to signal with a special pointer so that + // the execution loop can stop early. This is only okay + // if the state is *not* a matching state. + if (n == 0 && flag == 0) { + delete[] inst; + if (DebugDFA) + fprintf(stderr, " -> DeadState\n"); + return DeadState; + } + + // If we're in longest match mode, the state is a sequence of + // unordered state sets separated by Marks. Sort each set + // to canonicalize, to reduce the number of distinct sets stored. + if (kind_ == Prog::kLongestMatch) { + int* ip = inst; + int* ep = ip + n; + while (ip < ep) { + int* markp = ip; + while (markp < ep && *markp != Mark) + markp++; + sort(ip, markp); + if (markp < ep) + markp++; + ip = markp; + } + } + + // Save the needed empty-width flags in the top bits for use later. + flag |= needflags << kFlagNeedShift; + + State* state = CachedState(inst, n, flag); + delete[] inst; + return state; +} + +// Looks in the State cache for a State matching inst, ninst, flag. +// If one is found, returns it. If one is not found, allocates one, +// inserts it in the cache, and returns it. +DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { + if (DEBUG_MODE) + mutex_.AssertHeld(); + + // Look in the cache for a pre-existing state. + State state = { inst, ninst, flag, NULL }; + StateSet::iterator it = state_cache_.find(&state); + if (it != state_cache_.end()) { + if (DebugDFA) + fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); + return *it; + } + + // Must have enough memory for new state. + // In addition to what we're going to allocate, + // the state cache hash table seems to incur about 32 bytes per + // State*, empirically. + const int kStateCacheOverhead = 32; + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int); + if (mem_budget_ < mem + kStateCacheOverhead) { + mem_budget_ = -1; + return NULL; + } + mem_budget_ -= mem + kStateCacheOverhead; + + // Allocate new state, along with room for next and inst. + char* space = new char[mem]; + State* s = reinterpret_cast(space); + s->next_ = reinterpret_cast(s + 1); + s->inst_ = reinterpret_cast(s->next_ + nnext); + memset(s->next_, 0, nnext*sizeof s->next_[0]); + memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); + s->ninst_ = ninst; + s->flag_ = flag; + if (DebugDFA) + fprintf(stderr, " -> %s\n", DumpState(s).c_str()); + + // Put state in cache and return it. + state_cache_.insert(s); + return s; +} + +// Clear the cache. Must hold cache_mutex_.w or be in destructor. +void DFA::ClearCache() { + // In case state_cache_ doesn't support deleting entries + // during iteration, copy into a vector and then delete. + vector v; + v.reserve(state_cache_.size()); + for (StateSet::iterator it = state_cache_.begin(); + it != state_cache_.end(); ++it) + v.push_back(*it); + state_cache_.clear(); + for (int i = 0; i < v.size(); i++) + delete[] reinterpret_cast(v[i]); +} + +// Copies insts in state s to the work queue q. +void DFA::StateToWorkq(State* s, Workq* q) { + q->clear(); + for (int i = 0; i < s->ninst_; i++) { + if (s->inst_[i] == Mark) + q->mark(); + else + q->insert_new(s->inst_[i]); + } +} + +// Adds ip to the work queue, following empty arrows according to flag +// and expanding kInstAlt instructions (two-target gotos). +void DFA::AddToQueue(Workq* q, int id, uint flag) { + + // Use astack_ to hold our stack of states yet to process. + // It is sized to have room for nastack_ == 2*prog->size() + nmark + // instructions, which is enough: each instruction can be + // processed by the switch below only once, and the processing + // pushes at most two instructions plus maybe a mark. + // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.) + int* stk = astack_; + int nstk = 0; + + stk[nstk++] = id; + while (nstk > 0) { + DCHECK_LE(nstk, nastack_); + id = stk[--nstk]; + + if (id == Mark) { + q->mark(); + continue; + } + + if (id == 0) + continue; + + // If ip is already on the queue, nothing to do. + // Otherwise add it. We don't actually keep all the ones + // that get added -- for example, kInstAlt is ignored + // when on a work queue -- but adding all ip's here + // increases the likelihood of q->contains(id), + // reducing the amount of duplicated work. + if (q->contains(id)) + continue; + q->insert_new(id); + + // Process instruction. + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + case kInstFail: // can't happen: discarded above + break; + + case kInstByteRange: // just save these on the queue + case kInstMatch: + break; + + case kInstCapture: // DFA treats captures as no-ops. + case kInstNop: + stk[nstk++] = ip->out(); + break; + + case kInstAlt: // two choices: expand both, in order + case kInstAltMatch: + // Want to visit out then out1, so push on stack in reverse order. + // This instruction is the [00-FF]* loop at the beginning of + // a leftmost-longest unanchored search, separate out from out1 + // with a Mark, so that out1's threads (which will start farther + // to the right in the string being searched) are lower priority + // than the current ones. + stk[nstk++] = ip->out1(); + if (q->maxmark() > 0 && + id == prog_->start_unanchored() && id != prog_->start()) + stk[nstk++] = Mark; + stk[nstk++] = ip->out(); + break; + + case kInstEmptyWidth: + if ((ip->empty() & flag) == ip->empty()) + stk[nstk++] = ip->out(); + break; + } + } +} + +// Running of work queues. In the work queue, order matters: +// the queue is sorted in priority order. If instruction i comes before j, +// then the instructions that i produces during the run must come before +// the ones that j produces. In order to keep this invariant, all the +// work queue runners have to take an old queue to process and then +// also a new queue to fill in. It's not acceptable to add to the end of +// an existing queue, because new instructions will not end up in the +// correct position. + +// Runs the work queue, processing the empty strings indicated by flag. +// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match +// both ^ and $. It is important that callers pass all flags at once: +// processing both ^ and $ is not the same as first processing only ^ +// and then processing only $. Doing the two-step sequence won't match +// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior +// exhibited by existing implementations). +void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { + newq->clear(); + for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { + if (oldq->is_mark(*i)) + AddToQueue(newq, Mark, flag); + else + AddToQueue(newq, *i, flag); + } +} + +// Runs the work queue, processing the single byte c followed by any empty +// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, +// means to match c$. Sets the bool *ismatch to true if the end of the +// regular expression program has been reached (the regexp has matched). +void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, + int c, uint flag, bool* ismatch, + Prog::MatchKind kind, + int new_byte_loop) { + if (DEBUG_MODE) + mutex_.AssertHeld(); + + newq->clear(); + for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { + if (oldq->is_mark(*i)) { + if (*ismatch) + return; + newq->mark(); + continue; + } + int id = *i; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + case kInstFail: // never succeeds + case kInstCapture: // already followed + case kInstNop: // already followed + case kInstAlt: // already followed + case kInstAltMatch: // already followed + case kInstEmptyWidth: // already followed + break; + + case kInstByteRange: // can follow if c is in range + if (ip->Matches(c)) + AddToQueue(newq, ip->out(), flag); + break; + + case kInstMatch: + if (prog_->anchor_end() && c != kByteEndText) + break; + *ismatch = true; + if (kind == Prog::kFirstMatch) { + // Can stop processing work queue since we found a match. + return; + } + break; + } + } + + if (DebugDFA) + fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), + c, flag, DumpWorkq(newq).c_str(), *ismatch); +} + +// Processes input byte c in state, returning new state. +// Caller does not hold mutex. +DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { + // Keep only one RunStateOnByte going + // even if the DFA is being run by multiple threads. + MutexLock l(&mutex_); + return RunStateOnByte(state, c); +} + +// Processes input byte c in state, returning new state. +DFA::State* DFA::RunStateOnByte(State* state, int c) { + if (DEBUG_MODE) + mutex_.AssertHeld(); + if (state <= SpecialStateMax) { + if (state == FullMatchState) { + // It is convenient for routines like PossibleMatchRange + // if we implement RunStateOnByte for FullMatchState: + // once you get into this state you never get out, + // so it's pretty easy. + return FullMatchState; + } + if (state == DeadState) { + LOG(DFATAL) << "DeadState in RunStateOnByte"; + return NULL; + } + if (state == NULL) { + LOG(DFATAL) << "NULL state in RunStateOnByte"; + return NULL; + } + LOG(DFATAL) << "Unexpected special state in RunStateOnByte"; + return NULL; + } + + // If someone else already computed this, return it. + State* ns; + ATOMIC_LOAD_CONSUME(ns, &state->next_[ByteMap(c)]); + if (ns != NULL) + return ns; + + // Convert state into Workq. + StateToWorkq(state, q0_); + + // Flags marking the kinds of empty-width things (^ $ etc) + // around this byte. Before the byte we have the flags recorded + // in the State structure itself. After the byte we have + // nothing yet (but that will change: read on). + uint needflag = state->flag_ >> kFlagNeedShift; + uint beforeflag = state->flag_ & kFlagEmptyMask; + uint oldbeforeflag = beforeflag; + uint afterflag = 0; + + if (c == '\n') { + // Insert implicit $ and ^ around \n + beforeflag |= kEmptyEndLine; + afterflag |= kEmptyBeginLine; + } + + if (c == kByteEndText) { + // Insert implicit $ and \z before the fake "end text" byte. + beforeflag |= kEmptyEndLine | kEmptyEndText; + } + + // The state flag kFlagLastWord says whether the last + // byte processed was a word character. Use that info to + // insert empty-width (non-)word boundaries. + bool islastword = state->flag_ & kFlagLastWord; + bool isword = (c != kByteEndText && Prog::IsWordChar(c)); + if (isword == islastword) + beforeflag |= kEmptyNonWordBoundary; + else + beforeflag |= kEmptyWordBoundary; + + // Okay, finally ready to run. + // Only useful to rerun on empty string if there are new, useful flags. + if (beforeflag & ~oldbeforeflag & needflag) { + RunWorkqOnEmptyString(q0_, q1_, beforeflag); + swap(q0_, q1_); + } + bool ismatch = false; + RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); + + // Most of the time, we build the state from the output of + // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that + // RE2::Set can tell exactly which match instructions + // contributed to the match, don't swap if c is kByteEndText. + // The resulting state wouldn't be correct for further processing + // of the string, but we're at the end of the text so that's okay. + // Leaving q0_ alone preseves the match instructions that led to + // the current setting of ismatch. + if (c != kByteEndText || kind_ != Prog::kManyMatch) + swap(q0_, q1_); + + // Save afterflag along with ismatch and isword in new state. + uint flag = afterflag; + if (ismatch) + flag |= kFlagMatch; + if (isword) + flag |= kFlagLastWord; + + ns = WorkqToCachedState(q0_, flag); + + // Flush ns before linking to it. + // Write barrier before updating state->next_ so that the + // main search loop can proceed without any locking, for speed. + // (Otherwise it would need one mutex operation per input byte.) + ATOMIC_STORE_RELEASE(&state->next_[ByteMap(c)], ns); + return ns; +} + + +////////////////////////////////////////////////////////////////////// +// DFA cache reset. + +// Reader-writer lock helper. +// +// The DFA uses a reader-writer mutex to protect the state graph itself. +// Traversing the state graph requires holding the mutex for reading, +// and discarding the state graph and starting over requires holding the +// lock for writing. If a search needs to expand the graph but is out +// of memory, it will need to drop its read lock and then acquire the +// write lock. Since it cannot then atomically downgrade from write lock +// to read lock, it runs the rest of the search holding the write lock. +// (This probably helps avoid repeated contention, but really the decision +// is forced by the Mutex interface.) It's a bit complicated to keep +// track of whether the lock is held for reading or writing and thread +// that through the search, so instead we encapsulate it in the RWLocker +// and pass that around. + +class DFA::RWLocker { + public: + explicit RWLocker(Mutex* mu); + ~RWLocker(); + + // If the lock is only held for reading right now, + // drop the read lock and re-acquire for writing. + // Subsequent calls to LockForWriting are no-ops. + // Notice that the lock is *released* temporarily. + void LockForWriting(); + + // Returns whether the lock is already held for writing. + bool IsLockedForWriting() { + return writing_; + } + + private: + Mutex* mu_; + bool writing_; + + DISALLOW_EVIL_CONSTRUCTORS(RWLocker); +}; + +DFA::RWLocker::RWLocker(Mutex* mu) + : mu_(mu), writing_(false) { + + mu_->ReaderLock(); +} + +// This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations +// does not support lock upgrade. +void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { + if (!writing_) { + mu_->ReaderUnlock(); + mu_->Lock(); + writing_ = true; + } +} + +DFA::RWLocker::~RWLocker() { + if (writing_) + mu_->WriterUnlock(); + else + mu_->ReaderUnlock(); +} + + +// When the DFA's State cache fills, we discard all the states in the +// cache and start over. Many threads can be using and adding to the +// cache at the same time, so we synchronize using the cache_mutex_ +// to keep from stepping on other threads. Specifically, all the +// threads using the current cache hold cache_mutex_ for reading. +// When a thread decides to flush the cache, it drops cache_mutex_ +// and then re-acquires it for writing. That ensures there are no +// other threads accessing the cache anymore. The rest of the search +// runs holding cache_mutex_ for writing, avoiding any contention +// with or cache pollution caused by other threads. + +void DFA::ResetCache(RWLocker* cache_lock) { + // Re-acquire the cache_mutex_ for writing (exclusive use). + bool was_writing = cache_lock->IsLockedForWriting(); + cache_lock->LockForWriting(); + + // If we already held cache_mutex_ for writing, it means + // this invocation of Search() has already reset the + // cache once already. That's a pretty clear indication + // that the cache is too small. Warn about that, once. + // TODO(rsc): Only warn if state_cache_.size() < some threshold. + if (was_writing && !cache_warned_) { + LOG(INFO) << "DFA memory cache could be too small: " + << "only room for " << state_cache_.size() << " states."; + cache_warned_ = true; + } + + // Clear the cache, reset the memory budget. + for (int i = 0; i < kMaxStart; i++) { + start_[i].start = NULL; + start_[i].firstbyte = kFbUnknown; + } + ClearCache(); + mem_budget_ = state_budget_; +} + +// Typically, a couple States do need to be preserved across a cache +// reset, like the State at the current point in the search. +// The StateSaver class helps keep States across cache resets. +// It makes a copy of the state's guts outside the cache (before the reset) +// and then can be asked, after the reset, to recreate the State +// in the new cache. For example, in a DFA method ("this" is a DFA): +// +// StateSaver saver(this, s); +// ResetCache(cache_lock); +// s = saver.Restore(); +// +// The saver should always have room in the cache to re-create the state, +// because resetting the cache locks out all other threads, and the cache +// is known to have room for at least a couple states (otherwise the DFA +// constructor fails). + +class DFA::StateSaver { + public: + explicit StateSaver(DFA* dfa, State* state); + ~StateSaver(); + + // Recreates and returns a state equivalent to the + // original state passed to the constructor. + // Returns NULL if the cache has filled, but + // since the DFA guarantees to have room in the cache + // for a couple states, should never return NULL + // if used right after ResetCache. + State* Restore(); + + private: + DFA* dfa_; // the DFA to use + int* inst_; // saved info from State + int ninst_; + uint flag_; + bool is_special_; // whether original state was special + State* special_; // if is_special_, the original state + + DISALLOW_EVIL_CONSTRUCTORS(StateSaver); +}; + +DFA::StateSaver::StateSaver(DFA* dfa, State* state) { + dfa_ = dfa; + if (state <= SpecialStateMax) { + inst_ = NULL; + ninst_ = 0; + flag_ = 0; + is_special_ = true; + special_ = state; + return; + } + is_special_ = false; + special_ = NULL; + flag_ = state->flag_; + ninst_ = state->ninst_; + inst_ = new int[ninst_]; + memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); +} + +DFA::StateSaver::~StateSaver() { + if (!is_special_) + delete[] inst_; +} + +DFA::State* DFA::StateSaver::Restore() { + if (is_special_) + return special_; + MutexLock l(&dfa_->mutex_); + State* s = dfa_->CachedState(inst_, ninst_, flag_); + if (s == NULL) + LOG(DFATAL) << "StateSaver failed to restore state."; + return s; +} + + +////////////////////////////////////////////////////////////////////// +// +// DFA execution. +// +// The basic search loop is easy: start in a state s and then for each +// byte c in the input, s = s->next[c]. +// +// This simple description omits a few efficiency-driven complications. +// +// First, the State graph is constructed incrementally: it is possible +// that s->next[c] is null, indicating that that state has not been +// fully explored. In this case, RunStateOnByte must be invoked to +// determine the next state, which is cached in s->next[c] to save +// future effort. An alternative reason for s->next[c] to be null is +// that the DFA has reached a so-called "dead state", in which any match +// is no longer possible. In this case RunStateOnByte will return NULL +// and the processing of the string can stop early. +// +// Second, a 256-element pointer array for s->next_ makes each State +// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] +// maps from bytes to "byte classes" and then next_ only needs to have +// as many pointers as there are byte classes. A byte class is simply a +// range of bytes that the regexp never distinguishes between. +// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, +// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit +// but in exchange we typically cut the size of a State (and thus our +// memory footprint) by about 5-10x. The comments still refer to +// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. +// +// Third, it is common for a DFA for an unanchored match to begin in a +// state in which only one particular byte value can take the DFA to a +// different state. That is, s->next[c] != s for only one c. In this +// situation, the DFA can do better than executing the simple loop. +// Instead, it can call memchr to search very quickly for the byte c. +// Whether the start state has this property is determined during a +// pre-compilation pass, and if so, the byte b is passed to the search +// loop as the "firstbyte" argument, along with a boolean "have_firstbyte". +// +// Fourth, the desired behavior is to search for the leftmost-best match +// (approximately, the same one that Perl would find), which is not +// necessarily the match ending earliest in the string. Each time a +// match is found, it must be noted, but the DFA must continue on in +// hope of finding a higher-priority match. In some cases, the caller only +// cares whether there is any match at all, not which one is found. +// The "want_earliest_match" flag causes the search to stop at the first +// match found. +// +// Fifth, one algorithm that uses the DFA needs it to run over the +// input string backward, beginning at the end and ending at the beginning. +// Passing false for the "run_forward" flag causes the DFA to run backward. +// +// The checks for these last three cases, which in a naive implementation +// would be performed once per input byte, slow the general loop enough +// to merit specialized versions of the search loop for each of the +// eight possible settings of the three booleans. Rather than write +// eight different functions, we write one general implementation and then +// inline it to create the specialized ones. +// +// Note that matches are delayed by one byte, to make it easier to +// accomodate match conditions depending on the next input byte (like $ and \b). +// When s->next[c]->IsMatch(), it means that there is a match ending just +// *before* byte c. + +// The generic search loop. Searches text for a match, returning +// the pointer to the end of the chosen match, or NULL if no match. +// The bools are equal to the same-named variables in params, but +// making them function arguments lets the inliner specialize +// this function to each combination (see two paragraphs above). +inline bool DFA::InlinedSearchLoop(SearchParams* params, + bool have_firstbyte, + bool want_earliest_match, + bool run_forward) { + State* start = params->start; + const uint8* bp = BytePtr(params->text.begin()); // start of text + const uint8* p = bp; // text scanning point + const uint8* ep = BytePtr(params->text.end()); // end of text + const uint8* resetp = NULL; // p at last cache reset + if (!run_forward) + swap(p, ep); + + const uint8* bytemap = prog_->bytemap(); + const uint8* lastmatch = NULL; // most recent matching position in text + bool matched = false; + State* s = start; + + if (s->IsMatch()) { + matched = true; + lastmatch = p; + if (want_earliest_match) { + params->ep = reinterpret_cast(lastmatch); + return true; + } + } + + while (p != ep) { + if (DebugDFA) + fprintf(stderr, "@%d: %s\n", static_cast(p - bp), + DumpState(s).c_str()); + if (have_firstbyte && s == start) { + // In start state, only way out is to find firstbyte, + // so use optimized assembly in memchr to skip ahead. + // If firstbyte isn't found, we can skip to the end + // of the string. + if (run_forward) { + if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) { + p = ep; + break; + } + } else { + if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) { + p = ep; + break; + } + p++; + } + } + + int c; + if (run_forward) + c = *p++; + else + c = *--p; + + // Note that multiple threads might be consulting + // s->next_[bytemap[c]] simultaneously. + // RunStateOnByte takes care of the appropriate locking, + // including a memory barrier so that the unlocked access + // (sometimes known as "double-checked locking") is safe. + // The alternative would be either one DFA per thread + // or one mutex operation per input byte. + // + // ns == DeadState means the state is known to be dead + // (no more matches are possible). + // ns == NULL means the state has not yet been computed + // (need to call RunStateOnByteUnlocked). + // RunStateOnByte returns ns == NULL if it is out of memory. + // ns == FullMatchState means the rest of the string matches. + // + // Okay to use bytemap[] not ByteMap() here, because + // c is known to be an actual byte and not kByteEndText. + + State* ns; + ATOMIC_LOAD_CONSUME(ns, &s->next_[bytemap[c]]); + if (ns == NULL) { + ns = RunStateOnByteUnlocked(s, c); + if (ns == NULL) { + // After we reset the cache, we hold cache_mutex exclusively, + // so if resetp != NULL, it means we filled the DFA state + // cache with this search alone (without any other threads). + // Benchmarks show that doing a state computation on every + // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the + // same at about 2 MB/s. Unless we're processing an average + // of 10 bytes per state computation, fail so that RE2 can + // fall back to the NFA. + if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL && + (p - resetp) < 10*state_cache_.size()) { + params->failed = true; + return false; + } + resetp = p; + + // Prepare to save start and s across the reset. + StateSaver save_start(this, start); + StateSaver save_s(this, s); + + // Discard all the States in the cache. + ResetCache(params->cache_lock); + + // Restore start and s so we can continue. + if ((start = save_start.Restore()) == NULL || + (s = save_s.Restore()) == NULL) { + // Restore already did LOG(DFATAL). + params->failed = true; + return false; + } + ns = RunStateOnByteUnlocked(s, c); + if (ns == NULL) { + LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache"; + params->failed = true; + return false; + } + } + } + if (ns <= SpecialStateMax) { + if (ns == DeadState) { + params->ep = reinterpret_cast(lastmatch); + return matched; + } + // FullMatchState + params->ep = reinterpret_cast(ep); + return true; + } + s = ns; + + if (s->IsMatch()) { + matched = true; + // The DFA notices the match one byte late, + // so adjust p before using it in the match. + if (run_forward) + lastmatch = p - 1; + else + lastmatch = p + 1; + if (DebugDFA) + fprintf(stderr, "match @%d! [%s]\n", + static_cast(lastmatch - bp), + DumpState(s).c_str()); + + if (want_earliest_match) { + params->ep = reinterpret_cast(lastmatch); + return true; + } + } + } + + // Process one more byte to see if it triggers a match. + // (Remember, matches are delayed one byte.) + int lastbyte; + if (run_forward) { + if (params->text.end() == params->context.end()) + lastbyte = kByteEndText; + else + lastbyte = params->text.end()[0] & 0xFF; + } else { + if (params->text.begin() == params->context.begin()) + lastbyte = kByteEndText; + else + lastbyte = params->text.begin()[-1] & 0xFF; + } + + State* ns; + ATOMIC_LOAD_CONSUME(ns, &s->next_[ByteMap(lastbyte)]); + if (ns == NULL) { + ns = RunStateOnByteUnlocked(s, lastbyte); + if (ns == NULL) { + StateSaver save_s(this, s); + ResetCache(params->cache_lock); + if ((s = save_s.Restore()) == NULL) { + params->failed = true; + return false; + } + ns = RunStateOnByteUnlocked(s, lastbyte); + if (ns == NULL) { + LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset"; + params->failed = true; + return false; + } + } + } + s = ns; + if (DebugDFA) + fprintf(stderr, "@_: %s\n", DumpState(s).c_str()); + if (s == FullMatchState) { + params->ep = reinterpret_cast(ep); + return true; + } + if (s > SpecialStateMax && s->IsMatch()) { + matched = true; + lastmatch = p; + if (params->matches && kind_ == Prog::kManyMatch) { + vector* v = params->matches; + v->clear(); + for (int i = 0; i < s->ninst_; i++) { + Prog::Inst* ip = prog_->inst(s->inst_[i]); + if (ip->opcode() == kInstMatch) + v->push_back(ip->match_id()); + } + } + if (DebugDFA) + fprintf(stderr, "match @%d! [%s]\n", static_cast(lastmatch - bp), + DumpState(s).c_str()); + } + params->ep = reinterpret_cast(lastmatch); + return matched; +} + +// Inline specializations of the general loop. +bool DFA::SearchFFF(SearchParams* params) { + return InlinedSearchLoop(params, 0, 0, 0); +} +bool DFA::SearchFFT(SearchParams* params) { + return InlinedSearchLoop(params, 0, 0, 1); +} +bool DFA::SearchFTF(SearchParams* params) { + return InlinedSearchLoop(params, 0, 1, 0); +} +bool DFA::SearchFTT(SearchParams* params) { + return InlinedSearchLoop(params, 0, 1, 1); +} +bool DFA::SearchTFF(SearchParams* params) { + return InlinedSearchLoop(params, 1, 0, 0); +} +bool DFA::SearchTFT(SearchParams* params) { + return InlinedSearchLoop(params, 1, 0, 1); +} +bool DFA::SearchTTF(SearchParams* params) { + return InlinedSearchLoop(params, 1, 1, 0); +} +bool DFA::SearchTTT(SearchParams* params) { + return InlinedSearchLoop(params, 1, 1, 1); +} + +// For debugging, calls the general code directly. +bool DFA::SlowSearchLoop(SearchParams* params) { + return InlinedSearchLoop(params, + params->firstbyte >= 0, + params->want_earliest_match, + params->run_forward); +} + +// For performance, calls the appropriate specialized version +// of InlinedSearchLoop. +bool DFA::FastSearchLoop(SearchParams* params) { + // Because the methods are private, the Searches array + // cannot be declared at top level. + static bool (DFA::*Searches[])(SearchParams*) = { + &DFA::SearchFFF, + &DFA::SearchFFT, + &DFA::SearchFTF, + &DFA::SearchFTT, + &DFA::SearchTFF, + &DFA::SearchTFT, + &DFA::SearchTTF, + &DFA::SearchTTT, + }; + + bool have_firstbyte = (params->firstbyte >= 0); + int index = 4 * have_firstbyte + + 2 * params->want_earliest_match + + 1 * params->run_forward; + return (this->*Searches[index])(params); +} + + +// The discussion of DFA execution above ignored the question of how +// to determine the initial state for the search loop. There are two +// factors that influence the choice of start state. +// +// The first factor is whether the search is anchored or not. +// The regexp program (Prog*) itself has +// two different entry points: one for anchored searches and one for +// unanchored searches. (The unanchored version starts with a leading ".*?" +// and then jumps to the anchored one.) +// +// The second factor is where text appears in the larger context, which +// determines which empty-string operators can be matched at the beginning +// of execution. If text is at the very beginning of context, \A and ^ match. +// Otherwise if text is at the beginning of a line, then ^ matches. +// Otherwise it matters whether the character before text is a word character +// or a non-word character. +// +// The two cases (unanchored vs not) and four cases (empty-string flags) +// combine to make the eight cases recorded in the DFA's begin_text_[2], +// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached +// StartInfos. The start state for each is filled in the first time it +// is used for an actual search. + +// Examines text, context, and anchored to determine the right start +// state for the DFA search loop. Fills in params and returns true on success. +// Returns false on failure. +bool DFA::AnalyzeSearch(SearchParams* params) { + const StringPiece& text = params->text; + const StringPiece& context = params->context; + + // Sanity check: make sure that text lies within context. + if (text.begin() < context.begin() || text.end() > context.end()) { + LOG(DFATAL) << "Text is not inside context."; + params->start = DeadState; + return true; + } + + // Determine correct search type. + int start; + uint flags; + if (params->run_forward) { + if (text.begin() == context.begin()) { + start = kStartBeginText; + flags = kEmptyBeginText|kEmptyBeginLine; + } else if (text.begin()[-1] == '\n') { + start = kStartBeginLine; + flags = kEmptyBeginLine; + } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { + start = kStartAfterWordChar; + flags = kFlagLastWord; + } else { + start = kStartAfterNonWordChar; + flags = 0; + } + } else { + if (text.end() == context.end()) { + start = kStartBeginText; + flags = kEmptyBeginText|kEmptyBeginLine; + } else if (text.end()[0] == '\n') { + start = kStartBeginLine; + flags = kEmptyBeginLine; + } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { + start = kStartAfterWordChar; + flags = kFlagLastWord; + } else { + start = kStartAfterNonWordChar; + flags = 0; + } + } + if (params->anchored || prog_->anchor_start()) + start |= kStartAnchored; + StartInfo* info = &start_[start]; + + // Try once without cache_lock for writing. + // Try again after resetting the cache + // (ResetCache will relock cache_lock for writing). + if (!AnalyzeSearchHelper(params, info, flags)) { + ResetCache(params->cache_lock); + if (!AnalyzeSearchHelper(params, info, flags)) { + LOG(DFATAL) << "Failed to analyze start state."; + params->failed = true; + return false; + } + } + + if (DebugDFA) { + int fb; + ATOMIC_LOAD_RELAXED(fb, &info->firstbyte); + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n", + params->anchored, params->run_forward, flags, + DumpState(info->start).c_str(), fb); + } + + params->start = info->start; + ATOMIC_LOAD_ACQUIRE(params->firstbyte, &info->firstbyte); + + return true; +} + +// Fills in info if needed. Returns true on success, false on failure. +bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, + uint flags) { + // Quick check. + int fb; + ATOMIC_LOAD_ACQUIRE(fb, &info->firstbyte); + if (fb != kFbUnknown) + return true; + + MutexLock l(&mutex_); + if (info->firstbyte != kFbUnknown) + return true; + + q0_->clear(); + AddToQueue(q0_, + params->anchored ? prog_->start() : prog_->start_unanchored(), + flags); + info->start = WorkqToCachedState(q0_, flags); + if (info->start == NULL) + return false; + + if (info->start == DeadState) { + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); + return true; + } + + if (info->start == FullMatchState) { + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); // will be ignored + return true; + } + + // Compute info->firstbyte by running state on all + // possible byte values, looking for a single one that + // leads to a different state. + int firstbyte = kFbNone; + for (int i = 0; i < 256; i++) { + State* s = RunStateOnByte(info->start, i); + if (s == NULL) { + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); + return false; + } + if (s == info->start) + continue; + // Goes to new state... + if (firstbyte == kFbNone) { + firstbyte = i; // ... first one + } else { + firstbyte = kFbMany; // ... too many + break; + } + } + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); + return true; +} + +// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. +bool DFA::Search(const StringPiece& text, + const StringPiece& context, + bool anchored, + bool want_earliest_match, + bool run_forward, + bool* failed, + const char** epp, + vector* matches) { + *epp = NULL; + if (!ok()) { + *failed = true; + return false; + } + *failed = false; + + if (DebugDFA) { + fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); + fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", + text.as_string().c_str(), anchored, want_earliest_match, + run_forward, kind_); + } + + RWLocker l(&cache_mutex_); + SearchParams params(text, context, &l); + params.anchored = anchored; + params.want_earliest_match = want_earliest_match; + params.run_forward = run_forward; + params.matches = matches; + + if (!AnalyzeSearch(¶ms)) { + *failed = true; + return false; + } + if (params.start == DeadState) + return false; + if (params.start == FullMatchState) { + if (run_forward == want_earliest_match) + *epp = text.begin(); + else + *epp = text.end(); + return true; + } + if (DebugDFA) + fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); + bool ret = FastSearchLoop(¶ms); + if (params.failed) { + *failed = true; + return false; + } + *epp = params.ep; + return ret; +} + +// Deletes dfa. +// +// This is a separate function so that +// prog.h can be used without moving the definition of +// class DFA out of this file. If you set +// prog->dfa_ = dfa; +// then you also have to set +// prog->delete_dfa_ = DeleteDFA; +// so that ~Prog can delete the dfa. +static void DeleteDFA(DFA* dfa) { + delete dfa; +} + +DFA* Prog::GetDFA(MatchKind kind) { + DFA*volatile* pdfa; + if (kind == kFirstMatch || kind == kManyMatch) { + pdfa = &dfa_first_; + } else { + kind = kLongestMatch; + pdfa = &dfa_longest_; + } + + // Quick check. + DFA *dfa; + ATOMIC_LOAD_ACQUIRE(dfa, pdfa); + if (dfa != NULL) + return dfa; + + MutexLock l(&dfa_mutex_); + dfa = *pdfa; + if (dfa != NULL) + return dfa; + + // For a forward DFA, half the memory goes to each DFA. + // For a reverse DFA, all the memory goes to the + // "longest match" DFA, because RE2 never does reverse + // "first match" searches. + int64 m = dfa_mem_/2; + if (reversed_) { + if (kind == kLongestMatch || kind == kManyMatch) + m = dfa_mem_; + else + m = 0; + } + dfa = new DFA(this, kind, m); + delete_dfa_ = DeleteDFA; + + // Synchronize with "quick check" above. + ATOMIC_STORE_RELEASE(pdfa, dfa); + + return dfa; +} + + +// Executes the regexp program to search in text, +// which itself is inside the larger context. (As a convenience, +// passing a NULL context is equivalent to passing text.) +// Returns true if a match is found, false if not. +// If a match is found, fills in match0->end() to point at the end of the match +// and sets match0->begin() to text.begin(), since the DFA can't track +// where the match actually began. +// +// This is the only external interface (class DFA only exists in this file). +// +bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, + Anchor anchor, MatchKind kind, + StringPiece* match0, bool* failed, vector* matches) { + *failed = false; + + StringPiece context = const_context; + if (context.begin() == NULL) + context = text; + bool carat = anchor_start(); + bool dollar = anchor_end(); + if (reversed_) { + bool t = carat; + carat = dollar; + dollar = t; + } + if (carat && context.begin() != text.begin()) + return false; + if (dollar && context.end() != text.end()) + return false; + + // Handle full match by running an anchored longest match + // and then checking if it covers all of text. + bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; + bool endmatch = false; + if (kind == kManyMatch) { + endmatch = true; + } else if (kind == kFullMatch || anchor_end()) { + endmatch = true; + kind = kLongestMatch; + } + + // If the caller doesn't care where the match is (just whether one exists), + // then we can stop at the very first match we find, the so-called + // "shortest match". + bool want_shortest_match = false; + if (match0 == NULL && !endmatch) { + want_shortest_match = true; + kind = kLongestMatch; + } + + DFA* dfa = GetDFA(kind); + const char* ep; + bool matched = dfa->Search(text, context, anchored, + want_shortest_match, !reversed_, + failed, &ep, matches); + if (*failed) + return false; + if (!matched) + return false; + if (endmatch && ep != (reversed_ ? text.begin() : text.end())) + return false; + + // If caller cares, record the boundary of the match. + // We only know where it ends, so use the boundary of text + // as the beginning. + if (match0) { + if (reversed_) + *match0 = StringPiece(ep, text.end() - ep); + else + *match0 = StringPiece(text.begin(), ep - text.begin()); + } + return true; +} + +// Build out all states in DFA. Returns number of states. +int DFA::BuildAllStates() { + if (!ok()) + return 0; + + // Pick out start state for unanchored search + // at beginning of text. + RWLocker l(&cache_mutex_); + SearchParams params(NULL, NULL, &l); + params.anchored = false; + if (!AnalyzeSearch(¶ms) || params.start <= SpecialStateMax) + return 0; + + // Add start state to work queue. + StateSet queued; + vector q; + queued.insert(params.start); + q.push_back(params.start); + + // Flood to expand every state. + for (int i = 0; i < q.size(); i++) { + State* s = q[i]; + for (int c = 0; c < 257; c++) { + State* ns = RunStateOnByteUnlocked(s, c); + if (ns > SpecialStateMax && queued.find(ns) == queued.end()) { + queued.insert(ns); + q.push_back(ns); + } + } + } + + return q.size(); +} + +// Build out all states in DFA for kind. Returns number of states. +int Prog::BuildEntireDFA(MatchKind kind) { + //LOG(ERROR) << "BuildEntireDFA is only for testing."; + return GetDFA(kind)->BuildAllStates(); +} + +// Computes min and max for matching string. +// Won't return strings bigger than maxlen. +bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { + if (!ok()) + return false; + + // NOTE: if future users of PossibleMatchRange want more precision when + // presented with infinitely repeated elements, consider making this a + // parameter to PossibleMatchRange. + static int kMaxEltRepetitions = 0; + + // Keep track of the number of times we've visited states previously. We only + // revisit a given state if it's part of a repeated group, so if the value + // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set + // |*max| to |PrefixSuccessor(*max)|. + // + // Also note that previously_visited_states[UnseenStatePtr] will, in the STL + // tradition, implicitly insert a '0' value at first use. We take advantage + // of that property below. + map previously_visited_states; + + // Pick out start state for anchored search at beginning of text. + RWLocker l(&cache_mutex_); + SearchParams params(NULL, NULL, &l); + params.anchored = true; + if (!AnalyzeSearch(¶ms)) + return false; + if (params.start == DeadState) { // No matching strings + *min = ""; + *max = ""; + return true; + } + if (params.start == FullMatchState) // Every string matches: no max + return false; + + // The DFA is essentially a big graph rooted at params.start, + // and paths in the graph correspond to accepted strings. + // Each node in the graph has potentially 256+1 arrows + // coming out, one for each byte plus the magic end of + // text character kByteEndText. + + // To find the smallest possible prefix of an accepted + // string, we just walk the graph preferring to follow + // arrows with the lowest bytes possible. To find the + // largest possible prefix, we follow the largest bytes + // possible. + + // The test for whether there is an arrow from s on byte j is + // ns = RunStateOnByteUnlocked(s, j); + // if (ns == NULL) + // return false; + // if (ns != DeadState && ns->ninst > 0) + // The RunStateOnByteUnlocked call asks the DFA to build out the graph. + // It returns NULL only if the DFA has run out of memory, + // in which case we can't be sure of anything. + // The second check sees whether there was graph built + // and whether it is interesting graph. Nodes might have + // ns->ninst == 0 if they exist only to represent the fact + // that a match was found on the previous byte. + + // Build minimum prefix. + State* s = params.start; + min->clear(); + for (int i = 0; i < maxlen; i++) { + if (previously_visited_states[s] > kMaxEltRepetitions) { + VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions + << " for state s=" << s << " and min=" << CEscape(*min); + break; + } + previously_visited_states[s]++; + + // Stop if min is a match. + State* ns = RunStateOnByteUnlocked(s, kByteEndText); + if (ns == NULL) // DFA out of memory + return false; + if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) + break; + + // Try to extend the string with low bytes. + bool extended = false; + for (int j = 0; j < 256; j++) { + ns = RunStateOnByteUnlocked(s, j); + if (ns == NULL) // DFA out of memory + return false; + if (ns == FullMatchState || + (ns > SpecialStateMax && ns->ninst_ > 0)) { + extended = true; + min->append(1, j); + s = ns; + break; + } + } + if (!extended) + break; + } + + // Build maximum prefix. + previously_visited_states.clear(); + s = params.start; + max->clear(); + for (int i = 0; i < maxlen; i++) { + if (previously_visited_states[s] > kMaxEltRepetitions) { + VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions + << " for state s=" << s << " and max=" << CEscape(*max); + break; + } + previously_visited_states[s] += 1; + + // Try to extend the string with high bytes. + bool extended = false; + for (int j = 255; j >= 0; j--) { + State* ns = RunStateOnByteUnlocked(s, j); + if (ns == NULL) + return false; + if (ns == FullMatchState || + (ns > SpecialStateMax && ns->ninst_ > 0)) { + extended = true; + max->append(1, j); + s = ns; + break; + } + } + if (!extended) { + // Done, no need for PrefixSuccessor. + return true; + } + } + + // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b + *max = PrefixSuccessor(*max); + + // If there are no bytes left, we have no way to say "there is no maximum + // string". We could make the interface more complicated and be able to + // return "there is no maximum but here is a minimum", but that seems like + // overkill -- the most common no-max case is all possible strings, so not + // telling the caller that the empty string is the minimum match isn't a + // great loss. + if (max->empty()) + return false; + + return true; +} + +// PossibleMatchRange for a Prog. +bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) { + DFA* dfa = NULL; + { + MutexLock l(&dfa_mutex_); + // Have to use dfa_longest_ to get all strings for full matches. + // For example, (a|aa) never matches aa in first-match mode. + dfa = dfa_longest_; + if (dfa == NULL) { + dfa = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); + ATOMIC_STORE_RELEASE(&dfa_longest_, dfa); + delete_dfa_ = DeleteDFA; + } + } + return dfa->PossibleMatchRange(min, max, maxlen); +} + +} // namespace re2 diff --git a/outside/re2/re2/filtered_re2.cc b/outside/re2/re2/filtered_re2.cc new file mode 100644 index 000000000..f57625895 --- /dev/null +++ b/outside/re2/re2/filtered_re2.cc @@ -0,0 +1,102 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include "util/util.h" +#include "re2/filtered_re2.h" +#include "re2/prefilter.h" +#include "re2/prefilter_tree.h" + +namespace re2 { + +FilteredRE2::FilteredRE2() + : compiled_(false), + prefilter_tree_(new PrefilterTree()) { +} + +FilteredRE2::~FilteredRE2() { + for (int i = 0; i < re2_vec_.size(); i++) + delete re2_vec_[i]; + delete prefilter_tree_; +} + +RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, + const RE2::Options& options, int* id) { + RE2* re = new RE2(pattern, options); + RE2::ErrorCode code = re->error_code(); + + if (!re->ok()) { + if (options.log_errors()) { + LOG(ERROR) << "Couldn't compile regular expression, skipping: " + << re << " due to error " << re->error(); + } + delete re; + } else { + *id = re2_vec_.size(); + re2_vec_.push_back(re); + } + + return code; +} + +void FilteredRE2::Compile(vector* atoms) { + if (compiled_ || re2_vec_.size() == 0) { + LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size(); + return; + } + + for (int i = 0; i < re2_vec_.size(); i++) { + Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]); + prefilter_tree_->Add(prefilter); + } + atoms->clear(); + prefilter_tree_->Compile(atoms); + compiled_ = true; +} + +int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { + for (int i = 0; i < re2_vec_.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[i])) + return i; + return -1; +} + +int FilteredRE2::FirstMatch(const StringPiece& text, + const vector& atoms) const { + if (!compiled_) { + LOG(DFATAL) << "FirstMatch called before Compile"; + return -1; + } + vector regexps; + prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + for (int i = 0; i < regexps.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + return regexps[i]; + return -1; +} + +bool FilteredRE2::AllMatches( + const StringPiece& text, + const vector& atoms, + vector* matching_regexps) const { + matching_regexps->clear(); + vector regexps; + prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); + for (int i = 0; i < regexps.size(); i++) + if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) + matching_regexps->push_back(regexps[i]); + return !matching_regexps->empty(); +} + +void FilteredRE2::RegexpsGivenStrings(const vector& matched_atoms, + vector* passed_regexps) { + prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); +} + + +void FilteredRE2::PrintPrefilter(int regexpid) { + prefilter_tree_->PrintPrefilter(regexpid); +} + +} // namespace re2 diff --git a/outside/re2/re2/filtered_re2.h b/outside/re2/re2/filtered_re2.h new file mode 100644 index 000000000..64b35be6c --- /dev/null +++ b/outside/re2/re2/filtered_re2.h @@ -0,0 +1,101 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. +// It provides a prefilter mechanism that helps in cutting down the +// number of regexps that need to be actually searched. +// +// By design, it does not include a string matching engine. This is to +// allow the user of the class to use their favorite string match +// engine. The overall flow is: Add all the regexps using Add, then +// Compile the FilteredRE2. The compile returns strings that need to +// be matched. Note that all returned strings are lowercase. For +// applying regexps to a search text, the caller does the string +// matching using the strings returned. When doing the string match, +// note that the caller has to do that on lower cased version of the +// search text. Then call FirstMatch or AllMatches with a vector of +// indices of strings that were found in the text to get the actual +// regexp matches. + +#ifndef RE2_FILTERED_RE2_H_ +#define RE2_FILTERED_RE2_H_ + +#include +#include "re2/re2.h" + +namespace re2 { +using std::vector; + +class PrefilterTree; + +class FilteredRE2 { + public: + FilteredRE2(); + ~FilteredRE2(); + + // Uses RE2 constructor to create a RE2 object (re). Returns + // re->error_code(). If error_code is other than NoError, then re is + // deleted and not added to re2_vec_. + RE2::ErrorCode Add(const StringPiece& pattern, + const RE2::Options& options, + int *id); + + // Prepares the regexps added by Add for filtering. Returns a set + // of strings that the caller should check for in candidate texts. + // The returned strings are lowercased. When doing string matching, + // the search text should be lowercased first to find matching + // strings from the set of strings returned by Compile. Call after + // all Add calls are done. + void Compile(vector* strings_to_match); + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Can be called prior to Compile. + // Does not do any filtering: simply tries to Match the + // regexps in a loop. + int SlowFirstMatch(const StringPiece& text) const; + + // Returns the index of the first matching regexp. + // Returns -1 on no match. Compile has to be called before + // calling this. + int FirstMatch(const StringPiece& text, + const vector& atoms) const; + + // Returns the indices of all matching regexps, after first clearing + // matched_regexps. + bool AllMatches(const StringPiece& text, + const vector& atoms, + vector* matching_regexps) const; + + // The number of regexps added. + int NumRegexps() const { return re2_vec_.size(); } + + private: + + // Get the individual RE2 objects. Useful for testing. + RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; } + + // Print prefilter. + void PrintPrefilter(int regexpid); + + // Useful for testing and debugging. + void RegexpsGivenStrings(const vector& matched_atoms, + vector* passed_regexps); + + // All the regexps in the FilteredRE2. + vector re2_vec_; + + // Has the FilteredRE2 been compiled using Compile() + bool compiled_; + + // An AND-OR tree of string atoms used for filtering regexps. + PrefilterTree* prefilter_tree_; + + //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2); + FilteredRE2(const FilteredRE2&); + void operator=(const FilteredRE2&); +}; + +} // namespace re2 + +#endif // RE2_FILTERED_RE2_H_ diff --git a/outside/re2/re2/make_perl_groups.pl b/outside/re2/re2/make_perl_groups.pl new file mode 100755 index 000000000..8c1f4f6ff --- /dev/null +++ b/outside/re2/re2/make_perl_groups.pl @@ -0,0 +1,110 @@ +#!/usr/bin/perl +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Generate table entries giving character ranges +# for POSIX/Perl character classes. Rather than +# figure out what the definition is, it is easier to ask +# Perl about each letter from 0-128 and write down +# its answer. + +@posixclasses = ( + "[:alnum:]", + "[:alpha:]", + "[:ascii:]", + "[:blank:]", + "[:cntrl:]", + "[:digit:]", + "[:graph:]", + "[:lower:]", + "[:print:]", + "[:punct:]", + "[:space:]", + "[:upper:]", + "[:word:]", + "[:xdigit:]", +); + +@perlclasses = ( + "\\d", + "\\s", + "\\w", +); + +sub ComputeClass($) { + my @ranges; + my ($class) = @_; + my $regexp = "[$class]"; + my $start = -1; + for (my $i=0; $i<=129; $i++) { + if ($i == 129) { $i = 256; } + if ($i <= 128 && chr($i) =~ $regexp) { + if ($start < 0) { + $start = $i; + } + } else { + if ($start >= 0) { + push @ranges, [$start, $i-1]; + } + $start = -1; + } + } + return @ranges; +} + +sub PrintClass($$@) { + my ($cname, $name, @ranges) = @_; + print "static const URange16 code${cname}[] = { /* $name */\n"; + for (my $i=0; $i<@ranges; $i++) { + my @a = @{$ranges[$i]}; + printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1]; + } + print "};\n"; + my $n = @ranges; + my $escname = $name; + $escname =~ s/\\/\\\\/g; + $negname = $escname; + if ($negname =~ /:/) { + $negname =~ s/:/:^/; + } else { + $negname =~ y/a-z/A-Z/; + } + return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }"; +} + +my $gen = 0; + +sub PrintClasses($@) { + my ($cname, @classes) = @_; + my @entries; + foreach my $cl (@classes) { + my @ranges = ComputeClass($cl); + push @entries, PrintClass(++$gen, $cl, @ranges); + } + print "const UGroup ${cname}_groups[] = {\n"; + foreach my $e (@entries) { + print "\t$e,\n"; + } + print "};\n"; + my $count = @entries; + print "const int num_${cname}_groups = $count;\n"; +} + +print <perl_groups.cc + +#include "re2/unicode_groups.h" + +namespace re2 { + +EOF + +PrintClasses("perl", @perlclasses); +PrintClasses("posix", @posixclasses); + +print <unicode_casefold.cc + +#include "re2/unicode_casefold.h" + +namespace re2 { + +""" + +_trailer = """ + +} // namespace re2 + +""" + +def _Delta(a, b): + """Compute the delta for b - a. Even/odd and odd/even + are handled specially, as described above.""" + if a+1 == b: + if a%2 == 0: + return 'EvenOdd' + else: + return 'OddEven' + if a == b+1: + if a%2 == 0: + return 'OddEven' + else: + return 'EvenOdd' + return b - a + +def _AddDelta(a, delta): + """Return a + delta, handling EvenOdd and OddEven specially.""" + if type(delta) == int: + return a+delta + if delta == 'EvenOdd': + if a%2 == 0: + return a+1 + else: + return a-1 + if delta == 'OddEven': + if a%2 == 1: + return a+1 + else: + return a-1 + print >>sys.stderr, "Bad Delta: ", delta + raise "Bad Delta" + +def _MakeRanges(pairs): + """Turn a list like [(65,97), (66, 98), ..., (90,122)] + into [(65, 90, +32)].""" + ranges = [] + last = -100 + + def evenodd(last, a, b, r): + if a != last+1 or b != _AddDelta(a, r[2]): + return False + r[1] = a + return True + + def evenoddpair(last, a, b, r): + if a != last+2: + return False + delta = r[2] + d = delta + if type(delta) is not str: + return False + if delta.endswith('Skip'): + d = delta[:-4] + else: + delta = d + 'Skip' + if b != _AddDelta(a, d): + return False + r[1] = a + r[2] = delta + return True + + for a, b in pairs: + if ranges and evenodd(last, a, b, ranges[-1]): + pass + elif ranges and evenoddpair(last, a, b, ranges[-1]): + pass + else: + ranges.append([a, a, _Delta(a, b)]) + last = a + return ranges + +# The maximum size of a case-folding group. +# Case folding is implemented in parse.cc by a recursive process +# with a recursion depth equal to the size of the largest +# case-folding group, so it is important that this bound be small. +# The current tables have no group bigger than 4. +# If there are ever groups bigger than 10 or so, it will be +# time to rework the code in parse.cc. +MaxCasefoldGroup = 4 + +def main(): + lowergroups, casegroups = unicode.CaseGroups() + foldpairs = [] + seen = {} + for c in casegroups: + if len(c) > MaxCasefoldGroup: + raise unicode.Error("casefold group too long: %s" % (c,)) + for i in range(len(c)): + if c[i-1] in seen: + raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) + seen[c[i-1]] = True + foldpairs.append([c[i-1], c[i]]) + + lowerpairs = [] + for lower, group in lowergroups.iteritems(): + for g in group: + if g != lower: + lowerpairs.append([g, lower]) + + def printpairs(name, foldpairs): + foldpairs.sort() + foldranges = _MakeRanges(foldpairs) + print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) + print "const CaseFold unicode_%s[] = {" % (name,) + for lo, hi, delta in foldranges: + print "\t{ %d, %d, %s }," % (lo, hi, delta) + print "};" + print "const int num_unicode_%s = %d;" % (name, len(foldranges),) + print "" + + print _header + printpairs("casefold", foldpairs) + printpairs("tolower", lowerpairs) + print _trailer + +if __name__ == '__main__': + main() diff --git a/outside/re2/re2/make_unicode_groups.py b/outside/re2/re2/make_unicode_groups.py new file mode 100755 index 000000000..8499793fa --- /dev/null +++ b/outside/re2/re2/make_unicode_groups.py @@ -0,0 +1,111 @@ +#!/usr/bin/python +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +"""Generate C++ tables for Unicode Script and Category groups.""" + +import sys +import unicode + +_header = """ +// GENERATED BY make_unicode_groups.py; DO NOT EDIT. +// make_unicode_groups.py >unicode_groups.cc + +#include "re2/unicode_groups.h" + +namespace re2 { + +""" + +_trailer = """ + +} // namespace re2 + +""" + +n16 = 0 +n32 = 0 + +def MakeRanges(codes): + """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" + ranges = [] + last = -100 + for c in codes: + if c == last+1: + ranges[-1][1] = c + else: + ranges.append([c, c]) + last = c + return ranges + +def PrintRanges(type, name, ranges): + """Print the ranges as an array of type named name.""" + print "static const %s %s[] = {" % (type, name,) + for lo, hi in ranges: + print "\t{ %d, %d }," % (lo, hi) + print "};" + +# def PrintCodes(type, name, codes): +# """Print the codes as an array of type named name.""" +# print "static %s %s[] = {" % (type, name,) +# for c in codes: +# print "\t%d," % (c,) +# print "};" + +def PrintGroup(name, codes): + """Print the data structures for the group of codes. + Return a UGroup literal for the group.""" + + # See unicode_groups.h for a description of the data structure. + + # Split codes into 16-bit ranges and 32-bit ranges. + range16 = MakeRanges([c for c in codes if c < 65536]) + range32 = MakeRanges([c for c in codes if c >= 65536]) + + # Pull singleton ranges out of range16. + # code16 = [lo for lo, hi in range16 if lo == hi] + # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] + + global n16 + global n32 + n16 += len(range16) + n32 += len(range32) + + ugroup = "{ \"%s\", +1" % (name,) + # if len(code16) > 0: + # PrintCodes("uint16", name+"_code16", code16) + # ugroup += ", %s_code16, %d" % (name, len(code16)) + # else: + # ugroup += ", 0, 0" + if len(range16) > 0: + PrintRanges("URange16", name+"_range16", range16) + ugroup += ", %s_range16, %d" % (name, len(range16)) + else: + ugroup += ", 0, 0" + if len(range32) > 0: + PrintRanges("URange32", name+"_range32", range32) + ugroup += ", %s_range32, %d" % (name, len(range32)) + else: + ugroup += ", 0, 0" + ugroup += " }" + return ugroup + +def main(): + print _header + ugroups = [] + for name, codes in unicode.Categories().iteritems(): + ugroups.append(PrintGroup(name, codes)) + for name, codes in unicode.Scripts().iteritems(): + ugroups.append(PrintGroup(name, codes)) + print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) + print "const UGroup unicode_groups[] = {"; + ugroups.sort() + for ug in ugroups: + print "\t%s," % (ug,) + print "};" + print "const int num_unicode_groups = %d;" % (len(ugroups),) + print _trailer + +if __name__ == '__main__': + main() diff --git a/outside/re2/re2/mimics_pcre.cc b/outside/re2/re2/mimics_pcre.cc new file mode 100644 index 000000000..fc6dd4ad5 --- /dev/null +++ b/outside/re2/re2/mimics_pcre.cc @@ -0,0 +1,185 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Determine whether this library should match PCRE exactly +// for a particular Regexp. (If so, the testing framework can +// check that it does.) +// +// This library matches PCRE except in these cases: +// * the regexp contains a repetition of an empty string, +// like (a*)* or (a*)+. In this case, PCRE will treat +// the repetition sequence as ending with an empty string, +// while this library does not. +// * Perl and PCRE differ on whether \v matches \n. +// For historical reasons, this library implements the Perl behavior. +// * Perl and PCRE allow $ in one-line mode to match either the very +// end of the text or just before a \n at the end of the text. +// This library requires it to match only the end of the text. +// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to +// match the end of the text if the last character is a \n. +// This library does allow it. +// +// Regexp::MimicsPCRE checks for any of these conditions. + +#include "util/util.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Returns whether re might match an empty string. +static bool CanBeEmptyString(Regexp *re); + +// Walker class to compute whether library handles a regexp +// exactly as PCRE would. See comment at top for conditions. + +class PCREWalker : public Regexp::Walker { + public: + PCREWalker() {} + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, + int nchild_args); + + bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; + return a; + } +}; + +// Called after visiting each of re's children and accumulating +// the return values in child_args. So child_args contains whether +// this library mimics PCRE for those subexpressions. +bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + // If children failed, so do we. + for (int i = 0; i < nchild_args; i++) + if (!child_args[i]) + return false; + + // Otherwise look for other reasons to fail. + switch (re->op()) { + // Look for repeated empty string. + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + if (CanBeEmptyString(re->sub()[0])) + return false; + break; + case kRegexpRepeat: + if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) + return false; + break; + + // Look for \v + case kRegexpLiteral: + if (re->rune() == '\v') + return false; + break; + + // Look for $ in single-line mode. + case kRegexpEndText: + case kRegexpEmptyMatch: + if (re->parse_flags() & Regexp::WasDollar) + return false; + break; + + // Look for ^ in multi-line mode. + case kRegexpBeginLine: + // No condition: in single-line mode ^ becomes kRegexpBeginText. + return false; + + default: + break; + } + + // Not proven guilty. + return true; +} + +// Returns whether this regexp's behavior will mimic PCRE's exactly. +bool Regexp::MimicsPCRE() { + PCREWalker w; + return w.Walk(this, true); +} + + +// Walker class to compute whether a Regexp can match an empty string. +// It is okay to overestimate. For example, \b\B cannot match an empty +// string, because \b and \B are mutually exclusive, but this isn't +// that smart and will say it can. Spurious empty strings +// will reduce the number of regexps we sanity check against PCRE, +// but they won't break anything. + +class EmptyStringWalker : public Regexp::Walker { + public: + EmptyStringWalker() { } + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; + return a; + } + + private: + DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker); +}; + +// Called after visiting re's children. child_args contains the return +// value from each of the children's PostVisits (i.e., whether each child +// can match an empty string). Returns whether this clause can match an +// empty string. +bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + switch (re->op()) { + case kRegexpNoMatch: // never empty + case kRegexpLiteral: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpCharClass: + case kRegexpLiteralString: + return false; + + case kRegexpEmptyMatch: // always empty + case kRegexpBeginLine: // always empty, when they match + case kRegexpEndLine: + case kRegexpNoWordBoundary: + case kRegexpWordBoundary: + case kRegexpBeginText: + case kRegexpEndText: + case kRegexpStar: // can always be empty + case kRegexpQuest: + case kRegexpHaveMatch: + return true; + + case kRegexpConcat: // can be empty if all children can + for (int i = 0; i < nchild_args; i++) + if (!child_args[i]) + return false; + return true; + + case kRegexpAlternate: // can be empty if any child can + for (int i = 0; i < nchild_args; i++) + if (child_args[i]) + return true; + return false; + + case kRegexpPlus: // can be empty if the child can + case kRegexpCapture: + return child_args[0]; + + case kRegexpRepeat: // can be empty if child can or is x{0} + return child_args[0] || re->min() == 0; + } + return false; +} + +// Returns whether re can match an empty string. +static bool CanBeEmptyString(Regexp* re) { + EmptyStringWalker w; + return w.Walk(re, true); +} + +} // namespace re2 diff --git a/outside/re2/re2/nfa.cc b/outside/re2/re2/nfa.cc new file mode 100644 index 000000000..8c4f76136 --- /dev/null +++ b/outside/re2/re2/nfa.cc @@ -0,0 +1,709 @@ +// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc. +// +// Prog::SearchNFA, an NFA search. +// This is an actual NFA like the theorists talk about, +// not the pseudo-NFA found in backtracking regexp implementations. +// +// IMPLEMENTATION +// +// This algorithm is a variant of one that appeared in Rob Pike's sam editor, +// which is a variant of the one described in Thompson's 1968 CACM paper. +// See http://swtch.com/~rsc/regexp/ for various history. The main feature +// over the DFA implementation is that it tracks submatch boundaries. +// +// When the choice of submatch boundaries is ambiguous, this particular +// implementation makes the same choices that traditional backtracking +// implementations (in particular, Perl and PCRE) do. +// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential +// time in the length of the input. +// +// Like Thompson's original machine and like the DFA implementation, this +// implementation notices a match only once it is one byte past it. + +#include "re2/prog.h" +#include "re2/regexp.h" +#include "util/sparse_array.h" +#include "util/sparse_set.h" + +namespace re2 { + +class NFA { + public: + NFA(Prog* prog); + ~NFA(); + + // Searches for a matching string. + // * If anchored is true, only considers matches starting at offset. + // Otherwise finds lefmost match at or after offset. + // * If longest is true, returns the longest match starting + // at the chosen start point. Otherwise returns the so-called + // left-biased match, the one traditional backtracking engines + // (like Perl and PCRE) find. + // Records submatch boundaries in submatch[1..nsubmatch-1]. + // Submatch[0] is the entire match. When there is a choice in + // which text matches each subexpression, the submatch boundaries + // are chosen to match what a backtracking implementation would choose. + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + static const int Debug = 0; + + private: + struct Thread { + union { + int id; + Thread* next; // when on free list + }; + const char** capture; + }; + + // State for explicit stack in AddToThreadq. + struct AddState { + int id; // Inst to process + int j; + const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip + + AddState() + : id(0), j(-1), cap_j(NULL) {} + explicit AddState(int id) + : id(id), j(-1), cap_j(NULL) {} + AddState(int id, const char* cap_j, int j) + : id(id), j(j), cap_j(cap_j) {} + }; + + // Threadq is a list of threads. The list is sorted by the order + // in which Perl would explore that particular state -- the earlier + // choices appear earlier in the list. + typedef SparseArray Threadq; + + inline Thread* AllocThread(); + inline void FreeThread(Thread*); + + // Add id (or its children, following unlabeled arrows) + // to the workqueue q with associated capture info. + void AddToThreadq(Threadq* q, int id, int flag, + const char* p, const char** capture); + + // Run runq on byte c, appending new states to nextq. + // Updates matched_ and match_ as new, better matches are found. + // p is position of the next byte (the one after c) + // in the input string, used when processing capturing parens. + // flag is the bitwise or of Bol, Eol, etc., specifying whether + // ^, $ and \b match the current input point (after c). + inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p); + + // Returns text version of capture information, for debugging. + string FormatCapture(const char** capture); + + inline void CopyCapture(const char** dst, const char** src); + + // Computes whether all matches must begin with the same first + // byte, and if so, returns that byte. If not, returns -1. + int ComputeFirstByte(); + + Prog* prog_; // underlying program + int start_; // start instruction in program + int ncapture_; // number of submatches to track + bool longest_; // whether searching for longest match + bool endmatch_; // whether match must end at text.end() + const char* btext_; // beginning of text being matched (for FormatSubmatch) + const char* etext_; // end of text being matched (for endmatch_) + Threadq q0_, q1_; // pre-allocated for Search. + const char** match_; // best match so far + bool matched_; // any match so far? + AddState* astack_; // pre-allocated for AddToThreadq + int nastack_; + int first_byte_; // required first byte for match, or -1 if none + + Thread* free_threads_; // free list + + DISALLOW_EVIL_CONSTRUCTORS(NFA); +}; + +NFA::NFA(Prog* prog) { + prog_ = prog; + start_ = prog->start(); + ncapture_ = 0; + longest_ = false; + endmatch_ = false; + btext_ = NULL; + etext_ = NULL; + q0_.resize(prog_->size()); + q1_.resize(prog_->size()); + nastack_ = 2*prog_->size(); + astack_ = new AddState[nastack_]; + match_ = NULL; + matched_ = false; + free_threads_ = NULL; + first_byte_ = ComputeFirstByte(); +} + +NFA::~NFA() { + delete[] match_; + delete[] astack_; + Thread* next; + for (Thread* t = free_threads_; t; t = next) { + next = t->next; + delete[] t->capture; + delete t; + } +} + +void NFA::FreeThread(Thread *t) { + if (t == NULL) + return; + t->next = free_threads_; + free_threads_ = t; +} + +NFA::Thread* NFA::AllocThread() { + Thread* t = free_threads_; + if (t == NULL) { + t = new Thread; + t->capture = new const char*[ncapture_]; + return t; + } + free_threads_ = t->next; + return t; +} + +void NFA::CopyCapture(const char** dst, const char** src) { + for (int i = 0; i < ncapture_; i+=2) { + dst[i] = src[i]; + dst[i+1] = src[i+1]; + } +} + +// Follows all empty arrows from id0 and enqueues all the states reached. +// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. +// The pointer p is the current input position, and m is the +// current set of match boundaries. +void NFA::AddToThreadq(Threadq* q, int id0, int flag, + const char* p, const char** capture) { + if (id0 == 0) + return; + + // Astack_ is pre-allocated to avoid resize operations. + // It has room for 2*prog_->size() entries, which is enough: + // Each inst in prog can be processed at most once, + // pushing at most two entries on stk. + + int nstk = 0; + AddState* stk = astack_; + stk[nstk++] = AddState(id0); + + while (nstk > 0) { + DCHECK_LE(nstk, nastack_); + const AddState& a = stk[--nstk]; + if (a.j >= 0) + capture[a.j] = a.cap_j; + + int id = a.id; + if (id == 0) + continue; + if (q->has_index(id)) { + if (Debug) + fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); + continue; + } + + // Create entry in q no matter what. We might fill it in below, + // or we might not. Even if not, it is necessary to have it, + // so that we don't revisit id0 during the recursion. + q->set_new(id, NULL); + + Thread** tp = &q->find(id)->second; + int j; + Thread* t; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; + break; + + case kInstFail: + break; + + case kInstAltMatch: + // Save state; will pick up at next byte. + t = AllocThread(); + t->id = id; + CopyCapture(t->capture, capture); + *tp = t; + // fall through + + case kInstAlt: + // Explore alternatives. + stk[nstk++] = AddState(ip->out1()); + stk[nstk++] = AddState(ip->out()); + break; + + case kInstNop: + // Continue on. + stk[nstk++] = AddState(ip->out()); + break; + + case kInstCapture: + if ((j=ip->cap()) < ncapture_) { + // Push a dummy whose only job is to restore capture[j] + // once we finish exploring this possibility. + stk[nstk++] = AddState(0, capture[j], j); + + // Record capture. + capture[j] = p; + } + stk[nstk++] = AddState(ip->out()); + break; + + case kInstMatch: + case kInstByteRange: + // Save state; will pick up at next byte. + t = AllocThread(); + t->id = id; + CopyCapture(t->capture, capture); + *tp = t; + if (Debug) + fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t); + break; + + case kInstEmptyWidth: + // Continue on if we have all the right flag bits. + if (ip->empty() & ~flag) + break; + stk[nstk++] = AddState(ip->out()); + break; + } + } +} + +// Run runq on byte c, appending new states to nextq. +// Updates match as new, better matches are found. +// p is position of the byte c in the input string, +// used when processing capturing parens. +// flag is the bitwise or of Bol, Eol, etc., specifying whether +// ^, $ and \b match the current input point (after c). +// Frees all the threads on runq. +// If there is a shortcut to the end, returns that shortcut. +int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { + nextq->clear(); + + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + Thread* t = i->second; + if (t == NULL) + continue; + + if (longest_) { + // Can skip any threads started after our current best match. + if (matched_ && match_[0] < t->capture[0]) { + FreeThread(t); + continue; + } + } + + int id = t->id; + Prog::Inst* ip = prog_->inst(id); + + switch (ip->opcode()) { + default: + // Should only see the values handled below. + LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; + break; + + case kInstByteRange: + if (ip->Matches(c)) + AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); + break; + + case kInstAltMatch: + if (i != runq->begin()) + break; + // The match is ours if we want it. + if (ip->greedy(prog_) || longest_) { + CopyCapture((const char**)match_, t->capture); + FreeThread(t); + for (++i; i != runq->end(); ++i) + FreeThread(i->second); + runq->clear(); + matched_ = true; + if (ip->greedy(prog_)) + return ip->out1(); + return ip->out(); + } + break; + + case kInstMatch: + if (endmatch_ && p != etext_) + break; + + const char* old = t->capture[1]; // previous end pointer + t->capture[1] = p; + if (longest_) { + // Leftmost-longest mode: save this match only if + // it is either farther to the left or at the same + // point but longer than an existing match. + if (!matched_ || t->capture[0] < match_[0] || + (t->capture[0] == match_[0] && t->capture[1] > match_[1])) + CopyCapture((const char**)match_, t->capture); + } else { + // Leftmost-biased mode: this match is by definition + // better than what we've already found (see next line). + CopyCapture((const char**)match_, t->capture); + + // Cut off the threads that can only find matches + // worse than the one we just found: don't run the + // rest of the current Threadq. + t->capture[0] = old; + FreeThread(t); + for (++i; i != runq->end(); ++i) + FreeThread(i->second); + runq->clear(); + matched_ = true; + return 0; + } + t->capture[0] = old; + matched_ = true; + break; + } + FreeThread(t); + } + runq->clear(); + return 0; +} + +string NFA::FormatCapture(const char** capture) { + string s; + + for (int i = 0; i < ncapture_; i+=2) { + if (capture[i] == NULL) + StringAppendF(&s, "(?,?)"); + else if (capture[i+1] == NULL) + StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_)); + else + StringAppendF(&s, "(%d,%d)", + (int)(capture[i] - btext_), + (int)(capture[i+1] - btext_)); + } + return s; +} + +// Returns whether haystack contains needle's memory. +static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) { + return haystack.begin() <= needle.begin() && + haystack.end() >= needle.end(); +} + +bool NFA::Search(const StringPiece& text, const StringPiece& const_context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + if (start_ == 0) + return false; + + StringPiece context = const_context; + if (context.begin() == NULL) + context = text; + + if (!StringPieceContains(context, text)) { + LOG(FATAL) << "Bad args: context does not contain text " + << reinterpret_cast(context.begin()) + << "+" << context.size() << " " + << reinterpret_cast(text.begin()) + << "+" << text.size(); + return false; + } + + if (prog_->anchor_start() && context.begin() != text.begin()) + return false; + if (prog_->anchor_end() && context.end() != text.end()) + return false; + anchored |= prog_->anchor_start(); + if (prog_->anchor_end()) { + longest = true; + endmatch_ = true; + etext_ = text.end(); + } + + if (nsubmatch < 0) { + LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; + return false; + } + + // Save search parameters. + ncapture_ = 2*nsubmatch; + longest_ = longest; + + if (nsubmatch == 0) { + // We need to maintain match[0], both to distinguish the + // longest match (if longest is true) and also to tell + // whether we've seen any matches at all. + ncapture_ = 2; + } + + match_ = new const char*[ncapture_]; + matched_ = false; + memset(match_, 0, ncapture_*sizeof match_[0]); + + // For debugging prints. + btext_ = context.begin(); + + if (Debug) { + fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", + text.as_string().c_str(), context.as_string().c_str(), anchored, + longest); + } + + // Set up search. + Threadq* runq = &q0_; + Threadq* nextq = &q1_; + runq->clear(); + nextq->clear(); + memset(&match_[0], 0, ncapture_*sizeof match_[0]); + const char* bp = context.begin(); + int c = -1; + int wasword = 0; + + if (text.begin() > context.begin()) { + c = text.begin()[-1] & 0xFF; + wasword = Prog::IsWordChar(c); + } + + // Loop over the text, stepping the machine. + for (const char* p = text.begin();; p++) { + // Check for empty-width specials. + int flag = 0; + + // ^ and \A + if (p == context.begin()) + flag |= kEmptyBeginText | kEmptyBeginLine; + else if (p <= context.end() && p[-1] == '\n') + flag |= kEmptyBeginLine; + + // $ and \z + if (p == context.end()) + flag |= kEmptyEndText | kEmptyEndLine; + else if (p < context.end() && p[0] == '\n') + flag |= kEmptyEndLine; + + // \b and \B + int isword = 0; + if (p < context.end()) + isword = Prog::IsWordChar(p[0] & 0xFF); + + if (isword != wasword) + flag |= kEmptyWordBoundary; + else + flag |= kEmptyNonWordBoundary; + + if (Debug) { + fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword); + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + Thread* t = i->second; + if (t == NULL) + continue; + fprintf(stderr, " %d%s", t->id, + FormatCapture((const char**)t->capture).c_str()); + } + fprintf(stderr, "\n"); + } + + // Process previous character (waited until now to avoid + // repeating the flag computation above). + // This is a no-op the first time around the loop, because + // runq is empty. + int id = Step(runq, nextq, c, flag, p-1); + DCHECK_EQ(runq->size(), 0); + swap(nextq, runq); + nextq->clear(); + if (id != 0) { + // We're done: full match ahead. + p = text.end(); + for (;;) { + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); + break; + + case kInstCapture: + match_[ip->cap()] = p; + id = ip->out(); + continue; + + case kInstNop: + id = ip->out(); + continue; + + case kInstMatch: + match_[1] = p; + matched_ = true; + break; + + case kInstEmptyWidth: + if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { + LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty(); + break; + } + id = ip->out(); + continue; + } + break; + } + break; + } + + if (p > text.end()) + break; + + // Start a new thread if there have not been any matches. + // (No point in starting a new thread if there have been + // matches, since it would be to the right of the match + // we already found.) + if (!matched_ && (!anchored || p == text.begin())) { + // If there's a required first byte for an unanchored search + // and we're not in the middle of any possible matches, + // use memchr to search for the byte quickly. + if (!anchored && first_byte_ >= 0 && runq->size() == 0 && + p < text.end() && (p[0] & 0xFF) != first_byte_) { + p = reinterpret_cast(memchr(p, first_byte_, + text.end() - p)); + if (p == NULL) { + p = text.end(); + isword = 0; + } else { + isword = Prog::IsWordChar(p[0] & 0xFF); + } + flag = Prog::EmptyFlags(context, p); + } + + // Steal match storage (cleared but unused as of yet) + // temporarily to hold match boundaries for new thread. + match_[0] = p; + AddToThreadq(runq, start_, flag, p, match_); + match_[0] = NULL; + } + + // If all the threads have died, stop early. + if (runq->size() == 0) { + if (Debug) + fprintf(stderr, "dead\n"); + break; + } + + if (p == text.end()) + c = 0; + else + c = *p & 0xFF; + wasword = isword; + + // Will run step(runq, nextq, c, ...) on next iteration. See above. + } + + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) + FreeThread(i->second); + + if (matched_) { + for (int i = 0; i < nsubmatch; i++) + submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); + if (Debug) + fprintf(stderr, "match (%d,%d)\n", + static_cast(match_[0] - btext_), + static_cast(match_[1] - btext_)); + return true; + } + VLOG(1) << "No matches found"; + return false; +} + +// Computes whether all successful matches have a common first byte, +// and if so, returns that byte. If not, returns -1. +int NFA::ComputeFirstByte() { + if (start_ == 0) + return -1; + + int b = -1; // first byte, not yet computed + + typedef SparseSet Workq; + Workq q(prog_->size()); + q.insert(start_); + for (Workq::iterator it = q.begin(); it != q.end(); ++it) { + int id = *it; + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; + break; + + case kInstMatch: + // The empty string matches: no first byte. + return -1; + + case kInstByteRange: + // Must match only a single byte + if (ip->lo() != ip->hi()) + return -1; + if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') + return -1; + // If we haven't seen any bytes yet, record it; + // otherwise must match the one we saw before. + if (b == -1) + b = ip->lo(); + else if (b != ip->lo()) + return -1; + break; + + case kInstNop: + case kInstCapture: + case kInstEmptyWidth: + // Continue on. + // Ignore ip->empty() flags for kInstEmptyWidth + // in order to be as conservative as possible + // (assume all possible empty-width flags are true). + if (ip->out()) + q.insert(ip->out()); + break; + + case kInstAlt: + case kInstAltMatch: + // Explore alternatives. + if (ip->out()) + q.insert(ip->out()); + if (ip->out1()) + q.insert(ip->out1()); + break; + + case kInstFail: + break; + } + } + return b; +} + +bool +Prog::SearchNFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch) { + if (NFA::Debug) + Dump(); + + NFA nfa(this); + StringPiece sp; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch == 0) { + match = &sp; + nmatch = 1; + } + } + if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) + return false; + if (kind == kFullMatch && match[0].end() != text.end()) + return false; + return true; +} + +} // namespace re2 + diff --git a/outside/re2/re2/onepass.cc b/outside/re2/re2/onepass.cc new file mode 100644 index 000000000..1c4998828 --- /dev/null +++ b/outside/re2/re2/onepass.cc @@ -0,0 +1,614 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc. +// +// Prog::SearchOnePass is an efficient implementation of +// regular expression search with submatch tracking for +// what I call "one-pass regular expressions". (An alternate +// name might be "backtracking-free regular expressions".) +// +// One-pass regular expressions have the property that +// at each input byte during an anchored match, there may be +// multiple alternatives but only one can proceed for any +// given input byte. +// +// For example, the regexp /x*yx*/ is one-pass: you read +// x's until a y, then you read the y, then you keep reading x's. +// At no point do you have to guess what to do or back up +// and try a different guess. +// +// On the other hand, /x*x/ is not one-pass: when you're +// looking at an input "x", it's not clear whether you should +// use it to extend the x* or as the final x. +// +// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. +// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. +// +// A simple intuition for identifying one-pass regular expressions +// is that it's always immediately obvious when a repetition ends. +// It must also be immediately obvious which branch of an | to take: +// +// /x(y|z)/ is one-pass, but /(xy|xz)/ is not. +// +// The NFA-based search in nfa.cc does some bookkeeping to +// avoid the need for backtracking and its associated exponential blowup. +// But if we have a one-pass regular expression, there is no +// possibility of backtracking, so there is no need for the +// extra bookkeeping. Hence, this code. +// +// On a one-pass regular expression, the NFA code in nfa.cc +// runs at about 1/20 of the backtracking-based PCRE speed. +// In contrast, the code in this file runs at about the same +// speed as PCRE. +// +// One-pass regular expressions get used a lot when RE is +// used for parsing simple strings, so it pays off to +// notice them and handle them efficiently. +// +// See also Anne Brüggemann-Klein and Derick Wood, +// "One-unambiguous regular languages", Information and Computation 142(2). + +#include +#include +#include "util/util.h" +#include "util/arena.h" +#include "util/sparse_set.h" +#include "re2/prog.h" +#include "re2/stringpiece.h" + +namespace re2 { + +static const int Debug = 0; + +// The key insight behind this implementation is that the +// non-determinism in an NFA for a one-pass regular expression +// is contained. To explain what that means, first a +// refresher about what regular expression programs look like +// and how the usual NFA execution runs. +// +// In a regular expression program, only the kInstByteRange +// instruction processes an input byte c and moves on to the +// next byte in the string (it does so if c is in the given range). +// The kInstByteRange instructions correspond to literal characters +// and character classes in the regular expression. +// +// The kInstAlt instructions are used as wiring to connect the +// kInstByteRange instructions together in interesting ways when +// implementing | + and *. +// The kInstAlt instruction forks execution, like a goto that +// jumps to ip->out() and ip->out1() in parallel. Each of the +// resulting computation paths is called a thread. +// +// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- +// are interesting in their own right but like kInstAlt they don't +// advance the input pointer. Only kInstByteRange does. +// +// The automaton execution in nfa.cc runs all the possible +// threads of execution in lock-step over the input. To process +// a particular byte, each thread gets run until it either dies +// or finds a kInstByteRange instruction matching the byte. +// If the latter happens, the thread stops just past the +// kInstByteRange instruction (at ip->out()) and waits for +// the other threads to finish processing the input byte. +// Then, once all the threads have processed that input byte, +// the whole process repeats. The kInstAlt state instruction +// might create new threads during input processing, but no +// matter what, all the threads stop after a kInstByteRange +// and wait for the other threads to "catch up". +// Running in lock step like this ensures that the NFA reads +// the input string only once. +// +// Each thread maintains its own set of capture registers +// (the string positions at which it executed the kInstCapture +// instructions corresponding to capturing parentheses in the +// regular expression). Repeated copying of the capture registers +// is the main performance bottleneck in the NFA implementation. +// +// A regular expression program is "one-pass" if, no matter what +// the input string, there is only one thread that makes it +// past a kInstByteRange instruction at each input byte. This means +// that there is in some sense only one active thread throughout +// the execution. Other threads might be created during the +// processing of an input byte, but they are ephemeral: only one +// thread is left to start processing the next input byte. +// This is what I meant above when I said the non-determinism +// was "contained". +// +// To execute a one-pass regular expression program, we can build +// a DFA (no non-determinism) that has at most as many states as +// the NFA (compare this to the possibly exponential number of states +// in the general case). Each state records, for each possible +// input byte, the next state along with the conditions required +// before entering that state -- empty-width flags that must be true +// and capture operations that must be performed. It also records +// whether a set of conditions required to finish a match at that +// point in the input rather than process the next byte. + +// A state in the one-pass NFA (aka DFA) - just an array of actions. +struct OneState; + +// A state in the one-pass NFA - just an array of actions indexed +// by the bytemap_[] of the next input byte. (The bytemap +// maps next input bytes into equivalence classes, to reduce +// the memory footprint.) +struct OneState { + uint32 matchcond; // conditions to match right now. + uint32 action[1]; +}; + +// The uint32 conditions in the action are a combination of +// condition and capture bits and the next state. The bottom 16 bits +// are the condition and capture bits, and the top 16 are the index of +// the next state. +// +// Bits 0-5 are the empty-width flags from prog.h. +// Bit 6 is kMatchWins, which means the match takes +// priority over moving to next in a first-match search. +// The remaining bits mark capture registers that should +// be set to the current input position. The capture bits +// start at index 2, since the search loop can take care of +// cap[0], cap[1] (the overall match position). +// That means we can handle up to 5 capturing parens: $1 through $4, plus $0. +// No input position can satisfy both kEmptyWordBoundary +// and kEmptyNonWordBoundary, so we can use that as a sentinel +// instead of needing an extra bit. + +static const int kIndexShift = 16; // number of bits below index +static const int kEmptyShift = 6; // number of empty flags in prog.h +static const int kRealCapShift = kEmptyShift + 1; +static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; + +// Parameters used to skip over cap[0], cap[1]. +static const int kCapShift = kRealCapShift - 2; +static const int kMaxCap = kRealMaxCap + 2; + +static const uint32 kMatchWins = 1 << kEmptyShift; +static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; + +static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; + +// Check, at compile time, that prog.h agrees with math above. +// This function is never called. +void OnePass_Checks() { + COMPILE_ASSERT((1<( + const_cast(nodes + statesize*nodeindex)); +} + +bool Prog::SearchOnePass(const StringPiece& text, + const StringPiece& const_context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch) { + if (anchor != kAnchored && kind != kFullMatch) { + LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; + return false; + } + + // Make sure we have at least cap[1], + // because we use it to tell if we matched. + int ncap = 2*nmatch; + if (ncap < 2) + ncap = 2; + + const char* cap[kMaxCap]; + for (int i = 0; i < ncap; i++) + cap[i] = NULL; + + const char* matchcap[kMaxCap]; + for (int i = 0; i < ncap; i++) + matchcap[i] = NULL; + + StringPiece context = const_context; + if (context.begin() == NULL) + context = text; + if (anchor_start() && context.begin() != text.begin()) + return false; + if (anchor_end() && context.end() != text.end()) + return false; + if (anchor_end()) + kind = kFullMatch; + + // State and act are marked volatile to + // keep the compiler from re-ordering the + // memory accesses walking over the NFA. + // This is worth about 5%. + volatile OneState* state = onepass_start_; + volatile uint8* nodes = onepass_nodes_; + volatile uint32 statesize = onepass_statesize_; + uint8* bytemap = bytemap_; + const char* bp = text.begin(); + const char* ep = text.end(); + const char* p; + bool matched = false; + matchcap[0] = bp; + cap[0] = bp; + uint32 nextmatchcond = state->matchcond; + for (p = bp; p < ep; p++) { + int c = bytemap[*p & 0xFF]; + uint32 matchcond = nextmatchcond; + uint32 cond = state->action[c]; + + // Determine whether we can reach act->next. + // If so, advance state and nextmatchcond. + if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { + uint32 nextindex = cond >> kIndexShift; + state = IndexToNode(nodes, statesize, nextindex); + nextmatchcond = state->matchcond; + } else { + state = NULL; + nextmatchcond = kImpossible; + } + + // This code section is carefully tuned. + // The goto sequence is about 10% faster than the + // obvious rewrite as a large if statement in the + // ASCIIMatchRE2 and DotMatchRE2 benchmarks. + + // Saving the match capture registers is expensive. + // Is this intermediate match worth thinking about? + + // Not if we want a full match. + if (kind == kFullMatch) + goto skipmatch; + + // Not if it's impossible. + if (matchcond == kImpossible) + goto skipmatch; + + // Not if the possible match is beaten by the certain + // match at the next byte. When this test is useless + // (e.g., HTTPPartialMatchRE2) it slows the loop by + // about 10%, but when it avoids work (e.g., DotMatchRE2), + // it cuts the loop execution by about 45%. + if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) + goto skipmatch; + + // Finally, the match conditions must be satisfied. + if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { + for (int i = 2; i < 2*nmatch; i++) + matchcap[i] = cap[i]; + if (nmatch > 1 && (matchcond & kCapMask)) + ApplyCaptures(matchcond, p, matchcap, ncap); + matchcap[1] = p; + matched = true; + + // If we're in longest match mode, we have to keep + // going and see if we find a longer match. + // In first match mode, we can stop if the match + // takes priority over the next state for this input byte. + // That bit is per-input byte and thus in cond, not matchcond. + if (kind == kFirstMatch && (cond & kMatchWins)) + goto done; + } + + skipmatch: + if (state == NULL) + goto done; + if ((cond & kCapMask) && nmatch > 1) + ApplyCaptures(cond, p, cap, ncap); + } + + // Look for match at end of input. + { + uint32 matchcond = state->matchcond; + if (matchcond != kImpossible && + ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { + if (nmatch > 1 && (matchcond & kCapMask)) + ApplyCaptures(matchcond, p, cap, ncap); + for (int i = 2; i < ncap; i++) + matchcap[i] = cap[i]; + matchcap[1] = p; + matched = true; + } + } + +done: + if (!matched) + return false; + for (int i = 0; i < nmatch; i++) + match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); + return true; +} + + +// Analysis to determine whether a given regexp program is one-pass. + +// If ip is not on workq, adds ip to work queue and returns true. +// If ip is already on work queue, does nothing and returns false. +// If ip is NULL, does nothing and returns true (pretends to add it). +typedef SparseSet Instq; +static bool AddQ(Instq *q, int id) { + if (id == 0) + return true; + if (q->contains(id)) + return false; + q->insert(id); + return true; +} + +struct InstCond { + int id; + uint32 cond; +}; + +// Returns whether this is a one-pass program; that is, +// returns whether it is safe to use SearchOnePass on this program. +// These conditions must be true for any instruction ip: +// +// (1) for any other Inst nip, there is at most one input-free +// path from ip to nip. +// (2) there is at most one kInstByte instruction reachable from +// ip that matches any particular byte c. +// (3) there is at most one input-free path from ip to a kInstMatch +// instruction. +// +// This is actually just a conservative approximation: it might +// return false when the answer is true, when kInstEmptyWidth +// instructions are involved. +// Constructs and saves corresponding one-pass NFA on success. +bool Prog::IsOnePass() { + if (did_onepass_) + return onepass_start_ != NULL; + did_onepass_ = true; + + if (start() == 0) // no match + return false; + + // Steal memory for the one-pass NFA from the overall DFA budget. + // Willing to use at most 1/4 of the DFA budget (heuristic). + // Limit max node count to 65000 as a conservative estimate to + // avoid overflowing 16-bit node index in encoding. + int maxnodes = 2 + byte_inst_count_; + int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32); + if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) + return false; + + // Flood the graph starting at the start state, and check + // that in each reachable state, each possible byte leads + // to a unique next state. + int size = this->size(); + InstCond *stack = new InstCond[size]; + + int* nodebyid = new int[size]; // indexed by ip + memset(nodebyid, 0xFF, size*sizeof nodebyid[0]); + + uint8* nodes = new uint8[maxnodes*statesize]; + uint8* nodep = nodes; + + Instq tovisit(size), workq(size); + AddQ(&tovisit, start()); + nodebyid[start()] = 0; + nodep += statesize; + int nalloc = 1; + for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { + int id = *it; + int nodeindex = nodebyid[id]; + OneState* node = IndexToNode(nodes, statesize, nodeindex); + + // Flood graph using manual stack, filling in actions as found. + // Default is none. + for (int b = 0; b < bytemap_range_; b++) + node->action[b] = kImpossible; + node->matchcond = kImpossible; + + workq.clear(); + bool matched = false; + int nstack = 0; + stack[nstack].id = id; + stack[nstack++].cond = 0; + while (nstack > 0) { + int id = stack[--nstack].id; + Prog::Inst* ip = inst(id); + uint32 cond = stack[nstack].cond; + switch (ip->opcode()) { + case kInstAltMatch: + // TODO(rsc): Ignoring kInstAltMatch optimization. + // Should implement it in this engine, but it's subtle. + // Fall through. + case kInstAlt: + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1())) + goto fail; + stack[nstack].id = ip->out1(); + stack[nstack++].cond = cond; + stack[nstack].id = ip->out(); + stack[nstack++].cond = cond; + break; + + case kInstByteRange: { + int nextindex = nodebyid[ip->out()]; + if (nextindex == -1) { + if (nalloc >= maxnodes) { + if (Debug) + LOG(ERROR) + << StringPrintf("Not OnePass: hit node limit %d > %d", + nalloc, maxnodes); + goto fail; + } + nextindex = nalloc; + nodep += statesize; + nodebyid[ip->out()] = nextindex; + nalloc++; + AddQ(&tovisit, ip->out()); + } + if (matched) + cond |= kMatchWins; + for (int c = ip->lo(); c <= ip->hi(); c++) { + int b = bytemap_[c]; + c = unbytemap_[b]; // last c in byte class + uint32 act = node->action[b]; + uint32 newact = (nextindex << kIndexShift) | cond; + if ((act & kImpossible) == kImpossible) { + node->action[b] = newact; + } else if (act != newact) { + if (Debug) { + LOG(ERROR) + << StringPrintf("Not OnePass: conflict on byte " + "%#x at state %d", + c, *it); + } + goto fail; + } + } + if (ip->foldcase()) { + Rune lo = max(ip->lo(), 'a') + 'A' - 'a'; + Rune hi = min(ip->hi(), 'z') + 'A' - 'a'; + for (int c = lo; c <= hi; c++) { + int b = bytemap_[c]; + c = unbytemap_[b]; // last c in class + uint32 act = node->action[b]; + uint32 newact = (nextindex << kIndexShift) | cond; + if ((act & kImpossible) == kImpossible) { + node->action[b] = newact; + } else if (act != newact) { + if (Debug) { + LOG(ERROR) + << StringPrintf("Not OnePass: conflict on byte " + "%#x at state %d", + c, *it); + } + goto fail; + } + } + } + break; + } + + case kInstCapture: + if (ip->cap() < kMaxCap) + cond |= (1 << kCapShift) << ip->cap(); + goto QueueEmpty; + + case kInstEmptyWidth: + cond |= ip->empty(); + goto QueueEmpty; + + case kInstNop: + QueueEmpty: + // kInstCapture and kInstNop always proceed to ip->out(). + // kInstEmptyWidth only sometimes proceeds to ip->out(), + // but as a conservative approximation we assume it always does. + // We could be a little more precise by looking at what c + // is, but that seems like overkill. + + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, ip->out())) { + if (Debug) { + LOG(ERROR) << StringPrintf("Not OnePass: multiple paths" + " %d -> %d\n", + *it, ip->out()); + } + goto fail; + } + stack[nstack].id = ip->out(); + stack[nstack++].cond = cond; + break; + + case kInstMatch: + if (matched) { + // (3) is violated + if (Debug) { + LOG(ERROR) << StringPrintf("Not OnePass: multiple matches" + " from %d\n", *it); + } + goto fail; + } + matched = true; + node->matchcond = cond; + break; + + case kInstFail: + break; + } + } + } + + if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR). + string dump = "prog dump:\n" + Dump() + "node dump\n"; + map idmap; + for (int i = 0; i < size; i++) + if (nodebyid[i] != -1) + idmap[nodebyid[i]] = i; + + StringAppendF(&dump, "byte ranges:\n"); + int i = 0; + for (int b = 0; b < bytemap_range_; b++) { + int lo = i; + while (bytemap_[i] == b) + i++; + StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1); + } + + for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { + int id = *it; + int nodeindex = nodebyid[id]; + if (nodeindex == -1) + continue; + OneState* node = IndexToNode(nodes, statesize, nodeindex); + string s; + StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", + nodeindex, id, node->matchcond); + for (int i = 0; i < bytemap_range_; i++) { + if ((node->action[i] & kImpossible) == kImpossible) + continue; + StringAppendF(&dump, " %d cond %#x -> %d id=%d\n", + i, node->action[i] & 0xFFFF, + node->action[i] >> kIndexShift, + idmap[node->action[i] >> kIndexShift]); + } + } + LOG(ERROR) << dump; + } + + // Overallocated earlier; cut down to actual size. + nodep = new uint8[nalloc*statesize]; + memmove(nodep, nodes, nalloc*statesize); + delete[] nodes; + nodes = nodep; + + onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]); + onepass_nodes_ = nodes; + onepass_statesize_ = statesize; + dfa_mem_ -= nalloc*statesize; + + delete[] stack; + delete[] nodebyid; + return true; + +fail: + delete[] stack; + delete[] nodebyid; + delete[] nodes; + return false; +} + +} // namespace re2 diff --git a/outside/re2/re2/parse.cc b/outside/re2/re2/parse.cc new file mode 100644 index 000000000..d7e170c14 --- /dev/null +++ b/outside/re2/re2/parse.cc @@ -0,0 +1,2216 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression parser. + +// The parser is a simple precedence-based parser with a +// manual stack. The parsing work is done by the methods +// of the ParseState class. The Regexp::Parse function is +// essentially just a lexer that calls the ParseState method +// for each token. + +// The parser recognizes POSIX extended regular expressions +// excluding backreferences, collating elements, and collating +// classes. It also allows the empty string as a regular expression +// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. +// See regexp.h for rationale. + +#include "util/util.h" +#include "re2/regexp.h" +#include "re2/stringpiece.h" +#include "re2/unicode_casefold.h" +#include "re2/unicode_groups.h" + +namespace re2 { + +// Regular expression parse state. +// The list of parsed regexps so far is maintained as a vector of +// Regexp pointers called the stack. Left parenthesis and vertical +// bar markers are also placed on the stack, as Regexps with +// non-standard opcodes. +// Scanning a left parenthesis causes the parser to push a left parenthesis +// marker on the stack. +// Scanning a vertical bar causes the parser to pop the stack until it finds a +// vertical bar or left parenthesis marker (not popping the marker), +// concatenate all the popped results, and push them back on +// the stack (DoConcatenation). +// Scanning a right parenthesis causes the parser to act as though it +// has seen a vertical bar, which then leaves the top of the stack in the +// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. +// The parser pops all this off the stack and creates an alternation of the +// regexps (DoAlternation). + +class Regexp::ParseState { + public: + ParseState(ParseFlags flags, const StringPiece& whole_regexp, + RegexpStatus* status); + ~ParseState(); + + ParseFlags flags() { return flags_; } + int rune_max() { return rune_max_; } + + // Parse methods. All public methods return a bool saying + // whether parsing should continue. If a method returns + // false, it has set fields in *status_, and the parser + // should return NULL. + + // Pushes the given regular expression onto the stack. + // Could check for too much memory used here. + bool PushRegexp(Regexp* re); + + // Pushes the literal rune r onto the stack. + bool PushLiteral(Rune r); + + // Pushes a regexp with the given op (and no args) onto the stack. + bool PushSimpleOp(RegexpOp op); + + // Pushes a ^ onto the stack. + bool PushCarat(); + + // Pushes a \b (word == true) or \B (word == false) onto the stack. + bool PushWordBoundary(bool word); + + // Pushes a $ onto the stack. + bool PushDollar(); + + // Pushes a . onto the stack + bool PushDot(); + + // Pushes a repeat operator regexp onto the stack. + // A valid argument for the operator must already be on the stack. + // s is the name of the operator, for use in error messages. + bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); + + // Pushes a repetition regexp onto the stack. + // A valid argument for the operator must already be on the stack. + bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); + + // Checks whether a particular regexp op is a marker. + bool IsMarker(RegexpOp op); + + // Processes a left parenthesis in the input. + // Pushes a marker onto the stack. + bool DoLeftParen(const StringPiece& name); + bool DoLeftParenNoCapture(); + + // Processes a vertical bar in the input. + bool DoVerticalBar(); + + // Processes a right parenthesis in the input. + bool DoRightParen(); + + // Processes the end of input, returning the final regexp. + Regexp* DoFinish(); + + // Finishes the regexp if necessary, preparing it for use + // in a more complicated expression. + // If it is a CharClassBuilder, converts into a CharClass. + Regexp* FinishRegexp(Regexp*); + + // These routines don't manipulate the parse stack + // directly, but they do need to look at flags_. + // ParseCharClass also manipulates the internals of Regexp + // while creating *out_re. + + // Parse a character class into *out_re. + // Removes parsed text from s. + bool ParseCharClass(StringPiece* s, Regexp** out_re, + RegexpStatus* status); + + // Parse a character class character into *rp. + // Removes parsed text from s. + bool ParseCCCharacter(StringPiece* s, Rune *rp, + const StringPiece& whole_class, + RegexpStatus* status); + + // Parse a character class range into rr. + // Removes parsed text from s. + bool ParseCCRange(StringPiece* s, RuneRange* rr, + const StringPiece& whole_class, + RegexpStatus* status); + + // Parse a Perl flag set or non-capturing group from s. + bool ParsePerlFlags(StringPiece* s); + + + // Finishes the current concatenation, + // collapsing it into a single regexp on the stack. + void DoConcatenation(); + + // Finishes the current alternation, + // collapsing it to a single regexp on the stack. + void DoAlternation(); + + // Generalized DoAlternation/DoConcatenation. + void DoCollapse(RegexpOp op); + + // Maybe concatenate Literals into LiteralString. + bool MaybeConcatString(int r, ParseFlags flags); + +private: + ParseFlags flags_; + StringPiece whole_regexp_; + RegexpStatus* status_; + Regexp* stacktop_; + int ncap_; // number of capturing parens seen + int rune_max_; // maximum char value for this encoding + + DISALLOW_EVIL_CONSTRUCTORS(ParseState); +}; + +// Pseudo-operators - only on parse stack. +const RegexpOp kLeftParen = static_cast(kMaxRegexpOp+1); +const RegexpOp kVerticalBar = static_cast(kMaxRegexpOp+2); + +Regexp::ParseState::ParseState(ParseFlags flags, + const StringPiece& whole_regexp, + RegexpStatus* status) + : flags_(flags), whole_regexp_(whole_regexp), + status_(status), stacktop_(NULL), ncap_(0) { + if (flags_ & Latin1) + rune_max_ = 0xFF; + else + rune_max_ = Runemax; +} + +// Cleans up by freeing all the regexps on the stack. +Regexp::ParseState::~ParseState() { + Regexp* next; + for (Regexp* re = stacktop_; re != NULL; re = next) { + next = re->down_; + re->down_ = NULL; + if (re->op() == kLeftParen) + delete re->name_; + re->Decref(); + } +} + +// Finishes the regexp if necessary, preparing it for use in +// a more complex expression. +// If it is a CharClassBuilder, converts into a CharClass. +Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { + if (re == NULL) + return NULL; + re->down_ = NULL; + + if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { + CharClassBuilder* ccb = re->ccb_; + re->ccb_ = NULL; + re->cc_ = ccb->GetCharClass(); + delete ccb; + } + + return re; +} + +// Pushes the given regular expression onto the stack. +// Could check for too much memory used here. +bool Regexp::ParseState::PushRegexp(Regexp* re) { + MaybeConcatString(-1, NoParseFlags); + + // Special case: a character class of one character is just + // a literal. This is a common idiom for escaping + // single characters (e.g., [.] instead of \.), and some + // analysis does better with fewer character classes. + // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. + if (re->op_ == kRegexpCharClass) { + if (re->ccb_->size() == 1) { + Rune r = re->ccb_->begin()->lo; + re->Decref(); + re = new Regexp(kRegexpLiteral, flags_); + re->rune_ = r; + } else if (re->ccb_->size() == 2) { + Rune r = re->ccb_->begin()->lo; + if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { + re->Decref(); + re = new Regexp(kRegexpLiteral, flags_ | FoldCase); + re->rune_ = r + 'a' - 'A'; + } + } + } + + if (!IsMarker(re->op())) + re->simple_ = re->ComputeSimple(); + re->down_ = stacktop_; + stacktop_ = re; + return true; +} + +// Searches the case folding tables and returns the CaseFold* that contains r. +// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. +// If there isn't one, returns NULL. +const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { + const CaseFold* ef = f + n; + + // Binary search for entry containing r. + while (n > 0) { + int m = n/2; + if (f[m].lo <= r && r <= f[m].hi) + return &f[m]; + if (r < f[m].lo) { + n = m; + } else { + f += m+1; + n -= m+1; + } + } + + // There is no entry that contains r, but f points + // where it would have been. Unless f points at + // the end of the array, it points at the next entry + // after r. + if (f < ef) + return f; + + // No entry contains r; no entry contains runes > r. + return NULL; +} + +// Returns the result of applying the fold f to the rune r. +Rune ApplyFold(const CaseFold *f, Rune r) { + switch (f->delta) { + default: + return r + f->delta; + + case EvenOddSkip: // even <-> odd but only applies to every other + if ((r - f->lo) % 2) + return r; + // fall through + case EvenOdd: // even <-> odd + if (r%2 == 0) + return r + 1; + return r - 1; + + case OddEvenSkip: // odd <-> even but only applies to every other + if ((r - f->lo) % 2) + return r; + // fall through + case OddEven: // odd <-> even + if (r%2 == 1) + return r + 1; + return r - 1; + } +} + +// Returns the next Rune in r's folding cycle (see unicode_casefold.h). +// Examples: +// CycleFoldRune('A') = 'a' +// CycleFoldRune('a') = 'A' +// +// CycleFoldRune('K') = 'k' +// CycleFoldRune('k') = 0x212A (Kelvin) +// CycleFoldRune(0x212A) = 'K' +// +// CycleFoldRune('?') = '?' +Rune CycleFoldRune(Rune r) { + const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); + if (f == NULL || r < f->lo) + return r; + return ApplyFold(f, r); +} + +// Add lo-hi to the class, along with their fold-equivalent characters. +// If lo-hi is already in the class, assume that the fold-equivalent +// chars are there too, so there's no work to do. +static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { + // AddFoldedRange calls itself recursively for each rune in the fold cycle. + // Most folding cycles are small: there aren't any bigger than four in the + // current Unicode tables. make_unicode_casefold.py checks that + // the cycles are not too long, and we double-check here using depth. + if (depth > 10) { + LOG(DFATAL) << "AddFoldedRange recurses too much."; + return; + } + + if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done + return; + + while (lo <= hi) { + const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); + if (f == NULL) // lo has no fold, nor does anything above lo + break; + if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo + lo = f->lo; + continue; + } + + // Add in the result of folding the range lo - f->hi + // and that range's fold, recursively. + Rune lo1 = lo; + Rune hi1 = min(hi, f->hi); + switch (f->delta) { + default: + lo1 += f->delta; + hi1 += f->delta; + break; + case EvenOdd: + if (lo1%2 == 1) + lo1--; + if (hi1%2 == 0) + hi1++; + break; + case OddEven: + if (lo1%2 == 0) + lo1--; + if (hi1%2 == 1) + hi1++; + break; + } + AddFoldedRange(cc, lo1, hi1, depth+1); + + // Pick up where this fold left off. + lo = f->hi + 1; + } +} + +// Pushes the literal rune r onto the stack. +bool Regexp::ParseState::PushLiteral(Rune r) { + // Do case folding if needed. + if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + Rune r1 = r; + do { + if (!(flags_ & NeverNL) || r != '\n') { + re->ccb_->AddRange(r, r); + } + r = CycleFoldRune(r); + } while (r != r1); + re->ccb_->RemoveAbove(rune_max_); + return PushRegexp(re); + } + + // Exclude newline if applicable. + if ((flags_ & NeverNL) && r == '\n') + return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); + + // No fancy stuff worked. Ordinary literal. + if (MaybeConcatString(r, flags_)) + return true; + + Regexp* re = new Regexp(kRegexpLiteral, flags_); + re->rune_ = r; + return PushRegexp(re); +} + +// Pushes a ^ onto the stack. +bool Regexp::ParseState::PushCarat() { + if (flags_ & OneLine) { + return PushSimpleOp(kRegexpBeginText); + } + return PushSimpleOp(kRegexpBeginLine); +} + +// Pushes a \b or \B onto the stack. +bool Regexp::ParseState::PushWordBoundary(bool word) { + if (word) + return PushSimpleOp(kRegexpWordBoundary); + return PushSimpleOp(kRegexpNoWordBoundary); +} + +// Pushes a $ onto the stack. +bool Regexp::ParseState::PushDollar() { + if (flags_ & OneLine) { + // Clumsy marker so that MimicsPCRE() can tell whether + // this kRegexpEndText was a $ and not a \z. + Regexp::ParseFlags oflags = flags_; + flags_ = flags_ | WasDollar; + bool ret = PushSimpleOp(kRegexpEndText); + flags_ = oflags; + return ret; + } + return PushSimpleOp(kRegexpEndLine); +} + +// Pushes a . onto the stack. +bool Regexp::ParseState::PushDot() { + if ((flags_ & DotNL) && !(flags_ & NeverNL)) + return PushSimpleOp(kRegexpAnyChar); + // Rewrite . into [^\n] + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + re->ccb_->AddRange(0, '\n' - 1); + re->ccb_->AddRange('\n' + 1, rune_max_); + return PushRegexp(re); +} + +// Pushes a regexp with the given op (and no args) onto the stack. +bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { + Regexp* re = new Regexp(op, flags_); + return PushRegexp(re); +} + +// Pushes a repeat operator regexp onto the stack. +// A valid argument for the operator must already be on the stack. +// The char c is the name of the operator, for use in error messages. +bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, + bool nongreedy) { + if (stacktop_ == NULL || IsMarker(stacktop_->op())) { + status_->set_code(kRegexpRepeatArgument); + status_->set_error_arg(s); + return false; + } + Regexp::ParseFlags fl = flags_; + if (nongreedy) + fl = fl ^ NonGreedy; + Regexp* re = new Regexp(op, fl); + re->AllocSub(1); + re->down_ = stacktop_->down_; + re->sub()[0] = FinishRegexp(stacktop_); + re->simple_ = re->ComputeSimple(); + stacktop_ = re; + return true; +} + +// Pushes a repetition regexp onto the stack. +// A valid argument for the operator must already be on the stack. +bool Regexp::ParseState::PushRepetition(int min, int max, + const StringPiece& s, + bool nongreedy) { + if ((max != -1 && max < min) || min > 1000 || max > 1000) { + status_->set_code(kRegexpRepeatSize); + status_->set_error_arg(s); + return false; + } + if (stacktop_ == NULL || IsMarker(stacktop_->op())) { + status_->set_code(kRegexpRepeatArgument); + status_->set_error_arg(s); + return false; + } + Regexp::ParseFlags fl = flags_; + if (nongreedy) + fl = fl ^ NonGreedy; + Regexp* re = new Regexp(kRegexpRepeat, fl); + re->min_ = min; + re->max_ = max; + re->AllocSub(1); + re->down_ = stacktop_->down_; + re->sub()[0] = FinishRegexp(stacktop_); + re->simple_ = re->ComputeSimple(); + + stacktop_ = re; + return true; +} + +// Checks whether a particular regexp op is a marker. +bool Regexp::ParseState::IsMarker(RegexpOp op) { + return op >= kLeftParen; +} + +// Processes a left parenthesis in the input. +// Pushes a marker onto the stack. +bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { + Regexp* re = new Regexp(kLeftParen, flags_); + re->cap_ = ++ncap_; + if (name.data() != NULL) + re->name_ = new string(name.as_string()); + return PushRegexp(re); +} + +// Pushes a non-capturing marker onto the stack. +bool Regexp::ParseState::DoLeftParenNoCapture() { + Regexp* re = new Regexp(kLeftParen, flags_); + re->cap_ = -1; + return PushRegexp(re); +} + +// Adds r to cc, along with r's upper case if foldascii is set. +static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { + cc->AddRange(r, r); + if (foldascii && 'a' <= r && r <= 'z') + cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); +} + +// Processes a vertical bar in the input. +bool Regexp::ParseState::DoVerticalBar() { + MaybeConcatString(-1, NoParseFlags); + DoConcatenation(); + + // Below the vertical bar is a list to alternate. + // Above the vertical bar is a list to concatenate. + // We just did the concatenation, so either swap + // the result below the vertical bar or push a new + // vertical bar on the stack. + Regexp* r1; + Regexp* r2; + if ((r1 = stacktop_) != NULL && + (r2 = stacktop_->down_) != NULL && + r2->op() == kVerticalBar) { + // If above and below vertical bar are literal or char class, + // can merge into a single char class. + Regexp* r3; + if ((r1->op() == kRegexpLiteral || + r1->op() == kRegexpCharClass || + r1->op() == kRegexpAnyChar) && + (r3 = r2->down_) != NULL) { + Rune rune; + switch (r3->op()) { + case kRegexpLiteral: // convert to char class + rune = r3->rune_; + r3->op_ = kRegexpCharClass; + r3->cc_ = NULL; + r3->ccb_ = new CharClassBuilder; + AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); + // fall through + case kRegexpCharClass: + if (r1->op() == kRegexpLiteral) + AddLiteral(r3->ccb_, r1->rune_, + r1->parse_flags_ & Regexp::FoldCase); + else if (r1->op() == kRegexpCharClass) + r3->ccb_->AddCharClass(r1->ccb_); + if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { + delete r3->ccb_; + r3->ccb_ = NULL; + r3->op_ = kRegexpAnyChar; + } + // fall through + case kRegexpAnyChar: + // pop r1 + stacktop_ = r2; + r1->Decref(); + return true; + default: + break; + } + } + + // Swap r1 below vertical bar (r2). + r1->down_ = r2->down_; + r2->down_ = r1; + stacktop_ = r2; + return true; + } + return PushSimpleOp(kVerticalBar); +} + +// Processes a right parenthesis in the input. +bool Regexp::ParseState::DoRightParen() { + // Finish the current concatenation and alternation. + DoAlternation(); + + // The stack should be: LeftParen regexp + // Remove the LeftParen, leaving the regexp, + // parenthesized. + Regexp* r1; + Regexp* r2; + if ((r1 = stacktop_) == NULL || + (r2 = r1->down_) == NULL || + r2->op() != kLeftParen) { + status_->set_code(kRegexpMissingParen); + status_->set_error_arg(whole_regexp_); + return false; + } + + // Pop off r1, r2. Will Decref or reuse below. + stacktop_ = r2->down_; + + // Restore flags from when paren opened. + Regexp* re = r2; + flags_ = re->parse_flags(); + + // Rewrite LeftParen as capture if needed. + if (re->cap_ > 0) { + re->op_ = kRegexpCapture; + // re->cap_ is already set + re->AllocSub(1); + re->sub()[0] = FinishRegexp(r1); + re->simple_ = re->ComputeSimple(); + } else { + re->Decref(); + re = r1; + } + return PushRegexp(re); +} + +// Processes the end of input, returning the final regexp. +Regexp* Regexp::ParseState::DoFinish() { + DoAlternation(); + Regexp* re = stacktop_; + if (re != NULL && re->down_ != NULL) { + status_->set_code(kRegexpMissingParen); + status_->set_error_arg(whole_regexp_); + return NULL; + } + stacktop_ = NULL; + return FinishRegexp(re); +} + +// Returns the leading regexp that re starts with. +// The returned Regexp* points into a piece of re, +// so it must not be used after the caller calls re->Decref(). +Regexp* Regexp::LeadingRegexp(Regexp* re) { + if (re->op() == kRegexpEmptyMatch) + return NULL; + if (re->op() == kRegexpConcat && re->nsub() >= 2) { + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) + return NULL; + return sub[0]; + } + return re; +} + +// Removes LeadingRegexp(re) from re and returns what's left. +// Consumes the reference to re and may edit it in place. +// If caller wants to hold on to LeadingRegexp(re), +// must have already Incref'ed it. +Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { + if (re->op() == kRegexpEmptyMatch) + return re; + if (re->op() == kRegexpConcat && re->nsub() >= 2) { + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) + return re; + sub[0]->Decref(); + sub[0] = NULL; + if (re->nsub() == 2) { + // Collapse concatenation to single regexp. + Regexp* nre = sub[1]; + sub[1] = NULL; + re->Decref(); + return nre; + } + // 3 or more -> 2 or more. + re->nsub_--; + memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); + return re; + } + Regexp::ParseFlags pf = re->parse_flags(); + re->Decref(); + return new Regexp(kRegexpEmptyMatch, pf); +} + +// Returns the leading string that re starts with. +// The returned Rune* points into a piece of re, +// so it must not be used after the caller calls re->Decref(). +Rune* Regexp::LeadingString(Regexp* re, int *nrune, + Regexp::ParseFlags *flags) { + while (re->op() == kRegexpConcat && re->nsub() > 0) + re = re->sub()[0]; + + *flags = static_cast(re->parse_flags_ & Regexp::FoldCase); + + if (re->op() == kRegexpLiteral) { + *nrune = 1; + return &re->rune_; + } + + if (re->op() == kRegexpLiteralString) { + *nrune = re->nrunes_; + return re->runes_; + } + + *nrune = 0; + return NULL; +} + +// Removes the first n leading runes from the beginning of re. +// Edits re in place. +void Regexp::RemoveLeadingString(Regexp* re, int n) { + // Chase down concats to find first string. + // For regexps generated by parser, nested concats are + // flattened except when doing so would overflow the 16-bit + // limit on the size of a concatenation, so we should never + // see more than two here. + Regexp* stk[4]; + int d = 0; + while (re->op() == kRegexpConcat) { + if (d < arraysize(stk)) + stk[d++] = re; + re = re->sub()[0]; + } + + // Remove leading string from re. + if (re->op() == kRegexpLiteral) { + re->rune_ = 0; + re->op_ = kRegexpEmptyMatch; + } else if (re->op() == kRegexpLiteralString) { + if (n >= re->nrunes_) { + delete[] re->runes_; + re->runes_ = NULL; + re->nrunes_ = 0; + re->op_ = kRegexpEmptyMatch; + } else if (n == re->nrunes_ - 1) { + Rune rune = re->runes_[re->nrunes_ - 1]; + delete[] re->runes_; + re->runes_ = NULL; + re->nrunes_ = 0; + re->rune_ = rune; + re->op_ = kRegexpLiteral; + } else { + re->nrunes_ -= n; + memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); + } + } + + // If re is now empty, concatenations might simplify too. + while (d-- > 0) { + re = stk[d]; + Regexp** sub = re->sub(); + if (sub[0]->op() == kRegexpEmptyMatch) { + sub[0]->Decref(); + sub[0] = NULL; + // Delete first element of concat. + switch (re->nsub()) { + case 0: + case 1: + // Impossible. + LOG(DFATAL) << "Concat of " << re->nsub(); + re->submany_ = NULL; + re->op_ = kRegexpEmptyMatch; + break; + + case 2: { + // Replace re with sub[1]. + Regexp* old = sub[1]; + sub[1] = NULL; + re->Swap(old); + old->Decref(); + break; + } + + default: + // Slide down. + re->nsub_--; + memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); + break; + } + } + } +} + +// Factors common prefixes from alternation. +// For example, +// ABC|ABD|AEF|BCX|BCY +// simplifies to +// A(B(C|D)|EF)|BC(X|Y) +// which the normal parse state routines will further simplify to +// A(B[CD]|EF)|BC[XY] +// +// Rewrites sub to contain simplified list to alternate and returns +// the new length of sub. Adjusts reference counts accordingly +// (incoming sub[i] decremented, outgoing sub[i] incremented). + +// It's too much of a pain to write this code with an explicit stack, +// so instead we let the caller specify a maximum depth and +// don't simplify beyond that. There are around 15 words of local +// variables and parameters in the frame, so allowing 8 levels +// on a 64-bit machine is still less than a kilobyte of stack and +// probably enough benefit for practical uses. +const int kFactorAlternationMaxDepth = 8; + +int Regexp::FactorAlternation( + Regexp** sub, int n, + Regexp::ParseFlags altflags) { + return FactorAlternationRecursive(sub, n, altflags, + kFactorAlternationMaxDepth); +} + +int Regexp::FactorAlternationRecursive( + Regexp** sub, int n, + Regexp::ParseFlags altflags, + int maxdepth) { + + if (maxdepth <= 0) + return n; + + // Round 1: Factor out common literal prefixes. + Rune *rune = NULL; + int nrune = 0; + Regexp::ParseFlags runeflags = Regexp::NoParseFlags; + int start = 0; + int out = 0; + for (int i = 0; i <= n; i++) { + // Invariant: what was in sub[0:start] has been Decref'ed + // and that space has been reused for sub[0:out] (out <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin + // with the string rune[0:nrune]. + + Rune* rune_i = NULL; + int nrune_i = 0; + Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; + if (i < n) { + rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); + if (runeflags_i == runeflags) { + int same = 0; + while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) + same++; + if (same > 0) { + // Matches at least one rune in current range. Keep going around. + nrune = same; + continue; + } + } + } + + // Found end of a run with common leading literal string: + // sub[start:i] all begin with rune[0:nrune] but sub[i] + // does not even begin with rune[0]. + // + // Factor out common string and append factored expression to sub[0:out]. + if (i == start) { + // Nothing to do - first iteration. + } else if (i == start+1) { + // Just one: don't bother factoring. + sub[out++] = sub[start]; + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... + x[0] = LiteralString(rune, nrune, runeflags); + for (int j = start; j < i; j++) + RemoveLeadingString(sub[j], nrune); + int nn = FactorAlternationRecursive(sub + start, i - start, altflags, + maxdepth - 1); + x[1] = AlternateNoFactor(sub + start, nn, altflags); + sub[out++] = Concat(x, 2, altflags); + } + + // Prepare for next round (if there is one). + if (i < n) { + start = i; + rune = rune_i; + nrune = nrune_i; + runeflags = runeflags_i; + } + } + n = out; + + // Round 2: Factor out common complex prefixes, + // just the first piece of each concatenation, + // whatever it is. This is good enough a lot of the time. + start = 0; + out = 0; + Regexp* first = NULL; + for (int i = 0; i <= n; i++) { + // Invariant: what was in sub[0:start] has been Decref'ed + // and that space has been reused for sub[0:out] (out <= start). + // + // Invariant: sub[start:i] consists of regexps that all begin with first. + + Regexp* first_i = NULL; + if (i < n) { + first_i = LeadingRegexp(sub[i]); + if (first != NULL && Regexp::Equal(first, first_i)) { + continue; + } + } + + // Found end of a run with common leading regexp: + // sub[start:i] all begin with first but sub[i] does not. + // + // Factor out common regexp and append factored expression to sub[0:out]. + if (i == start) { + // Nothing to do - first iteration. + } else if (i == start+1) { + // Just one: don't bother factoring. + sub[out++] = sub[start]; + } else { + // Construct factored form: prefix(suffix1|suffix2|...) + Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... + x[0] = first->Incref(); + for (int j = start; j < i; j++) + sub[j] = RemoveLeadingRegexp(sub[j]); + int nn = FactorAlternationRecursive(sub + start, i - start, altflags, + maxdepth - 1); + x[1] = AlternateNoFactor(sub + start, nn, altflags); + sub[out++] = Concat(x, 2, altflags); + } + + // Prepare for next round (if there is one). + if (i < n) { + start = i; + first = first_i; + } + } + n = out; + + // Round 3: Collapse runs of single literals into character classes. + start = 0; + out = 0; + for (int i = 0; i <= n; i++) { + // Invariant: what was in sub[0:start] has been Decref'ed + // and that space has been reused for sub[0:out] (out <= start). + // + // Invariant: sub[start:i] consists of regexps that are either + // literal runes or character classes. + + if (i < n && + (sub[i]->op() == kRegexpLiteral || + sub[i]->op() == kRegexpCharClass)) + continue; + + // sub[i] is not a char or char class; + // emit char class for sub[start:i]... + if (i == start) { + // Nothing to do. + } else if (i == start+1) { + sub[out++] = sub[start]; + } else { + // Make new char class. + CharClassBuilder ccb; + for (int j = start; j < i; j++) { + Regexp* re = sub[j]; + if (re->op() == kRegexpCharClass) { + CharClass* cc = re->cc(); + for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) + ccb.AddRange(it->lo, it->hi); + } else if (re->op() == kRegexpLiteral) { + ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); + } else { + LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " + << re->ToString(); + } + re->Decref(); + } + sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); + } + + // ... and then emit sub[i]. + if (i < n) + sub[out++] = sub[i]; + start = i+1; + } + n = out; + + // Round 4: Collapse runs of empty matches into single empty match. + start = 0; + out = 0; + for (int i = 0; i < n; i++) { + if (i + 1 < n && + sub[i]->op() == kRegexpEmptyMatch && + sub[i+1]->op() == kRegexpEmptyMatch) { + sub[i]->Decref(); + continue; + } + sub[out++] = sub[i]; + } + n = out; + + return n; +} + +// Collapse the regexps on top of the stack, down to the +// first marker, into a new op node (op == kRegexpAlternate +// or op == kRegexpConcat). +void Regexp::ParseState::DoCollapse(RegexpOp op) { + // Scan backward to marker, counting children of composite. + int n = 0; + Regexp* next = NULL; + Regexp* sub; + for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { + next = sub->down_; + if (sub->op_ == op) + n += sub->nsub_; + else + n++; + } + + // If there's just one child, leave it alone. + // (Concat of one thing is that one thing; alternate of one thing is same.) + if (stacktop_ != NULL && stacktop_->down_ == next) + return; + + // Construct op (alternation or concatenation), flattening op of op. + Regexp** subs = new Regexp*[n]; + next = NULL; + int i = n; + for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { + next = sub->down_; + if (sub->op_ == op) { + Regexp** sub_subs = sub->sub(); + for (int k = sub->nsub_ - 1; k >= 0; k--) + subs[--i] = sub_subs[k]->Incref(); + sub->Decref(); + } else { + subs[--i] = FinishRegexp(sub); + } + } + + Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true); + delete[] subs; + re->simple_ = re->ComputeSimple(); + re->down_ = next; + stacktop_ = re; +} + +// Finishes the current concatenation, +// collapsing it into a single regexp on the stack. +void Regexp::ParseState::DoConcatenation() { + Regexp* r1 = stacktop_; + if (r1 == NULL || IsMarker(r1->op())) { + // empty concatenation is special case + Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); + PushRegexp(re); + } + DoCollapse(kRegexpConcat); +} + +// Finishes the current alternation, +// collapsing it to a single regexp on the stack. +void Regexp::ParseState::DoAlternation() { + DoVerticalBar(); + // Now stack top is kVerticalBar. + Regexp* r1 = stacktop_; + stacktop_ = r1->down_; + r1->Decref(); + DoCollapse(kRegexpAlternate); +} + +// Incremental conversion of concatenated literals into strings. +// If top two elements on stack are both literal or string, +// collapse into single string. +// Don't walk down the stack -- the parser calls this frequently +// enough that below the bottom two is known to be collapsed. +// Only called when another regexp is about to be pushed +// on the stack, so that the topmost literal is not being considered. +// (Otherwise ab* would turn into (ab)*.) +// If r >= 0, consider pushing a literal r on the stack. +// Return whether that happened. +bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { + Regexp* re1; + Regexp* re2; + if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) + return false; + + if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) + return false; + if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) + return false; + if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) + return false; + + if (re2->op_ == kRegexpLiteral) { + // convert into string + Rune rune = re2->rune_; + re2->op_ = kRegexpLiteralString; + re2->nrunes_ = 0; + re2->runes_ = NULL; + re2->AddRuneToString(rune); + } + + // push re1 into re2. + if (re1->op_ == kRegexpLiteral) { + re2->AddRuneToString(re1->rune_); + } else { + for (int i = 0; i < re1->nrunes_; i++) + re2->AddRuneToString(re1->runes_[i]); + re1->nrunes_ = 0; + delete[] re1->runes_; + re1->runes_ = NULL; + } + + // reuse re1 if possible + if (r >= 0) { + re1->op_ = kRegexpLiteral; + re1->rune_ = r; + re1->parse_flags_ = flags; + return true; + } + + stacktop_ = re2; + re1->Decref(); + return false; +} + +// Lexing routines. + +// Parses a decimal integer, storing it in *n. +// Sets *s to span the remainder of the string. +// Sets *out_re to the regexp for the class. +static bool ParseInteger(StringPiece* s, int* np) { + if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) + return false; + // Disallow leading zeros. + if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) + return false; + int n = 0; + int c; + while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { + // Avoid overflow. + if (n >= 100000000) + return false; + n = n*10 + c - '0'; + s->remove_prefix(1); // digit + } + *np = n; + return true; +} + +// Parses a repetition suffix like {1,2} or {2} or {2,}. +// Sets *s to span the remainder of the string on success. +// Sets *lo and *hi to the given range. +// In the case of {2,}, the high number is unbounded; +// sets *hi to -1 to signify this. +// {,2} is NOT a valid suffix. +// The Maybe in the name signifies that the regexp parse +// doesn't fail even if ParseRepetition does, so the StringPiece +// s must NOT be edited unless MaybeParseRepetition returns true. +static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { + StringPiece s = *sp; + if (s.size() == 0 || s[0] != '{') + return false; + s.remove_prefix(1); // '{' + if (!ParseInteger(&s, lo)) + return false; + if (s.size() == 0) + return false; + if (s[0] == ',') { + s.remove_prefix(1); // ',' + if (s.size() == 0) + return false; + if (s[0] == '}') { + // {2,} means at least 2 + *hi = -1; + } else { + // {2,4} means 2, 3, or 4. + if (!ParseInteger(&s, hi)) + return false; + } + } else { + // {2} means exactly two + *hi = *lo; + } + if (s.size() == 0 || s[0] != '}') + return false; + s.remove_prefix(1); // '}' + *sp = s; + return true; +} + +// Removes the next Rune from the StringPiece and stores it in *r. +// Returns number of bytes removed from sp. +// Behaves as though there is a terminating NUL at the end of sp. +// Argument order is backwards from usual Google style +// but consistent with chartorune. +static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { + int n; + if (fullrune(sp->data(), sp->size())) { + n = chartorune(r, sp->data()); + if (!(n == 1 && *r == Runeerror)) { // no decoding error + sp->remove_prefix(n); + return n; + } + } + + status->set_code(kRegexpBadUTF8); + status->set_error_arg(NULL); + return -1; +} + +// Return whether name is valid UTF-8. +// If not, set status to kRegexpBadUTF8. +static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { + StringPiece t = s; + Rune r; + while (t.size() > 0) { + if (StringPieceToRune(&r, &t, status) < 0) + return false; + } + return true; +} + +// Is c a hex digit? +static int IsHex(int c) { + return ('0' <= c && c <= '9') || + ('A' <= c && c <= 'F') || + ('a' <= c && c <= 'f'); +} + +// Convert hex digit to value. +static int UnHex(int c) { + if ('0' <= c && c <= '9') + return c - '0'; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + LOG(DFATAL) << "Bad hex digit " << c; + return 0; +} + +// Parse an escape sequence (e.g., \n, \{). +// Sets *s to span the remainder of the string. +// Sets *rp to the named character. +static bool ParseEscape(StringPiece* s, Rune* rp, + RegexpStatus* status, int rune_max) { + const char* begin = s->begin(); + if (s->size() < 1 || (*s)[0] != '\\') { + // Should not happen - caller always checks. + status->set_code(kRegexpInternalError); + status->set_error_arg(NULL); + return false; + } + if (s->size() < 2) { + status->set_code(kRegexpTrailingBackslash); + status->set_error_arg(NULL); + return false; + } + Rune c, c1; + s->remove_prefix(1); // backslash + if (StringPieceToRune(&c, s, status) < 0) + return false; + int code; + switch (c) { + default: + if (c < Runeself && !isalpha(c) && !isdigit(c)) { + // Escaped non-word characters are always themselves. + // PCRE is not quite so rigorous: it accepts things like + // \q, but we don't. We once rejected \_, but too many + // programs and people insist on using it, so allow \_. + *rp = c; + return true; + } + goto BadEscape; + + // Octal escapes. + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // Single non-zero octal digit is a backreference; not supported. + if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') + goto BadEscape; + // fall through + case '0': + // consume up to three octal digits; already have one. + code = c - '0'; + if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { + code = code * 8 + c - '0'; + s->remove_prefix(1); // digit + if (s->size() > 0) { + c = (*s)[0]; + if ('0' <= c && c <= '7') { + code = code * 8 + c - '0'; + s->remove_prefix(1); // digit + } + } + } + if (code > rune_max) + goto BadEscape; + *rp = code; + return true; + + // Hexadecimal escapes + case 'x': + if (s->size() == 0) + goto BadEscape; + if (StringPieceToRune(&c, s, status) < 0) + return false; + if (c == '{') { + // Any number of digits in braces. + // Update n as we consume the string, so that + // the whole thing gets shown in the error message. + // Perl accepts any text at all; it ignores all text + // after the first non-hex digit. We require only hex digits, + // and at least one. + if (StringPieceToRune(&c, s, status) < 0) + return false; + int nhex = 0; + code = 0; + while (IsHex(c)) { + nhex++; + code = code * 16 + UnHex(c); + if (code > rune_max) + goto BadEscape; + if (s->size() == 0) + goto BadEscape; + if (StringPieceToRune(&c, s, status) < 0) + return false; + } + if (c != '}' || nhex == 0) + goto BadEscape; + *rp = code; + return true; + } + // Easy case: two hex digits. + if (s->size() == 0) + goto BadEscape; + if (StringPieceToRune(&c1, s, status) < 0) + return false; + if (!IsHex(c) || !IsHex(c1)) + goto BadEscape; + *rp = UnHex(c) * 16 + UnHex(c1); + return true; + + // C escapes. + case 'n': + *rp = '\n'; + return true; + case 'r': + *rp = '\r'; + return true; + case 't': + *rp = '\t'; + return true; + + // Less common C escapes. + case 'a': + *rp = '\a'; + return true; + case 'f': + *rp = '\f'; + return true; + case 'v': + *rp = '\v'; + return true; + + // This code is disabled to avoid misparsing + // the Perl word-boundary \b as a backspace + // when in POSIX regexp mode. Surprisingly, + // in Perl, \b means word-boundary but [\b] + // means backspace. We don't support that: + // if you want a backspace embed a literal + // backspace character or use \x08. + // + // case 'b': + // *rp = '\b'; + // return true; + } + + LOG(DFATAL) << "Not reached in ParseEscape."; + +BadEscape: + // Unrecognized escape sequence. + status->set_code(kRegexpBadEscape); + status->set_error_arg(StringPiece(begin, s->data() - begin)); + return false; +} + +// Add a range to the character class, but exclude newline if asked. +// Also handle case folding. +void CharClassBuilder::AddRangeFlags( + Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { + + // Take out \n if the flags say so. + bool cutnl = !(parse_flags & Regexp::ClassNL) || + (parse_flags & Regexp::NeverNL); + if (cutnl && lo <= '\n' && '\n' <= hi) { + if (lo < '\n') + AddRangeFlags(lo, '\n' - 1, parse_flags); + if (hi > '\n') + AddRangeFlags('\n' + 1, hi, parse_flags); + return; + } + + // If folding case, add fold-equivalent characters too. + if (parse_flags & Regexp::FoldCase) + AddFoldedRange(this, lo, hi, 0); + else + AddRange(lo, hi); +} + +// Look for a group with the given name. +static const UGroup* LookupGroup(const StringPiece& name, + const UGroup *groups, int ngroups) { + // Simple name lookup. + for (int i = 0; i < ngroups; i++) + if (StringPiece(groups[i].name) == name) + return &groups[i]; + return NULL; +} + +// Fake UGroup containing all Runes +static URange16 any16[] = { { 0, 65535 } }; +static URange32 any32[] = { { 65536, Runemax } }; +static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; + +// Look for a POSIX group with the given name (e.g., "[:^alpha:]") +static const UGroup* LookupPosixGroup(const StringPiece& name) { + return LookupGroup(name, posix_groups, num_posix_groups); +} + +static const UGroup* LookupPerlGroup(const StringPiece& name) { + return LookupGroup(name, perl_groups, num_perl_groups); +} + +// Look for a Unicode group with the given name (e.g., "Han") +static const UGroup* LookupUnicodeGroup(const StringPiece& name) { + // Special case: "Any" means any. + if (name == StringPiece("Any")) + return &anygroup; + return LookupGroup(name, unicode_groups, num_unicode_groups); +} + +// Add a UGroup or its negation to the character class. +static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, + Regexp::ParseFlags parse_flags) { + if (sign == +1) { + for (int i = 0; i < g->nr16; i++) { + cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); + } + for (int i = 0; i < g->nr32; i++) { + cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); + } + } else { + if (parse_flags & Regexp::FoldCase) { + // Normally adding a case-folded group means + // adding all the extra fold-equivalent runes too. + // But if we're adding the negation of the group, + // we have to exclude all the runes that are fold-equivalent + // to what's already missing. Too hard, so do in two steps. + CharClassBuilder ccb1; + AddUGroup(&ccb1, g, +1, parse_flags); + // If the flags say to take out \n, put it in, so that negating will take it out. + // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. + bool cutnl = !(parse_flags & Regexp::ClassNL) || + (parse_flags & Regexp::NeverNL); + if (cutnl) { + ccb1.AddRange('\n', '\n'); + } + ccb1.Negate(); + cc->AddCharClass(&ccb1); + return; + } + int next = 0; + for (int i = 0; i < g->nr16; i++) { + if (next < g->r16[i].lo) + cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); + next = g->r16[i].hi + 1; + } + for (int i = 0; i < g->nr32; i++) { + if (next < g->r32[i].lo) + cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); + next = g->r32[i].hi + 1; + } + if (next <= Runemax) + cc->AddRangeFlags(next, Runemax, parse_flags); + } +} + +// Maybe parse a Perl character class escape sequence. +// Only recognizes the Perl character classes (\d \s \w \D \S \W), +// not the Perl empty-string classes (\b \B \A \Z \z). +// On success, sets *s to span the remainder of the string +// and returns the corresponding UGroup. +// The StringPiece must *NOT* be edited unless the call succeeds. +const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { + if (!(parse_flags & Regexp::PerlClasses)) + return NULL; + if (s->size() < 2 || (*s)[0] != '\\') + return NULL; + // Could use StringPieceToRune, but there aren't + // any non-ASCII Perl group names. + StringPiece name(s->begin(), 2); + const UGroup *g = LookupPerlGroup(name); + if (g == NULL) + return NULL; + s->remove_prefix(name.size()); + return g; +} + +enum ParseStatus { + kParseOk, // Did some parsing. + kParseError, // Found an error. + kParseNothing, // Decided not to parse. +}; + +// Maybe parses a Unicode character group like \p{Han} or \P{Han} +// (the latter is a negated group). +ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, + CharClassBuilder *cc, + RegexpStatus* status) { + // Decide whether to parse. + if (!(parse_flags & Regexp::UnicodeGroups)) + return kParseNothing; + if (s->size() < 2 || (*s)[0] != '\\') + return kParseNothing; + Rune c = (*s)[1]; + if (c != 'p' && c != 'P') + return kParseNothing; + + // Committed to parse. Results: + int sign = +1; // -1 = negated char class + if (c == 'P') + sign = -1; + StringPiece seq = *s; // \p{Han} or \pL + StringPiece name; // Han or L + s->remove_prefix(2); // '\\', 'p' + + if (!StringPieceToRune(&c, s, status)) + return kParseError; + if (c != '{') { + // Name is the bit of string we just skipped over for c. + const char* p = seq.begin() + 2; + name = StringPiece(p, s->begin() - p); + } else { + // Name is in braces. Look for closing } + int end = s->find('}', 0); + if (end == s->npos) { + if (!IsValidUTF8(seq, status)) + return kParseError; + status->set_code(kRegexpBadCharRange); + status->set_error_arg(seq); + return kParseError; + } + name = StringPiece(s->begin(), end); // without '}' + s->remove_prefix(end + 1); // with '}' + if (!IsValidUTF8(name, status)) + return kParseError; + } + + // Chop seq where s now begins. + seq = StringPiece(seq.begin(), s->begin() - seq.begin()); + + // Look up group + if (name.size() > 0 && name[0] == '^') { + sign = -sign; + name.remove_prefix(1); // '^' + } + const UGroup *g = LookupUnicodeGroup(name); + if (g == NULL) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(seq); + return kParseError; + } + + AddUGroup(cc, g, sign, parse_flags); + return kParseOk; +} + +// Parses a character class name like [:alnum:]. +// Sets *s to span the remainder of the string. +// Adds the ranges corresponding to the class to ranges. +static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, + CharClassBuilder *cc, + RegexpStatus* status) { + // Check begins with [: + const char* p = s->data(); + const char* ep = s->data() + s->size(); + if (ep - p < 2 || p[0] != '[' || p[1] != ':') + return kParseNothing; + + // Look for closing :]. + const char* q; + for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) + ; + + // If no closing :], then ignore. + if (q > ep-2) + return kParseNothing; + + // Got it. Check that it's valid. + q += 2; + StringPiece name(p, q-p); + + const UGroup *g = LookupPosixGroup(name); + if (g == NULL) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(name); + return kParseError; + } + + s->remove_prefix(name.size()); + AddUGroup(cc, g, g->sign, parse_flags); + return kParseOk; +} + +// Parses a character inside a character class. +// There are fewer special characters here than in the rest of the regexp. +// Sets *s to span the remainder of the string. +// Sets *rp to the character. +bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, + const StringPiece& whole_class, + RegexpStatus* status) { + if (s->size() == 0) { + status->set_code(kRegexpMissingBracket); + status->set_error_arg(whole_class); + return false; + } + + // Allow regular escape sequences even though + // many need not be escaped in this context. + if (s->size() >= 1 && (*s)[0] == '\\') + return ParseEscape(s, rp, status, rune_max_); + + // Otherwise take the next rune. + return StringPieceToRune(rp, s, status) >= 0; +} + +// Parses a character class character, or, if the character +// is followed by a hyphen, parses a character class range. +// For single characters, rr->lo == rr->hi. +// Sets *s to span the remainder of the string. +// Sets *rp to the character. +bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, + const StringPiece& whole_class, + RegexpStatus* status) { + StringPiece os = *s; + if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) + return false; + // [a-] means (a|-), so check for final ]. + if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { + s->remove_prefix(1); // '-' + if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) + return false; + if (rr->hi < rr->lo) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); + return false; + } + } else { + rr->hi = rr->lo; + } + return true; +} + +// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. +// Sets *s to span the remainder of the string. +// Sets *out_re to the regexp for the class. +bool Regexp::ParseState::ParseCharClass(StringPiece* s, + Regexp** out_re, + RegexpStatus* status) { + StringPiece whole_class = *s; + if (s->size() == 0 || (*s)[0] != '[') { + // Caller checked this. + status->set_code(kRegexpInternalError); + status->set_error_arg(NULL); + return false; + } + bool negated = false; + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + s->remove_prefix(1); // '[' + if (s->size() > 0 && (*s)[0] == '^') { + s->remove_prefix(1); // '^' + negated = true; + if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { + // If NL can't match implicitly, then pretend + // negated classes include a leading \n. + re->ccb_->AddRange('\n', '\n'); + } + } + bool first = true; // ] is okay as first char in class + while (s->size() > 0 && ((*s)[0] != ']' || first)) { + // - is only okay unescaped as first or last in class. + // Except that Perl allows - anywhere. + if ((*s)[0] == '-' && !first && !(flags_&PerlX) && + (s->size() == 1 || (*s)[1] != ']')) { + StringPiece t = *s; + t.remove_prefix(1); // '-' + Rune r; + int n = StringPieceToRune(&r, &t, status); + if (n < 0) { + re->Decref(); + return false; + } + status->set_code(kRegexpBadCharRange); + status->set_error_arg(StringPiece(s->data(), 1+n)); + re->Decref(); + return false; + } + first = false; + + // Look for [:alnum:] etc. + if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { + switch (ParseCCName(s, flags_, re->ccb_, status)) { + case kParseOk: + continue; + case kParseError: + re->Decref(); + return false; + case kParseNothing: + break; + } + } + + // Look for Unicode character group like \p{Han} + if (s->size() > 2 && + (*s)[0] == '\\' && + ((*s)[1] == 'p' || (*s)[1] == 'P')) { + switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { + case kParseOk: + continue; + case kParseError: + re->Decref(); + return false; + case kParseNothing: + break; + } + } + + // Look for Perl character class symbols (extension). + const UGroup *g = MaybeParsePerlCCEscape(s, flags_); + if (g != NULL) { + AddUGroup(re->ccb_, g, g->sign, flags_); + continue; + } + + // Otherwise assume single character or simple range. + RuneRange rr; + if (!ParseCCRange(s, &rr, whole_class, status)) { + re->Decref(); + return false; + } + // AddRangeFlags is usually called in response to a class like + // \p{Foo} or [[:foo:]]; for those, it filters \n out unless + // Regexp::ClassNL is set. In an explicit range or singleton + // like we just parsed, we do not filter \n out, so set ClassNL + // in the flags. + re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); + } + if (s->size() == 0) { + status->set_code(kRegexpMissingBracket); + status->set_error_arg(whole_class); + re->Decref(); + return false; + } + s->remove_prefix(1); // ']' + + if (negated) + re->ccb_->Negate(); + re->ccb_->RemoveAbove(rune_max_); + + *out_re = re; + return true; +} + +// Is this a valid capture name? [A-Za-z0-9_]+ +// PCRE limits names to 32 bytes. +// Python rejects names starting with digits. +// We don't enforce either of those. +static bool IsValidCaptureName(const StringPiece& name) { + if (name.size() == 0) + return false; + for (int i = 0; i < name.size(); i++) { + int c = name[i]; + if (('0' <= c && c <= '9') || + ('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + c == '_') + continue; + return false; + } + return true; +} + +// Parses a Perl flag setting or non-capturing group or both, +// like (?i) or (?: or (?i:. Removes from s, updates parse state. +// The caller must check that s begins with "(?". +// Returns true on success. If the Perl flag is not +// well-formed or not supported, sets status_ and returns false. +bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { + StringPiece t = *s; + + // Caller is supposed to check this. + if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { + LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; + status_->set_code(kRegexpInternalError); + return false; + } + + t.remove_prefix(2); // "(?" + + // Check for named captures, first introduced in Python's regexp library. + // As usual, there are three slightly different syntaxes: + // + // (?Pexpr) the original, introduced by Python + // (?expr) the .NET alteration, adopted by Perl 5.10 + // (?'name'expr) another .NET alteration, adopted by Perl 5.10 + // + // Perl 5.10 gave in and implemented the Python version too, + // but they claim that the last two are the preferred forms. + // PCRE and languages based on it (specifically, PHP and Ruby) + // support all three as well. EcmaScript 4 uses only the Python form. + // + // In both the open source world (via Code Search) and the + // Google source tree, (?Pname) is the dominant form, + // so that's the one we implement. One is enough. + if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { + // Pull out name. + int end = t.find('>', 2); + if (end == t.npos) { + if (!IsValidUTF8(*s, status_)) + return false; + status_->set_code(kRegexpBadNamedCapture); + status_->set_error_arg(*s); + return false; + } + + // t is "P...", t[end] == '>' + StringPiece capture(t.begin()-2, end+3); // "(?P" + StringPiece name(t.begin()+2, end-2); // "name" + if (!IsValidUTF8(name, status_)) + return false; + if (!IsValidCaptureName(name)) { + status_->set_code(kRegexpBadNamedCapture); + status_->set_error_arg(capture); + return false; + } + + if (!DoLeftParen(name)) { + // DoLeftParen's failure set status_. + return false; + } + + s->remove_prefix(capture.end() - s->begin()); + return true; + } + + bool negated = false; + bool sawflags = false; + int nflags = flags_; + Rune c; + for (bool done = false; !done; ) { + if (t.size() == 0) + goto BadPerlOp; + if (StringPieceToRune(&c, &t, status_) < 0) + return false; + switch (c) { + default: + goto BadPerlOp; + + // Parse flags. + case 'i': + sawflags = true; + if (negated) + nflags &= ~FoldCase; + else + nflags |= FoldCase; + break; + + case 'm': // opposite of our OneLine + sawflags = true; + if (negated) + nflags |= OneLine; + else + nflags &= ~OneLine; + break; + + case 's': + sawflags = true; + if (negated) + nflags &= ~DotNL; + else + nflags |= DotNL; + break; + + case 'U': + sawflags = true; + if (negated) + nflags &= ~NonGreedy; + else + nflags |= NonGreedy; + break; + + // Negation + case '-': + if (negated) + goto BadPerlOp; + negated = true; + sawflags = false; + break; + + // Open new group. + case ':': + if (!DoLeftParenNoCapture()) { + // DoLeftParenNoCapture's failure set status_. + return false; + } + done = true; + break; + + // Finish flags. + case ')': + done = true; + break; + } + } + + if (negated && !sawflags) + goto BadPerlOp; + + flags_ = static_cast(nflags); + *s = t; + return true; + +BadPerlOp: + status_->set_code(kRegexpBadPerlOp); + status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); + return false; +} + +// Converts latin1 (assumed to be encoded as Latin1 bytes) +// into UTF8 encoding in string. +// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is +// deprecated and because it rejects code points 0x80-0x9F. +void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { + char buf[UTFmax]; + + utf->clear(); + for (int i = 0; i < latin1.size(); i++) { + Rune r = latin1[i] & 0xFF; + int n = runetochar(buf, &r); + utf->append(buf, n); + } +} + +// Parses the regular expression given by s, +// returning the corresponding Regexp tree. +// The caller must Decref the return value when done with it. +// Returns NULL on error. +Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, + RegexpStatus* status) { + // Make status non-NULL (easier on everyone else). + RegexpStatus xstatus; + if (status == NULL) + status = &xstatus; + + ParseState ps(global_flags, s, status); + StringPiece t = s; + + // Convert regexp to UTF-8 (easier on the rest of the parser). + if (global_flags & Latin1) { + string* tmp = new string; + ConvertLatin1ToUTF8(t, tmp); + status->set_tmp(tmp); + t = *tmp; + } + + if (global_flags & Literal) { + // Special parse loop for literal string. + while (t.size() > 0) { + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + } + return ps.DoFinish(); + } + + StringPiece lastunary = NULL; + while (t.size() > 0) { + StringPiece isunary = NULL; + switch (t[0]) { + default: { + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + break; + } + + case '(': + // "(?" introduces Perl escape. + if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { + // Flag changes and non-capturing groups. + if (!ps.ParsePerlFlags(&t)) + return NULL; + break; + } + if (ps.flags() & NeverCapture) { + if (!ps.DoLeftParenNoCapture()) + return NULL; + } else { + if (!ps.DoLeftParen(NULL)) + return NULL; + } + t.remove_prefix(1); // '(' + break; + + case '|': + if (!ps.DoVerticalBar()) + return NULL; + t.remove_prefix(1); // '|' + break; + + case ')': + if (!ps.DoRightParen()) + return NULL; + t.remove_prefix(1); // ')' + break; + + case '^': // Beginning of line. + if (!ps.PushCarat()) + return NULL; + t.remove_prefix(1); // '^' + break; + + case '$': // End of line. + if (!ps.PushDollar()) + return NULL; + t.remove_prefix(1); // '$' + break; + + case '.': // Any character (possibly except newline). + if (!ps.PushDot()) + return NULL; + t.remove_prefix(1); // '.' + break; + + case '[': { // Character class. + Regexp* re; + if (!ps.ParseCharClass(&t, &re, status)) + return NULL; + if (!ps.PushRegexp(re)) + return NULL; + break; + } + + case '*': { // Zero or more. + RegexpOp op; + op = kRegexpStar; + goto Rep; + case '+': // One or more. + op = kRegexpPlus; + goto Rep; + case '?': // Zero or one. + op = kRegexpQuest; + goto Rep; + Rep: + StringPiece opstr = t; + bool nongreedy = false; + t.remove_prefix(1); // '*' or '+' or '?' + if (ps.flags() & PerlX) { + if (t.size() > 0 && t[0] == '?') { + nongreedy = true; + t.remove_prefix(1); // '?' + } + if (lastunary.size() > 0) { + // In Perl it is not allowed to stack repetition operators: + // a** is a syntax error, not a double-star. + // (and a++ means something else entirely, which we don't support!) + status->set_code(kRegexpRepeatOp); + status->set_error_arg(StringPiece(lastunary.begin(), + t.begin() - lastunary.begin())); + return NULL; + } + } + opstr.set(opstr.data(), t.data() - opstr.data()); + if (!ps.PushRepeatOp(op, opstr, nongreedy)) + return NULL; + isunary = opstr; + break; + } + + case '{': { // Counted repetition. + int lo, hi; + StringPiece opstr = t; + if (!MaybeParseRepetition(&t, &lo, &hi)) { + // Treat like a literal. + if (!ps.PushLiteral('{')) + return NULL; + t.remove_prefix(1); // '{' + break; + } + bool nongreedy = false; + if (ps.flags() & PerlX) { + if (t.size() > 0 && t[0] == '?') { + nongreedy = true; + t.remove_prefix(1); // '?' + } + if (lastunary.size() > 0) { + // Not allowed to stack repetition operators. + status->set_code(kRegexpRepeatOp); + status->set_error_arg(StringPiece(lastunary.begin(), + t.begin() - lastunary.begin())); + return NULL; + } + } + opstr.set(opstr.data(), t.data() - opstr.data()); + if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) + return NULL; + isunary = opstr; + break; + } + + case '\\': { // Escaped character or Perl sequence. + // \b and \B: word boundary or not + if ((ps.flags() & Regexp::PerlB) && + t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { + if (!ps.PushWordBoundary(t[1] == 'b')) + return NULL; + t.remove_prefix(2); // '\\', 'b' + break; + } + + if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { + if (t[1] == 'A') { + if (!ps.PushSimpleOp(kRegexpBeginText)) + return NULL; + t.remove_prefix(2); // '\\', 'A' + break; + } + if (t[1] == 'z') { + if (!ps.PushSimpleOp(kRegexpEndText)) + return NULL; + t.remove_prefix(2); // '\\', 'z' + break; + } + // Do not recognize \Z, because this library can't + // implement the exact Perl/PCRE semantics. + // (This library treats "(?-m)$" as \z, even though + // in Perl and PCRE it is equivalent to \Z.) + + if (t[1] == 'C') { // \C: any byte [sic] + if (!ps.PushSimpleOp(kRegexpAnyByte)) + return NULL; + t.remove_prefix(2); // '\\', 'C' + break; + } + + if (t[1] == 'Q') { // \Q ... \E: the ... is always literals + t.remove_prefix(2); // '\\', 'Q' + while (t.size() > 0) { + if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { + t.remove_prefix(2); // '\\', 'E' + break; + } + Rune r; + if (StringPieceToRune(&r, &t, status) < 0) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + } + break; + } + } + + if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { + Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); + re->ccb_ = new CharClassBuilder; + switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { + case kParseOk: + if (!ps.PushRegexp(re)) + return NULL; + goto Break2; + case kParseError: + re->Decref(); + return NULL; + case kParseNothing: + re->Decref(); + break; + } + } + + const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); + if (g != NULL) { + Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); + re->ccb_ = new CharClassBuilder; + AddUGroup(re->ccb_, g, g->sign, ps.flags()); + if (!ps.PushRegexp(re)) + return NULL; + break; + } + + Rune r; + if (!ParseEscape(&t, &r, status, ps.rune_max())) + return NULL; + if (!ps.PushLiteral(r)) + return NULL; + break; + } + } + Break2: + lastunary = isunary; + } + return ps.DoFinish(); +} + +} // namespace re2 diff --git a/outside/re2/re2/perl_groups.cc b/outside/re2/re2/perl_groups.cc new file mode 100644 index 000000000..422b3882d --- /dev/null +++ b/outside/re2/re2/perl_groups.cc @@ -0,0 +1,119 @@ +// GENERATED BY make_perl_groups.pl; DO NOT EDIT. +// make_perl_groups.pl >perl_groups.cc + +#include "re2/unicode_groups.h" + +namespace re2 { + +static const URange16 code1[] = { /* \d */ + { 0x30, 0x39 }, +}; +static const URange16 code2[] = { /* \s */ + { 0x9, 0xa }, + { 0xc, 0xd }, + { 0x20, 0x20 }, +}; +static const URange16 code3[] = { /* \w */ + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x5f, 0x5f }, + { 0x61, 0x7a }, +}; +const UGroup perl_groups[] = { + { "\\d", +1, code1, 1 }, + { "\\D", -1, code1, 1 }, + { "\\s", +1, code2, 3 }, + { "\\S", -1, code2, 3 }, + { "\\w", +1, code3, 4 }, + { "\\W", -1, code3, 4 }, +}; +const int num_perl_groups = 6; +static const URange16 code4[] = { /* [:alnum:] */ + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x61, 0x7a }, +}; +static const URange16 code5[] = { /* [:alpha:] */ + { 0x41, 0x5a }, + { 0x61, 0x7a }, +}; +static const URange16 code6[] = { /* [:ascii:] */ + { 0x0, 0x7f }, +}; +static const URange16 code7[] = { /* [:blank:] */ + { 0x9, 0x9 }, + { 0x20, 0x20 }, +}; +static const URange16 code8[] = { /* [:cntrl:] */ + { 0x0, 0x1f }, + { 0x7f, 0x7f }, +}; +static const URange16 code9[] = { /* [:digit:] */ + { 0x30, 0x39 }, +}; +static const URange16 code10[] = { /* [:graph:] */ + { 0x21, 0x7e }, +}; +static const URange16 code11[] = { /* [:lower:] */ + { 0x61, 0x7a }, +}; +static const URange16 code12[] = { /* [:print:] */ + { 0x20, 0x7e }, +}; +static const URange16 code13[] = { /* [:punct:] */ + { 0x21, 0x2f }, + { 0x3a, 0x40 }, + { 0x5b, 0x60 }, + { 0x7b, 0x7e }, +}; +static const URange16 code14[] = { /* [:space:] */ + { 0x9, 0xd }, + { 0x20, 0x20 }, +}; +static const URange16 code15[] = { /* [:upper:] */ + { 0x41, 0x5a }, +}; +static const URange16 code16[] = { /* [:word:] */ + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x5f, 0x5f }, + { 0x61, 0x7a }, +}; +static const URange16 code17[] = { /* [:xdigit:] */ + { 0x30, 0x39 }, + { 0x41, 0x46 }, + { 0x61, 0x66 }, +}; +const UGroup posix_groups[] = { + { "[:alnum:]", +1, code4, 3 }, + { "[:^alnum:]", -1, code4, 3 }, + { "[:alpha:]", +1, code5, 2 }, + { "[:^alpha:]", -1, code5, 2 }, + { "[:ascii:]", +1, code6, 1 }, + { "[:^ascii:]", -1, code6, 1 }, + { "[:blank:]", +1, code7, 2 }, + { "[:^blank:]", -1, code7, 2 }, + { "[:cntrl:]", +1, code8, 2 }, + { "[:^cntrl:]", -1, code8, 2 }, + { "[:digit:]", +1, code9, 1 }, + { "[:^digit:]", -1, code9, 1 }, + { "[:graph:]", +1, code10, 1 }, + { "[:^graph:]", -1, code10, 1 }, + { "[:lower:]", +1, code11, 1 }, + { "[:^lower:]", -1, code11, 1 }, + { "[:print:]", +1, code12, 1 }, + { "[:^print:]", -1, code12, 1 }, + { "[:punct:]", +1, code13, 4 }, + { "[:^punct:]", -1, code13, 4 }, + { "[:space:]", +1, code14, 2 }, + { "[:^space:]", -1, code14, 2 }, + { "[:upper:]", +1, code15, 1 }, + { "[:^upper:]", -1, code15, 1 }, + { "[:word:]", +1, code16, 4 }, + { "[:^word:]", -1, code16, 4 }, + { "[:xdigit:]", +1, code17, 3 }, + { "[:^xdigit:]", -1, code17, 3 }, +}; +const int num_posix_groups = 28; + +} // namespace re2 diff --git a/outside/re2/re2/prefilter.cc b/outside/re2/re2/prefilter.cc new file mode 100644 index 000000000..1b1245875 --- /dev/null +++ b/outside/re2/re2/prefilter.cc @@ -0,0 +1,715 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" +#include "re2/prefilter.h" +#include "re2/re2.h" +#include "re2/unicode_casefold.h" +#include "re2/walker-inl.h" + +namespace re2 { + +static const int Trace = false; + +typedef set::iterator SSIter; +typedef set::const_iterator ConstSSIter; + +static int alloc_id = 100000; // Used for debugging. +// Initializes a Prefilter, allocating subs_ as necessary. +Prefilter::Prefilter(Op op) { + op_ = op; + subs_ = NULL; + if (op_ == AND || op_ == OR) + subs_ = new vector; + + alloc_id_ = alloc_id++; + VLOG(10) << "alloc_id: " << alloc_id_; +} + +// Destroys a Prefilter. +Prefilter::~Prefilter() { + VLOG(10) << "Deleted: " << alloc_id_; + if (subs_) { + for (int i = 0; i < subs_->size(); i++) + delete (*subs_)[i]; + delete subs_; + subs_ = NULL; + } +} + +// Simplify if the node is an empty Or or And. +Prefilter* Prefilter::Simplify() { + if (op_ != AND && op_ != OR) { + return this; + } + + // Nothing left in the AND/OR. + if (subs_->size() == 0) { + if (op_ == AND) + op_ = ALL; // AND of nothing is true + else + op_ = NONE; // OR of nothing is false + + return this; + } + + // Just one subnode: throw away wrapper. + if (subs_->size() == 1) { + Prefilter* a = (*subs_)[0]; + subs_->clear(); + delete this; + return a->Simplify(); + } + + return this; +} + +// Combines two Prefilters together to create an "op" (AND or OR). +// The passed Prefilters will be part of the returned Prefilter or deleted. +// Does lots of work to avoid creating unnecessarily complicated structures. +Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) { + // If a, b can be rewritten as op, do so. + a = a->Simplify(); + b = b->Simplify(); + + // Canonicalize: a->op <= b->op. + if (a->op() > b->op()) { + Prefilter* t = a; + a = b; + b = t; + } + + // Trivial cases. + // ALL AND b = b + // NONE OR b = b + // ALL OR b = ALL + // NONE AND b = NONE + // Don't need to look at b, because of canonicalization above. + // ALL and NONE are smallest opcodes. + if (a->op() == ALL || a->op() == NONE) { + if ((a->op() == ALL && op == AND) || + (a->op() == NONE && op == OR)) { + delete a; + return b; + } else { + delete b; + return a; + } + } + + // If a and b match op, merge their contents. + if (a->op() == op && b->op() == op) { + for (int i = 0; i < b->subs()->size(); i++) { + Prefilter* bb = (*b->subs())[i]; + a->subs()->push_back(bb); + } + b->subs()->clear(); + delete b; + return a; + } + + // If a already has the same op as the op that is under construction + // add in b (similarly if b already has the same op, add in a). + if (b->op() == op) { + Prefilter* t = a; + a = b; + b = t; + } + if (a->op() == op) { + a->subs()->push_back(b); + return a; + } + + // Otherwise just return the op. + Prefilter* c = new Prefilter(op); + c->subs()->push_back(a); + c->subs()->push_back(b); + return c; +} + +Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) { + return AndOr(AND, a, b); +} + +Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { + return AndOr(OR, a, b); +} + +static void SimplifyStringSet(set *ss) { + // Now make sure that the strings aren't redundant. For example, if + // we know "ab" is a required string, then it doesn't help at all to + // know that "abc" is also a required string, so delete "abc". This + // is because, when we are performing a string search to filter + // regexps, matching ab will already allow this regexp to be a + // candidate for match, so further matching abc is redundant. + + for (SSIter i = ss->begin(); i != ss->end(); ++i) { + SSIter j = i; + ++j; + while (j != ss->end()) { + // Increment j early so that we can erase the element it points to. + SSIter old_j = j; + ++j; + if (old_j->find(*i) != string::npos) + ss->erase(old_j); + } + } +} + +Prefilter* Prefilter::OrStrings(set* ss) { + SimplifyStringSet(ss); + Prefilter* or_prefilter = NULL; + if (!ss->empty()) { + or_prefilter = new Prefilter(NONE); + for (SSIter i = ss->begin(); i != ss->end(); ++i) + or_prefilter = Or(or_prefilter, FromString(*i)); + } + return or_prefilter; +} + +static Rune ToLowerRune(Rune r) { + if (r < Runeself) { + if ('A' <= r && r <= 'Z') + r += 'a' - 'A'; + return r; + } + + const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); + if (f == NULL || r < f->lo) + return r; + return ApplyFold(f, r); +} + +static Rune ToLowerRuneLatin1(Rune r) { + if ('A' <= r && r <= 'Z') + r += 'a' - 'A'; + return r; +} + +Prefilter* Prefilter::FromString(const string& str) { + Prefilter* m = new Prefilter(Prefilter::ATOM); + m->atom_ = str; + return m; +} + +// Information about a regexp used during computation of Prefilter. +// Can be thought of as information about the set of strings matching +// the given regular expression. +class Prefilter::Info { + public: + Info(); + ~Info(); + + // More constructors. They delete their Info* arguments. + static Info* Alt(Info* a, Info* b); + static Info* Concat(Info* a, Info* b); + static Info* And(Info* a, Info* b); + static Info* Star(Info* a); + static Info* Plus(Info* a); + static Info* Quest(Info* a); + static Info* EmptyString(); + static Info* NoMatch(); + static Info* AnyChar(); + static Info* CClass(CharClass* cc, bool latin1); + static Info* Literal(Rune r); + static Info* LiteralLatin1(Rune r); + static Info* AnyMatch(); + + // Format Info as a string. + string ToString(); + + // Caller takes ownership of the Prefilter. + Prefilter* TakeMatch(); + + set& exact() { return exact_; } + + bool is_exact() const { return is_exact_; } + + class Walker; + + private: + set exact_; + + // When is_exact_ is true, the strings that match + // are placed in exact_. When it is no longer an exact + // set of strings that match this RE, then is_exact_ + // is false and the match_ contains the required match + // criteria. + bool is_exact_; + + // Accumulated Prefilter query that any + // match for this regexp is guaranteed to match. + Prefilter* match_; +}; + + +Prefilter::Info::Info() + : is_exact_(false), + match_(NULL) { +} + +Prefilter::Info::~Info() { + delete match_; +} + +Prefilter* Prefilter::Info::TakeMatch() { + if (is_exact_) { + match_ = Prefilter::OrStrings(&exact_); + is_exact_ = false; + } + Prefilter* m = match_; + match_ = NULL; + return m; +} + +// Format a Info in string form. +string Prefilter::Info::ToString() { + if (this == NULL) { + // Sometimes when iterating on children of a node, + // some children might have NULL Info. Adding + // the check here for NULL to take care of cases where + // the caller is not checking. + return ""; + } + + if (is_exact_) { + int n = 0; + string s; + for (set::iterator i = exact_.begin(); i != exact_.end(); ++i) { + if (n++ > 0) + s += ","; + s += *i; + } + return s; + } + + if (match_) + return match_->DebugString(); + + return ""; +} + +// Add the strings from src to dst. +static void CopyIn(const set& src, set* dst) { + for (ConstSSIter i = src.begin(); i != src.end(); ++i) + dst->insert(*i); +} + +// Add the cross-product of a and b to dst. +// (For each string i in a and j in b, add i+j.) +static void CrossProduct(const set& a, + const set& b, + set* dst) { + for (ConstSSIter i = a.begin(); i != a.end(); ++i) + for (ConstSSIter j = b.begin(); j != b.end(); ++j) + dst->insert(*i + *j); +} + +// Concats a and b. Requires that both are exact sets. +// Forms an exact set that is a crossproduct of a and b. +Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) { + if (a == NULL) + return b; + DCHECK(a->is_exact_); + DCHECK(b && b->is_exact_); + Info *ab = new Info(); + + CrossProduct(a->exact_, b->exact_, &ab->exact_); + ab->is_exact_ = true; + + delete a; + delete b; + return ab; +} + +// Constructs an inexact Info for ab given a and b. +// Used only when a or b is not exact or when the +// exact cross product is likely to be too big. +Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) { + if (a == NULL) + return b; + if (b == NULL) + return a; + + Info *ab = new Info(); + + ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch()); + ab->is_exact_ = false; + delete a; + delete b; + return ab; +} + +// Constructs Info for a|b given a and b. +Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { + Info *ab = new Info(); + + if (a->is_exact_ && b->is_exact_) { + CopyIn(a->exact_, &ab->exact_); + CopyIn(b->exact_, &ab->exact_); + ab->is_exact_ = true; + } else { + // Either a or b has is_exact_ = false. If the other + // one has is_exact_ = true, we move it to match_ and + // then create a OR of a,b. The resulting Info has + // is_exact_ = false. + ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch()); + ab->is_exact_ = false; + } + + delete a; + delete b; + return ab; +} + +// Constructs Info for a? given a. +Prefilter::Info* Prefilter::Info::Quest(Info *a) { + Info *ab = new Info(); + + ab->is_exact_ = false; + ab->match_ = new Prefilter(ALL); + delete a; + return ab; +} + +// Constructs Info for a* given a. +// Same as a? -- not much to do. +Prefilter::Info* Prefilter::Info::Star(Info *a) { + return Quest(a); +} + +// Constructs Info for a+ given a. If a was exact set, it isn't +// anymore. +Prefilter::Info* Prefilter::Info::Plus(Info *a) { + Info *ab = new Info(); + + ab->match_ = a->TakeMatch(); + ab->is_exact_ = false; + + delete a; + return ab; +} + +static string RuneToString(Rune r) { + char buf[UTFmax]; + int n = runetochar(buf, &r); + return string(buf, n); +} + +static string RuneToStringLatin1(Rune r) { + char c = r & 0xff; + return string(&c, 1); +} + +// Constructs Info for literal rune. +Prefilter::Info* Prefilter::Info::Literal(Rune r) { + Info* info = new Info(); + info->exact_.insert(RuneToString(ToLowerRune(r))); + info->is_exact_ = true; + return info; +} + +// Constructs Info for literal rune for Latin1 encoded string. +Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) { + Info* info = new Info(); + info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); + info->is_exact_ = true; + return info; +} + +// Constructs Info for dot (any character). +Prefilter::Info* Prefilter::Info::AnyChar() { + Prefilter::Info* info = new Prefilter::Info(); + info->match_ = new Prefilter(ALL); + return info; +} + +// Constructs Prefilter::Info for no possible match. +Prefilter::Info* Prefilter::Info::NoMatch() { + Prefilter::Info* info = new Prefilter::Info(); + info->match_ = new Prefilter(NONE); + return info; +} + +// Constructs Prefilter::Info for any possible match. +// This Prefilter::Info is valid for any regular expression, +// since it makes no assertions whatsoever about the +// strings being matched. +Prefilter::Info* Prefilter::Info::AnyMatch() { + Prefilter::Info *info = new Prefilter::Info(); + info->match_ = new Prefilter(ALL); + return info; +} + +// Constructs Prefilter::Info for just the empty string. +Prefilter::Info* Prefilter::Info::EmptyString() { + Prefilter::Info* info = new Prefilter::Info(); + info->is_exact_ = true; + info->exact_.insert(""); + return info; +} + +// Constructs Prefilter::Info for a character class. +typedef CharClass::iterator CCIter; +Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, + bool latin1) { + if (Trace) { + VLOG(0) << "CharClassInfo:"; + for (CCIter i = cc->begin(); i != cc->end(); ++i) + VLOG(0) << " " << i->lo << "-" << i->hi; + } + + // If the class is too large, it's okay to overestimate. + if (cc->size() > 10) + return AnyChar(); + + Prefilter::Info *a = new Prefilter::Info(); + for (CCIter i = cc->begin(); i != cc->end(); ++i) + for (Rune r = i->lo; r <= i->hi; r++) { + if (latin1) { + a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r))); + } else { + a->exact_.insert(RuneToString(ToLowerRune(r))); + } + } + + + a->is_exact_ = true; + + if (Trace) { + VLOG(0) << " = " << a->ToString(); + } + + return a; +} + +class Prefilter::Info::Walker : public Regexp::Walker { + public: + Walker(bool latin1) : latin1_(latin1) {} + + virtual Info* PostVisit( + Regexp* re, Info* parent_arg, + Info* pre_arg, + Info** child_args, int nchild_args); + + virtual Info* ShortVisit( + Regexp* re, + Info* parent_arg); + + bool latin1() { return latin1_; } + private: + bool latin1_; + DISALLOW_EVIL_CONSTRUCTORS(Walker); +}; + +Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { + if (Trace) { + LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); + } + + bool latin1 = re->parse_flags() & Regexp::Latin1; + Prefilter::Info::Walker w(latin1); + Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); + + if (w.stopped_early()) { + delete info; + return NULL; + } + + return info; +} + +Prefilter::Info* Prefilter::Info::Walker::ShortVisit( + Regexp* re, Prefilter::Info* parent_arg) { + return AnyMatch(); +} + +// Constructs the Prefilter::Info for the given regular expression. +// Assumes re is simplified. +Prefilter::Info* Prefilter::Info::Walker::PostVisit( + Regexp* re, Prefilter::Info* parent_arg, + Prefilter::Info* pre_arg, Prefilter::Info** child_args, + int nchild_args) { + Prefilter::Info *info; + switch (re->op()) { + default: + case kRegexpRepeat: + LOG(DFATAL) << "Bad regexp op " << re->op(); + info = EmptyString(); + break; + + case kRegexpNoMatch: + info = NoMatch(); + break; + + // These ops match the empty string: + case kRegexpEmptyMatch: // anywhere + case kRegexpBeginLine: // at beginning of line + case kRegexpEndLine: // at end of line + case kRegexpBeginText: // at beginning of text + case kRegexpEndText: // at end of text + case kRegexpWordBoundary: // at word boundary + case kRegexpNoWordBoundary: // not at word boundary + info = EmptyString(); + break; + + case kRegexpLiteral: + if (latin1()) { + info = LiteralLatin1(re->rune()); + } + else { + info = Literal(re->rune()); + } + break; + + case kRegexpLiteralString: + if (re->nrunes() == 0) { + info = NoMatch(); + break; + } + if (latin1()) { + info = LiteralLatin1(re->runes()[0]); + for (int i = 1; i < re->nrunes(); i++) { + info = Concat(info, LiteralLatin1(re->runes()[i])); + } + } else { + info = Literal(re->runes()[0]); + for (int i = 1; i < re->nrunes(); i++) { + info = Concat(info, Literal(re->runes()[i])); + } + } + break; + + case kRegexpConcat: { + // Accumulate in info. + // Exact is concat of recent contiguous exact nodes. + info = NULL; + Info* exact = NULL; + for (int i = 0; i < nchild_args; i++) { + Info* ci = child_args[i]; // child info + if (!ci->is_exact() || + (exact && ci->exact().size() * exact->exact().size() > 16)) { + // Exact run is over. + info = And(info, exact); + exact = NULL; + // Add this child's info. + info = And(info, ci); + } else { + // Append to exact run. + exact = Concat(exact, ci); + } + } + info = And(info, exact); + } + break; + + case kRegexpAlternate: + info = child_args[0]; + for (int i = 1; i < nchild_args; i++) + info = Alt(info, child_args[i]); + VLOG(10) << "Alt: " << info->ToString(); + break; + + case kRegexpStar: + info = Star(child_args[0]); + break; + + case kRegexpQuest: + info = Quest(child_args[0]); + break; + + case kRegexpPlus: + info = Plus(child_args[0]); + break; + + case kRegexpAnyChar: + // Claim nothing, except that it's not empty. + info = AnyChar(); + break; + + case kRegexpCharClass: + info = CClass(re->cc(), latin1()); + break; + + case kRegexpCapture: + // These don't affect the set of matching strings. + info = child_args[0]; + break; + } + + if (Trace) { + VLOG(0) << "BuildInfo " << re->ToString() + << ": " << info->ToString(); + } + + return info; +} + + +Prefilter* Prefilter::FromRegexp(Regexp* re) { + if (re == NULL) + return NULL; + + Regexp* simple = re->Simplify(); + Prefilter::Info *info = BuildInfo(simple); + + simple->Decref(); + if (info == NULL) + return NULL; + + Prefilter* m = info->TakeMatch(); + + delete info; + return m; +} + +string Prefilter::DebugString() const { + if (this == NULL) + return ""; + + switch (op_) { + default: + LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_; + return StringPrintf("op%d", op_); + case NONE: + return "*no-matches*"; + case ATOM: + return atom_; + case ALL: + return ""; + case AND: { + string s = ""; + for (int i = 0; i < subs_->size(); i++) { + if (i > 0) + s += " "; + s += (*subs_)[i]->DebugString(); + } + return s; + } + case OR: { + string s = "("; + for (int i = 0; i < subs_->size(); i++) { + if (i > 0) + s += "|"; + s += (*subs_)[i]->DebugString(); + } + s += ")"; + return s; + } + } +} + +Prefilter* Prefilter::FromRE2(const RE2* re2) { + if (re2 == NULL) + return NULL; + + Regexp* regexp = re2->Regexp(); + if (regexp == NULL) + return NULL; + + return FromRegexp(regexp); +} + + +} // namespace re2 diff --git a/outside/re2/re2/prefilter.h b/outside/re2/re2/prefilter.h new file mode 100644 index 000000000..c2f9dddd8 --- /dev/null +++ b/outside/re2/re2/prefilter.h @@ -0,0 +1,105 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Prefilter is the class used to extract string guards from regexps. +// Rather than using Prefilter class directly, use FilteredRE2. +// See filtered_re2.h + +#ifndef RE2_PREFILTER_H_ +#define RE2_PREFILTER_H_ + +#include "util/util.h" + +namespace re2 { + +class RE2; + +class Regexp; + +class Prefilter { + // Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h + public: + enum Op { + ALL = 0, // Everything matches + NONE, // Nothing matches + ATOM, // The string atom() must match + AND, // All in subs() must match + OR, // One of subs() must match + }; + + explicit Prefilter(Op op); + ~Prefilter(); + + Op op() { return op_; } + const string& atom() const { return atom_; } + void set_unique_id(int id) { unique_id_ = id; } + int unique_id() const { return unique_id_; } + + // The children of the Prefilter node. + vector* subs() { + CHECK(op_ == AND || op_ == OR); + return subs_; + } + + // Set the children vector. Prefilter takes ownership of subs and + // subs_ will be deleted when Prefilter is deleted. + void set_subs(vector* subs) { subs_ = subs; } + + // Given a RE2, return a Prefilter. The caller takes ownership of + // the Prefilter and should deallocate it. Returns NULL if Prefilter + // cannot be formed. + static Prefilter* FromRE2(const RE2* re2); + + // Returns a readable debug string of the prefilter. + string DebugString() const; + + private: + class Info; + + // Combines two prefilters together to create an AND. The passed + // Prefilters will be part of the returned Prefilter or deleted. + static Prefilter* And(Prefilter* a, Prefilter* b); + + // Combines two prefilters together to create an OR. The passed + // Prefilters will be part of the returned Prefilter or deleted. + static Prefilter* Or(Prefilter* a, Prefilter* b); + + // Generalized And/Or + static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b); + + static Prefilter* FromRegexp(Regexp* a); + + static Prefilter* FromString(const string& str); + + static Prefilter* OrStrings(set* ss); + + static Info* BuildInfo(Regexp* re); + + Prefilter* Simplify(); + + // Kind of Prefilter. + Op op_; + + // Sub-matches for AND or OR Prefilter. + vector* subs_; + + // Actual string to match in leaf node. + string atom_; + + // If different prefilters have the same string atom, or if they are + // structurally the same (e.g., OR of same atom strings) they are + // considered the same unique nodes. This is the id for each unique + // node. This field is populated with a unique id for every node, + // and -1 for duplicate nodes. + int unique_id_; + + // Used for debugging, helps in tracking memory leaks. + int alloc_id_; + + DISALLOW_EVIL_CONSTRUCTORS(Prefilter); +}; + +} // namespace re2 + +#endif // RE2_PREFILTER_H_ diff --git a/outside/re2/re2/prefilter_tree.cc b/outside/re2/re2/prefilter_tree.cc new file mode 100644 index 000000000..62ede5a52 --- /dev/null +++ b/outside/re2/re2/prefilter_tree.cc @@ -0,0 +1,397 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" +#include "util/flags.h" +#include "re2/prefilter.h" +#include "re2/prefilter_tree.h" +#include "re2/re2.h" + +DEFINE_int32(filtered_re2_min_atom_len, + 3, + "Strings less than this length are not stored as atoms"); + +namespace re2 { + +PrefilterTree::PrefilterTree() + : compiled_(false) { +} + +PrefilterTree::~PrefilterTree() { + for (int i = 0; i < prefilter_vec_.size(); i++) + delete prefilter_vec_[i]; + + for (int i = 0; i < entries_.size(); i++) + delete entries_[i].parents; +} + +// Functions used for adding and Compiling prefilters to the +// PrefilterTree. +static bool KeepPart(Prefilter* prefilter, int level) { + if (prefilter == NULL) + return false; + + switch (prefilter->op()) { + default: + LOG(DFATAL) << "Unexpected op in KeepPart: " + << prefilter->op(); + return false; + + case Prefilter::ALL: + return false; + + case Prefilter::ATOM: + return prefilter->atom().size() >= + FLAGS_filtered_re2_min_atom_len; + + case Prefilter::AND: { + int j = 0; + vector* subs = prefilter->subs(); + for (int i = 0; i < subs->size(); i++) + if (KeepPart((*subs)[i], level + 1)) + (*subs)[j++] = (*subs)[i]; + else + delete (*subs)[i]; + + subs->resize(j); + return j > 0; + } + + case Prefilter::OR: + for (int i = 0; i < prefilter->subs()->size(); i++) + if (!KeepPart((*prefilter->subs())[i], level + 1)) + return false; + return true; + } +} + +void PrefilterTree::Add(Prefilter *f) { + if (compiled_) { + LOG(DFATAL) << "Add after Compile."; + return; + } + if (f != NULL && !KeepPart(f, 0)) { + delete f; + f = NULL; + } + + prefilter_vec_.push_back(f); +} + +void PrefilterTree::Compile(vector* atom_vec) { + if (compiled_) { + LOG(DFATAL) << "Compile after Compile."; + return; + } + + // We do this check to support some legacy uses of + // PrefilterTree that call Compile before adding any regexps, + // and expect Compile not to have effect. + if (prefilter_vec_.empty()) + return; + + compiled_ = true; + + AssignUniqueIds(atom_vec); + + // Identify nodes that are too common among prefilters and are + // triggering too many parents. Then get rid of them if possible. + // Note that getting rid of a prefilter node simply means they are + // no longer necessary for their parent to trigger; that is, we do + // not miss out on any regexps triggering by getting rid of a + // prefilter node. + for (int i = 0; i < entries_.size(); i++) { + StdIntMap* parents = entries_[i].parents; + if (parents->size() > 8) { + // This one triggers too many things. If all the parents are AND + // nodes and have other things guarding them, then get rid of + // this trigger. TODO(vsri): Adjust the threshold appropriately, + // make it a function of total number of nodes? + bool have_other_guard = true; + for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) + have_other_guard = have_other_guard && + (entries_[it->first].propagate_up_at_count > 1); + + if (have_other_guard) { + for (StdIntMap::iterator it = parents->begin(); + it != parents->end(); ++it) + entries_[it->first].propagate_up_at_count -= 1; + + parents->clear(); // Forget the parents + } + } + } + + PrintDebugInfo(); +} + +Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { + string node_string = NodeString(node); + map::iterator iter = node_map_.find(node_string); + if (iter == node_map_.end()) + return NULL; + return (*iter).second; +} + +static string Itoa(int n) { + char buf[100]; + snprintf(buf, sizeof buf, "%d", n); + return string(buf); +} + +string PrefilterTree::NodeString(Prefilter* node) const { + // Adding the operation disambiguates AND/OR/atom nodes. + string s = Itoa(node->op()) + ":"; + if (node->op() == Prefilter::ATOM) { + s += node->atom(); + } else { + for (int i = 0; i < node->subs()->size() ; i++) { + if (i > 0) + s += ','; + s += Itoa((*node->subs())[i]->unique_id()); + } + } + return s; +} + +void PrefilterTree::AssignUniqueIds(vector* atom_vec) { + atom_vec->clear(); + + // Build vector of all filter nodes, sorted topologically + // from top to bottom in v. + vector v; + + // Add the top level nodes of each regexp prefilter. + for (int i = 0; i < prefilter_vec_.size(); i++) { + Prefilter* f = prefilter_vec_[i]; + if (f == NULL) + unfiltered_.push_back(i); + + // We push NULL also on to v, so that we maintain the + // mapping of index==regexpid for level=0 prefilter nodes. + v.push_back(f); + } + + // Now add all the descendant nodes. + for (int i = 0; i < v.size(); i++) { + Prefilter* f = v[i]; + if (f == NULL) + continue; + if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { + const vector& subs = *f->subs(); + for (int j = 0; j < subs.size(); j++) + v.push_back(subs[j]); + } + } + + // Identify unique nodes. + int unique_id = 0; + for (int i = v.size() - 1; i >= 0; i--) { + Prefilter *node = v[i]; + if (node == NULL) + continue; + node->set_unique_id(-1); + Prefilter* canonical = CanonicalNode(node); + if (canonical == NULL) { + // Any further nodes that have the same node string + // will find this node as the canonical node. + node_map_[NodeString(node)] = node; + if (node->op() == Prefilter::ATOM) { + atom_vec->push_back(node->atom()); + atom_index_to_id_.push_back(unique_id); + } + node->set_unique_id(unique_id++); + } else { + node->set_unique_id(canonical->unique_id()); + } + } + entries_.resize(node_map_.size()); + + // Create parent IntMap for the entries. + for (int i = v.size() - 1; i >= 0; i--) { + Prefilter* prefilter = v[i]; + if (prefilter == NULL) + continue; + + if (CanonicalNode(prefilter) != prefilter) + continue; + + Entry* entry = &entries_[prefilter->unique_id()]; + entry->parents = new StdIntMap(); + } + + // Fill the entries. + for (int i = v.size() - 1; i >= 0; i--) { + Prefilter* prefilter = v[i]; + if (prefilter == NULL) + continue; + + if (CanonicalNode(prefilter) != prefilter) + continue; + + Entry* entry = &entries_[prefilter->unique_id()]; + + switch (prefilter->op()) { + default: + case Prefilter::ALL: + LOG(DFATAL) << "Unexpected op: " << prefilter->op(); + return; + + case Prefilter::ATOM: + entry->propagate_up_at_count = 1; + break; + + case Prefilter::OR: + case Prefilter::AND: { + set uniq_child; + for (int j = 0; j < prefilter->subs()->size() ; j++) { + Prefilter* child = (*prefilter->subs())[j]; + Prefilter* canonical = CanonicalNode(child); + if (canonical == NULL) { + LOG(DFATAL) << "Null canonical node"; + return; + } + int child_id = canonical->unique_id(); + uniq_child.insert(child_id); + // To the child, we want to add to parent indices. + Entry* child_entry = &entries_[child_id]; + if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end()) + (*child_entry->parents)[prefilter->unique_id()] = 1; + } + entry->propagate_up_at_count = + prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; + + break; + } + } + } + + // For top level nodes, populate regexp id. + for (int i = 0; i < prefilter_vec_.size(); i++) { + if (prefilter_vec_[i] == NULL) + continue; + int id = CanonicalNode(prefilter_vec_[i])->unique_id(); + DCHECK_LE(0, id); + Entry* entry = &entries_[id]; + entry->regexps.push_back(i); + } +} + +// Functions for triggering during search. +void PrefilterTree::RegexpsGivenStrings( + const vector& matched_atoms, + vector* regexps) const { + regexps->clear(); + if (!compiled_) { + LOG(WARNING) << "Compile() not called"; + for (int i = 0; i < prefilter_vec_.size(); ++i) + regexps->push_back(i); + } else { + if (!prefilter_vec_.empty()) { + IntMap regexps_map(prefilter_vec_.size()); + vector matched_atom_ids; + for (int j = 0; j < matched_atoms.size(); j++) { + matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); + VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; + } + PropagateMatch(matched_atom_ids, ®exps_map); + for (IntMap::iterator it = regexps_map.begin(); + it != regexps_map.end(); + ++it) + regexps->push_back(it->index()); + + regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); + } + } + sort(regexps->begin(), regexps->end()); +} + +void PrefilterTree::PropagateMatch(const vector& atom_ids, + IntMap* regexps) const { + IntMap count(entries_.size()); + IntMap work(entries_.size()); + for (int i = 0; i < atom_ids.size(); i++) + work.set(atom_ids[i], 1); + for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { + const Entry& entry = entries_[it->index()]; + VLOG(10) << "Processing: " << it->index(); + // Record regexps triggered. + for (int i = 0; i < entry.regexps.size(); i++) { + VLOG(10) << "Regexp triggered: " << entry.regexps[i]; + regexps->set(entry.regexps[i], 1); + } + int c; + // Pass trigger up to parents. + for (StdIntMap::iterator it = entry.parents->begin(); + it != entry.parents->end(); + ++it) { + int j = it->first; + const Entry& parent = entries_[j]; + VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; + // Delay until all the children have succeeded. + if (parent.propagate_up_at_count > 1) { + if (count.has_index(j)) { + c = count.get_existing(j) + 1; + count.set_existing(j, c); + } else { + c = 1; + count.set_new(j, c); + } + if (c < parent.propagate_up_at_count) + continue; + } + VLOG(10) << "Triggering: " << j; + // Trigger the parent. + work.set(j, 1); + } + } +} + +// Debugging help. +void PrefilterTree::PrintPrefilter(int regexpid) { + LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); +} + +void PrefilterTree::PrintDebugInfo() { + VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); + VLOG(10) << "#Unique Nodes: " << entries_.size(); + + for (int i = 0; i < entries_.size(); ++i) { + StdIntMap* parents = entries_[i].parents; + const vector& regexps = entries_[i].regexps; + VLOG(10) << "EntryId: " << i + << " N: " << parents->size() << " R: " << regexps.size(); + for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) + VLOG(10) << it->first; + } + VLOG(10) << "Map:"; + for (map::const_iterator iter = node_map_.begin(); + iter != node_map_.end(); ++iter) + VLOG(10) << "NodeId: " << (*iter).second->unique_id() + << " Str: " << (*iter).first; +} + +string PrefilterTree::DebugNodeString(Prefilter* node) const { + string node_string = ""; + + if (node->op() == Prefilter::ATOM) { + DCHECK(!node->atom().empty()); + node_string += node->atom(); + } else { + // Adding the operation disambiguates AND and OR nodes. + node_string += node->op() == Prefilter::AND ? "AND" : "OR"; + node_string += "("; + for (int i = 0; i < node->subs()->size() ; i++) { + if (i > 0) + node_string += ','; + node_string += Itoa((*node->subs())[i]->unique_id()); + node_string += ":"; + node_string += DebugNodeString((*node->subs())[i]); + } + node_string += ")"; + } + return node_string; +} + +} // namespace re2 diff --git a/outside/re2/re2/prefilter_tree.h b/outside/re2/re2/prefilter_tree.h new file mode 100644 index 000000000..e1d3e5f9b --- /dev/null +++ b/outside/re2/re2/prefilter_tree.h @@ -0,0 +1,131 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The PrefilterTree class is used to form an AND-OR tree of strings +// that would trigger each regexp. The 'prefilter' of each regexp is +// added tp PrefilterTree, and then PrefilterTree is used to find all +// the unique strings across the prefilters. During search, by using +// matches from a string matching engine, PrefilterTree deduces the +// set of regexps that are to be triggered. The 'string matching +// engine' itself is outside of this class, and the caller can use any +// favorite engine. PrefilterTree provides a set of strings (called +// atoms) that the user of this class should use to do the string +// matching. +// +#ifndef RE2_PREFILTER_TREE_H_ +#define RE2_PREFILTER_TREE_H_ + +#include "util/util.h" +#include "util/sparse_array.h" + +namespace re2 { + +typedef SparseArray IntMap; +typedef map StdIntMap; + +class Prefilter; + +class PrefilterTree { + public: + PrefilterTree(); + ~PrefilterTree(); + + // Adds the prefilter for the next regexp. Note that we assume that + // Add called sequentially for all regexps. All Add calls + // must precede Compile. + void Add(Prefilter* prefilter); + + // The Compile returns a vector of string in atom_vec. + // Call this after all the prefilters are added through Add. + // No calls to Add after Compile are allowed. + // The caller should use the returned set of strings to do string matching. + // Each time a string matches, the corresponding index then has to be + // and passed to RegexpsGivenStrings below. + void Compile(vector* atom_vec); + + // Given the indices of the atoms that matched, returns the indexes + // of regexps that should be searched. The matched_atoms should + // contain all the ids of string atoms that were found to match the + // content. The caller can use any string match engine to perform + // this function. This function is thread safe. + void RegexpsGivenStrings(const vector& matched_atoms, + vector* regexps) const; + + // Print debug prefilter. Also prints unique ids associated with + // nodes of the prefilter of the regexp. + void PrintPrefilter(int regexpid); + + + // Each unique node has a corresponding Entry that helps in + // passing the matching trigger information along the tree. + struct Entry { + public: + // How many children should match before this node triggers the + // parent. For an atom and an OR node, this is 1 and for an AND + // node, it is the number of unique children. + int propagate_up_at_count; + + // When this node is ready to trigger the parent, what are the indices + // of the parent nodes to trigger. The reason there may be more than + // one is because of sharing. For example (abc | def) and (xyz | def) + // are two different nodes, but they share the atom 'def'. So when + // 'def' matches, it triggers two parents, corresponding to the two + // different OR nodes. + StdIntMap* parents; + + // When this node is ready to trigger the parent, what are the + // regexps that are triggered. + vector regexps; + }; + + private: + // This function assigns unique ids to various parts of the + // prefilter, by looking at if these nodes are already in the + // PrefilterTree. + void AssignUniqueIds(vector* atom_vec); + + // Given the matching atoms, find the regexps to be triggered. + void PropagateMatch(const vector& atom_ids, + IntMap* regexps) const; + + // Returns the prefilter node that has the same NodeString as this + // node. For the canonical node, returns node. + Prefilter* CanonicalNode(Prefilter* node); + + // A string that uniquely identifies the node. Assumes that the + // children of node has already been assigned unique ids. + string NodeString(Prefilter* node) const; + + // Recursively constructs a readable prefilter string. + string DebugNodeString(Prefilter* node) const; + + // Used for debugging. + void PrintDebugInfo(); + + // These are all the nodes formed by Compile. Essentially, there is + // one node for each unique atom and each unique AND/OR node. + vector entries_; + + // Map node string to canonical Prefilter node. + map node_map_; + + // indices of regexps that always pass through the filter (since we + // found no required literals in these regexps). + vector unfiltered_; + + // vector of Prefilter for all regexps. + vector prefilter_vec_; + + // Atom index in returned strings to entry id mapping. + vector atom_index_to_id_; + + // Has the prefilter tree been compiled. + bool compiled_; + + DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); +}; + +} // namespace + +#endif // RE2_PREFILTER_TREE_H_ diff --git a/outside/re2/re2/prog.cc b/outside/re2/re2/prog.cc new file mode 100644 index 000000000..f326ffdd1 --- /dev/null +++ b/outside/re2/re2/prog.cc @@ -0,0 +1,343 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compiled regular expression representation. +// Tested by compile_test.cc + +#include "util/util.h" +#include "util/sparse_set.h" +#include "re2/prog.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// Constructors per Inst opcode + +void Prog::Inst::InitAlt(uint32 out, uint32 out1) { + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstAlt); + out1_ = out1; +} + +void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstByteRange); + lo_ = lo & 0xFF; + hi_ = hi & 0xFF; + foldcase_ = foldcase; +} + +void Prog::Inst::InitCapture(int cap, uint32 out) { + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstCapture); + cap_ = cap; +} + +void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { + DCHECK_EQ(out_opcode_, 0); + set_out_opcode(out, kInstEmptyWidth); + empty_ = empty; +} + +void Prog::Inst::InitMatch(int32 id) { + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstMatch); + match_id_ = id; +} + +void Prog::Inst::InitNop(uint32 out) { + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstNop); +} + +void Prog::Inst::InitFail() { + DCHECK_EQ(out_opcode_, 0); + set_opcode(kInstFail); +} + +string Prog::Inst::Dump() { + switch (opcode()) { + default: + return StringPrintf("opcode %d", static_cast(opcode())); + + case kInstAlt: + return StringPrintf("alt -> %d | %d", out(), out1_); + + case kInstAltMatch: + return StringPrintf("altmatch -> %d | %d", out(), out1_); + + case kInstByteRange: + return StringPrintf("byte%s [%02x-%02x] -> %d", + foldcase_ ? "/i" : "", + lo_, hi_, out()); + + case kInstCapture: + return StringPrintf("capture %d -> %d", cap_, out()); + + case kInstEmptyWidth: + return StringPrintf("emptywidth %#x -> %d", + static_cast(empty_), out()); + + case kInstMatch: + return StringPrintf("match! %d", match_id()); + + case kInstNop: + return StringPrintf("nop -> %d", out()); + + case kInstFail: + return StringPrintf("fail"); + } +} + +Prog::Prog() + : anchor_start_(false), + anchor_end_(false), + reversed_(false), + did_onepass_(false), + start_(0), + start_unanchored_(0), + size_(0), + byte_inst_count_(0), + bytemap_range_(0), + flags_(0), + onepass_statesize_(0), + inst_(NULL), + dfa_first_(NULL), + dfa_longest_(NULL), + dfa_mem_(0), + delete_dfa_(NULL), + unbytemap_(NULL), + onepass_nodes_(NULL), + onepass_start_(NULL) { +} + +Prog::~Prog() { + if (delete_dfa_) { + if (dfa_first_) + delete_dfa_(dfa_first_); + if (dfa_longest_) + delete_dfa_(dfa_longest_); + } + delete[] onepass_nodes_; + delete[] inst_; + delete[] unbytemap_; +} + +typedef SparseSet Workq; + +static inline void AddToQueue(Workq* q, int id) { + if (id != 0) + q->insert(id); +} + +static string ProgToString(Prog* prog, Workq* q) { + string s; + + for (Workq::iterator i = q->begin(); i != q->end(); ++i) { + int id = *i; + Prog::Inst* ip = prog->inst(id); + StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); + AddToQueue(q, ip->out()); + if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) + AddToQueue(q, ip->out1()); + } + return s; +} + +string Prog::Dump() { + string map; + if (false) { // Debugging + int lo = 0; + StringAppendF(&map, "byte map:\n"); + for (int i = 0; i < bytemap_range_; i++) { + StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); + lo = unbytemap_[i] + 1; + } + StringAppendF(&map, "\n"); + } + + Workq q(size_); + AddToQueue(&q, start_); + return map + ProgToString(this, &q); +} + +string Prog::DumpUnanchored() { + Workq q(size_); + AddToQueue(&q, start_unanchored_); + return ProgToString(this, &q); +} + +static bool IsMatch(Prog*, Prog::Inst*); + +// Peep-hole optimizer. +void Prog::Optimize() { + Workq q(size_); + + // Eliminate nops. Most are taken out during compilation + // but a few are hard to avoid. + q.clear(); + AddToQueue(&q, start_); + for (Workq::iterator i = q.begin(); i != q.end(); ++i) { + int id = *i; + + Inst* ip = inst(id); + int j = ip->out(); + Inst* jp; + while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { + j = jp->out(); + } + ip->set_out(j); + AddToQueue(&q, ip->out()); + + if (ip->opcode() == kInstAlt) { + j = ip->out1(); + while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { + j = jp->out(); + } + ip->out1_ = j; + AddToQueue(&q, ip->out1()); + } + } + + // Insert kInstAltMatch instructions + // Look for + // ip: Alt -> j | k + // j: ByteRange [00-FF] -> ip + // k: Match + // or the reverse (the above is the greedy one). + // Rewrite Alt to AltMatch. + q.clear(); + AddToQueue(&q, start_); + for (Workq::iterator i = q.begin(); i != q.end(); ++i) { + int id = *i; + Inst* ip = inst(id); + AddToQueue(&q, ip->out()); + if (ip->opcode() == kInstAlt) + AddToQueue(&q, ip->out1()); + + if (ip->opcode() == kInstAlt) { + Inst* j = inst(ip->out()); + Inst* k = inst(ip->out1()); + if (j->opcode() == kInstByteRange && j->out() == id && + j->lo() == 0x00 && j->hi() == 0xFF && + IsMatch(this, k)) { + ip->set_opcode(kInstAltMatch); + continue; + } + if (IsMatch(this, j) && + k->opcode() == kInstByteRange && k->out() == id && + k->lo() == 0x00 && k->hi() == 0xFF) { + ip->set_opcode(kInstAltMatch); + } + } + } +} + +// Is ip a guaranteed match at end of text, perhaps after some capturing? +static bool IsMatch(Prog* prog, Prog::Inst* ip) { + for (;;) { + switch (ip->opcode()) { + default: + LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); + return false; + + case kInstAlt: + case kInstAltMatch: + case kInstByteRange: + case kInstFail: + case kInstEmptyWidth: + return false; + + case kInstCapture: + case kInstNop: + ip = prog->inst(ip->out()); + break; + + case kInstMatch: + return true; + } + } +} + +uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { + int flags = 0; + + // ^ and \A + if (p == text.begin()) + flags |= kEmptyBeginText | kEmptyBeginLine; + else if (p[-1] == '\n') + flags |= kEmptyBeginLine; + + // $ and \z + if (p == text.end()) + flags |= kEmptyEndText | kEmptyEndLine; + else if (p < text.end() && p[0] == '\n') + flags |= kEmptyEndLine; + + // \b and \B + if (p == text.begin() && p == text.end()) { + // no word boundary here + } else if (p == text.begin()) { + if (IsWordChar(p[0])) + flags |= kEmptyWordBoundary; + } else if (p == text.end()) { + if (IsWordChar(p[-1])) + flags |= kEmptyWordBoundary; + } else { + if (IsWordChar(p[-1]) != IsWordChar(p[0])) + flags |= kEmptyWordBoundary; + } + if (!(flags & kEmptyWordBoundary)) + flags |= kEmptyNonWordBoundary; + + return flags; +} + +void Prog::MarkByteRange(int lo, int hi) { + DCHECK_GE(lo, 0); + DCHECK_GE(hi, 0); + DCHECK_LE(lo, 255); + DCHECK_LE(hi, 255); + DCHECK_LE(lo, hi); + if (0 < lo && lo <= 255) + byterange_.Set(lo - 1); + if (0 <= hi && hi <= 255) + byterange_.Set(hi); +} + +void Prog::ComputeByteMap() { + // Fill in bytemap with byte classes for prog_. + // Ranges of bytes that are treated as indistinguishable + // by the regexp program are mapped to a single byte class. + // The vector prog_->byterange() marks the end of each + // such range. + const Bitmap<256>& v = byterange(); + + COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); + uint8 n = 0; + uint32 bits = 0; + for (int i = 0; i < 256; i++) { + if ((i&31) == 0) + bits = v.Word(i >> 5); + bytemap_[i] = n; + n += bits & 1; + bits >>= 1; + } + bytemap_range_ = bytemap_[255] + 1; + unbytemap_ = new uint8[bytemap_range_]; + for (int i = 0; i < 256; i++) + unbytemap_[bytemap_[i]] = i; + + if (0) { // For debugging: use trivial byte map. + for (int i = 0; i < 256; i++) { + bytemap_[i] = i; + unbytemap_[i] = i; + } + bytemap_range_ = 256; + LOG(INFO) << "Using trivial bytemap."; + } +} + +} // namespace re2 + diff --git a/outside/re2/re2/prog.h b/outside/re2/re2/prog.h new file mode 100644 index 000000000..2cf65bc76 --- /dev/null +++ b/outside/re2/re2/prog.h @@ -0,0 +1,376 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Compiled representation of regular expressions. +// See regexp.h for the Regexp class, which represents a regular +// expression symbolically. + +#ifndef RE2_PROG_H__ +#define RE2_PROG_H__ + +#include "util/util.h" +#include "re2/re2.h" + +namespace re2 { + +// Simple fixed-size bitmap. +template +class Bitmap { + public: + Bitmap() { Reset(); } + int Size() { return Bits; } + + void Reset() { + for (int i = 0; i < Words; i++) + w_[i] = 0; + } + bool Get(int k) const { + return w_[k >> WordLog] & (1<<(k & 31)); + } + void Set(int k) { + w_[k >> WordLog] |= 1<<(k & 31); + } + void Clear(int k) { + w_[k >> WordLog] &= ~(1<<(k & 31)); + } + uint32 Word(int i) const { + return w_[i]; + } + + private: + static const int WordLog = 5; + static const int Words = (Bits+31)/32; + uint32 w_[Words]; + DISALLOW_EVIL_CONSTRUCTORS(Bitmap); +}; + + +// Opcodes for Inst +enum InstOp { + kInstAlt = 0, // choose between out_ and out1_ + kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. + kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] + kInstCapture, // capturing parenthesis number cap_ + kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ + kInstMatch, // found a match! + kInstNop, // no-op; occasionally unavoidable + kInstFail, // never match; occasionally unavoidable +}; + +// Bit flags for empty-width specials +enum EmptyOp { + kEmptyBeginLine = 1<<0, // ^ - beginning of line + kEmptyEndLine = 1<<1, // $ - end of line + kEmptyBeginText = 1<<2, // \A - beginning of text + kEmptyEndText = 1<<3, // \z - end of text + kEmptyWordBoundary = 1<<4, // \b - word boundary + kEmptyNonWordBoundary = 1<<5, // \B - not \b + kEmptyAllFlags = (1<<6)-1, +}; + +class Regexp; + +class DFA; +struct OneState; + +// Compiled form of regexp program. +class Prog { + public: + Prog(); + ~Prog(); + + // Single instruction in regexp program. + class Inst { + public: + Inst() : out_opcode_(0), out1_(0) { } + + // Constructors per opcode + void InitAlt(uint32 out, uint32 out1); + void InitByteRange(int lo, int hi, int foldcase, uint32 out); + void InitCapture(int cap, uint32 out); + void InitEmptyWidth(EmptyOp empty, uint32 out); + void InitMatch(int id); + void InitNop(uint32 out); + void InitFail(); + + // Getters + int id(Prog* p) { return this - p->inst_; } + InstOp opcode() { return static_cast(out_opcode_&7); } + int out() { return out_opcode_>>3; } + int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } + int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } + int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } + int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } + int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } + int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } + EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } + bool greedy(Prog *p) { + DCHECK_EQ(opcode(), kInstAltMatch); + return p->inst(out())->opcode() == kInstByteRange; + } + + // Does this inst (an kInstByteRange) match c? + inline bool Matches(int c) { + DCHECK_EQ(opcode(), kInstByteRange); + if (foldcase_ && 'A' <= c && c <= 'Z') + c += 'a' - 'A'; + return lo_ <= c && c <= hi_; + } + + // Returns string representation for debugging. + string Dump(); + + // Maximum instruction id. + // (Must fit in out_opcode_, and PatchList steals another bit.) + static const int kMaxInst = (1<<28) - 1; + + private: + void set_opcode(InstOp opcode) { + out_opcode_ = (out()<<3) | opcode; + } + + void set_out(int out) { + out_opcode_ = (out<<3) | opcode(); + } + + void set_out_opcode(int out, InstOp opcode) { + out_opcode_ = (out<<3) | opcode; + } + + uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode + union { // additional instruction arguments: + uint32 out1_; // opcode == kInstAlt + // alternate next instruction + + int32 cap_; // opcode == kInstCapture + // Index of capture register (holds text + // position recorded by capturing parentheses). + // For \n (the submatch for the nth parentheses), + // the left parenthesis captures into register 2*n + // and the right one captures into register 2*n+1. + + int32 match_id_; // opcode == kInstMatch + // Match ID to identify this match (for re2::Set). + + struct { // opcode == kInstByteRange + uint8 lo_; // byte range is lo_-hi_ inclusive + uint8 hi_; // + uint8 foldcase_; // convert A-Z to a-z before checking range. + }; + + EmptyOp empty_; // opcode == kInstEmptyWidth + // empty_ is bitwise OR of kEmpty* flags above. + }; + + friend class Compiler; + friend struct PatchList; + friend class Prog; + + DISALLOW_EVIL_CONSTRUCTORS(Inst); + }; + + // Whether to anchor the search. + enum Anchor { + kUnanchored, // match anywhere + kAnchored, // match only starting at beginning of text + }; + + // Kind of match to look for (for anchor != kFullMatch) + // + // kLongestMatch mode finds the overall longest + // match but still makes its submatch choices the way + // Perl would, not in the way prescribed by POSIX. + // The POSIX rules are much more expensive to implement, + // and no one has needed them. + // + // kFullMatch is not strictly necessary -- we could use + // kLongestMatch and then check the length of the match -- but + // the matching code can run faster if it knows to consider only + // full matches. + enum MatchKind { + kFirstMatch, // like Perl, PCRE + kLongestMatch, // like egrep or POSIX + kFullMatch, // match only entire text; implies anchor==kAnchored + kManyMatch // for SearchDFA, records set of matches + }; + + Inst *inst(int id) { return &inst_[id]; } + int start() { return start_; } + int start_unanchored() { return start_unanchored_; } + void set_start(int start) { start_ = start; } + void set_start_unanchored(int start) { start_unanchored_ = start; } + int64 size() { return size_; } + bool reversed() { return reversed_; } + void set_reversed(bool reversed) { reversed_ = reversed; } + int64 byte_inst_count() { return byte_inst_count_; } + const Bitmap<256>& byterange() { return byterange_; } + void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } + int64 dfa_mem() { return dfa_mem_; } + int flags() { return flags_; } + void set_flags(int flags) { flags_ = flags; } + bool anchor_start() { return anchor_start_; } + void set_anchor_start(bool b) { anchor_start_ = b; } + bool anchor_end() { return anchor_end_; } + void set_anchor_end(bool b) { anchor_end_ = b; } + int bytemap_range() { return bytemap_range_; } + const uint8* bytemap() { return bytemap_; } + + // Returns string representation of program for debugging. + string Dump(); + string DumpUnanchored(); + + // Record that at some point in the prog, the bytes in the range + // lo-hi (inclusive) are treated as different from bytes outside the range. + // Tracking this lets the DFA collapse commonly-treated byte ranges + // when recording state pointers, greatly reducing its memory footprint. + void MarkByteRange(int lo, int hi); + + // Returns the set of kEmpty flags that are in effect at + // position p within context. + static uint32 EmptyFlags(const StringPiece& context, const char* p); + + // Returns whether byte c is a word character: ASCII only. + // Used by the implementation of \b and \B. + // This is not right for Unicode, but: + // - it's hard to get right in a byte-at-a-time matching world + // (the DFA has only one-byte lookahead). + // - even if the lookahead were possible, the Progs would be huge. + // This crude approximation is the same one PCRE uses. + static bool IsWordChar(uint8 c) { + return ('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || + ('0' <= c && c <= '9') || + c == '_'; + } + + // Execution engines. They all search for the regexp (run the prog) + // in text, which is in the larger context (used for ^ $ \b etc). + // Anchor and kind control the kind of search. + // Returns true if match found, false if not. + // If match found, fills match[0..nmatch-1] with submatch info. + // match[0] is overall match, match[1] is first set of parens, etc. + // If a particular submatch is not matched during the regexp match, + // it is set to NULL. + // + // Matching text == StringPiece(NULL, 0) is treated as any other empty + // string, but note that on return, it will not be possible to distinguish + // submatches that matched that empty string from submatches that didn't + // match anything. Either way, match[i] == NULL. + + // Search using NFA: can find submatches but kind of slow. + bool SearchNFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Search using DFA: much faster than NFA but only finds + // end of match and can use a lot more memory. + // Returns whether a match was found. + // If the DFA runs out of memory, sets *failed to true and returns false. + // If matches != NULL and kind == kManyMatch and there is a match, + // SearchDFA fills matches with the match IDs of the final matching state. + bool SearchDFA(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match0, bool* failed, + vector* matches); + + // Build the entire DFA for the given match kind. FOR TESTING ONLY. + // Usually the DFA is built out incrementally, as needed, which + // avoids lots of unnecessary work. This function is useful only + // for testing purposes. Returns number of states. + int BuildEntireDFA(MatchKind kind); + + // Compute byte map. + void ComputeByteMap(); + + // Run peep-hole optimizer on program. + void Optimize(); + + // One-pass NFA: only correct if IsOnePass() is true, + // but much faster than NFA (competitive with PCRE) + // for those expressions. + bool IsOnePass(); + bool SearchOnePass(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Bit-state backtracking. Fast on small cases but uses memory + // proportional to the product of the program size and the text size. + bool SearchBitState(const StringPiece& text, const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + static const int kMaxOnePassCapture = 5; // $0 through $4 + + // Backtracking search: the gold standard against which the other + // implementations are checked. FOR TESTING ONLY. + // It allocates a ton of memory to avoid running forever. + // It is also recursive, so can't use in production (will overflow stacks). + // The name "Unsafe" here is supposed to be a flag that + // you should not be using this function. + bool UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, MatchKind kind, + StringPiece* match, int nmatch); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(string* min, string* max, int maxlen); + + // Compiles a collection of regexps to Prog. Each regexp will have + // its own Match instruction recording the index in the vector. + static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, + Regexp* re); + + private: + friend class Compiler; + + DFA* GetDFA(MatchKind kind); + + bool anchor_start_; // regexp has explicit start anchor + bool anchor_end_; // regexp has explicit end anchor + bool reversed_; // whether program runs backward over input + bool did_onepass_; // has IsOnePass been called? + + int start_; // entry point for program + int start_unanchored_; // unanchored entry point for program + int size_; // number of instructions + int byte_inst_count_; // number of kInstByteRange instructions + int bytemap_range_; // bytemap_[x] < bytemap_range_ + int flags_; // regexp parse flags + int onepass_statesize_; // byte size of each OneState* node + + Inst* inst_; // pointer to instruction array + + Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ + DFA* volatile dfa_first_; // DFA cached for kFirstMatch + DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch + int64 dfa_mem_; // Maximum memory for DFAs. + void (*delete_dfa_)(DFA* dfa); + + Bitmap<256> byterange_; // byterange.Get(x) true if x ends a + // commonly-treated byte range. + uint8 bytemap_[256]; // map from input bytes to byte classes + uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x + + uint8* onepass_nodes_; // data for OnePass nodes + OneState* onepass_start_; // start node for OnePass program + + DISALLOW_EVIL_CONSTRUCTORS(Prog); +}; + +} // namespace re2 + +#endif // RE2_PROG_H__ diff --git a/outside/re2/re2/re2.cc b/outside/re2/re2/re2.cc new file mode 100644 index 000000000..d67ef45c1 --- /dev/null +++ b/outside/re2/re2/re2.cc @@ -0,0 +1,1218 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression interface RE2. +// +// Originally the PCRE C++ wrapper, but adapted to use +// the new automata-based regular expression engines. + +#include "re2/re2.h" + +#include +#include +#include +#include +#include "util/atomicops.h" +#include "util/util.h" +#include "util/flags.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +DEFINE_bool(trace_re2, false, "trace RE2 execution"); + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = 1+kMaxArgs; + +const VariadicFunction2 RE2::FullMatch = {}; +const VariadicFunction2 RE2::PartialMatch = {}; +const VariadicFunction2 RE2::Consume = {}; +const VariadicFunction2 RE2::FindAndConsume = {}; + +// This will trigger LNK2005 error in MSVC. +#ifndef COMPILER_MSVC +const int RE2::Options::kDefaultMaxMem; // initialized in re2.h +#endif // COMPILER_MSVC + +RE2::Options::Options(RE2::CannedOptions opt) + : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), + posix_syntax_(opt == RE2::POSIX), + longest_match_(opt == RE2::POSIX), + log_errors_(opt != RE2::Quiet), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { +} + +// static empty things for use as const references. +// To avoid global constructors, initialized on demand. +GLOBAL_MUTEX(empty_mutex); +static const string *empty_string; +static const map *empty_named_groups; +static const map *empty_group_names; + +static void InitEmpty() { + GLOBAL_MUTEX_LOCK(empty_mutex); + if (empty_string == NULL) { + empty_string = new string; + empty_named_groups = new map; + empty_group_names = new map; + } + GLOBAL_MUTEX_UNLOCK(empty_mutex); +} + +// Converts from Regexp error code to RE2 error code. +// Maybe some day they will diverge. In any event, this +// hides the existence of Regexp from RE2 users. +static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { + switch (code) { + case re2::kRegexpSuccess: + return RE2::NoError; + case re2::kRegexpInternalError: + return RE2::ErrorInternal; + case re2::kRegexpBadEscape: + return RE2::ErrorBadEscape; + case re2::kRegexpBadCharClass: + return RE2::ErrorBadCharClass; + case re2::kRegexpBadCharRange: + return RE2::ErrorBadCharRange; + case re2::kRegexpMissingBracket: + return RE2::ErrorMissingBracket; + case re2::kRegexpMissingParen: + return RE2::ErrorMissingParen; + case re2::kRegexpTrailingBackslash: + return RE2::ErrorTrailingBackslash; + case re2::kRegexpRepeatArgument: + return RE2::ErrorRepeatArgument; + case re2::kRegexpRepeatSize: + return RE2::ErrorRepeatSize; + case re2::kRegexpRepeatOp: + return RE2::ErrorRepeatOp; + case re2::kRegexpBadPerlOp: + return RE2::ErrorBadPerlOp; + case re2::kRegexpBadUTF8: + return RE2::ErrorBadUTF8; + case re2::kRegexpBadNamedCapture: + return RE2::ErrorBadNamedCapture; + } + return RE2::ErrorInternal; +} + +static string trunc(const StringPiece& pattern) { + if (pattern.size() < 100) + return pattern.as_string(); + return pattern.substr(0, 100).as_string() + "..."; +} + + +RE2::RE2(const char* pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const string& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern) { + Init(pattern, DefaultOptions); +} + +RE2::RE2(const StringPiece& pattern, const Options& options) { + Init(pattern, options); +} + +int RE2::Options::ParseFlags() const { + int flags = Regexp::ClassNL; + switch (encoding()) { + default: + if (log_errors()) + LOG(ERROR) << "Unknown encoding " << encoding(); + break; + case RE2::Options::EncodingUTF8: + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; + break; + } + + if (!posix_syntax()) + flags |= Regexp::LikePerl; + + if (literal()) + flags |= Regexp::Literal; + + if (never_nl()) + flags |= Regexp::NeverNL; + + if (dot_nl()) + flags |= Regexp::DotNL; + + if (never_capture()) + flags |= Regexp::NeverCapture; + + if (!case_sensitive()) + flags |= Regexp::FoldCase; + + if (perl_classes()) + flags |= Regexp::PerlClasses; + + if (word_boundary()) + flags |= Regexp::PerlB; + + if (one_line()) + flags |= Regexp::OneLine; + + return flags; +} + +void RE2::Init(const StringPiece& pattern, const Options& options) { + mutex_ = new Mutex; + pattern_ = pattern.as_string(); + options_.Copy(options); + InitEmpty(); + error_ = empty_string; + error_code_ = NoError; + suffix_regexp_ = NULL; + entire_regexp_ = NULL; + prog_ = NULL; + rprog_ = NULL; + named_groups_ = NULL; + group_names_ = NULL; + num_captures_ = -1; + + RegexpStatus status; + entire_regexp_ = Regexp::Parse( + pattern_, + static_cast(options_.ParseFlags()), + &status); + if (entire_regexp_ == NULL) { + if (error_ == empty_string) + error_ = new string(status.Text()); + if (options_.log_errors()) { + LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " + << status.Text(); + } + error_arg_ = status.error_arg().as_string(); + error_code_ = RegexpErrorToRE2(status.code()); + return; + } + + prefix_.clear(); + prefix_foldcase_ = false; + re2::Regexp* suffix; + if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) + suffix_regexp_ = suffix; + else + suffix_regexp_ = entire_regexp_->Incref(); + + // Two thirds of the memory goes to the forward Prog, + // one third to the reverse prog, because the forward + // Prog has two DFAs but the reverse prog has one. + prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); + if (prog_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; + error_ = new string("pattern too large - compile failed"); + error_code_ = RE2::ErrorPatternTooLarge; + return; + } + + // Could delay this until the first match call that + // cares about submatch information, but the one-pass + // machine's memory gets cut from the DFA memory budget, + // and that is harder to do if the DFA has already + // been built. + is_one_pass_ = prog_->IsOnePass(); +} + +// Returns rprog_, computing it if needed. +re2::Prog* RE2::ReverseProg() const { + MutexLock l(mutex_); + if (rprog_ == NULL && error_ == empty_string) { + rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); + if (rprog_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; + error_ = new string("pattern too large - reverse compile failed"); + error_code_ = RE2::ErrorPatternTooLarge; + return NULL; + } + } + return rprog_; +} + +RE2::~RE2() { + if (suffix_regexp_) + suffix_regexp_->Decref(); + if (entire_regexp_) + entire_regexp_->Decref(); + delete mutex_; + delete prog_; + delete rprog_; + if (error_ != empty_string) + delete error_; + if (named_groups_ != NULL && named_groups_ != empty_named_groups) + delete named_groups_; + if (group_names_ != NULL && group_names_ != empty_group_names) + delete group_names_; +} + +int RE2::ProgramSize() const { + if (prog_ == NULL) + return -1; + return prog_->size(); +} + +// Returns named_groups_, computing it if needed. +const map& RE2::NamedCapturingGroups() const { + MutexLock l(mutex_); + if (!ok()) + return *empty_named_groups; + if (named_groups_ == NULL) { + named_groups_ = suffix_regexp_->NamedCaptures(); + if (named_groups_ == NULL) + named_groups_ = empty_named_groups; + } + return *named_groups_; +} + +// Returns group_names_, computing it if needed. +const map& RE2::CapturingGroupNames() const { + MutexLock l(mutex_); + if (!ok()) + return *empty_group_names; + if (group_names_ == NULL) { + group_names_ = suffix_regexp_->CaptureNames(); + if (group_names_ == NULL) + group_names_ = empty_group_names; + } + return *group_names_; +} + +/***** Convenience interfaces *****/ + +bool RE2::FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); +} + +bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int n) { + return re.DoMatch(text, UNANCHORED, NULL, args, n); +} + +bool RE2::ConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + int consumed; + if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int n) { + int consumed; + if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +int RE2::MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +bool RE2::Replace(string *str, + const RE2& re, + const StringPiece& rewrite) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) + return false; + + string s; + if (!re.Rewrite(&s, rewrite, vec, nvec)) + return false; + + assert(vec[0].begin() >= str->data()); + assert(vec[0].end() <= str->data()+str->size()); + str->replace(vec[0].data() - str->data(), vec[0].size(), s); + return true; +} + +int RE2::GlobalReplace(string *str, + const RE2& re, + const StringPiece& rewrite) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + + const char* p = str->data(); + const char* ep = p + str->size(); + const char* lastend = NULL; + string out; + int count = 0; + while (p <= ep) { + if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) + break; + if (p < vec[0].begin()) + out.append(p, vec[0].begin() - p); + if (vec[0].begin() == lastend && vec[0].size() == 0) { + // Disallow empty match at end of last match: skip ahead. + if (p < ep) + out.append(p, 1); + p++; + continue; + } + re.Rewrite(&out, rewrite, vec, nvec); + p = vec[0].end(); + lastend = p; + count++; + } + + if (count == 0) + return 0; + + if (p < ep) + out.append(p, ep - p); + swap(out, *str); + return count; +} + +bool RE2::Extract(const StringPiece &text, + const RE2& re, + const StringPiece &rewrite, + string *out) { + StringPiece vec[kVecSize]; + int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > arraysize(vec)) + return false; + + if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) + return false; + + out->clear(); + return re.Rewrite(out, rewrite, vec, nvec); +} + +string RE2::QuoteMeta(const StringPiece& unquoted) { + string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (int ii = 0; ii < unquoted.length(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Note that this special handling is not strictly required for RE2, + // but this quoting is required for other regexp libraries such as + // PCRE. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { + if (prog_ == NULL) + return false; + + int n = prefix_.size(); + if (n > maxlen) + n = maxlen; + + // Determine initial min max from prefix_ literal. + string pmin, pmax; + pmin = prefix_.substr(0, n); + pmax = prefix_.substr(0, n); + if (prefix_foldcase_) { + // prefix is ASCII lowercase; change pmin to uppercase. + for (int i = 0; i < n; i++) { + if ('a' <= pmin[i] && pmin[i] <= 'z') + pmin[i] += 'A' - 'a'; + } + } + + // Add to prefix min max using PossibleMatchRange on regexp. + string dmin, dmax; + maxlen -= n; + if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { + pmin += dmin; + pmax += dmax; + } else if (pmax.size() > 0) { + // prog_->PossibleMatchRange has failed us, + // but we still have useful information from prefix_. + // Round up pmax to allow any possible suffix. + pmax = PrefixSuccessor(pmax); + } else { + // Nothing useful. + *min = ""; + *max = ""; + return false; + } + + *min = pmin; + *max = pmax; + return true; +} + +// Avoid possible locale nonsense in standard strcasecmp. +// The string a is known to be all lowercase. +static int ascii_strcasecmp(const char* a, const char* b, int len) { + const char *ae = a + len; + + for (; a < ae; a++, b++) { + uint8 x = *a; + uint8 y = *b; + if ('A' <= y && y <= 'Z') + y += 'a' - 'A'; + if (x != y) + return x - y; + } + return 0; +} + + +/***** Actual matching and rewriting code *****/ + +bool RE2::Match(const StringPiece& text, + int startpos, + int endpos, + Anchor re_anchor, + StringPiece* submatch, + int nsubmatch) const { + if (!ok() || suffix_regexp_ == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + + if (startpos < 0 || startpos > endpos || endpos > text.size()) { + if (options_.log_errors()) + LOG(ERROR) << "RE2: invalid startpos, endpos pair."; + return false; + } + + StringPiece subtext = text; + subtext.remove_prefix(startpos); + subtext.remove_suffix(text.size() - endpos); + + // Use DFAs to find exact location of match, filter out non-matches. + + // Don't ask for the location if we won't use it. + // SearchDFA can do extra optimizations in that case. + StringPiece match; + StringPiece* matchp = &match; + if (nsubmatch == 0) + matchp = NULL; + + int ncap = 1 + NumberOfCapturingGroups(); + if (ncap > nsubmatch) + ncap = nsubmatch; + + // If the regexp is anchored explicitly, must not be in middle of text. + if (prog_->anchor_start() && startpos != 0) + return false; + + // If the regexp is anchored explicitly, update re_anchor + // so that we can potentially fall into a faster case below. + if (prog_->anchor_start() && prog_->anchor_end()) + re_anchor = ANCHOR_BOTH; + else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + + // Check for the required prefix, if any. + int prefixlen = 0; + if (!prefix_.empty()) { + if (startpos != 0) + return false; + prefixlen = prefix_.size(); + if (prefixlen > subtext.size()) + return false; + if (prefix_foldcase_) { + if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } else { + if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) + return false; + } + subtext.remove_prefix(prefixlen); + // If there is a required prefix, the anchor must be at least ANCHOR_START. + if (re_anchor != ANCHOR_BOTH) + re_anchor = ANCHOR_START; + } + + Prog::Anchor anchor = Prog::kUnanchored; + Prog::MatchKind kind = Prog::kFirstMatch; + if (options_.longest_match()) + kind = Prog::kLongestMatch; + bool skipped_test = false; + + bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); + + // SearchBitState allocates a bit vector of size prog_->size() * text.size(). + // It also allocates a stack of 3-word structures which could potentially + // grow as large as prog_->size() * text.size() but in practice is much + // smaller. + // Conditions for using SearchBitState: + const int MaxBitStateProg = 500; // prog_->size() <= Max. + const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) + bool can_bit_state = prog_->size() <= MaxBitStateProg; + int bit_state_text_max = MaxBitStateVector / prog_->size(); + + bool dfa_failed = false; + switch (re_anchor) { + default: + case UNANCHORED: { + if (!prog_->SearchDFA(subtext, text, anchor, kind, + matchp, &dfa_failed, NULL)) { + if (dfa_failed) { + // Fall back to NFA below. + skipped_test = true; + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA failed."; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - no match."; + return false; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - match"; + if (matchp == NULL) // Matched. Don't care where + return true; + // SearchDFA set match[0].end() but didn't know where the + // match started. Run the regexp backward from match[0].end() + // to find the longest possible match -- that's where it started. + Prog* prog = ReverseProg(); + if (prog == NULL) + return false; + if (!prog->SearchDFA(match, text, Prog::kAnchored, + Prog::kLongestMatch, &match, &dfa_failed, NULL)) { + if (dfa_failed) { + // Fall back to NFA below. + skipped_test = true; + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " reverse DFA failed."; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA inconsistency."; + if (options_.log_errors()) + LOG(ERROR) << "DFA inconsistency"; + return false; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used reverse DFA."; + break; + } + + case ANCHOR_BOTH: + case ANCHOR_START: + if (re_anchor == ANCHOR_BOTH) + kind = Prog::kFullMatch; + anchor = Prog::kAnchored; + + // If only a small amount of text and need submatch + // information anyway and we're going to use OnePass or BitState + // to get it, we might as well not even bother with the DFA: + // OnePass or BitState will be fast enough. + // On tiny texts, OnePass outruns even the DFA, and + // it doesn't have the shared state and occasional mutex that + // the DFA does. + if (can_one_pass && text.size() <= 4096 && + (ncap > 1 || text.size() <= 8)) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " skipping DFA for OnePass."; + skipped_test = true; + break; + } + if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " skipping DFA for BitState."; + skipped_test = true; + break; + } + if (!prog_->SearchDFA(subtext, text, anchor, kind, + &match, &dfa_failed, NULL)) { + if (dfa_failed) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " DFA failed."; + skipped_test = true; + break; + } + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " used DFA - no match."; + return false; + } + break; + } + + if (!skipped_test && ncap <= 1) { + // We know exactly where it matches. That's enough. + if (ncap == 1) + submatch[0] = match; + } else { + StringPiece subtext1; + if (skipped_test) { + // DFA ran out of memory or was skipped: + // need to search in entire original text. + subtext1 = subtext; + } else { + // DFA found the exact match location: + // let NFA run an anchored, full match search + // to find submatch locations. + subtext1 = match; + anchor = Prog::kAnchored; + kind = Prog::kFullMatch; + } + + if (can_one_pass && anchor != Prog::kUnanchored) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using OnePass."; + if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { + if (!skipped_test && options_.log_errors()) + LOG(ERROR) << "SearchOnePass inconsistency"; + return false; + } + } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using BitState."; + if (!prog_->SearchBitState(subtext1, text, anchor, + kind, submatch, ncap)) { + if (!skipped_test && options_.log_errors()) + LOG(ERROR) << "SearchBitState inconsistency"; + return false; + } + } else { + if (FLAGS_trace_re2) + LOG(INFO) << "Match " << trunc(pattern_) + << " [" << CEscape(subtext) << "]" + << " using NFA."; + if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { + if (!skipped_test && options_.log_errors()) + LOG(ERROR) << "SearchNFA inconsistency"; + return false; + } + } + } + + // Adjust overall match for required prefix that we stripped off. + if (prefixlen > 0 && nsubmatch > 0) + submatch[0] = StringPiece(submatch[0].begin() - prefixlen, + submatch[0].size() + prefixlen); + + // Zero submatches that don't exist in the regexp. + for (int i = ncap; i < nsubmatch; i++) + submatch[i] = NULL; + return true; +} + +// Internal matcher - like Match() but takes Args not StringPieces. +bool RE2::DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const* args, + int n) const { + if (!ok()) { + if (options_.log_errors()) + LOG(ERROR) << "Invalid RE2: " << *error_; + return false; + } + + // Count number of capture groups needed. + int nvec; + if (n == 0 && consumed == NULL) + nvec = 0; + else + nvec = n+1; + + StringPiece* vec; + StringPiece stkvec[kVecSize]; + StringPiece* heapvec = NULL; + + if (nvec <= arraysize(stkvec)) { + vec = stkvec; + } else { + vec = new StringPiece[nvec]; + heapvec = vec; + } + + if (!Match(text, 0, text.size(), anchor, vec, nvec)) { + delete[] heapvec; + return false; + } + + if(consumed != NULL) + *consumed = vec[0].end() - text.begin(); + + if (n == 0 || args == NULL) { + // We are not interested in results + delete[] heapvec; + return true; + } + + int ncap = NumberOfCapturingGroups(); + if (ncap < n) { + // RE has fewer capturing groups than number of arg pointers passed in + VLOG(1) << "Asked for " << n << " but only have " << ncap; + delete[] heapvec; + return false; + } + + // If we got here, we must have matched the whole pattern. + for (int i = 0; i < n; i++) { + const StringPiece& s = vec[i+1]; + if (!args[i]->Parse(s.data(), s.size())) { + // TODO: Should we indicate what the error was? + VLOG(1) << "Parse error on #" << i << " " << s << " " + << (void*)s.data() << "/" << s.size(); + delete[] heapvec; + return false; + } + } + + delete[] heapvec; + return true; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(string *out, const StringPiece &rewrite, + const StringPiece *vec, int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + s++; + c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + } + return false; + } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int RE2::NumberOfCapturingGroups() const { + if (suffix_regexp_ == NULL) + return -1; + int n; + ATOMIC_LOAD_RELAXED(n, &num_captures_); + if (n == -1) { + n = suffix_regexp_->NumCaptures(); + ATOMIC_STORE_RELAXED(&num_captures_, n); + } + return n; +} + +// Checks that the rewrite string is well-formed with respect to this +// regular expression. +bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + SStringPrintf(error, "Rewrite schema requests %d matches, " + "but the regexp only has %d parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + +/***** Parsers for various types *****/ + +bool RE2::Arg::parse_null(const char* str, int n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool RE2::Arg::parse_string(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast(dest)->assign(str, n); + return true; +} + +bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast(dest)->set(str, n); + return true; +} + +bool RE2::Arg::parse_char(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// REQUIRES "buf" must have length at least kMaxNumberLength+1 +// Copies "str" into "buf" and null-terminates. +// Overwrites *np with the new length. +static const char* TerminateNumber(char* buf, const char* str, int* np) { + int n = *np; + if (n <= 0) return ""; + if (n > 0 && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // Although buf has a fixed maximum size, we can still handle + // arbitrarily large integers correctly by omitting leading zeros. + // (Numbers that are still too long will be out of range.) + // Before deciding whether str is too long, + // remove leading zeros with s/000+/00/. + // Leaving the leading two zeros in place means that + // we don't change 0000x123 (invalid) into 0x123 (valid). + // Skip over leading - before replacing. + bool neg = false; + if (n >= 1 && str[0] == '-') { + neg = true; + n--; + str++; + } + + if (n >= 3 && str[0] == '0' && str[1] == '0') { + while (n >= 3 && str[2] == '0') { + n--; + str++; + } + } + + if (neg) { // make room in buf for - + n++; + str--; + } + + if (n > kMaxNumberLength) return ""; + + memmove(buf, str, n); + if (neg) { + buf[0] = '-'; + } + buf[n] = '\0'; + *np = n; + return buf; +} + +bool RE2::Arg::parse_long_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ulong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_short_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ushort_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((ushort)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_int_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_uint_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((uint)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +#ifdef RE2_HAVE_LONGLONG +bool RE2::Arg::parse_longlong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + char* end; + errno = 0; + int64 r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ulonglong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, &n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + uint64 r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} +#endif + +static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + errno = 0; + char* end; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } + if (end != buf + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + if (isfloat) { + *(reinterpret_cast(dest)) = r; + } else { + *(reinterpret_cast(dest)) = r; + } + return true; +} + +bool RE2::Arg::parse_double(const char* str, int n, void* dest) { + return parse_double_float(str, n, false, dest); +} + +bool RE2::Arg::parse_float(const char* str, int n, void* dest) { + return parse_double_float(str, n, true, dest); +} + + +#define DEFINE_INTEGER_PARSERS(name) \ + bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSERS(short); +DEFINE_INTEGER_PARSERS(ushort); +DEFINE_INTEGER_PARSERS(int); +DEFINE_INTEGER_PARSERS(uint); +DEFINE_INTEGER_PARSERS(long); +DEFINE_INTEGER_PARSERS(ulong); +DEFINE_INTEGER_PARSERS(longlong); +DEFINE_INTEGER_PARSERS(ulonglong); + +#undef DEFINE_INTEGER_PARSERS + +} // namespace re2 diff --git a/outside/re2/re2/re2.h b/outside/re2/re2/re2.h new file mode 100644 index 000000000..1aabcbc4f --- /dev/null +++ b/outside/re2/re2/re2.h @@ -0,0 +1,877 @@ +// Copyright 2003-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_RE2_H +#define RE2_RE2_H + +// C++ interface to the re2 regular-expression library. +// RE2 supports Perl-style regular expressions (with extensions like +// \d, \w, \s, ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the re2 library and hence supports +// its syntax for regular expressions, which is similar to Perl's with +// some of the more complicated things thrown away. In particular, +// backreferences and generalized assertions are not available, nor is \Z. +// +// See http://code.google.com/p/re2/wiki/Syntax for the syntax +// supported by RE2, and a comparison with PCRE and PERL regexps. +// +// For those not familiar with Perl's regular expressions, +// here are some examples of the most commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(RE2::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!RE2::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, the pattern and input text are interpreted as UTF-8. +// The RE2::Latin1 option causes them to be interpreted as Latin-1. +// +// Example: +// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern))); +// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUB-STRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched subpieces. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// string s; +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!RE2::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// NOTE(rsc): Asking for substrings slows successful matches quite a bit. +// This may get a little faster in the future, but right now is slower +// than PCRE. On the other hand, failed matches run *very* fast (faster +// than PCRE), as do matches without substring extraction. +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(RE2::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PRE-COMPILED REGULAR EXPRESSIONS +// +// RE2 makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "RE2" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// RE2 pattern("h.*o"); +// while (ReadLine(&str)) { +// if (RE2::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// string var; +// int value; +// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// RE2::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// USING VARIABLE NUMBER OF ARGUMENTS +// +// The above operations require you to know the number of arguments +// when you write the code. This is not always possible or easy (for +// example, the regular expression may be calculated at run time). +// You can use the "N" version of the operations when the number of +// match arguments are determined at run time. +// +// Example: +// const RE2::Arg* args[10]; +// int n; +// // ... populate args with pointers to RE2::Arg values ... +// // ... set n to the number of RE2::Arg objects ... +// bool match = RE2::FullMatchN(input, pattern, args, n); +// +// The last statement is equivalent to +// +// bool match = RE2::FullMatch(input, pattern, +// *args[0], *args[1], ..., *args[n - 1]); +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include +#include +#include +#include "re2/stringpiece.h" +#include "re2/variadic_function.h" + +#ifndef RE2_HAVE_LONGLONG +#define RE2_HAVE_LONGLONG 1 +#endif + +namespace re2 { + +using std::string; +using std::map; +class Mutex; +class Prog; +class Regexp; + +// The following enum should be used only as a constructor argument to indicate +// that the variable has static storage class, and that the constructor should +// do nothing to its state. It indicates to the reader that it is legal to +// declare a static instance of the class, provided the constructor is given +// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a +// static variable that has a constructor or a destructor because invocation +// order is undefined. However, IF the type can be initialized by filling with +// zeroes (which the loader does for static variables), AND the type's +// destructor does nothing to the storage, then a constructor for static +// initialization can be declared as +// explicit MyClass(LinkerInitialized x) {} +// and invoked as +// static MyClass my_variable_name(LINKER_INITIALIZED); +enum LinkerInitialized { LINKER_INITIALIZED }; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "RE2" object is safe for +// concurrent use by multiple threads. +class RE2 { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + class Options; + + // Defined in set.h. + class Set; + + enum ErrorCode { + NoError = 0, + + // Unexpected error + ErrorInternal, + + // Parse errors + ErrorBadEscape, // bad escape sequence + ErrorBadCharClass, // bad character class + ErrorBadCharRange, // bad character class range + ErrorMissingBracket, // missing closing ] + ErrorMissingParen, // missing closing ) + ErrorTrailingBackslash, // trailing \ at end of regexp + ErrorRepeatArgument, // repeat argument missing, e.g. "*" + ErrorRepeatSize, // bad repetition argument + ErrorRepeatOp, // bad repetition operator + ErrorBadPerlOp, // bad perl operator + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge // pattern too large (compile failed) + }; + + // Predefined common options. + // If you need more complicated things, instantiate + // an Option class, possibly passing one of these to + // the Option constructor, change the settings, and pass that + // Option class to the RE2 constructor. + enum CannedOptions { + DefaultOptions = 0, + Latin1, // treat input as Latin-1 (default UTF-8) + POSIX, // POSIX syntax, leftmost-longest match + Quiet // do not log about regexp parse errors + }; + + // Need to have the const char* and const string& forms for implicit + // conversions when passing string literals to FullMatch and PartialMatch. + // Otherwise the StringPiece form would be sufficient. +#ifndef SWIG + RE2(const char* pattern); + RE2(const string& pattern); +#endif + RE2(const StringPiece& pattern); + RE2(const StringPiece& pattern, const Options& option); + ~RE2(); + + // Returns whether RE2 was created properly. + bool ok() const { return error_code() == NoError; } + + // The string specification for this RE2. E.g. + // RE2 re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const string& pattern() const { return pattern_; } + + // If RE2 could not be created properly, returns an error string. + // Else returns the empty string. + const string& error() const { return *error_; } + + // If RE2 could not be created properly, returns an error code. + // Else returns RE2::NoError (== 0). + ErrorCode error_code() const { return error_code_; } + + // If RE2 could not be created properly, returns the offending + // portion of the regexp. + const string& error_arg() const { return error_arg_; } + + // Returns the program size, a very approximate measure of a regexp's "cost". + // Larger numbers are more expensive than smaller numbers. + int ProgramSize() const; + + // Returns the underlying Regexp; not for general use. + // Returns entire_regexp_ so that callers don't need + // to know about prefix_ and prefix_foldcase_. + re2::Regexp* Regexp() const { return entire_regexp_; } + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "string" for "text". + // You can pass in a "const char*" or a "string" or a "RE2" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, int)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); + static bool FullMatchN(const StringPiece& text, const RE2& re, + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + static bool FindAndConsumeN(StringPiece* input, const RE2& pattern, + const Arg* const args[], int argc); + static const VariadicFunction2< + bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // string s = "yabba dabba doo"; + // CHECK(RE2::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(string *str, + const RE2& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces successive non-overlapping occurrences + // of the pattern in the string with the rewrite. E.g. + // + // string s = "yabba dabba doo"; + // CHECK(RE2::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // Replacements are not subject to re-matching. + // + // Because GlobalReplace only replaces non-overlapping matches, + // replacing "ana" within "banana" makes only one replacement, not two. + // + // Returns the number of replacements made. + static int GlobalReplace(string *str, + const RE2& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const RE2& pattern, + const StringPiece &rewrite, + string *out); + + // Escapes all potentially meaningful regexp characters in + // 'unquoted'. The returned string, used as a regular expression, + // will exactly match the original string. For example, + // 1.5-2.0? + // may become: + // 1\.5\-2\.0\? + static string QuoteMeta(const StringPiece& unquoted); + + // Computes range for any strings matching regexp. The min and max can in + // some cases be arbitrarily precise, so the caller gets to specify the + // maximum desired length of string returned. + // + // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any + // string s that is an anchored match for this regexp satisfies + // min <= s && s <= max. + // + // Note that PossibleMatchRange() will only consider the first copy of an + // infinitely repeated element (i.e., any regexp element followed by a '*' or + // '+' operator). Regexps with "{N}" constructions are not affected, as those + // do not compile down to infinite repetitions. + // + // Returns true on success, false on error. + bool PossibleMatchRange(string* min, string* max, int maxlen) const; + + // Generic matching interface + + // Type of match. + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH // Anchor at start and end + }; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. The overall match ($0) + // does not count: if the regexp is "(a)(b)", returns 2. + int NumberOfCapturingGroups() const; + + + // Return a map from names to capturing indices. + // The map records the index of the leftmost group + // with the given name. + // Only valid until the re is deleted. + const map& NamedCapturingGroups() const; + + // Return a map from capturing indices to names. + // The map has no entries for unnamed groups. + // Only valid until the re is deleted. + const map& CapturingGroupNames() const; + + // General matching routine. + // Match against text starting at offset startpos + // and stopping the search at offset endpos. + // Returns true if match found, false if not. + // On a successful match, fills in match[] (up to nmatch entries) + // with information about submatches. + // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, + // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar", + // match[3] = NULL, ..., up to match[nmatch-1] = NULL. + // + // Don't ask for more match information than you will use: + // runs much faster with nmatch == 1 than nmatch > 1, and + // runs even faster if nmatch == 0. + // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(), + // but will be handled correctly. + // + // Passing text == StringPiece(NULL, 0) will be handled like any other + // empty string, but note that on return, it will not be possible to tell + // whether submatch i matched the empty string or did not match: + // either way, match[i] == NULL. + bool Match(const StringPiece& text, + int startpos, + int endpos, + Anchor anchor, + StringPiece *match, + int nmatch) const; + + // Check that the given rewrite string is suitable for use with this + // regular expression. It checks that: + // * The regular expression has enough parenthesized subexpressions + // to satisfy all of the \N tokens in rewrite + // * The rewrite string doesn't have any syntax errors. E.g., + // '\' followed by anything other than a digit or '\'. + // A true return value guarantees that Replace() and Extract() won't + // fail because of a bad rewrite string. + bool CheckRewriteString(const StringPiece& rewrite, string* error) const; + + // Returns the maximum submatch needed for the rewrite to be done by + // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2. + static int MaxSubmatch(const StringPiece& rewrite); + + // Append the "rewrite" string, with backslash subsitutions from "vec", + // to string "out". + // Returns true on success. This method can fail because of a malformed + // rewrite string. CheckRewriteString guarantees that the rewrite will + // be sucessful. + bool Rewrite(string *out, + const StringPiece &rewrite, + const StringPiece* vec, + int veclen) const; + + // Constructor options + class Options { + public: + // The options are (defaults in parentheses): + // + // utf8 (true) text and pattern are UTF-8; otherwise Latin-1 + // posix_syntax (false) restrict regexps to POSIX egrep syntax + // longest_match (false) search for longest match, not first match + // log_errors (true) log syntax and execution errors to ERROR + // max_mem (see below) approx. max memory footprint of RE2 + // literal (false) interpret string as literal, not regexp + // never_nl (false) never match \n, even if it is in regexp + // dot_nl (false) dot matches everything including new line + // never_capture (false) parse all parens as non-capturing + // case_sensitive (true) match is case-sensitive (regexp can override + // with (?i) unless in posix_syntax mode) + // + // The following options are only consulted when posix_syntax == true. + // (When posix_syntax == false these features are always enabled and + // cannot be turned off.) + // perl_classes (false) allow Perl's \d \s \w \D \S \W + // word_boundary (false) allow Perl's \b \B (word boundary and not) + // one_line (false) ^ and $ only match beginning and end of text + // + // The max_mem option controls how much memory can be used + // to hold the compiled form of the regexp (the Prog) and + // its cached DFA graphs. Code Search placed limits on the number + // of Prog instructions and DFA states: 10,000 for both. + // In RE2, those limits would translate to about 240 KB per Prog + // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a + // better job of keeping them small than Code Search did). + // Each RE2 has two Progs (one forward, one reverse), and each Prog + // can have two DFAs (one first match, one longest match). + // That makes 4 DFAs: + // + // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches + // if opt.longest_match() == false + // forward, longest-match - used for all ANCHOR_BOTH searches, + // and the other two kinds if + // opt.longest_match() == true + // reverse, first-match - never used + // reverse, longest-match - used as second phase for unanchored searches + // + // The RE2 memory budget is statically divided between the two + // Progs and then the DFAs: two thirds to the forward Prog + // and one third to the reverse Prog. The forward Prog gives half + // of what it has left over to each of its DFAs. The reverse Prog + // gives it all to its longest-match DFA. + // + // Once a DFA fills its budget, it flushes its cache and starts over. + // If this happens too often, RE2 falls back on the NFA implementation. + + // For now, make the default budget something close to Code Search. + static const int kDefaultMaxMem = 8<<20; + + enum Encoding { + EncodingUTF8 = 1, + EncodingLatin1 + }; + + Options() : + encoding_(EncodingUTF8), + posix_syntax_(false), + longest_match_(false), + log_errors_(true), + max_mem_(kDefaultMaxMem), + literal_(false), + never_nl_(false), + dot_nl_(false), + never_capture_(false), + case_sensitive_(true), + perl_classes_(false), + word_boundary_(false), + one_line_(false) { + } + + /*implicit*/ Options(CannedOptions); + + Encoding encoding() const { return encoding_; } + void set_encoding(Encoding encoding) { encoding_ = encoding; } + + // Legacy interface to encoding. + // TODO(rsc): Remove once clients have been converted. + bool utf8() const { return encoding_ == EncodingUTF8; } + void set_utf8(bool b) { + if (b) { + encoding_ = EncodingUTF8; + } else { + encoding_ = EncodingLatin1; + } + } + + bool posix_syntax() const { return posix_syntax_; } + void set_posix_syntax(bool b) { posix_syntax_ = b; } + + bool longest_match() const { return longest_match_; } + void set_longest_match(bool b) { longest_match_ = b; } + + bool log_errors() const { return log_errors_; } + void set_log_errors(bool b) { log_errors_ = b; } + + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + + bool literal() const { return literal_; } + void set_literal(bool b) { literal_ = b; } + + bool never_nl() const { return never_nl_; } + void set_never_nl(bool b) { never_nl_ = b; } + + bool dot_nl() const { return dot_nl_; } + void set_dot_nl(bool b) { dot_nl_ = b; } + + bool never_capture() const { return never_capture_; } + void set_never_capture(bool b) { never_capture_ = b; } + + bool case_sensitive() const { return case_sensitive_; } + void set_case_sensitive(bool b) { case_sensitive_ = b; } + + bool perl_classes() const { return perl_classes_; } + void set_perl_classes(bool b) { perl_classes_ = b; } + + bool word_boundary() const { return word_boundary_; } + void set_word_boundary(bool b) { word_boundary_ = b; } + + bool one_line() const { return one_line_; } + void set_one_line(bool b) { one_line_ = b; } + + void Copy(const Options& src) { + encoding_ = src.encoding_; + posix_syntax_ = src.posix_syntax_; + longest_match_ = src.longest_match_; + log_errors_ = src.log_errors_; + max_mem_ = src.max_mem_; + literal_ = src.literal_; + never_nl_ = src.never_nl_; + dot_nl_ = src.dot_nl_; + never_capture_ = src.never_capture_; + case_sensitive_ = src.case_sensitive_; + perl_classes_ = src.perl_classes_; + word_boundary_ = src.word_boundary_; + one_line_ = src.one_line_; + } + + int ParseFlags() const; + + private: + Encoding encoding_; + bool posix_syntax_; + bool longest_match_; + bool log_errors_; + int64_t max_mem_; + bool literal_; + bool never_nl_; + bool dot_nl_; + bool never_capture_; + bool case_sensitive_; + bool perl_classes_; + bool word_boundary_; + bool one_line_; + + //DISALLOW_EVIL_CONSTRUCTORS(Options); + Options(const Options&); + void operator=(const Options&); + }; + + // Returns the options set in the constructor. + const Options& options() const { return options_; }; + + // Argument converters; see below. + static inline Arg CRadix(short* x); + static inline Arg CRadix(unsigned short* x); + static inline Arg CRadix(int* x); + static inline Arg CRadix(unsigned int* x); + static inline Arg CRadix(long* x); + static inline Arg CRadix(unsigned long* x); + #ifdef RE2_HAVE_LONGLONG + static inline Arg CRadix(long long* x); + static inline Arg CRadix(unsigned long long* x); + #endif + + static inline Arg Hex(short* x); + static inline Arg Hex(unsigned short* x); + static inline Arg Hex(int* x); + static inline Arg Hex(unsigned int* x); + static inline Arg Hex(long* x); + static inline Arg Hex(unsigned long* x); + #ifdef RE2_HAVE_LONGLONG + static inline Arg Hex(long long* x); + static inline Arg Hex(unsigned long long* x); + #endif + + static inline Arg Octal(short* x); + static inline Arg Octal(unsigned short* x); + static inline Arg Octal(int* x); + static inline Arg Octal(unsigned int* x); + static inline Arg Octal(long* x); + static inline Arg Octal(unsigned long* x); + #ifdef RE2_HAVE_LONGLONG + static inline Arg Octal(long long* x); + static inline Arg Octal(unsigned long long* x); + #endif + + private: + void Init(const StringPiece& pattern, const Options& options); + + bool DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const args[], + int n) const; + + re2::Prog* ReverseProg() const; + + mutable Mutex* mutex_; + string pattern_; // string regular expression + Options options_; // option flags + string prefix_; // required prefix (before regexp_) + bool prefix_foldcase_; // prefix is ASCII case-insensitive + re2::Regexp* entire_regexp_; // parsed regular expression + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed + re2::Prog* prog_; // compiled program for regexp + mutable re2::Prog* rprog_; // reverse program for regexp + bool is_one_pass_; // can use prog_->SearchOnePass? + mutable const string* error_; // Error indicator + // (or points to empty string) + mutable ErrorCode error_code_; // Error code + mutable string error_arg_; // Fragment of regexp showing error + mutable int num_captures_; // Number of capturing groups + + // Map from capture names to indices + mutable const map* named_groups_; + + // Map from capture indices to names + mutable const map* group_names_; + + //DISALLOW_EVIL_CONSTRUCTORS(RE2); + RE2(const RE2&); + void operator=(const RE2&); +}; + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template +class _RE2_MatchObject { + public: + static inline bool Parse(const char* str, int n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast(dest); + return object->ParseFrom(str, n); + } +}; + +class RE2::Arg { + public: + // Empty constructor so we can declare arrays of RE2::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, int n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type,name) \ + Arg(type* p) : arg_(p), parser_(name) { } \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ + + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_char); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + #ifdef RE2_HAVE_LONGLONG + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + #endif + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + +#undef MAKE_PARSER + + // Generic constructor + template Arg(T*, Parser parser); + // Generic constructor template + template Arg(T* p) + : arg_(p), parser_(_RE2_MatchObject::Parse) { + } + + // Parse the data + bool Parse(const char* str, int n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, int n, void* dest); + static bool parse_char (const char* str, int n, void* dest); + static bool parse_uchar (const char* str, int n, void* dest); + static bool parse_float (const char* str, int n, void* dest); + static bool parse_double (const char* str, int n, void* dest); + static bool parse_string (const char* str, int n, void* dest); + static bool parse_stringpiece (const char* str, int n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_ ## name(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _radix( \ + const char* str, int n, void* dest, int radix); \ + public: \ + static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + #ifdef RE2_HAVE_LONGLONG + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + #endif + +#undef DECLARE_INTEGER_PARSER +}; + +inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool RE2::Arg::Parse(const char* str, int n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline RE2::Arg RE2::Hex(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \ + inline RE2::Arg RE2::Octal(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \ + inline RE2::Arg RE2::CRadix(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } + +MAKE_INTEGER_PARSER(short, short) +MAKE_INTEGER_PARSER(unsigned short, ushort) +MAKE_INTEGER_PARSER(int, int) +MAKE_INTEGER_PARSER(unsigned int, uint) +MAKE_INTEGER_PARSER(long, long) +MAKE_INTEGER_PARSER(unsigned long, ulong) +#ifdef RE2_HAVE_LONGLONG +MAKE_INTEGER_PARSER(long long, longlong) +MAKE_INTEGER_PARSER(unsigned long long, ulonglong) +#endif + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 + +using re2::RE2; + +#endif /* RE2_RE2_H */ diff --git a/outside/re2/re2/regexp.cc b/outside/re2/re2/regexp.cc new file mode 100644 index 000000000..a74ceec82 --- /dev/null +++ b/outside/re2/re2/regexp.cc @@ -0,0 +1,931 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression representation. +// Tested by parse_test.cc + +#include "util/util.h" +#include "re2/regexp.h" +#include "re2/stringpiece.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Constructor. Allocates vectors as appropriate for operator. +Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) + : op_(op), + simple_(false), + parse_flags_(static_cast(parse_flags)), + ref_(1), + nsub_(0), + down_(NULL) { + subone_ = NULL; + memset(the_union_, 0, sizeof the_union_); +} + +// Destructor. Assumes already cleaned up children. +// Private: use Decref() instead of delete to destroy Regexps. +// Can't call Decref on the sub-Regexps here because +// that could cause arbitrarily deep recursion, so +// required Decref() to have handled them for us. +Regexp::~Regexp() { + if (nsub_ > 0) + LOG(DFATAL) << "Regexp not destroyed."; + + switch (op_) { + default: + break; + case kRegexpCapture: + delete name_; + break; + case kRegexpLiteralString: + delete[] runes_; + break; + case kRegexpCharClass: + cc_->Delete(); + delete ccb_; + break; + } +} + +// If it's possible to destroy this regexp without recurring, +// do so and return true. Else return false. +bool Regexp::QuickDestroy() { + if (nsub_ == 0) { + delete this; + return true; + } + return false; +} + +static map *ref_map; +GLOBAL_MUTEX(ref_mutex); + +int Regexp::Ref() { + if (ref_ < kMaxRef) + return ref_; + + GLOBAL_MUTEX_LOCK(ref_mutex); + int r = 0; + if (ref_map != NULL) { + r = (*ref_map)[this]; + } + GLOBAL_MUTEX_UNLOCK(ref_mutex); + return r; +} + +// Increments reference count, returns object as convenience. +Regexp* Regexp::Incref() { + if (ref_ >= kMaxRef-1) { + // Store ref count in overflow map. + GLOBAL_MUTEX_LOCK(ref_mutex); + if (ref_map == NULL) { + ref_map = new map; + } + if (ref_ == kMaxRef) { + // already overflowed + (*ref_map)[this]++; + } else { + // overflowing now + (*ref_map)[this] = kMaxRef; + ref_ = kMaxRef; + } + GLOBAL_MUTEX_UNLOCK(ref_mutex); + return this; + } + + ref_++; + return this; +} + +// Decrements reference count and deletes this object if count reaches 0. +void Regexp::Decref() { + if (ref_ == kMaxRef) { + // Ref count is stored in overflow map. + GLOBAL_MUTEX_LOCK(ref_mutex); + int r = (*ref_map)[this] - 1; + if (r < kMaxRef) { + ref_ = r; + ref_map->erase(this); + } else { + (*ref_map)[this] = r; + } + GLOBAL_MUTEX_UNLOCK(ref_mutex); + return; + } + ref_--; + if (ref_ == 0) + Destroy(); +} + +// Deletes this object; ref count has count reached 0. +void Regexp::Destroy() { + if (QuickDestroy()) + return; + + // Handle recursive Destroy with explicit stack + // to avoid arbitrarily deep recursion on process stack [sigh]. + down_ = NULL; + Regexp* stack = this; + while (stack != NULL) { + Regexp* re = stack; + stack = re->down_; + if (re->ref_ != 0) + LOG(DFATAL) << "Bad reference count " << re->ref_; + if (re->nsub_ > 0) { + Regexp** subs = re->sub(); + for (int i = 0; i < re->nsub_; i++) { + Regexp* sub = subs[i]; + if (sub == NULL) + continue; + if (sub->ref_ == kMaxRef) + sub->Decref(); + else + --sub->ref_; + if (sub->ref_ == 0 && !sub->QuickDestroy()) { + sub->down_ = stack; + stack = sub; + } + } + if (re->nsub_ > 1) + delete[] subs; + re->nsub_ = 0; + } + delete re; + } +} + +void Regexp::AddRuneToString(Rune r) { + DCHECK(op_ == kRegexpLiteralString); + if (nrunes_ == 0) { + // start with 8 + runes_ = new Rune[8]; + } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { + // double on powers of two + Rune *old = runes_; + runes_ = new Rune[nrunes_ * 2]; + for (int i = 0; i < nrunes_; i++) + runes_[i] = old[i]; + delete[] old; + } + + runes_[nrunes_++] = r; +} + +Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpHaveMatch, flags); + re->match_id_ = match_id; + return re; +} + +Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { + if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) + return sub; + Regexp* re = new Regexp(kRegexpPlus, flags); + re->AllocSub(1); + re->sub()[0] = sub; + return re; +} + +Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { + if (sub->op() == kRegexpStar && sub->parse_flags() == flags) + return sub; + Regexp* re = new Regexp(kRegexpStar, flags); + re->AllocSub(1); + re->sub()[0] = sub; + return re; +} + +Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { + if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) + return sub; + Regexp* re = new Regexp(kRegexpQuest, flags); + re->AllocSub(1); + re->sub()[0] = sub; + return re; +} + +Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, + ParseFlags flags, bool can_factor) { + if (nsub == 1) + return sub[0]; + + Regexp** subcopy = NULL; + if (op == kRegexpAlternate && can_factor) { + // Going to edit sub; make a copy so we don't step on caller. + subcopy = new Regexp*[nsub]; + memmove(subcopy, sub, nsub * sizeof sub[0]); + sub = subcopy; + nsub = FactorAlternation(sub, nsub, flags); + if (nsub == 1) { + Regexp* re = sub[0]; + delete[] subcopy; + return re; + } + } + + if (nsub > kMaxNsub) { + // Too many subexpressions to fit in a single Regexp. + // Make a two-level tree. Two levels gets us to 65535^2. + int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; + Regexp* re = new Regexp(op, flags); + re->AllocSub(nbigsub); + Regexp** subs = re->sub(); + for (int i = 0; i < nbigsub - 1; i++) + subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); + subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, + nsub - (nbigsub-1)*kMaxNsub, flags, + false); + delete[] subcopy; + return re; + } + + Regexp* re = new Regexp(op, flags); + re->AllocSub(nsub); + Regexp** subs = re->sub(); + for (int i = 0; i < nsub; i++) + subs[i] = sub[i]; + + delete[] subcopy; + return re; +} + +Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); +} + +Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); +} + +Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { + return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); +} + +Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { + Regexp* re = new Regexp(kRegexpCapture, flags); + re->AllocSub(1); + re->sub()[0] = sub; + re->cap_ = cap; + return re; +} + +Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { + Regexp* re = new Regexp(kRegexpRepeat, flags); + re->AllocSub(1); + re->sub()[0] = sub; + re->min_ = min; + re->max_ = max; + return re; +} + +Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpLiteral, flags); + re->rune_ = rune; + return re; +} + +Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { + if (nrunes <= 0) + return new Regexp(kRegexpEmptyMatch, flags); + if (nrunes == 1) + return NewLiteral(runes[0], flags); + Regexp* re = new Regexp(kRegexpLiteralString, flags); + for (int i = 0; i < nrunes; i++) + re->AddRuneToString(runes[i]); + return re; +} + +Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { + Regexp* re = new Regexp(kRegexpCharClass, flags); + re->cc_ = cc; + return re; +} + +// Swaps this and that in place. +void Regexp::Swap(Regexp* that) { + // Can use memmove because Regexp is just a struct (no vtable). + char tmp[sizeof *this]; + memmove(tmp, this, sizeof tmp); + memmove(this, that, sizeof tmp); + memmove(that, tmp, sizeof tmp); +} + +// Tests equality of all top-level structure but not subregexps. +static bool TopEqual(Regexp* a, Regexp* b) { + if (a->op() != b->op()) + return false; + + switch (a->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpBeginText: + return true; + + case kRegexpEndText: + // The parse flags remember whether it's \z or (?-m:$), + // which matters when testing against PCRE. + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; + + case kRegexpLiteral: + return a->rune() == b->rune() && + ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; + + case kRegexpLiteralString: + return a->nrunes() == b->nrunes() && + ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && + memcmp(a->runes(), b->runes(), + a->nrunes() * sizeof a->runes()[0]) == 0; + + case kRegexpAlternate: + case kRegexpConcat: + return a->nsub() == b->nsub(); + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; + + case kRegexpRepeat: + return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && + a->min() == b->min() && + a->max() == b->max(); + + case kRegexpCapture: + return a->cap() == b->cap() && a->name() == b->name(); + + case kRegexpHaveMatch: + return a->match_id() == b->match_id(); + + case kRegexpCharClass: { + CharClass* acc = a->cc(); + CharClass* bcc = b->cc(); + return acc->size() == bcc->size() && + acc->end() - acc->begin() == bcc->end() - bcc->begin() && + memcmp(acc->begin(), bcc->begin(), + (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; + } + } + + LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); + return 0; +} + +bool Regexp::Equal(Regexp* a, Regexp* b) { + if (a == NULL || b == NULL) + return a == b; + + if (!TopEqual(a, b)) + return false; + + // Fast path: + // return without allocating vector if there are no subregexps. + switch (a->op()) { + case kRegexpAlternate: + case kRegexpConcat: + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + case kRegexpCapture: + break; + + default: + return true; + } + + // Committed to doing real work. + // The stack (vector) has pairs of regexps waiting to + // be compared. The regexps are only equal if + // all the pairs end up being equal. + vector stk; + + for (;;) { + // Invariant: TopEqual(a, b) == true. + Regexp* a2; + Regexp* b2; + switch (a->op()) { + default: + break; + case kRegexpAlternate: + case kRegexpConcat: + for (int i = 0; i < a->nsub(); i++) { + a2 = a->sub()[i]; + b2 = b->sub()[i]; + if (!TopEqual(a2, b2)) + return false; + stk.push_back(a2); + stk.push_back(b2); + } + break; + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + case kRegexpCapture: + a2 = a->sub()[0]; + b2 = b->sub()[0]; + if (!TopEqual(a2, b2)) + return false; + // Really: + // stk.push_back(a2); + // stk.push_back(b2); + // break; + // but faster to assign directly and loop. + a = a2; + b = b2; + continue; + } + + int n = stk.size(); + if (n == 0) + break; + + a = stk[n-2]; + b = stk[n-1]; + stk.resize(n-2); + } + + return true; +} + +// Keep in sync with enum RegexpStatusCode in regexp.h +static const char *kErrorStrings[] = { + "no error", + "unexpected error", + "invalid escape sequence", + "invalid character class", + "invalid character class range", + "missing ]", + "missing )", + "trailing \\", + "no argument for repetition operator", + "invalid repetition size", + "bad repetition operator", + "invalid perl operator", + "invalid UTF-8", + "invalid named capture group", +}; + +string RegexpStatus::CodeText(enum RegexpStatusCode code) { + if (code < 0 || code >= arraysize(kErrorStrings)) + code = kRegexpInternalError; + return kErrorStrings[code]; +} + +string RegexpStatus::Text() const { + if (error_arg_.empty()) + return CodeText(code_); + string s; + s.append(CodeText(code_)); + s.append(": "); + s.append(error_arg_.data(), error_arg_.size()); + return s; +} + +void RegexpStatus::Copy(const RegexpStatus& status) { + code_ = status.code_; + error_arg_ = status.error_arg_; +} + +typedef int Ignored; // Walker doesn't exist + +// Walker subclass to count capturing parens in regexp. +class NumCapturesWalker : public Regexp::Walker { + public: + NumCapturesWalker() : ncapture_(0) {} + int ncapture() { return ncapture_; } + + virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + if (re->op() == kRegexpCapture) + ncapture_++; + return ignored; + } + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; + return ignored; + } + + private: + int ncapture_; + DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); +}; + +int Regexp::NumCaptures() { + NumCapturesWalker w; + w.Walk(this, 0); + return w.ncapture(); +} + +// Walker class to build map of named capture groups and their indices. +class NamedCapturesWalker : public Regexp::Walker { + public: + NamedCapturesWalker() : map_(NULL) {} + ~NamedCapturesWalker() { delete map_; } + + map* TakeMap() { + map* m = map_; + map_ = NULL; + return m; + } + + Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + if (re->op() == kRegexpCapture && re->name() != NULL) { + // Allocate map once we find a name. + if (map_ == NULL) + map_ = new map; + + // Record first occurrence of each name. + // (The rule is that if you have the same name + // multiple times, only the leftmost one counts.) + if (map_->find(*re->name()) == map_->end()) + (*map_)[*re->name()] = re->cap(); + } + return ignored; + } + + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; + return ignored; + } + + private: + map* map_; + DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); +}; + +map* Regexp::NamedCaptures() { + NamedCapturesWalker w; + w.Walk(this, 0); + return w.TakeMap(); +} + +// Walker class to build map from capture group indices to their names. +class CaptureNamesWalker : public Regexp::Walker { + public: + CaptureNamesWalker() : map_(NULL) {} + ~CaptureNamesWalker() { delete map_; } + + map* TakeMap() { + map* m = map_; + map_ = NULL; + return m; + } + + Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { + if (re->op() == kRegexpCapture && re->name() != NULL) { + // Allocate map once we find a name. + if (map_ == NULL) + map_ = new map; + + (*map_)[re->cap()] = *re->name(); + } + return ignored; + } + + virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; + return ignored; + } + + private: + map* map_; + DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); +}; + +map* Regexp::CaptureNames() { + CaptureNamesWalker w; + w.Walk(this, 0); + return w.TakeMap(); +} + +// Determines whether regexp matches must be anchored +// with a fixed string prefix. If so, returns the prefix and +// the regexp that remains after the prefix. The prefix might +// be ASCII case-insensitive. +bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { + // No need for a walker: the regexp must be of the form + // 1. some number of ^ anchors + // 2. a literal char or string + // 3. the rest + prefix->clear(); + *foldcase = false; + *suffix = NULL; + if (op_ != kRegexpConcat) + return false; + + // Some number of anchors, then a literal or concatenation. + int i = 0; + Regexp** sub = this->sub(); + while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) + i++; + if (i == 0 || i >= nsub_) + return false; + + Regexp* re = sub[i]; + switch (re->op_) { + default: + return false; + + case kRegexpLiteralString: + // Convert to string in proper encoding. + if (re->parse_flags() & Latin1) { + prefix->resize(re->nrunes_); + for (int j = 0; j < re->nrunes_; j++) + (*prefix)[j] = re->runes_[j]; + } else { + // Convert to UTF-8 in place. + // Assume worst-case space and then trim. + prefix->resize(re->nrunes_ * UTFmax); + char *p = &(*prefix)[0]; + for (int j = 0; j < re->nrunes_; j++) { + Rune r = re->runes_[j]; + if (r < Runeself) + *p++ = r; + else + p += runetochar(p, &r); + } + prefix->resize(p - &(*prefix)[0]); + } + break; + + case kRegexpLiteral: + if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { + prefix->append(1, re->rune_); + } else { + char buf[UTFmax]; + prefix->append(buf, runetochar(buf, &re->rune_)); + } + break; + } + *foldcase = (sub[i]->parse_flags() & FoldCase); + i++; + + // The rest. + if (i < nsub_) { + for (int j = i; j < nsub_; j++) + sub[j]->Incref(); + re = Concat(sub + i, nsub_ - i, parse_flags()); + } else { + re = new Regexp(kRegexpEmptyMatch, parse_flags()); + } + *suffix = re; + return true; +} + +// Character class builder is a balanced binary tree (STL set) +// containing non-overlapping, non-abutting RuneRanges. +// The less-than operator used in the tree treats two +// ranges as equal if they overlap at all, so that +// lookups for a particular Rune are possible. + +CharClassBuilder::CharClassBuilder() { + nrunes_ = 0; + upper_ = 0; + lower_ = 0; +} + +// Add lo-hi to the class; return whether class got bigger. +bool CharClassBuilder::AddRange(Rune lo, Rune hi) { + if (hi < lo) + return false; + + if (lo <= 'z' && hi >= 'A') { + // Overlaps some alpha, maybe not all. + // Update bitmaps telling which ASCII letters are in the set. + Rune lo1 = max(lo, 'A'); + Rune hi1 = min(hi, 'Z'); + if (lo1 <= hi1) + upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); + + lo1 = max(lo, 'a'); + hi1 = min(hi, 'z'); + if (lo1 <= hi1) + lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); + } + + { // Check whether lo, hi is already in the class. + iterator it = ranges_.find(RuneRange(lo, lo)); + if (it != end() && it->lo <= lo && hi <= it->hi) + return false; + } + + // Look for a range abutting lo on the left. + // If it exists, take it out and increase our range. + if (lo > 0) { + iterator it = ranges_.find(RuneRange(lo-1, lo-1)); + if (it != end()) { + lo = it->lo; + if (it->hi > hi) + hi = it->hi; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + } + + // Look for a range abutting hi on the right. + // If it exists, take it out and increase our range. + if (hi < Runemax) { + iterator it = ranges_.find(RuneRange(hi+1, hi+1)); + if (it != end()) { + hi = it->hi; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + } + + // Look for ranges between lo and hi. Take them out. + // This is only safe because the set has no overlapping ranges. + // We've already removed any ranges abutting lo and hi, so + // any that overlap [lo, hi] must be contained within it. + for (;;) { + iterator it = ranges_.find(RuneRange(lo, hi)); + if (it == end()) + break; + nrunes_ -= it->hi - it->lo + 1; + ranges_.erase(it); + } + + // Finally, add [lo, hi]. + nrunes_ += hi - lo + 1; + ranges_.insert(RuneRange(lo, hi)); + return true; +} + +void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { + for (iterator it = cc->begin(); it != cc->end(); ++it) + AddRange(it->lo, it->hi); +} + +bool CharClassBuilder::Contains(Rune r) { + return ranges_.find(RuneRange(r, r)) != end(); +} + +// Does the character class behave the same on A-Z as on a-z? +bool CharClassBuilder::FoldsASCII() { + return ((upper_ ^ lower_) & AlphaMask) == 0; +} + +CharClassBuilder* CharClassBuilder::Copy() { + CharClassBuilder* cc = new CharClassBuilder; + for (iterator it = begin(); it != end(); ++it) + cc->ranges_.insert(RuneRange(it->lo, it->hi)); + cc->upper_ = upper_; + cc->lower_ = lower_; + cc->nrunes_ = nrunes_; + return cc; +} + + + +void CharClassBuilder::RemoveAbove(Rune r) { + if (r >= Runemax) + return; + + if (r < 'z') { + if (r < 'a') + lower_ = 0; + else + lower_ &= AlphaMask >> ('z' - r); + } + + if (r < 'Z') { + if (r < 'A') + upper_ = 0; + else + upper_ &= AlphaMask >> ('Z' - r); + } + + for (;;) { + + iterator it = ranges_.find(RuneRange(r + 1, Runemax)); + if (it == end()) + break; + RuneRange rr = *it; + ranges_.erase(it); + nrunes_ -= rr.hi - rr.lo + 1; + if (rr.lo <= r) { + rr.hi = r; + ranges_.insert(rr); + nrunes_ += rr.hi - rr.lo + 1; + } + } +} + +void CharClassBuilder::Negate() { + // Build up negation and then copy in. + // Could edit ranges in place, but C++ won't let me. + vector v; + v.reserve(ranges_.size() + 1); + + // In negation, first range begins at 0, unless + // the current class begins at 0. + iterator it = begin(); + if (it == end()) { + v.push_back(RuneRange(0, Runemax)); + } else { + int nextlo = 0; + if (it->lo == 0) { + nextlo = it->hi + 1; + ++it; + } + for (; it != end(); ++it) { + v.push_back(RuneRange(nextlo, it->lo - 1)); + nextlo = it->hi + 1; + } + if (nextlo <= Runemax) + v.push_back(RuneRange(nextlo, Runemax)); + } + + ranges_.clear(); + for (int i = 0; i < v.size(); i++) + ranges_.insert(v[i]); + + upper_ = AlphaMask & ~upper_; + lower_ = AlphaMask & ~lower_; + nrunes_ = Runemax+1 - nrunes_; +} + +// Character class is a sorted list of ranges. +// The ranges are allocated in the same block as the header, +// necessitating a special allocator and Delete method. + +CharClass* CharClass::New(int maxranges) { + CharClass* cc; + uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; + cc = reinterpret_cast(data); + cc->ranges_ = reinterpret_cast(data + sizeof *cc); + cc->nranges_ = 0; + cc->folds_ascii_ = false; + cc->nrunes_ = 0; + return cc; +} + +void CharClass::Delete() { + if (this == NULL) + return; + uint8 *data = reinterpret_cast(this); + delete[] data; +} + +CharClass* CharClass::Negate() { + CharClass* cc = CharClass::New(nranges_+1); + cc->folds_ascii_ = folds_ascii_; + cc->nrunes_ = Runemax + 1 - nrunes_; + int n = 0; + int nextlo = 0; + for (CharClass::iterator it = begin(); it != end(); ++it) { + if (it->lo == nextlo) { + nextlo = it->hi + 1; + } else { + cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); + nextlo = it->hi + 1; + } + } + if (nextlo <= Runemax) + cc->ranges_[n++] = RuneRange(nextlo, Runemax); + cc->nranges_ = n; + return cc; +} + +bool CharClass::Contains(Rune r) { + RuneRange* rr = ranges_; + int n = nranges_; + while (n > 0) { + int m = n/2; + if (rr[m].hi < r) { + rr += m+1; + n -= m+1; + } else if (r < rr[m].lo) { + n = m; + } else { // rr[m].lo <= r && r <= rr[m].hi + return true; + } + } + return false; +} + +CharClass* CharClassBuilder::GetCharClass() { + CharClass* cc = CharClass::New(ranges_.size()); + int n = 0; + for (iterator it = begin(); it != end(); ++it) + cc->ranges_[n++] = *it; + cc->nranges_ = n; + DCHECK_LE(n, ranges_.size()); + cc->nrunes_ = nrunes_; + cc->folds_ascii_ = FoldsASCII(); + return cc; +} + +} // namespace re2 diff --git a/outside/re2/re2/regexp.h b/outside/re2/re2/regexp.h new file mode 100644 index 000000000..331c01767 --- /dev/null +++ b/outside/re2/re2/regexp.h @@ -0,0 +1,633 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// --- SPONSORED LINK -------------------------------------------------- +// If you want to use this library for regular expression matching, +// you should use re2/re2.h, which provides a class RE2 that +// mimics the PCRE interface provided by PCRE's C++ wrappers. +// This header describes the low-level interface used to implement RE2 +// and may change in backwards-incompatible ways from time to time. +// In contrast, RE2's interface will not. +// --------------------------------------------------------------------- + +// Regular expression library: parsing, execution, and manipulation +// of regular expressions. +// +// Any operation that traverses the Regexp structures should be written +// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested +// regular expressions such as x++++++++++++++++++++... might cause recursive +// traversals to overflow the stack. +// +// It is the caller's responsibility to provide appropriate mutual exclusion +// around manipulation of the regexps. RE2 does this. +// +// PARSING +// +// Regexp::Parse parses regular expressions encoded in UTF-8. +// The default syntax is POSIX extended regular expressions, +// with the following changes: +// +// 1. Backreferences (optional in POSIX EREs) are not supported. +// (Supporting them precludes the use of DFA-based +// matching engines.) +// +// 2. Collating elements and collation classes are not supported. +// (No one has needed or wanted them.) +// +// The exact syntax accepted can be modified by passing flags to +// Regexp::Parse. In particular, many of the basic Perl additions +// are available. The flags are documented below (search for LikePerl). +// +// If parsed with the flag Regexp::Latin1, both the regular expression +// and the input to the matching routines are assumed to be encoded in +// Latin-1, not UTF-8. +// +// EXECUTION +// +// Once Regexp has parsed a regular expression, it provides methods +// to search text using that regular expression. These methods are +// implemented via calling out to other regular expression libraries. +// (Let's call them the sublibraries.) +// +// To call a sublibrary, Regexp does not simply prepare a +// string version of the regular expression and hand it to the +// sublibrary. Instead, Regexp prepares, from its own parsed form, the +// corresponding internal representation used by the sublibrary. +// This has the drawback of needing to know the internal representation +// used by the sublibrary, but it has two important benefits: +// +// 1. The syntax and meaning of regular expressions is guaranteed +// to be that used by Regexp's parser, not the syntax expected +// by the sublibrary. Regexp might accept a restricted or +// expanded syntax for regular expressions as compared with +// the sublibrary. As long as Regexp can translate from its +// internal form into the sublibrary's, clients need not know +// exactly which sublibrary they are using. +// +// 2. The sublibrary parsers are bypassed. For whatever reason, +// sublibrary regular expression parsers often have security +// problems. For example, plan9grep's regular expression parser +// has a buffer overflow in its handling of large character +// classes, and PCRE's parser has had buffer overflow problems +// in the past. Security-team requires sandboxing of sublibrary +// regular expression parsers. Avoiding the sublibrary parsers +// avoids the sandbox. +// +// The execution methods we use now are provided by the compiled form, +// Prog, described in prog.h +// +// MANIPULATION +// +// Unlike other regular expression libraries, Regexp makes its parsed +// form accessible to clients, so that client code can analyze the +// parsed regular expressions. + +#ifndef RE2_REGEXP_H__ +#define RE2_REGEXP_H__ + +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// Keep in sync with string list kOpcodeNames[] in testing/dump.cc +enum RegexpOp { + // Matches no strings. + kRegexpNoMatch = 1, + + // Matches empty string. + kRegexpEmptyMatch, + + // Matches rune_. + kRegexpLiteral, + + // Matches runes_. + kRegexpLiteralString, + + // Matches concatenation of sub_[0..nsub-1]. + kRegexpConcat, + // Matches union of sub_[0..nsub-1]. + kRegexpAlternate, + + // Matches sub_[0] zero or more times. + kRegexpStar, + // Matches sub_[0] one or more times. + kRegexpPlus, + // Matches sub_[0] zero or one times. + kRegexpQuest, + + // Matches sub_[0] at least min_ times, at most max_ times. + // max_ == -1 means no upper limit. + kRegexpRepeat, + + // Parenthesized (capturing) subexpression. Index is cap_. + // Optionally, capturing name is name_. + kRegexpCapture, + + // Matches any character. + kRegexpAnyChar, + + // Matches any byte [sic]. + kRegexpAnyByte, + + // Matches empty string at beginning of line. + kRegexpBeginLine, + // Matches empty string at end of line. + kRegexpEndLine, + + // Matches word boundary "\b". + kRegexpWordBoundary, + // Matches not-a-word boundary "\B". + kRegexpNoWordBoundary, + + // Matches empty string at beginning of text. + kRegexpBeginText, + // Matches empty string at end of text. + kRegexpEndText, + + // Matches character class given by cc_. + kRegexpCharClass, + + // Forces match of entire expression right now, + // with match ID match_id_ (used by RE2::Set). + kRegexpHaveMatch, + + kMaxRegexpOp = kRegexpHaveMatch, +}; + +// Keep in sync with string list in regexp.cc +enum RegexpStatusCode { + // No error + kRegexpSuccess = 0, + + // Unexpected error + kRegexpInternalError, + + // Parse errors + kRegexpBadEscape, // bad escape sequence + kRegexpBadCharClass, // bad character class + kRegexpBadCharRange, // bad character class range + kRegexpMissingBracket, // missing closing ] + kRegexpMissingParen, // missing closing ) + kRegexpTrailingBackslash, // at end of regexp + kRegexpRepeatArgument, // repeat argument missing, e.g. "*" + kRegexpRepeatSize, // bad repetition argument + kRegexpRepeatOp, // bad repetition operator + kRegexpBadPerlOp, // bad perl operator + kRegexpBadUTF8, // invalid UTF-8 in regexp + kRegexpBadNamedCapture, // bad named capture +}; + +// Error status for certain operations. +class RegexpStatus { + public: + RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} + ~RegexpStatus() { delete tmp_; } + + void set_code(enum RegexpStatusCode code) { code_ = code; } + void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } + void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } + enum RegexpStatusCode code() const { return code_; } + const StringPiece& error_arg() const { return error_arg_; } + bool ok() const { return code() == kRegexpSuccess; } + + // Copies state from status. + void Copy(const RegexpStatus& status); + + // Returns text equivalent of code, e.g.: + // "Bad character class" + static string CodeText(enum RegexpStatusCode code); + + // Returns text describing error, e.g.: + // "Bad character class: [z-a]" + string Text() const; + + private: + enum RegexpStatusCode code_; // Kind of error + StringPiece error_arg_; // Piece of regexp containing syntax error. + string* tmp_; // Temporary storage, possibly where error_arg_ is. + + DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus); +}; + +// Walker to implement Simplify. +class SimplifyWalker; + +// Compiled form; see prog.h +class Prog; + +struct RuneRange { + RuneRange() : lo(0), hi(0) { } + RuneRange(int l, int h) : lo(l), hi(h) { } + Rune lo; + Rune hi; +}; + +// Less-than on RuneRanges treats a == b if they overlap at all. +// This lets us look in a set to find the range covering a particular Rune. +struct RuneRangeLess { + bool operator()(const RuneRange& a, const RuneRange& b) const { + return a.hi < b.lo; + } +}; + +class CharClassBuilder; + +class CharClass { + public: + void Delete(); + + typedef RuneRange* iterator; + iterator begin() { return ranges_; } + iterator end() { return ranges_ + nranges_; } + + int size() { return nrunes_; } + bool empty() { return nrunes_ == 0; } + bool full() { return nrunes_ == Runemax+1; } + bool FoldsASCII() { return folds_ascii_; } + + bool Contains(Rune r); + CharClass* Negate(); + + private: + CharClass(); // not implemented + ~CharClass(); // not implemented + static CharClass* New(int maxranges); + + friend class CharClassBuilder; + + bool folds_ascii_; + int nrunes_; + RuneRange *ranges_; + int nranges_; + DISALLOW_EVIL_CONSTRUCTORS(CharClass); +}; + +class Regexp { + public: + + // Flags for parsing. Can be ORed together. + enum ParseFlags { + NoParseFlags = 0, + FoldCase = 1<<0, // Fold case during matching (case-insensitive). + Literal = 1<<1, // Treat s as literal string instead of a regexp. + ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s + // and [[:space:]] to match newline. + DotNL = 1<<3, // Allow . to match newline. + MatchNL = ClassNL | DotNL, + OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and + // end of text, not around embedded newlines. + // (Perl's default) + Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. + NonGreedy = 1<<6, // Repetition operators are non-greedy by default. + PerlClasses = 1<<7, // Allow Perl character classes like \d. + PerlB = 1<<8, // Allow Perl's \b and \B. + PerlX = 1<<9, // Perl extensions: + // non-capturing parens - (?: ) + // non-greedy operators - *? +? ?? {}? + // flag edits - (?i) (?-i) (?i: ) + // i - FoldCase + // m - !OneLine + // s - DotNL + // U - NonGreedy + // line ends: \A \z + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. + NeverNL = 1<<11, // Never match NL, even if the regexp mentions + // it explicitly. + NeverCapture = 1<<12, // Parse all parens as non-capturing. + + // As close to Perl as we can get. + LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | + UnicodeGroups, + + // Internal use only. + WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text + }; + + // Get. No set, Regexps are logically immutable once created. + RegexpOp op() { return static_cast(op_); } + int nsub() { return nsub_; } + bool simple() { return simple_; } + enum ParseFlags parse_flags() { return static_cast(parse_flags_); } + int Ref(); // For testing. + + Regexp** sub() { + if(nsub_ <= 1) + return &subone_; + else + return submany_; + } + + int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; } + int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; } + Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; } + CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; } + int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; } + const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } + Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; } + int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; } + int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; } + + // Increments reference count, returns object as convenience. + Regexp* Incref(); + + // Decrements reference count and deletes this object if count reaches 0. + void Decref(); + + // Parses string s to produce regular expression, returned. + // Caller must release return value with re->Decref(). + // On failure, sets *status (if status != NULL) and returns NULL. + static Regexp* Parse(const StringPiece& s, ParseFlags flags, + RegexpStatus* status); + + // Returns a _new_ simplified version of the current regexp. + // Does not edit the current regexp. + // Caller must release return value with re->Decref(). + // Simplified means that counted repetition has been rewritten + // into simpler terms and all Perl/POSIX features have been + // removed. The result will capture exactly the same + // subexpressions the original did, unless formatted with ToString. + Regexp* Simplify(); + friend class SimplifyWalker; + + // Parses the regexp src and then simplifies it and sets *dst to the + // string representation of the simplified form. Returns true on success. + // Returns false and sets *status (if status != NULL) on parse error. + static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags, + string* dst, + RegexpStatus* status); + + // Returns the number of capturing groups in the regexp. + int NumCaptures(); + friend class NumCapturesWalker; + + // Returns a map from names to capturing group indices, + // or NULL if the regexp contains no named capture groups. + // The caller is responsible for deleting the map. + map* NamedCaptures(); + + // Returns a map from capturing group indices to capturing group + // names or NULL if the regexp contains no named capture groups. The + // caller is responsible for deleting the map. + map* CaptureNames(); + + // Returns a string representation of the current regexp, + // using as few parentheses as possible. + string ToString(); + + // Convenience functions. They consume the passed reference, + // so in many cases you should use, e.g., Plus(re->Incref(), flags). + // They do not consume allocated arrays like subs or runes. + static Regexp* Plus(Regexp* sub, ParseFlags flags); + static Regexp* Star(Regexp* sub, ParseFlags flags); + static Regexp* Quest(Regexp* sub, ParseFlags flags); + static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); + static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); + static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); + static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); + static Regexp* NewLiteral(Rune rune, ParseFlags flags); + static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); + static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); + static Regexp* HaveMatch(int match_id, ParseFlags flags); + + // Like Alternate but does not factor out common prefixes. + static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); + + // Debugging function. Returns string format for regexp + // that makes structure clear. Does NOT use regexp syntax. + string Dump(); + + // Helper traversal class, defined fully in walker-inl.h. + template class Walker; + + // Compile to Prog. See prog.h + // Reverse prog expects to be run over text backward. + // Construction and execution of prog will + // stay within approximately max_mem bytes of memory. + // If max_mem <= 0, a reasonable default is used. + Prog* CompileToProg(int64 max_mem); + Prog* CompileToReverseProg(int64 max_mem); + + // Whether to expect this library to find exactly the same answer as PCRE + // when running this regexp. Most regexps do mimic PCRE exactly, but a few + // obscure cases behave differently. Technically this is more a property + // of the Prog than the Regexp, but the computation is much easier to do + // on the Regexp. See mimics_pcre.cc for the exact conditions. + bool MimicsPCRE(); + + // Benchmarking function. + void NullWalk(); + + // Whether every match of this regexp must be anchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix and the sub-regexp that + // follows it. + bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix); + + private: + // Constructor allocates vectors as appropriate for operator. + explicit Regexp(RegexpOp op, ParseFlags parse_flags); + + // Use Decref() instead of delete to release Regexps. + // This is private to catch deletes at compile time. + ~Regexp(); + void Destroy(); + bool QuickDestroy(); + + // Helpers for Parse. Listed here so they can edit Regexps. + class ParseState; + friend class ParseState; + friend bool ParseCharClass(StringPiece* s, Regexp** out_re, + RegexpStatus* status); + + // Helper for testing [sic]. + friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); + + // Computes whether Regexp is already simple. + bool ComputeSimple(); + + // Constructor that generates a concatenation or alternation, + // enforcing the limit on the number of subexpressions for + // a particular Regexp. + static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, + ParseFlags flags, bool can_factor); + + // Returns the leading string that re starts with. + // The returned Rune* points into a piece of re, + // so it must not be used after the caller calls re->Decref(). + static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); + + // Removes the first n leading runes from the beginning of re. + // Edits re in place. + static void RemoveLeadingString(Regexp* re, int n); + + // Returns the leading regexp in re's top-level concatenation. + // The returned Regexp* points at re or a sub-expression of re, + // so it must not be used after the caller calls re->Decref(). + static Regexp* LeadingRegexp(Regexp* re); + + // Removes LeadingRegexp(re) from re and returns the remainder. + // Might edit re in place. + static Regexp* RemoveLeadingRegexp(Regexp* re); + + // Simplifies an alternation of literal strings by factoring out + // common prefixes. + static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); + static int FactorAlternationRecursive(Regexp** sub, int nsub, + ParseFlags flags, int maxdepth); + + // Is a == b? Only efficient on regexps that have not been through + // Simplify yet - the expansion of a kRegexpRepeat will make this + // take a long time. Do not call on such regexps, hence private. + static bool Equal(Regexp* a, Regexp* b); + + // Allocate space for n sub-regexps. + void AllocSub(int n) { + if (n < 0 || static_cast(n) != n) + LOG(FATAL) << "Cannot AllocSub " << n; + if (n > 1) + submany_ = new Regexp*[n]; + nsub_ = n; + } + + // Add Rune to LiteralString + void AddRuneToString(Rune r); + + // Swaps this with that, in place. + void Swap(Regexp *that); + + // Operator. See description of operators above. + // uint8 instead of RegexpOp to control space usage. + uint8 op_; + + // Is this regexp structure already simple + // (has it been returned by Simplify)? + // uint8 instead of bool to control space usage. + uint8 simple_; + + // Flags saved from parsing and used during execution. + // (Only FoldCase is used.) + // uint16 instead of ParseFlags to control space usage. + uint16 parse_flags_; + + // Reference count. Exists so that SimplifyRegexp can build + // regexp structures that are dags rather than trees to avoid + // exponential blowup in space requirements. + // uint16 to control space usage. + // The standard regexp routines will never generate a + // ref greater than the maximum repeat count (100), + // but even so, Incref and Decref consult an overflow map + // when ref_ reaches kMaxRef. + uint16 ref_; + static const uint16 kMaxRef = 0xffff; + + // Subexpressions. + // uint16 to control space usage. + // Concat and Alternate handle larger numbers of subexpressions + // by building concatenation or alternation trees. + // Other routines should call Concat or Alternate instead of + // filling in sub() by hand. + uint16 nsub_; + static const uint16 kMaxNsub = 0xffff; + union { + Regexp** submany_; // if nsub_ > 1 + Regexp* subone_; // if nsub_ == 1 + }; + + // Extra space for parse and teardown stacks. + Regexp* down_; + + // Arguments to operator. See description of operators above. + union { + struct { // Repeat + int max_; + int min_; + }; + struct { // Capture + int cap_; + string* name_; + }; + struct { // LiteralString + int nrunes_; + Rune* runes_; + }; + struct { // CharClass + // These two could be in separate union members, + // but it wouldn't save any space (there are other two-word structs) + // and keeping them separate avoids confusion during parsing. + CharClass* cc_; + CharClassBuilder* ccb_; + }; + Rune rune_; // Literal + int match_id_; // HaveMatch + void *the_union_[2]; // as big as any other element, for memset + }; + + DISALLOW_EVIL_CONSTRUCTORS(Regexp); +}; + +// Character class set: contains non-overlapping, non-abutting RuneRanges. +typedef set RuneRangeSet; + +class CharClassBuilder { + public: + CharClassBuilder(); + + typedef RuneRangeSet::iterator iterator; + iterator begin() { return ranges_.begin(); } + iterator end() { return ranges_.end(); } + + int size() { return nrunes_; } + bool empty() { return nrunes_ == 0; } + bool full() { return nrunes_ == Runemax+1; } + + bool Contains(Rune r); + bool FoldsASCII(); + bool AddRange(Rune lo, Rune hi); // returns whether class changed + CharClassBuilder* Copy(); + void AddCharClass(CharClassBuilder* cc); + void Negate(); + void RemoveAbove(Rune r); + CharClass* GetCharClass(); + void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); + + private: + static const uint32 AlphaMask = (1<<26) - 1; + uint32 upper_; // bitmap of A-Z + uint32 lower_; // bitmap of a-z + int nrunes_; + RuneRangeSet ranges_; + DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder); +}; + +// Tell g++ that bitwise ops on ParseFlags produce ParseFlags. +inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) +{ + return static_cast(static_cast(a) | static_cast(b)); +} + +inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) +{ + return static_cast(static_cast(a) ^ static_cast(b)); +} + +inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) +{ + return static_cast(static_cast(a) & static_cast(b)); +} + +inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) +{ + return static_cast(~static_cast(a)); +} + + + +} // namespace re2 + +#endif // RE2_REGEXP_H__ diff --git a/outside/re2/re2/set.cc b/outside/re2/re2/set.cc new file mode 100644 index 000000000..2bcd30acb --- /dev/null +++ b/outside/re2/re2/set.cc @@ -0,0 +1,113 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/set.h" + +#include "util/util.h" +#include "re2/stringpiece.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +using namespace re2; + +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { + options_.Copy(options); + anchor_ = anchor; + prog_ = NULL; + compiled_ = false; +} + +RE2::Set::~Set() { + for (int i = 0; i < re_.size(); i++) + re_[i]->Decref(); + delete prog_; +} + +int RE2::Set::Add(const StringPiece& pattern, string* error) { + if (compiled_) { + LOG(DFATAL) << "RE2::Set::Add after Compile"; + return -1; + } + + Regexp::ParseFlags pf = static_cast( + options_.ParseFlags()); + + RegexpStatus status; + re2::Regexp* re = Regexp::Parse(pattern, pf, &status); + if (re == NULL) { + if (error != NULL) + *error = status.Text(); + if (options_.log_errors()) + LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text(); + return -1; + } + + // Concatenate with match index and push on vector. + int n = re_.size(); + re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); + if (re->op() == kRegexpConcat) { + int nsub = re->nsub(); + re2::Regexp** sub = new re2::Regexp*[nsub + 1]; + for (int i = 0; i < nsub; i++) + sub[i] = re->sub()[i]->Incref(); + sub[nsub] = m; + re->Decref(); + re = re2::Regexp::Concat(sub, nsub + 1, pf); + delete[] sub; + } else { + re2::Regexp* sub[2]; + sub[0] = re; + sub[1] = m; + re = re2::Regexp::Concat(sub, 2, pf); + } + re_.push_back(re); + return n; +} + +bool RE2::Set::Compile() { + if (compiled_) { + LOG(DFATAL) << "RE2::Set::Compile multiple times"; + return false; + } + compiled_ = true; + + Regexp::ParseFlags pf = static_cast( + options_.ParseFlags()); + re2::Regexp* re = re2::Regexp::Alternate(const_cast(&re_[0]), + re_.size(), pf); + re_.clear(); + re2::Regexp* sre = re->Simplify(); + re->Decref(); + re = sre; + if (re == NULL) { + if (options_.log_errors()) + LOG(ERROR) << "Error simplifying during Compile."; + return false; + } + + prog_ = Prog::CompileSet(options_, anchor_, re); + return prog_ != NULL; +} + +bool RE2::Set::Match(const StringPiece& text, vector* v) const { + if (!compiled_) { + LOG(DFATAL) << "RE2::Set::Match without Compile"; + return false; + } + v->clear(); + bool failed; + bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, + Prog::kManyMatch, NULL, &failed, v); + if (failed) + LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space"; + + if (ret == false) + return false; + if (v->size() == 0) { + LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set"; + return false; + } + return true; +} diff --git a/outside/re2/re2/set.h b/outside/re2/re2/set.h new file mode 100644 index 000000000..d7164257f --- /dev/null +++ b/outside/re2/re2/set.h @@ -0,0 +1,55 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_SET_H +#define RE2_SET_H + +#include +#include + +#include "re2/re2.h" + +namespace re2 { +using std::vector; + +// An RE2::Set represents a collection of regexps that can +// be searched for simultaneously. +class RE2::Set { + public: + Set(const RE2::Options& options, RE2::Anchor anchor); + ~Set(); + + // Add adds regexp pattern to the set, interpreted using the RE2 options. + // (The RE2 constructor's default options parameter is RE2::UTF8.) + // Add returns the regexp index that will be used to identify + // it in the result of Match, or -1 if the regexp cannot be parsed. + // Indices are assigned in sequential order starting from 0. + // Error returns do not increment the index. + // If an error occurs and error != NULL, *error will hold an error message. + int Add(const StringPiece& pattern, string* error); + + // Compile prepares the Set for matching. + // Add must not be called again after Compile. + // Compile must be called before FullMatch or PartialMatch. + // Compile may return false if it runs out of memory. + bool Compile(); + + // Match returns true if text matches any of the regexps in the set. + // If so, it fills v with the indices of the matching regexps. + bool Match(const StringPiece& text, vector* v) const; + + private: + RE2::Options options_; + RE2::Anchor anchor_; + vector re_; + re2::Prog* prog_; + bool compiled_; + //DISALLOW_EVIL_CONSTRUCTORS(Set); + Set(const Set&); + void operator=(const Set&); +}; + +} // namespace re2 + +#endif // RE2_SET_H diff --git a/outside/re2/re2/simplify.cc b/outside/re2/re2/simplify.cc new file mode 100644 index 000000000..faf32084e --- /dev/null +++ b/outside/re2/re2/simplify.cc @@ -0,0 +1,393 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Rewrite POSIX and other features in re +// to use simple extended regular expression features. +// Also sort and simplify character classes. + +#include "util/util.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Parses the regexp src and then simplifies it and sets *dst to the +// string representation of the simplified form. Returns true on success. +// Returns false and sets *error (if error != NULL) on error. +bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, + string* dst, + RegexpStatus* status) { + Regexp* re = Parse(src, flags, status); + if (re == NULL) + return false; + Regexp* sre = re->Simplify(); + re->Decref(); + if (sre == NULL) { + // Should not happen, since Simplify never fails. + LOG(ERROR) << "Simplify failed on " << src; + if (status) { + status->set_code(kRegexpInternalError); + status->set_error_arg(src); + } + return false; + } + *dst = sre->ToString(); + sre->Decref(); + return true; +} + +// Assuming the simple_ flags on the children are accurate, +// is this Regexp* simple? +bool Regexp::ComputeSimple() { + Regexp** subs; + switch (op_) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpLiteralString: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpEndText: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpHaveMatch: + return true; + case kRegexpConcat: + case kRegexpAlternate: + // These are simple as long as the subpieces are simple. + subs = sub(); + for (int i = 0; i < nsub_; i++) + if (!subs[i]->simple_) + return false; + return true; + case kRegexpCharClass: + // Simple as long as the char class is not empty, not full. + if (ccb_ != NULL) + return !ccb_->empty() && !ccb_->full(); + return !cc_->empty() && !cc_->full(); + case kRegexpCapture: + subs = sub(); + return subs[0]->simple_; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + subs = sub(); + if (!subs[0]->simple_) + return false; + switch (subs[0]->op_) { + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpEmptyMatch: + case kRegexpNoMatch: + return false; + default: + break; + } + return true; + case kRegexpRepeat: + return false; + } + LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; + return false; +} + +// Walker subclass used by Simplify. +// The simplify walk is purely post-recursive: given the simplified children, +// PostVisit creates the simplified result. +// The child_args are simplified Regexp*s. +class SimplifyWalker : public Regexp::Walker { + public: + SimplifyWalker() {} + virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); + virtual Regexp* PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, int nchild_args); + virtual Regexp* Copy(Regexp* re); + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); + + private: + // These functions are declared inside SimplifyWalker so that + // they can edit the private fields of the Regexps they construct. + + // Creates a concatenation of two Regexp, consuming refs to re1 and re2. + // Caller must Decref return value when done with it. + static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); + + // Simplifies the expression re{min,max} in terms of *, +, and ?. + // Returns a new regexp. Does not edit re. Does not consume reference to re. + // Caller must Decref return value when done with it. + static Regexp* SimplifyRepeat(Regexp* re, int min, int max, + Regexp::ParseFlags parse_flags); + + // Simplifies a character class by expanding any named classes + // into rune ranges. Does not edit re. Does not consume ref to re. + // Caller must Decref return value when done with it. + static Regexp* SimplifyCharClass(Regexp* re); + + DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); +}; + +// Simplifies a regular expression, returning a new regexp. +// The new regexp uses traditional Unix egrep features only, +// plus the Perl (?:) non-capturing parentheses. +// Otherwise, no POSIX or Perl additions. The new regexp +// captures exactly the same subexpressions (with the same indices) +// as the original. +// Does not edit current object. +// Caller must Decref() return value when done with it. + +Regexp* Regexp::Simplify() { + if (simple_) + return Incref(); + SimplifyWalker w; + return w.Walk(this, NULL); +} + +#define Simplify DontCallSimplify // Avoid accidental recursion + +Regexp* SimplifyWalker::Copy(Regexp* re) { + return re->Incref(); +} + +Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; + return re->Incref(); +} + +Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { + if (re->simple_) { + *stop = true; + return re->Incref(); + } + return NULL; +} + +Regexp* SimplifyWalker::PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, + int nchild_args) { + switch (re->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpLiteralString: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpEndText: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpHaveMatch: + // All these are always simple. + re->simple_ = true; + return re->Incref(); + + case kRegexpConcat: + case kRegexpAlternate: { + // These are simple as long as the subpieces are simple. + // Two passes to avoid allocation in the common case. + bool changed = false; + Regexp** subs = re->sub(); + for (int i = 0; i < re->nsub_; i++) { + Regexp* sub = subs[i]; + Regexp* newsub = child_args[i]; + if (newsub != sub) { + changed = true; + break; + } + } + if (!changed) { + for (int i = 0; i < re->nsub_; i++) { + Regexp* newsub = child_args[i]; + newsub->Decref(); + } + re->simple_ = true; + return re->Incref(); + } + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub_); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i nsub_; i++) + nre_subs[i] = child_args[i]; + nre->simple_ = true; + return nre; + } + + case kRegexpCapture: { + Regexp* newsub = child_args[0]; + if (newsub == re->sub()[0]) { + newsub->Decref(); + re->simple_ = true; + return re->Incref(); + } + Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); + nre->AllocSub(1); + nre->sub()[0] = newsub; + nre->cap_ = re->cap_; + nre->simple_ = true; + return nre; + } + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: { + Regexp* newsub = child_args[0]; + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if (newsub->op() == kRegexpEmptyMatch) + return newsub; + + // These are simple as long as the subpiece is simple. + if (newsub == re->sub()[0]) { + newsub->Decref(); + re->simple_ = true; + return re->Incref(); + } + + // These are also idempotent if flags are constant. + if (re->op() == newsub->op() && + re->parse_flags() == newsub->parse_flags()) + return newsub; + + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(1); + nre->sub()[0] = newsub; + nre->simple_ = true; + return nre; + } + + case kRegexpRepeat: { + Regexp* newsub = child_args[0]; + // Special case: repeat the empty string as much as + // you want, but it's still the empty string. + if (newsub->op() == kRegexpEmptyMatch) + return newsub; + + Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, + re->parse_flags()); + newsub->Decref(); + nre->simple_ = true; + return nre; + } + + case kRegexpCharClass: { + Regexp* nre = SimplifyCharClass(re); + nre->simple_ = true; + return nre; + } + } + + LOG(ERROR) << "Simplify case not handled: " << re->op(); + return re->Incref(); +} + +// Creates a concatenation of two Regexp, consuming refs to re1 and re2. +// Returns a new Regexp, handing the ref to the caller. +Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, + Regexp::ParseFlags parse_flags) { + Regexp* re = new Regexp(kRegexpConcat, parse_flags); + re->AllocSub(2); + Regexp** subs = re->sub(); + subs[0] = re1; + subs[1] = re2; + return re; +} + +// Simplifies the expression re{min,max} in terms of *, +, and ?. +// Returns a new regexp. Does not edit re. Does not consume reference to re. +// Caller must Decref return value when done with it. +// The result will *not* necessarily have the right capturing parens +// if you call ToString() and re-parse it: (x){2} becomes (x)(x), +// but in the Regexp* representation, both (x) are marked as $1. +Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, + Regexp::ParseFlags f) { + // x{n,} means at least n matches of x. + if (max == -1) { + // Special case: x{0,} is x* + if (min == 0) + return Regexp::Star(re->Incref(), f); + + // Special case: x{1,} is x+ + if (min == 1) + return Regexp::Plus(re->Incref(), f); + + // General case: x{4,} is xxxx+ + Regexp* nre = new Regexp(kRegexpConcat, f); + nre->AllocSub(min); + VLOG(1) << "Simplify " << min; + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < min-1; i++) + nre_subs[i] = re->Incref(); + nre_subs[min-1] = Regexp::Plus(re->Incref(), f); + return nre; + } + + // Special case: (x){0} matches only empty string. + if (min == 0 && max == 0) + return new Regexp(kRegexpEmptyMatch, f); + + // Special case: x{1} is just x. + if (min == 1 && max == 1) + return re->Incref(); + + // General case: x{n,m} means n copies of x and m copies of x?. + // The machine will do less work if we nest the final m copies, + // so that x{2,5} = xx(x(x(x)?)?)? + + // Build leading prefix: xx. Capturing only on the last one. + Regexp* nre = NULL; + if (min > 0) { + nre = new Regexp(kRegexpConcat, f); + nre->AllocSub(min); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < min; i++) + nre_subs[i] = re->Incref(); + } + + // Build and attach suffix: (x(x(x)?)?)? + if (max > min) { + Regexp* suf = Regexp::Quest(re->Incref(), f); + for (int i = min+1; i < max; i++) + suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); + if (nre == NULL) + nre = suf; + else + nre = Concat2(nre, suf, f); + } + + if (nre == NULL) { + // Some degenerate case, like min > max, or min < max < 0. + // This shouldn't happen, because the parser rejects such regexps. + LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; + return new Regexp(kRegexpNoMatch, f); + } + + return nre; +} + +// Simplifies a character class. +// Caller must Decref return value when done with it. +Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { + CharClass* cc = re->cc(); + + // Special cases + if (cc->empty()) + return new Regexp(kRegexpNoMatch, re->parse_flags()); + if (cc->full()) + return new Regexp(kRegexpAnyChar, re->parse_flags()); + + return re->Incref(); +} + +} // namespace re2 diff --git a/outside/re2/re2/stringpiece.h b/outside/re2/re2/stringpiece.h new file mode 100644 index 000000000..ab9297c6d --- /dev/null +++ b/outside/re2/re2/stringpiece.h @@ -0,0 +1,182 @@ +// Copyright 2001-2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// +// +// Arghh! I wish C++ literals were "string". + +#ifndef STRINGS_STRINGPIECE_H__ +#define STRINGS_STRINGPIECE_H__ + +#include +#include +#include +#include + +namespace re2 { + +class StringPiece { + private: + const char* ptr_; + int length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : static_cast(strlen(str))) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(static_cast(str.size())) { } + StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + int size() const { return length_; } + int length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, int len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + if (str != NULL) + length_ = static_cast(strlen(str)); + else + length_ = 0; + } + void set(const void* data, int len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](int i) const { return ptr_[i]; } + + void remove_prefix(int n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(int n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + return std::string(data(), size()); + } + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const { + return std::string(data(), size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + // STLS says return size_type, but Google says return int + int max_size() const { return length_; } + int capacity() const { return length_; } + + int copy(char* buf, size_type n, size_type pos = 0) const; + + int find(const StringPiece& s, size_type pos = 0) const; + int find(char c, size_type pos = 0) const; + int rfind(const StringPiece& s, size_type pos = npos) const; + int rfind(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static bool _equal(const StringPiece&, const StringPiece&); +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + return StringPiece::_equal(x, y); +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +} // namespace re2 + +// allow StringPiece to be logged +extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece); + +#endif // STRINGS_STRINGPIECE_H__ diff --git a/outside/re2/re2/testing/backtrack.cc b/outside/re2/re2/testing/backtrack.cc new file mode 100644 index 000000000..b2dd6dbad --- /dev/null +++ b/outside/re2/re2/testing/backtrack.cc @@ -0,0 +1,254 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Tested by search_test.cc, exhaustive_test.cc, tester.cc +// +// Prog::BadSearchBacktrack is a backtracking regular expression search, +// except that it remembers where it has been, trading a lot of +// memory for a lot of time. It exists only for testing purposes. +// +// Let me repeat that. +// +// THIS CODE SHOULD NEVER BE USED IN PRODUCTION: +// - It uses a ton of memory. +// - It uses a ton of stack. +// - It uses CHECK and LOG(FATAL). +// - It implements unanchored search by repeated anchored search. +// +// On the other hand, it is very simple and a good reference +// implementation for the more complicated regexp packages. +// +// In BUILD, this file is linked into the ":testing" library, +// not the main library, in order to make it harder to pick up +// accidentally. + +#include "util/util.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +// Backtracker holds the state for a backtracking search. +// +// Excluding the search parameters, the main search state +// is just the "capture registers", which record, for the +// current execution, the string position at which each +// parenthesis was passed. cap_[0] and cap_[1] are the +// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. +// +// To avoid infinite loops during backtracking on expressions +// like (a*)*, the visited_[] bitmap marks the (state, string-position) +// pairs that have already been explored and are thus not worth +// re-exploring if we get there via another path. Modern backtracking +// libraries engineer their program representation differently, to make +// such infinite loops possible to avoid without keeping a giant visited_ +// bitmap, but visited_ works fine for a reference implementation +// and it has the nice benefit of making the search run in linear time. +class Backtracker { + public: + explicit Backtracker(Prog* prog); + ~Backtracker(); + + bool Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch); + + private: + // Explores from instruction ip at string position p looking for a match. + // Returns true if found (so that caller can stop trying other possibilities). + bool Visit(int id, const char* p); + + // Search parameters + Prog* prog_; // program being run + StringPiece text_; // text being searched + StringPiece context_; // greater context of text being searched + bool anchored_; // whether search is anchored at text.begin() + bool longest_; // whether search wants leftmost-longest match + bool endmatch_; // whether search must end at text.end() + StringPiece *submatch_; // submatches to fill in + int nsubmatch_; // # of submatches to fill in + + // Search state + const char* cap_[64]; // capture registers + uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked + int nvisited_; // # of words in bitmap +}; + +Backtracker::Backtracker(Prog* prog) + : prog_(prog), + anchored_(false), + longest_(false), + endmatch_(false), + submatch_(NULL), + nsubmatch_(0), + visited_(NULL), + nvisited_(0) { +} + +Backtracker::~Backtracker() { + delete[] visited_; +} + +// Runs a backtracking search. +bool Backtracker::Search(const StringPiece& text, const StringPiece& context, + bool anchored, bool longest, + StringPiece* submatch, int nsubmatch) { + text_ = text; + context_ = context; + if (context_.begin() == NULL) + context_ = text; + if (prog_->anchor_start() && text.begin() > context_.begin()) + return false; + if (prog_->anchor_end() && text.end() < context_.end()) + return false; + anchored_ = anchored | prog_->anchor_start(); + longest_ = longest | prog_->anchor_end(); + endmatch_ = prog_->anchor_end(); + submatch_ = submatch; + nsubmatch_ = nsubmatch; + CHECK(2*nsubmatch_ < arraysize(cap_)); + memset(cap_, 0, sizeof cap_); + + // We use submatch_[0] for our own bookkeeping, + // so it had better exist. + StringPiece sp0; + if (nsubmatch < 1) { + submatch_ = &sp0; + nsubmatch_ = 1; + } + submatch_[0] = NULL; + + // Allocate new visited_ bitmap -- size is proportional + // to text, so have to reallocate on each call to Search. + delete[] visited_; + nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; + visited_ = new uint32[nvisited_]; + memset(visited_, 0, nvisited_*sizeof visited_[0]); + + // Anchored search must start at text.begin(). + if (anchored_) { + cap_[0] = text.begin(); + return Visit(prog_->start(), text.begin()); + } + + // Unanchored search, starting from each possible text position. + // Notice that we have to try the empty string at the end of + // the text, so the loop condition is p <= text.end(), not p < text.end(). + for (const char* p = text.begin(); p <= text.end(); p++) { + cap_[0] = p; + if (Visit(prog_->start(), p)) // Match must be leftmost; done. + return true; + } + return false; +} + +// Explores from instruction ip at string position p looking for a match. +// Return true if found (so that caller can stop trying other possibilities). +bool Backtracker::Visit(int id, const char* p) { + // Check bitmap. If we've already explored from here, + // either it didn't match or it did but we're hoping for a better match. + // Either way, don't go down that road again. + CHECK(p <= text_.end()); + int n = id*(text_.size()+1) + (p - text_.begin()); + CHECK_LT(n/32, nvisited_); + if (visited_[n/32] & (1 << (n&31))) + return false; + visited_[n/32] |= 1 << (n&31); + + // Pick out byte at current position. If at end of string, + // have to explore in hope of finishing a match. Use impossible byte -1. + int c = -1; + if (p < text_.end()) + c = *p & 0xFF; + + Prog::Inst* ip = prog_->inst(id); + switch (ip->opcode()) { + default: + LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); + return false; // not reached + + case kInstAlt: + case kInstAltMatch: + // Try both possible next states: out is preferred to out1. + if (Visit(ip->out(), p)) { + if (longest_) + Visit(ip->out1(), p); + return true; + } + return Visit(ip->out1(), p); + + case kInstByteRange: + if (ip->Matches(c)) + return Visit(ip->out(), p+1); + return false; + + case kInstCapture: + if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) { + // Capture p to register, but save old value. + const char* q = cap_[ip->cap()]; + cap_[ip->cap()] = p; + bool ret = Visit(ip->out(), p); + // Restore old value as we backtrack. + cap_[ip->cap()] = q; + return ret; + } + return Visit(ip->out(), p); + + case kInstEmptyWidth: + if (ip->empty() & ~Prog::EmptyFlags(context_, p)) + return false; + return Visit(ip->out(), p); + + case kInstNop: + return Visit(ip->out(), p); + + case kInstMatch: + // We found a match. If it's the best so far, record the + // parameters in the caller's submatch_ array. + if (endmatch_ && p != context_.end()) + return false; + cap_[1] = p; + if (submatch_[0].data() == NULL || // First match so far ... + (longest_ && p > submatch_[0].end())) { // ... or better match + for (int i = 0; i < nsubmatch_; i++) + submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + } + return true; + + case kInstFail: + return false; + } +} + +// Runs a backtracking search. +bool Prog::UnsafeSearchBacktrack(const StringPiece& text, + const StringPiece& context, + Anchor anchor, + MatchKind kind, + StringPiece* match, + int nmatch) { + // If full match, we ask for an anchored longest match + // and then check that match[0] == text. + // So make sure match[0] exists. + StringPiece sp0; + if (kind == kFullMatch) { + anchor = kAnchored; + if (nmatch < 1) { + match = &sp0; + nmatch = 1; + } + } + + // Run the search. + Backtracker b(this); + bool anchored = anchor == kAnchored; + bool longest = kind != kFirstMatch; + if (!b.Search(text, context, anchored, longest, match, nmatch)) + return false; + if (kind == kFullMatch && match[0].end() != text.end()) + return false; + return true; +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/charclass_test.cc b/outside/re2/re2/testing/charclass_test.cc new file mode 100644 index 000000000..a3764d4be --- /dev/null +++ b/outside/re2/re2/testing/charclass_test.cc @@ -0,0 +1,223 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test character class manipulations. + +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct CCTest { + struct { + Rune lo; + Rune hi; + } add[10]; + int remove; + struct { + Rune lo; + Rune hi; + } final[10]; +}; + +static CCTest tests[] = { + { { { 10, 20 }, {-1} }, -1, + { { 10, 20 }, {-1} } }, + + { { { 10, 20 }, { 20, 30 }, {-1} }, -1, + { { 10, 30 }, {-1} } }, + + { { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1, + { { 10, 40 }, {-1} } }, + + { { { 0, 50 }, { 20, 30 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1, + { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1, + { { 5, 25 }, {-1} } }, + + { { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1, + { { 10, 23 }, {-1} } }, + + // These check boundary cases during negation. + { { { 0, Runemax }, {-1} }, -1, + { { 0, Runemax }, {-1} } }, + + { { { 0, 50 }, {-1} }, -1, + { { 0, 50 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, -1, + { { 50, Runemax }, {-1} } }, + + // Check RemoveAbove. + { { { 50, Runemax }, {-1} }, 255, + { { 50, 255 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, 65535, + { { 50, 65535 }, {-1} } }, + + { { { 50, Runemax }, {-1} }, Runemax, + { { 50, Runemax }, {-1} } }, + + { { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255, + { { 50, 60 }, { 250, 255 }, {-1} } }, + + { { { 50, 60 }, {-1} }, 255, + { { 50, 60 }, {-1} } }, + + { { { 350, 360 }, {-1} }, 255, + { {-1} } }, + + { { {-1} }, 255, + { {-1} } }, +}; + +template +static void Broke(const char *desc, const CCTest* t, CharClass* cc) { + if (t == NULL) { + printf("\t%s:", desc); + } else { + printf("\n"); + printf("CharClass added: [%s]", desc); + for (int k = 0; t->add[k].lo >= 0; k++) + printf(" %d-%d", t->add[k].lo, t->add[k].hi); + printf("\n"); + if (t->remove >= 0) + printf("Removed > %d\n", t->remove); + printf("\twant:"); + for (int k = 0; t->final[k].lo >= 0; k++) + printf(" %d-%d", t->final[k].lo, t->final[k].hi); + printf("\n"); + printf("\thave:"); + } + + for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it) + printf(" %d-%d", it->lo, it->hi); + printf("\n"); +} + +bool ShouldContain(CCTest *t, int x) { + for (int j = 0; t->final[j].lo >= 0; j++) + if (t->final[j].lo <= x && x <= t->final[j].hi) + return true; + return false; +} + +// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder. + +CharClass* Negate(CharClass *cc) { + return cc->Negate(); +} + +void Delete(CharClass* cc) { + cc->Delete(); +} + +CharClassBuilder* Negate(CharClassBuilder* cc) { + CharClassBuilder* ncc = cc->Copy(); + ncc->Negate(); + return ncc; +} + +void Delete(CharClassBuilder* cc) { + delete cc; +} + +template +bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) { + typename CharClass::iterator it = cc->begin(); + int size = 0; + for (int j = 0; t->final[j].lo >= 0; j++, ++it) { + if (it == cc->end() || + it->lo != t->final[j].lo || + it->hi != t->final[j].hi) { + Broke(desc, t, cc); + return false; + } + size += it->hi - it->lo + 1; + } + if (it != cc->end()) { + Broke(desc, t, cc); + return false; + } + if (cc->size() != size) { + Broke(desc, t, cc); + printf("wrong size: want %d have %d\n", size, cc->size()); + return false; + } + + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) != cc->Contains(j)) { + Broke(desc, t, cc); + printf("want contains(%d)=%d, got %d\n", + j, ShouldContain(t, j), cc->Contains(j)); + return false; + } + } + + CharClass* ncc = Negate(cc); + for (int j = 0; j < 101; j++) { + if (j == 100) + j = Runemax; + if (ShouldContain(t, j) == ncc->Contains(j)) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("want ncc contains(%d)!=%d, got %d\n", + j, ShouldContain(t, j), ncc->Contains(j)); + Delete(ncc); + return false; + } + if (ncc->size() != Runemax+1 - cc->size()) { + Broke(desc, t, cc); + Broke("ncc", NULL, ncc); + printf("ncc size should be %d is %d\n", + Runemax+1 - cc->size(), ncc->size()); + Delete(ncc); + return false; + } + } + Delete(ncc); + return true; +} + +TEST(TestCharClassBuilder, Adds) { + int nfail = 0; + for (int i = 0; i < arraysize(tests); i++) { + CharClassBuilder ccb; + CCTest* t = &tests[i]; + for (int j = 0; t->add[j].lo >= 0; j++) + ccb.AddRange(t->add[j].lo, t->add[j].hi); + if (t->remove >= 0) + ccb.RemoveAbove(t->remove); + if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)")) + nfail++; + CharClass* cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "before copy (CharClass)")) + nfail++; + cc->Delete(); + + CharClassBuilder *ccb1 = ccb.Copy(); + if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)")) + nfail++; + cc = ccb.GetCharClass(); + if (!CorrectCC(cc, t, "after copy (CharClass)")) + nfail++; + cc->Delete(); + delete ccb1; + } + EXPECT_EQ(nfail, 0); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/compile_test.cc b/outside/re2/re2/testing/compile_test.cc new file mode 100644 index 000000000..8d92105e2 --- /dev/null +++ b/outside/re2/re2/testing/compile_test.cc @@ -0,0 +1,171 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test prog.cc, compile.cc + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" +#include "re2/prog.h" + +DEFINE_string(show, "", "regular expression to compile and dump"); + +namespace re2 { + +// Simple input/output tests checking that +// the regexp compiles to the expected code. +// These are just to sanity check the basic implementation. +// The real confidence tests happen by testing the NFA/DFA +// that run the compiled code. + +struct Test { + const char* regexp; + const char* code; +}; + +static Test tests[] = { + { "a", + "1. byte [61-61] -> 2\n" + "2. match! 0\n" }, + { "ab", + "1. byte [61-61] -> 2\n" + "2. byte [62-62] -> 3\n" + "3. match! 0\n" }, + { "a|c", + "3. alt -> 1 | 2\n" + "1. byte [61-61] -> 4\n" + "2. byte [63-63] -> 4\n" + "4. match! 0\n" }, + { "a|b", + "1. byte [61-62] -> 2\n" + "2. match! 0\n" }, + { "[ab]", + "1. byte [61-62] -> 2\n" + "2. match! 0\n" }, + { "a+", + "1. byte [61-61] -> 2\n" + "2. alt -> 1 | 3\n" + "3. match! 0\n" }, + { "a+?", + "1. byte [61-61] -> 2\n" + "2. alt -> 3 | 1\n" + "3. match! 0\n" }, + { "a*", + "2. alt -> 1 | 3\n" + "1. byte [61-61] -> 2\n" + "3. match! 0\n" }, + { "a*?", + "2. alt -> 3 | 1\n" + "3. match! 0\n" + "1. byte [61-61] -> 2\n" }, + { "a?", + "2. alt -> 1 | 3\n" + "1. byte [61-61] -> 3\n" + "3. match! 0\n" }, + { "a??", + "2. alt -> 3 | 1\n" + "3. match! 0\n" + "1. byte [61-61] -> 3\n" }, + { "a{4}", + "1. byte [61-61] -> 2\n" + "2. byte [61-61] -> 3\n" + "3. byte [61-61] -> 4\n" + "4. byte [61-61] -> 5\n" + "5. match! 0\n" }, + { "(a)", + "2. capture 2 -> 1\n" + "1. byte [61-61] -> 3\n" + "3. capture 3 -> 4\n" + "4. match! 0\n" }, + { "(?:a)", + "1. byte [61-61] -> 2\n" + "2. match! 0\n" }, + { "", + "2. match! 0\n" }, + { ".", + "3. alt -> 1 | 2\n" + "1. byte [00-09] -> 4\n" + "2. byte [0b-ff] -> 4\n" + "4. match! 0\n" }, + { "[^ab]", + "5. alt -> 3 | 4\n" + "3. alt -> 1 | 2\n" + "4. byte [63-ff] -> 6\n" + "1. byte [00-09] -> 6\n" + "2. byte [0b-60] -> 6\n" + "6. match! 0\n" }, + { "[Aa]", + "1. byte/i [61-61] -> 2\n" + "2. match! 0\n" }, +}; + +TEST(TestRegexpCompileToProg, Simple) { + int failed = 0; + for (int i = 0; i < arraysize(tests); i++) { + const re2::Test& t = tests[i]; + Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL); + if (re == NULL) { + LOG(ERROR) << "Cannot parse: " << t.regexp; + failed++; + continue; + } + Prog* prog = re->CompileToProg(0); + if (prog == NULL) { + LOG(ERROR) << "Cannot compile: " << t.regexp; + re->Decref(); + failed++; + continue; + } + CHECK(re->CompileToProg(1) == NULL); + string s = prog->Dump(); + if (s != t.code) { + LOG(ERROR) << "Incorrect compiled code for: " << t.regexp; + LOG(ERROR) << "Want:\n" << t.code; + LOG(ERROR) << "Got:\n" << s; + failed++; + } + delete prog; + re->Decref(); + } + EXPECT_EQ(failed, 0); +} + +// The distinct byte ranges involved in the UTF-8 dot ([^\n]). +// Once, erroneously split between 0x3f and 0x40 because it is +// a 6-bit boundary. +static struct UTF8ByteRange { + int lo; + int hi; +} utf8ranges[] = { + { 0x00, 0x09 }, + { 0x0A, 0x0A }, + { 0x10, 0x7F }, + { 0x80, 0x8F }, + { 0x90, 0x9F }, + { 0xA0, 0xBF }, + { 0xC0, 0xC1 }, + { 0xC2, 0xDF }, + { 0xE0, 0xE0 }, + { 0xE1, 0xEF }, + { 0xF0, 0xF0 }, + { 0xF1, 0xF3 }, + { 0xF4, 0xF4 }, + { 0xF5, 0xFF }, +}; + +TEST(TestCompile, ByteRanges) { + Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL); + EXPECT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + EXPECT_TRUE(prog != NULL); + EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges)); + for (int i = 0; i < arraysize(utf8ranges); i++) + for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++) + EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j; + delete prog; + re->Decref(); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/dfa_test.cc b/outside/re2/re2/testing/dfa_test.cc new file mode 100644 index 000000000..8e95ae4b7 --- /dev/null +++ b/outside/re2/re2/testing/dfa_test.cc @@ -0,0 +1,344 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "util/thread.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +DECLARE_bool(re2_dfa_bail_when_slow); + +DEFINE_int32(size, 8, "log2(number of DFA nodes)"); +DEFINE_int32(repeat, 2, "Repetition count."); +DEFINE_int32(threads, 4, "number of threads"); + +namespace re2 { + +// Check that multithreaded access to DFA class works. + +// Helper thread: builds entire DFA for prog. +class BuildThread : public Thread { + public: + BuildThread(Prog* prog) : prog_(prog) {} + virtual void Run() { + CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch)); + } + + private: + Prog* prog_; +}; + +TEST(Multithreaded, BuildEntireDFA) { + // Create regexp with 2^FLAGS_size states in DFA. + string s = "a"; + for (int i = 0; i < FLAGS_size; i++) + s += "[ab]"; + s += "b"; + + // Check that single-threaded code works. + { + //LOG(INFO) << s; + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + BuildThread* t = new BuildThread(prog); + t->SetJoinable(true); + t->Start(); + t->Join(); + delete t; + delete prog; + re->Decref(); + } + + // Build the DFA simultaneously in a bunch of threads. + for (int i = 0; i < FLAGS_repeat; i++) { + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + + vector threads; + for (int j = 0; j < FLAGS_threads; j++) { + BuildThread *t = new BuildThread(prog); + t->SetJoinable(true); + threads.push_back(t); + } + for (int j = 0; j < FLAGS_threads; j++) + threads[j]->Start(); + for (int j = 0; j < FLAGS_threads; j++) { + threads[j]->Join(); + delete threads[j]; + } + + // One more compile, to make sure everything is okay. + prog->BuildEntireDFA(Prog::kFirstMatch); + delete prog; + re->Decref(); + } +} + +// Check that DFA size requirements are followed. +// BuildEntireDFA will, like SearchDFA, stop building out +// the DFA once the memory limits are reached. +TEST(SingleThreaded, BuildEntireDFA) { + // Create regexp with 2^30 states in DFA. + string s = "a"; + for (int i = 0; i < 30; i++) + s += "[ab]"; + s += "b"; + + //LOG(INFO) << s; + Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); + CHECK(re); + int max = 24; + for (int i = 17; i < max; i++) { + int limit = 1<CompileToProg(limit); + CHECK(prog); + //progusage = m.HeapGrowth(); + //dfamem = prog->dfa_mem(); + prog->BuildEntireDFA(Prog::kFirstMatch); + prog->BuildEntireDFA(Prog::kLongestMatch); + usage = m.HeapGrowth(); + delete prog; + } + if (!UsingMallocCounter) + continue; + //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n", + // limit, progusage, dfamem, usage); + CHECK_GT(usage, limit*9/10); + CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay + } + re->Decref(); +} + +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n + 2^n - 1 called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +// Such a string is useful for testing a DFA. If you have a DFA +// where distinct last n bytes implies distinct states, then running on a +// DeBruijn string causes the DFA to need to create a new state at every +// position in the input, never reusing any states until it gets to the +// end of the string. This is the worst possible case for DFA execution. +static string DeBruijnString(int n) { + CHECK_LT(n, 8*sizeof(int)); + CHECK_GT(n, 0); + + vector did(1<CompileToProg(1<SearchDFA(match, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(matched); + matched = prog->SearchDFA(no_match, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(!matched); + } + usage = m.HeapGrowth(); + peak_usage = m.PeakHeapGrowth(); + delete prog; + } + re->Decref(); + + if (!UsingMallocCounter) + return; + //LOG(INFO) << "usage " << usage << " " << peak_usage; + CHECK_LT(usage, 1<SearchDFA(match_, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(matched); + matched = prog_->SearchDFA(no_match_, NULL, + Prog::kUnanchored, Prog::kFirstMatch, + NULL, &failed, NULL); + CHECK(!failed); + CHECK(!matched); + } + } + + private: + Prog* prog_; + StringPiece match_; + StringPiece no_match_; +}; + +TEST(Multithreaded, SearchDFA) { + // Same as single-threaded test above. + const int n = 18; + Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), + Regexp::LikePerl, NULL); + CHECK(re); + string no_match = DeBruijnString(n); + string match = no_match + "0"; + FLAGS_re2_dfa_bail_when_slow = false; + + // Check that single-threaded code works. + { + Prog* prog = re->CompileToProg(1<SetJoinable(true); + t->Start(); + t->Join(); + delete t; + delete prog; + } + + // Run the search simultaneously in a bunch of threads. + // Reuse same flags for Multithreaded.BuildDFA above. + for (int i = 0; i < FLAGS_repeat; i++) { + //LOG(INFO) << "Search " << i; + Prog* prog = re->CompileToProg(1< threads; + for (int j = 0; j < FLAGS_threads; j++) { + SearchThread *t = new SearchThread(prog, match, no_match); + t->SetJoinable(true); + threads.push_back(t); + } + for (int j = 0; j < FLAGS_threads; j++) + threads[j]->Start(); + for (int j = 0; j < FLAGS_threads; j++) { + threads[j]->Join(); + delete threads[j]; + } + delete prog; + } + re->Decref(); +} + +struct ReverseTest { + const char *regexp; + const char *text; + bool match; +}; + +// Test that reverse DFA handles anchored/unanchored correctly. +// It's in the DFA interface but not used by RE2. +ReverseTest reverse_tests[] = { + { "\\A(a|b)", "abc", true }, + { "(a|b)\\z", "cba", true }, + { "\\A(a|b)", "cba", false }, + { "(a|b)\\z", "abc", false }, +}; + +TEST(DFA, ReverseMatch) { + int nfail = 0; + for (int i = 0; i < arraysize(reverse_tests); i++) { + const ReverseTest& t = reverse_tests[i]; + Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog *prog = re->CompileToReverseProg(0); + CHECK(prog); + bool failed = false; + bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL); + if (matched != t.match) { + LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; + nfail++; + } + delete prog; + re->Decref(); + } + EXPECT_EQ(nfail, 0); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/dump.cc b/outside/re2/re2/testing/dump.cc new file mode 100644 index 000000000..4bdf71465 --- /dev/null +++ b/outside/re2/re2/testing/dump.cc @@ -0,0 +1,164 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Dump the regexp into a string showing structure. +// Tested by parse_unittest.cc + +// This function traverses the regexp recursively, +// meaning that on inputs like Regexp::Simplify of +// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100}, +// it takes time and space exponential in the size of the +// original regular expression. It can also use stack space +// linear in the size of the regular expression for inputs +// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*. +// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE. +// As a result, Dump is provided only in the testing +// library (see BUILD). + +#include +#include +#include "util/test.h" +#include "re2/stringpiece.h" +#include "re2/regexp.h" + +// Cause a link error if this file is used outside of testing. +DECLARE_string(test_tmpdir); + +namespace re2 { + +static const char* kOpcodeNames[] = { + "bad", + "no", + "emp", + "lit", + "str", + "cat", + "alt", + "star", + "plus", + "que", + "rep", + "cap", + "dot", + "byte", + "bol", + "eol", + "wb", // kRegexpWordBoundary + "nwb", // kRegexpNoWordBoundary + "bot", + "eot", + "cc", + "match", +}; + +// Create string representation of regexp with explicit structure. +// Nothing pretty, just for testing. +static void DumpRegexpAppending(Regexp* re, string* s) { + if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) { + StringAppendF(s, "op%d", re->op()); + } else { + switch (re->op()) { + default: + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + if (re->parse_flags() & Regexp::NonGreedy) + s->append("n"); + break; + } + s->append(kOpcodeNames[re->op()]); + if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) { + Rune r = re->rune(); + if ('a' <= r && r <= 'z') + s->append("fold"); + } + if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) { + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + if ('a' <= r && r <= 'z') { + s->append("fold"); + break; + } + } + } + } + s->append("{"); + switch (re->op()) { + default: + break; + case kRegexpEndText: + if (!(re->parse_flags() & Regexp::WasDollar)) { + s->append("\\z"); + } + break; + case kRegexpLiteral: { + Rune r = re->rune(); + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + break; + } + case kRegexpLiteralString: + for (int i = 0; i < re->nrunes(); i++) { + Rune r = re->runes()[i]; + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } + break; + case kRegexpConcat: + case kRegexpAlternate: + for (int i = 0; i < re->nsub(); i++) + DumpRegexpAppending(re->sub()[i], s); + break; + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCapture: + if (re->name()) { + s->append(*re->name()); + s->append(":"); + } + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpRepeat: + s->append(StringPrintf("%d,%d ", re->min(), re->max())); + DumpRegexpAppending(re->sub()[0], s); + break; + case kRegexpCharClass: { + string sep; + for (CharClass::iterator it = re->cc()->begin(); + it != re->cc()->end(); ++it) { + RuneRange rr = *it; + s->append(sep); + if (rr.lo == rr.hi) + s->append(StringPrintf("%#x", rr.lo)); + else + s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi)); + sep = " "; + } + break; + } + } + s->append("}"); +} + +string Regexp::Dump() { + string s; + + // Make sure being called from a unit test. + if (FLAGS_test_tmpdir.empty()) { + LOG(ERROR) << "Cannot use except for testing."; + return s; + } + + DumpRegexpAppending(this, &s); + return s; +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/exhaustive1_test.cc b/outside/re2/re2/testing/exhaustive1_test.cc new file mode 100644 index 000000000..9e057cc56 --- /dev/null +++ b/outside/re2/re2/testing/exhaustive1_test.cc @@ -0,0 +1,42 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +DECLARE_string(regexp_engines); + +namespace re2 { + +// Test simple repetition operators +TEST(Repetition, Simple) { + vector ops = Split(" ", + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} " + "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} " + "%s* %s+ %s? %s*? %s+? %s??"); + ExhaustiveTest(3, 2, Explode("abc."), ops, + 6, Explode("ab"), "(?:%s)", ""); + ExhaustiveTest(3, 2, Explode("abc."), ops, + 40, Explode("a"), "(?:%s)", ""); +} + +// Test capturing parens -- (a) -- inside repetition operators +TEST(Repetition, Capturing) { + vector ops = Split(" ", + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} " + "%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} " + "%s* %s+ %s? %s*? %s+? %s??"); + ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops, + 7, Explode("ab"), "(?:%s)", ""); + + // This would be a great test, but it runs forever when PCRE is enabled. + if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL) + ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops, + 100, Explode("a"), "(?:%s)", ""); +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/exhaustive2_test.cc b/outside/re2/re2/testing/exhaustive2_test.cc new file mode 100644 index 000000000..c5fec5b3e --- /dev/null +++ b/outside/re2/re2/testing/exhaustive2_test.cc @@ -0,0 +1,70 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/re2.h" +#include "re2/testing/exhaustive_tester.h" + +DECLARE_string(regexp_engines); + +namespace re2 { + +// Test empty string matches (aka "(?:)") +TEST(EmptyString, Exhaustive) { + ExhaustiveTest(2, 2, Split(" ", "(?:) a"), + RegexpGenerator::EgrepOps(), + 5, Split("", "ab"), "", ""); +} + +// Test escaped versions of regexp syntax. +TEST(Punctuation, Literals) { + vector alphabet = Explode("()*+?{}[]\\^$."); + vector escaped = alphabet; + for (int i = 0; i < escaped.size(); i++) + escaped[i] = "\\" + escaped[i]; + ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), + 2, alphabet, "", ""); +} + +// Test ^ $ . \A \z in presence of line endings. +// Have to wrap the empty-width ones in (?:) so that +// they can be repeated -- PCRE rejects ^* but allows (?:^)* +TEST(LineEnds, Exhaustive) { + ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"), + RegexpGenerator::EgrepOps(), + 4, Explode("ab\n"), "", ""); +} + +// Test what does and does not match \n. +// This would be a good test, except that PCRE seems to have a bug: +// in single-byte character set mode (the default), +// [^a] matches \n, but in UTF-8 mode it does not. +// So when we run the test, the tester complains that +// we don't agree with PCRE, but it's PCRE that is at fault. +// For what it's worth, Perl gets this right (matches +// regardless of whether UTF-8 input is selected): +// +// #!/usr/bin/perl +// use POSIX qw(locale_h); +// print "matches in latin1\n" if "\n" =~ /[^a]/; +// setlocale("en_US.utf8"); +// print "matches in utf8\n" if "\n" =~ /[^a]/; +// +// The rule chosen for RE2 is that by default, like Perl, +// dot does not match \n but negated character classes [^a] do. +// (?s) will allow dot to match \n; there is no way in RE2 +// to stop [^a] from matching \n, though the underlying library +// provides a mechanism, and RE2 could add new syntax if needed. +// +// TEST(Newlines, Exhaustive) { +// vector empty_vector; +// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), +// RegexpGenerator::EgrepOps(), +// 4, Explode("a\n"), ""); +// } + +} // namespace re2 + diff --git a/outside/re2/re2/testing/exhaustive3_test.cc b/outside/re2/re2/testing/exhaustive3_test.cc new file mode 100644 index 000000000..5613fcbe8 --- /dev/null +++ b/outside/re2/re2/testing/exhaustive3_test.cc @@ -0,0 +1,94 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +// Test simple character classes by themselves. +TEST(CharacterClasses, Exhaustive) { + vector atoms = Split(" ", + "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); + ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), + 5, Explode("ab"), "", ""); +} + +// Test simple character classes inside a___b (for example, a[a]b). +TEST(CharacterClasses, ExhaustiveAB) { + vector atoms = Split(" ", + "[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b ."); + ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(), + 5, Explode("ab"), "a%sb", ""); +} + +// Returns UTF8 for Rune r +static string UTF8(Rune r) { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + return string(buf); +} + +// Returns a vector of "interesting" UTF8 characters. +// Unicode is now too big to just return all of them, +// so UTF8Characters return a set likely to be good test cases. +static const vector& InterestingUTF8() { + static bool init; + static vector v; + + if (init) + return v; + + init = true; + // All the Latin1 equivalents are interesting. + for (int i = 1; i < 256; i++) + v.push_back(UTF8(i)); + + // After that, the codes near bit boundaries are + // interesting, because they span byte sequence lengths. + for (int j = 0; j < 8; j++) + v.push_back(UTF8(256 + j)); + for (int i = 512; i < Runemax; i <<= 1) + for (int j = -8; j < 8; j++) + v.push_back(UTF8(i + j)); + + // The codes near Runemax, including Runemax itself, are interesting. + for (int j = -8; j <= 0; j++) + v.push_back(UTF8(Runemax + j)); + + return v; +} + +// Test interesting UTF-8 characters against character classes. +TEST(InterestingUTF8, SingleOps) { + vector atoms = Split(" ", + ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " + "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " + "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " + "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); + vector ops; // no ops + ExhaustiveTest(1, 0, atoms, ops, + 1, InterestingUTF8(), "", ""); +} + +// Test interesting UTF-8 characters against character classes, +// but wrap everything inside AB. +TEST(InterestingUTF8, AB) { + vector atoms = Split(" ", + ". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B " + "[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] " + "[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] " + "[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]"); + vector ops; // no ops + vector alpha = InterestingUTF8(); + for (int i = 0; i < alpha.size(); i++) + alpha[i] = "a" + alpha[i] + "b"; + ExhaustiveTest(1, 0, atoms, ops, + 1, alpha, "a%sb", ""); +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/exhaustive_test.cc b/outside/re2/re2/testing/exhaustive_test.cc new file mode 100644 index 000000000..fc40dee88 --- /dev/null +++ b/outside/re2/re2/testing/exhaustive_test.cc @@ -0,0 +1,38 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +DECLARE_string(regexp_engines); + +// Test very simple expressions. +TEST(EgrepLiterals, Lowercase) { + EgrepTest(3, 2, "abc.", 3, "abc", ""); +} + +// Test mixed-case expressions. +TEST(EgrepLiterals, MixedCase) { + EgrepTest(3, 2, "AaBb.", 2, "AaBb", ""); +} + +// Test mixed-case in case-insensitive mode. +TEST(EgrepLiterals, FoldCase) { + // The punctuation characters surround A-Z and a-z + // in the ASCII table. This looks for bugs in the + // bytemap range code in the DFA. + EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)"); +} + +// Test very simple expressions. +TEST(EgrepLiterals, UTF8) { + EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", ""); +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/exhaustive_tester.cc b/outside/re2/re2/testing/exhaustive_tester.cc new file mode 100644 index 000000000..54de85748 --- /dev/null +++ b/outside/re2/re2/testing/exhaustive_tester.cc @@ -0,0 +1,188 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Exhaustive testing of regular expression matching. + +// Each test picks an alphabet (e.g., "abc"), a maximum string length, +// a maximum regular expression length, and a maximum number of letters +// that can appear in the regular expression. Given these parameters, +// it tries every possible regular expression and string, verifying that +// the NFA, DFA, and a trivial backtracking implementation agree about +// the location of the match. + +#include +#include + +#ifndef LOGGING +#define LOGGING 0 +#endif + +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" +#include "re2/testing/tester.h" + +DEFINE_bool(show_regexps, false, "show regexps during testing"); + +DEFINE_int32(max_bad_regexp_inputs, 1, + "Stop testing a regular expression after finding this many " + "strings that break it."); + +// Compiled in debug mode, the usual tests run for over an hour. +// Have to cut it down to make the unit test machines happy. +DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode."); + +namespace re2 { + +static char* escape(const StringPiece& sp) { + static char buf[512]; + char* p = buf; + *p++ = '\"'; + for (int i = 0; i < sp.size(); i++) { + if(p+5 >= buf+sizeof buf) + LOG(FATAL) << "ExhaustiveTester escape: too long"; + if(sp[i] == '\\' || sp[i] == '\"') { + *p++ = '\\'; + *p++ = sp[i]; + } else if(sp[i] == '\n') { + *p++ = '\\'; + *p++ = 'n'; + } else { + *p++ = sp[i]; + } + } + *p++ = '\"'; + *p = '\0'; + return buf; +} + +static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { + if (!re.Match(input, 0, input.size(), anchor, m, n)) { + printf("-"); + return; + } + for (int i = 0; i < n; i++) { + if (i > 0) + printf(" "); + if (m[i].begin() == NULL) + printf("-"); + else + printf("%d-%d", static_cast(m[i].begin() - input.begin()), static_cast(m[i].end() - input.begin())); + } +} + +// Processes a single generated regexp. +// Compiles it using Regexp interface and PCRE, and then +// checks that NFA, DFA, and PCRE all return the same results. +void ExhaustiveTester::HandleRegexp(const string& const_regexp) { + regexps_++; + string regexp = const_regexp; + if (!topwrapper_.empty()) + regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); + + if (FLAGS_show_regexps) { + printf("\r%s", regexp.c_str()); + fflush(stdout); + } + + if (LOGGING) { + // Write out test cases and answers for use in testing + // other implementations, such as Go's regexp package. + if (randomstrings_) + LOG(ERROR) << "Cannot log with random strings."; + if (regexps_ == 1) { // first + printf("strings\n"); + strgen_.Reset(); + while (strgen_.HasNext()) + printf("%s\n", escape(strgen_.Next())); + printf("regexps\n"); + } + printf("%s\n", escape(regexp)); + + RE2 re(regexp); + RE2::Options longest; + longest.set_longest_match(true); + RE2 relongest(regexp, longest); + int ngroup = re.NumberOfCapturingGroups()+1; + StringPiece* group = new StringPiece[ngroup]; + + strgen_.Reset(); + while (strgen_.HasNext()) { + StringPiece input = strgen_.Next(); + PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(re, input, RE2::UNANCHORED, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); + printf(";"); + PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); + printf("\n"); + } + delete[] group; + return; + } + + Tester tester(regexp); + if (tester.error()) + return; + + strgen_.Reset(); + strgen_.GenerateNULL(); + if (randomstrings_) + strgen_.Random(stringseed_, stringcount_); + int bad_inputs = 0; + while (strgen_.HasNext()) { + tests_++; + if (!tester.TestInput(strgen_.Next())) { + failures_++; + if (++bad_inputs >= FLAGS_max_bad_regexp_inputs) + break; + } + } +} + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper, + const string& topwrapper) { + if (DEBUG_MODE && FLAGS_quick_debug_mode) { + if (maxatoms > 1) + maxatoms--; + if (maxops > 1) + maxops--; + if (maxstrlen > 1) + maxstrlen--; + } + ExhaustiveTester t(maxatoms, maxops, alphabet, ops, + maxstrlen, stralphabet, wrapper, + topwrapper); + t.Generate(); + if (!LOGGING) { + printf("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + } + EXPECT_EQ(0, t.failures()); +} + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const string& alphabet, + int maxstrlen, const string& stralphabet, + const string& wrapper) { + const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; + + for (int i = 0; i < arraysize(tops); i++) { + ExhaustiveTest(maxatoms, maxops, + Split("", alphabet), + RegexpGenerator::EgrepOps(), + maxstrlen, + Split("", stralphabet), + wrapper, + tops[i]); + } +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/exhaustive_tester.h b/outside/re2/re2/testing/exhaustive_tester.h new file mode 100644 index 000000000..38a139f58 --- /dev/null +++ b/outside/re2/re2/testing/exhaustive_tester.h @@ -0,0 +1,85 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__ +#define RE2_TESTING_EXHAUSTIVE_TESTER_H__ + +#include +#include +#include "util/util.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Exhaustive regular expression test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that NFA, DFA, and PCRE agree about whether each regexp matches +// each possible string, and if so, where the match is. +// +// Can also be used in a "random" mode that generates a given number +// of random regexp and strings, allowing testing of larger expressions +// and inputs. +class ExhaustiveTester : public RegexpGenerator { + public: + ExhaustiveTester(int maxatoms, + int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, + const vector& stralphabet, + const string& wrapper, + const string& topwrapper) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + wrapper_(wrapper), + topwrapper_(topwrapper), + regexps_(0), tests_(0), failures_(0), + randomstrings_(0), stringseed_(0), stringcount_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + int failures() { return failures_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const string& regexp); + + // Causes testing to generate random input strings. + void RandomStrings(int32 seed, int32 count) { + randomstrings_ = true; + stringseed_ = seed; + stringcount_ = count; + } + + private: + StringGenerator strgen_; + string wrapper_; // Regexp wrapper - either empty or has one %s. + string topwrapper_; // Regexp top-level wrapper. + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + int failures_; // Number of tests failed. + + bool randomstrings_; // Whether to use random strings + int32 stringseed_; // If so, the seed. + int stringcount_; // If so, how many to generate. + DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester); +}; + +// Runs an exhaustive test on the given parameters. +void ExhaustiveTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper, + const string& topwrapper); + +// Runs an exhaustive test using the given parameters and +// the basic egrep operators. +void EgrepTest(int maxatoms, int maxops, const string& alphabet, + int maxstrlen, const string& stralphabet, + const string& wrapper); + +} // namespace re2 + +#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__ diff --git a/outside/re2/re2/testing/filtered_re2_test.cc b/outside/re2/re2/testing/filtered_re2_test.cc new file mode 100644 index 000000000..e3a0dd137 --- /dev/null +++ b/outside/re2/re2/testing/filtered_re2_test.cc @@ -0,0 +1,275 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/filtered_re2.h" +#include "re2/re2.h" + +DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc + +namespace re2 { + +struct FilterTestVars { + vector atoms; + vector atom_indices; + vector matches; + RE2::Options opts; + FilteredRE2 f; +}; + +TEST(FilteredRE2Test, EmptyTest) { + FilterTestVars v; + v.f.AllMatches("foo", v.atom_indices, &v.matches); + EXPECT_EQ(0, v.matches.size()); +} + +TEST(FilteredRE2Test, SmallOrTest) { + FLAGS_filtered_re2_min_atom_len = 4; + + FilterTestVars v; + int id; + v.f.Add("(foo|bar)", v.opts, &id); + + v.f.Compile(&v.atoms); + EXPECT_EQ(0, v.atoms.size()); + + v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +TEST(FilteredRE2Test, SmallLatinTest) { + FLAGS_filtered_re2_min_atom_len = 3; + FilterTestVars v; + int id; + + v.opts.set_utf8(false); + v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id); + v.f.Compile(&v.atoms); + EXPECT_EQ(1, v.atoms.size()); + EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef"); + + v.atom_indices.push_back(0); + v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches); + EXPECT_EQ(1, v.matches.size()); + EXPECT_EQ(id, v.matches[0]); +} + +struct AtomTest { + const char* testname; + // If any test needs more than this many regexps or atoms, increase + // the size of the corresponding array. + const char* regexps[20]; + const char* atoms[20]; +}; + +AtomTest atom_tests[] = { + { + // This test checks to make sure empty patterns are allowed. + "CheckEmptyPattern", + {""}, + {} + }, { + // This test checks that all atoms of length greater than min length + // are found, and no atoms that are of smaller length are found. + "AllAtomsGtMinLengthFound", { + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP" + }, { + "abc123", + "def456", + "ghi789", + "mnop", + "abc", + "yyy", + "mnmnpp", + "ppp" + } + }, { + // Test to make sure that any atoms that have another atom as a + // substring in an OR are removed; that is, only the shortest + // substring is kept. + "SubstrAtomRemovesSuperStrInOr", { + "(abc123|abc|ghi789|abc1234).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP" + }, { + "abc", + "ghi789", + "abcd", + "yyy", + "yyyzzz", + "mnmnpp", + "ppp" + } + }, { + // Test character class expansion. + "CharClassExpansion", { + "m[a-c][d-f]n.*[x-z]+", + "[x-y]bcde[ab]" + }, { + "madn", "maen", "mafn", + "mbdn", "mben", "mbfn", + "mcdn", "mcen", "mcfn", + "xbcdea", "xbcdeb", + "ybcdea", "ybcdeb" + } + }, { + // Test upper/lower of non-ASCII. + "UnicodeLower", { + "(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + }, { + "δδπππσσσ", + "λμνοπ", + "ψρστυ", + }, + }, +}; + +void AddRegexpsAndCompile(const char* regexps[], + int n, + struct FilterTestVars* v) { + for (int i = 0; i < n; i++) { + int id; + v->f.Add(regexps[i], v->opts, &id); + } + v->f.Compile(&v->atoms); +} + +bool CheckExpectedAtoms(const char* atoms[], + int n, + const char* testname, + struct FilterTestVars* v) { + vector expected; + for (int i = 0; i < n; i++) + expected.push_back(atoms[i]); + + bool pass = expected.size() == v->atoms.size(); + + sort(v->atoms.begin(), v->atoms.end()); + sort(expected.begin(), expected.end()); + for (int i = 0; pass && i < n; i++) + pass = pass && expected[i] == v->atoms[i]; + + if (!pass) { + LOG(WARNING) << "Failed " << testname; + LOG(WARNING) << "Expected #atoms = " << expected.size(); + for (int i = 0; i < expected.size(); i++) + LOG(WARNING) << expected[i]; + LOG(WARNING) << "Found #atoms = " << v->atoms.size(); + for (int i = 0; i < v->atoms.size(); i++) + LOG(WARNING) << v->atoms[i]; + } + + return pass; +} + +TEST(FilteredRE2Test, AtomTests) { + FLAGS_filtered_re2_min_atom_len = 3; + + int nfail = 0; + for (int i = 0; i < arraysize(atom_tests); i++) { + FilterTestVars v; + AtomTest* t = &atom_tests[i]; + int natom, nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + for (natom = 0; natom < arraysize(t->atoms); natom++) + if (t->atoms[natom] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v)) + nfail++; + } + EXPECT_EQ(0, nfail); +} + +void FindAtomIndices(const vector atoms, + const vector matched_atoms, + vector* atom_indices) { + atom_indices->clear(); + for (int i = 0; i < matched_atoms.size(); i++) { + int j = 0; + for (; j < atoms.size(); j++) { + if (matched_atoms[i] == atoms[j]) { + atom_indices->push_back(j); + break; + } + EXPECT_LT(j, atoms.size()); + } + } +} + +TEST(FilteredRE2Test, MatchEmptyPattern) { + FLAGS_filtered_re2_min_atom_len = 3; + + FilterTestVars v; + AtomTest* t = &atom_tests[0]; + // We are using the regexps used in one of the atom tests + // for this test. Adding the EXPECT here to make sure + // the index we use for the test is for the correct test. + EXPECT_EQ("CheckEmptyPattern", string(t->testname)); + int nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + string text = "0123"; + vector atom_ids; + vector matching_regexps; + EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids)); +} + +TEST(FilteredRE2Test, MatchTests) { + FLAGS_filtered_re2_min_atom_len = 3; + + FilterTestVars v; + AtomTest* t = &atom_tests[2]; + // We are using the regexps used in one of the atom tests + // for this test. + EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname)); + int nregexp; + for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++) + if (t->regexps[nregexp] == NULL) + break; + AddRegexpsAndCompile(t->regexps, nregexp, &v); + + string text = "abc121212xyz"; + // atoms = abc + vector atom_ids; + vector atoms; + atoms.push_back("abc"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + vector matching_regexps; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abc12312yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(1, matching_regexps.size()); + + text = "abcd12yyy32yyyzzz"; + atoms.clear(); + atoms.push_back("abc"); + atoms.push_back("abcd"); + atoms.push_back("yyy"); + atoms.push_back("yyyzzz"); + FindAtomIndices(v.atoms, atoms, &atom_ids); + LOG(INFO) << "S: " << atom_ids.size(); + for (int i = 0; i < atom_ids.size(); i++) + LOG(INFO) << "i: " << i << " : " << atom_ids[i]; + v.f.AllMatches(text, atom_ids, &matching_regexps); + EXPECT_EQ(2, matching_regexps.size()); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/mimics_pcre_test.cc b/outside/re2/re2/testing/mimics_pcre_test.cc new file mode 100644 index 000000000..f96509298 --- /dev/null +++ b/outside/re2/re2/testing/mimics_pcre_test.cc @@ -0,0 +1,76 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/prog.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PCRETest { + const char* regexp; + bool should_match; +}; + +static PCRETest tests[] = { + // Most things should behave exactly. + { "abc", true }, + { "(a|b)c", true }, + { "(a*|b)c", true }, + { "(a|b*)c", true }, + { "a(b|c)d", true }, + { "a(()|())c", true }, + { "ab*c", true }, + { "ab+c", true }, + { "a(b*|c*)d", true }, + { "\\W", true }, + { "\\W{1,2}", true }, + { "\\d", true }, + + // Check that repeated empty strings do not. + { "(a*)*", false }, + { "x(a*)*y", false }, + { "(a*)+", false }, + { "(a+)*", true }, + { "(a+)+", true }, + { "(a+)+", true }, + + // \v is the only character class that shouldn't. + { "\\b", true }, + { "\\v", false }, + { "\\d", true }, + + // The handling of ^ in multi-line mode is different, as is + // the handling of $ in single-line mode. (Both involve + // boundary cases if the string ends with \n.) + { "\\A", true }, + { "\\z", true }, + { "(?m)^", false }, + { "(?m)$", true }, + { "(?-m)^", true }, + { "(?-m)$", false }, // In PCRE, == \Z + { "(?m)\\A", true }, + { "(?m)\\z", true }, + { "(?-m)\\A", true }, + { "(?-m)\\z", true }, +}; + +TEST(MimicsPCRE, SimpleTests) { + for (int i = 0; i < arraysize(tests); i++) { + const PCRETest& t = tests[i]; + for (int j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + CHECK(re) << " " << t.regexp; + CHECK_EQ(t.should_match, re->MimicsPCRE()) + << " " << t.regexp << " " + << (j==0 ? "latin1" : "utf"); + re->Decref(); + } + } +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/null_walker.cc b/outside/re2/re2/testing/null_walker.cc new file mode 100644 index 000000000..09b53cbea --- /dev/null +++ b/outside/re2/re2/testing/null_walker.cc @@ -0,0 +1,44 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +// Null walker. For benchmarking the walker itself. + +class NullWalker : public Regexp::Walker { + public: + NullWalker() { } + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args); + + bool ShortVisit(Regexp* re, bool a) { + // Should never be called: we use Walk not WalkExponential. + LOG(DFATAL) << "NullWalker::ShortVisit called"; + return a; + } + + private: + DISALLOW_EVIL_CONSTRUCTORS(NullWalker); +}; + +// Called after visiting re's children. child_args contains the return +// value from each of the children's PostVisits (i.e., whether each child +// can match an empty string). Returns whether this clause can match an +// empty string. +bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, + bool* child_args, int nchild_args) { + return false; +} + +// Returns whether re can match an empty string. +void Regexp::NullWalk() { + NullWalker w; + w.Walk(this, false); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/parse_test.cc b/outside/re2/re2/testing/parse_test.cc new file mode 100644 index 000000000..f67b477bf --- /dev/null +++ b/outside/re2/re2/testing/parse_test.cc @@ -0,0 +1,433 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30); + +struct Test { + const char* regexp; + const char* parse; + Regexp::ParseFlags flags; +}; + +static Regexp::ParseFlags kTestFlags = Regexp::MatchNL | + Regexp::PerlX | + Regexp::PerlClasses | + Regexp::UnicodeGroups; + +static Test tests[] = { + // Base cases + { "a", "lit{a}" }, + { "a.", "cat{lit{a}dot{}}" }, + { "a.b", "cat{lit{a}dot{}lit{b}}" }, + { "ab", "str{ab}" }, + { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" }, + { "abc", "str{abc}" }, + { "a|^", "alt{lit{a}bol{}}" }, + { "a|b", "cc{0x61-0x62}" }, + { "(a)", "cap{lit{a}}" }, + { "(a)|b", "alt{cap{lit{a}}lit{b}}" }, + { "a*", "star{lit{a}}" }, + { "a+", "plus{lit{a}}" }, + { "a?", "que{lit{a}}" }, + { "a{2}", "rep{2,2 lit{a}}" }, + { "a{2,3}", "rep{2,3 lit{a}}" }, + { "a{2,}", "rep{2,-1 lit{a}}" }, + { "a*?", "nstar{lit{a}}" }, + { "a+?", "nplus{lit{a}}" }, + { "a??", "nque{lit{a}}" }, + { "a{2}?", "nrep{2,2 lit{a}}" }, + { "a{2,3}?", "nrep{2,3 lit{a}}" }, + { "a{2,}?", "nrep{2,-1 lit{a}}" }, + { "", "emp{}" }, + { "|", "emp{}" }, // alt{emp{}emp{}} but got factored + { "|x|", "alt{emp{}lit{x}emp{}}" }, + { ".", "dot{}" }, + { "^", "bol{}" }, + { "$", "eol{}" }, + { "\\|", "lit{|}" }, + { "\\(", "lit{(}" }, + { "\\)", "lit{)}" }, + { "\\*", "lit{*}" }, + { "\\+", "lit{+}" }, + { "\\?", "lit{?}" }, + { "{", "lit{{}" }, + { "}", "lit{}}" }, + { "\\.", "lit{.}" }, + { "\\^", "lit{^}" }, + { "\\$", "lit{$}" }, + { "\\\\", "lit{\\}" }, + { "[ace]", "cc{0x61 0x63 0x65}" }, + { "[abc]", "cc{0x61-0x63}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[a]", "lit{a}" }, + { "\\-", "lit{-}" }, + { "-", "lit{-}" }, + { "\\_", "lit{_}" }, + + // Posix and Perl extensions + { "[[:lower:]]", "cc{0x61-0x7a}" }, + { "[a-z]", "cc{0x61-0x7a}" }, + { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" }, + { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "\\d", "cc{0x30-0x39}" }, + { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" }, + { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, + { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, + { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, + { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, + { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" }, + { "\\C", "byte{}" }, + + // Unicode, negatives, and a double negative. + { "\\p{Braille}", "cc{0x2800-0x28ff}" }, + { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" }, + { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, + + // More interesting regular expressions. + { "a{,2}", "str{a{,2}}" }, + { "\\.\\^\\$\\\\", "str{.^$\\}" }, + { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8 + { "a*{", "cat{star{lit{a}}lit{{}}" }, + + // Test precedences + { "(?:ab)*", "star{str{ab}}" }, + { "(ab)*", "star{cap{str{ab}}}" }, + { "ab|cd", "alt{str{ab}str{cd}}" }, + { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, + + // Test flattening. + { "(?:a)", "lit{a}" }, + { "(?:ab)(?:cd)", "str{abcd}" }, + { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + { "a|.", "dot{}" }, + { ".|a", "dot{}" }, + + // Test Perl quoted literals + { "\\Q+|*?{[\\E", "str{+|*?{[}" }, + { "\\Q+\\E+", "plus{lit{+}}" }, + { "\\Q\\\\E", "lit{\\}" }, + { "\\Q\\\\\\E", "str{\\\\}" }, + + // Test Perl \A and \z + { "(?m)^", "bol{}" }, + { "(?m)$", "eol{}" }, + { "(?-m)^", "bot{}" }, + { "(?-m)$", "eot{}" }, + { "(?m)\\A", "bot{}" }, + { "(?m)\\z", "eot{\\z}" }, + { "(?-m)\\A", "bot{}" }, + { "(?-m)\\z", "eot{\\z}" }, + + // Test named captures + { "(?Pa)", "cap{name:lit{a}}" }, + + // Case-folded literals + { "[Aa]", "litfold{a}" }, + + // Strings + { "abcde", "str{abcde}" }, + { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, + + // Reported bug involving \n leaking in despite use of NeverNL. + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL }, + { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::FoldCase }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL }, + { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", + Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase }, +}; + +bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { + return Regexp::Equal(a, b); +} + +void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags, + const string& title) { + Regexp** re = new Regexp*[ntests]; + for (int i = 0; i < ntests; i++) { + RegexpStatus status; + Regexp::ParseFlags f = flags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + re[i] = Regexp::Parse(tests[i].regexp, f, &status); + CHECK(re[i] != NULL) << " " << tests[i].regexp << " " + << status.Text(); + string s = re[i]->Dump(); + EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp + << "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f; + } + + for (int i = 0; i < ntests; i++) { + for (int j = 0; j < ntests; j++) { + EXPECT_EQ(string(tests[i].parse) == tests[j].parse, + RegexpEqualTestingOnly(re[i], re[j])) + << "Regexp: " << tests[i].regexp << " " << tests[j].regexp; + } + } + + for (int i = 0; i < ntests; i++) + re[i]->Decref(); + delete[] re; +} + +// Test that regexps parse to expected structures. +TEST(TestParse, SimpleRegexps) { + TestParse(tests, arraysize(tests), kTestFlags, "simple"); +} + +Test foldcase_tests[] = { + { "AbCdE", "strfold{abcde}" }, + { "[Aa]", "litfold{a}" }, + { "a", "litfold{a}" }, + + // 0x17F is an old English long s (looks like an f) and folds to s. + // 0x212A is the Kelvin symbol and folds to k. + { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...] + { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, +}; + +// Test that parsing with FoldCase works. +TEST(TestParse, FoldCase) { + TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase"); +} + +Test literal_tests[] = { + { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" }, +}; + +// Test that parsing with Literal works. +TEST(TestParse, Literal) { + TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal"); +} + +Test matchnl_tests[] = { + { ".", "dot{}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing with MatchNL works. +// (Also tested above during simple cases.) +TEST(TestParse, MatchNL) { + TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL"); +} + +Test nomatchnl_tests[] = { + { ".", "cc{0-0x9 0xb-0x10ffff}" }, + { "\n", "lit{\n}" }, + { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" }, + { "[a\\n]", "cc{0xa 0x61}" }, +}; + +// Test that parsing without MatchNL works. +TEST(TestParse, NoMatchNL) { + TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL"); +} + +Test prefix_tests[] = { + { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" }, + { "abc|abd|aef|bcx|bcy", + "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}" + "cat{str{bc}cc{0x78-0x79}}}" }, + { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" }, + { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" }, + { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" }, + { "(?:xx|yy)c|(?:xx|yy)d", + "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" }, + { "x{2}|x{2}[0-9]", + "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" }, + { "x{2}y|x{2}[0-9]y", + "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" }, +}; + +// Test that prefix factoring works. +TEST(TestParse, Prefix) { + TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix"); +} + +// Invalid regular expressions +const char* badtests[] = { + "(", + ")", + "(a", + "(a|b|", + "(a|b", + "[a-z", + "([a-z)", + "x{1001}", + "\xff", // Invalid UTF-8 + "[\xff]", + "[\\\xff]", + "\\\xff", + "(?Pa", + "(?P", + "(?Pa)", + "(?P<>a)", + "[a-Z]", + "(?i)[a-Z]", + "a{100000}", + "a{100000,}", +}; + +// Valid in Perl, bad in POSIX +const char* only_perl[] = { + "[a-b-c]", + "\\Qabc\\E", + "\\Q*+?{[\\E", + "\\Q\\\\E", + "\\Q\\\\\\E", + "\\Q\\\\\\\\E", + "\\Q\\\\\\\\\\E", + "(?:a)", + "(?Pa)", +}; + +// Valid in POSIX, bad in Perl. +const char* only_posix[] = { + "a++", + "a**", + "a?*", + "a+*", + "a{1}*", +}; + +// Test that parser rejects bad regexps. +TEST(TestParse, InvalidRegexps) { + for (int i = 0; i < arraysize(badtests); i++) { + CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL) + << " " << badtests[i]; + CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << badtests[i]; + } + for (int i = 0; i < arraysize(only_posix); i++) { + CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL) + << " " << only_posix[i]; + Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL); + CHECK(re) << " " << only_posix[i]; + re->Decref(); + } + for (int i = 0; i < arraysize(only_perl); i++) { + CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL) + << " " << only_perl[i]; + Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL); + CHECK(re) << " " << only_perl[i]; + re->Decref(); + } +} + +// Test that ToString produces original regexp or equivalent one. +TEST(TestToString, EquivalentParse) { + for (int i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + Regexp::ParseFlags f = kTestFlags; + if (tests[i].flags != 0) { + f = tests[i].flags & ~TestZeroFlags; + } + Regexp* re = Regexp::Parse(tests[i].regexp, f, &status); + CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + string s = re->Dump(); + EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s; + string t = re->ToString(); + if (t != tests[i].regexp) { + // If ToString didn't return the original regexp, + // it must have found one with fewer parens. + // Unfortunately we can't check the length here, because + // ToString produces "\\{" for a literal brace, + // but "{" is a shorter equivalent. + // CHECK_LT(t.size(), strlen(tests[i].regexp)) + // << " t=" << t << " regexp=" << tests[i].regexp; + + // Test that if we parse the new regexp we get the same structure. + Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + CHECK(nre != NULL) << " reparse " << t << " " << status.Text(); + string ss = nre->Dump(); + string tt = nre->ToString(); + if (s != ss || t != tt) + LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t; + EXPECT_EQ(s, ss); + EXPECT_EQ(t, tt); + nre->Decref(); + } + re->Decref(); + } +} + +// Test that capture error args are correct. +TEST(NamedCaptures, ErrorArgs) { + RegexpStatus status; + Regexp* re; + + re = Regexp::Parse("test(?Pz)", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?P"); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/possible_match_test.cc b/outside/re2/re2/testing/possible_match_test.cc new file mode 100644 index 000000000..7c2400eb5 --- /dev/null +++ b/outside/re2/re2/testing/possible_match_test.cc @@ -0,0 +1,240 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include "util/test.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "re2/testing/regexp_generator.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +// Test that C++ strings are compared as uint8s, not int8s. +// PossibleMatchRange doesn't depend on this, but callers probably will. +TEST(CplusplusStrings, EightBit) { + string s = "\x70"; + string t = "\xA0"; + EXPECT_LT(s, t); +} + +struct PrefixTest { + const char* regexp; + int maxlen; + const char* min; + const char* max; +}; + +static PrefixTest tests[] = { + { "", 10, "", "", }, + { "Abcdef", 10, "Abcdef", "Abcdef" }, + { "abc(def|ghi)", 10, "abcdef", "abcghi" }, + { "a+hello", 10, "aa", "ahello" }, + { "a*hello", 10, "a", "hello" }, + { "def|abc", 10, "abc", "def" }, + { "a(b)(c)[d]", 10, "abcd", "abcd" }, + { "ab(cab|cat)", 10, "abcab", "abcat" }, + { "ab(cab|ca)x", 10, "abcabx", "abcax" }, + { "(ab|x)(c|de)", 10, "abc", "xde" }, + { "(ab|x)?(c|z)?", 10, "", "z" }, + { "[^\\s\\S]", 10, "", "" }, + { "(abc)+", 5, "abc", "abcac" }, + { "(abc)+", 2, "ab", "ac" }, + { "(abc)+", 1, "a", "b" }, + { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "a*", 10, "", "ab" }, + + { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)a+hello", 10, "AA", "ahello" }, + { "(?i)a*hello", 10, "A", "hello" }, + { "(?i)def|abc", 10, "ABC", "def" }, + { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)[^\\s\\S]", 10, "", "" }, + { "(?i)(abc)+", 5, "ABC", "abcac" }, + { "(?i)(abc)+", 2, "AB", "ac" }, + { "(?i)(abc)+", 1, "A", "b" }, + { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)a*", 10, "", "ab" }, + { "(?i)A*", 10, "", "ab" }, + + { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, + { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, + { "\\Aa+hello", 10, "aa", "ahello" }, + { "\\Aa*hello", 10, "a", "hello" }, + { "\\Adef|abc", 10, "abc", "def" }, + { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, + { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, + { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, + { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, + { "\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "\\A[^\\s\\S]", 10, "", "" }, + { "\\A(abc)+", 5, "abc", "abcac" }, + { "\\A(abc)+", 2, "ab", "ac" }, + { "\\A(abc)+", 1, "a", "b" }, + { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, + { "\\Aa*", 10, "", "ab" }, + + { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, + { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, + { "(?i)\\Aa+hello", 10, "AA", "ahello" }, + { "(?i)\\Aa*hello", 10, "A", "hello" }, + { "(?i)\\Adef|abc", 10, "ABC", "def" }, + { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, + { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, + { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, + { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, + { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, + { "(?i)\\A[^\\s\\S]", 10, "", "" }, + { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, + { "(?i)\\A(abc)+", 2, "AB", "ac" }, + { "(?i)\\A(abc)+", 1, "A", "b" }, + { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, + { "(?i)\\Aa*", 10, "", "ab" }, + { "(?i)\\AA*", 10, "", "ab" }, +}; + +TEST(PossibleMatchRange, HandWritten) { + for (int i = 0; i < arraysize(tests); i++) { + for (int j = 0; j < 2; j++) { + const PrefixTest& t = tests[i]; + string min, max; + if (j == 0) { + LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); + Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen)) + << " " << t.regexp; + delete prog; + re->Decref(); + } else { + CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); + } + EXPECT_EQ(t.min, min) << t.regexp; + EXPECT_EQ(t.max, max) << t.regexp; + } + } +} + +// Test cases where PossibleMatchRange should return false. +TEST(PossibleMatchRange, Failures) { + string min, max; + + // Fails because no room to write max. + EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); + + // Fails because there is no max -- any non-empty string matches + // or begins a match. Have to use Latin-1 input, because there + // are no valid UTF-8 strings beginning with byte 0xFF. + EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".+hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*hello", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2(".*", RE2::Latin1). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + EXPECT_FALSE(RE2("\\C*"). + PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); + + // Fails because it's a malformed regexp. + EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) + << "min=" << CEscape(min) << ", max=" << CEscape(max); +} + +// Exhaustive test: generate all regexps within parameters, +// then generate all strings of a given length over a given alphabet, +// then check that the prefix information agrees with whether +// the regexp matches each of the strings. +class PossibleMatchTester : public RegexpGenerator { + public: + PossibleMatchTester(int maxatoms, + int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, + const vector& stralphabet) + : RegexpGenerator(maxatoms, maxops, alphabet, ops), + strgen_(maxstrlen, stralphabet), + regexps_(0), tests_(0) { } + + int regexps() { return regexps_; } + int tests() { return tests_; } + + // Needed for RegexpGenerator interface. + void HandleRegexp(const string& regexp); + + private: + StringGenerator strgen_; + + int regexps_; // Number of HandleRegexp calls + int tests_; // Number of regexp tests. + + DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); +}; + +// Processes a single generated regexp. +// Checks that all accepted strings agree with the prefix range. +void PossibleMatchTester::HandleRegexp(const string& regexp) { + regexps_++; + + VLOG(3) << CEscape(regexp); + + RE2 re(regexp, RE2::Latin1); + CHECK_EQ(re.error(), ""); + + string min, max; + if(!re.PossibleMatchRange(&min, &max, 10)) { + // There's no good max for "\\C*". Can't use strcmp + // because sometimes it gets embedded in more + // complicated expressions. + if(strstr(regexp.c_str(), "\\C*")) + return; + LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); + } + + strgen_.Reset(); + while (strgen_.HasNext()) { + const StringPiece& s = strgen_.Next(); + tests_++; + if (!RE2::FullMatch(s, re)) + continue; + CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max; + CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min; + } +} + +TEST(PossibleMatchRange, Exhaustive) { + int natom = 3; + int noperator = 3; + int stringlen = 5; + if (DEBUG_MODE) { + natom = 2; + noperator = 3; + stringlen = 3; + } + PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), + RegexpGenerator::EgrepOps(), + stringlen, Explode("ab4")); + t.Generate(); + LOG(INFO) << t.regexps() << " regexps, " + << t.tests() << " tests"; +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/random_test.cc b/outside/re2/re2/testing/random_test.cc new file mode 100644 index 000000000..91d2b3277 --- /dev/null +++ b/outside/re2/re2/testing/random_test.cc @@ -0,0 +1,95 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Random testing of regular expression matching. + +#include +#include "util/test.h" +#include "re2/testing/exhaustive_tester.h" + +DEFINE_int32(regexpseed, 404, "Random regexp seed."); +DEFINE_int32(regexpcount, 100, "How many random regexps to generate."); +DEFINE_int32(stringseed, 200, "Random string seed."); +DEFINE_int32(stringcount, 100, "How many random strings to generate."); + +namespace re2 { + +// Runs a random test on the given parameters. +// (Always uses the same random seeds for reproducibility. +// Can give different seeds on command line.) +static void RandomTest(int maxatoms, int maxops, + const vector& alphabet, + const vector& ops, + int maxstrlen, const vector& stralphabet, + const string& wrapper) { + // Limit to smaller test cases in debug mode, + // because everything is so much slower. + if (DEBUG_MODE) { + maxatoms--; + maxops--; + maxstrlen /= 2; + } + + ExhaustiveTester t(maxatoms, maxops, alphabet, ops, + maxstrlen, stralphabet, wrapper, ""); + t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount); + t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount); + printf("%d regexps, %d tests, %d failures [%d/%d str]\n", + t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); + EXPECT_EQ(0, t.failures()); +} + +// Tests random small regexps involving literals and egrep operators. +TEST(Random, SmallEgrepLiterals) { + RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random bigger regexps involving literals and egrep operators. +TEST(Random, BigEgrepLiterals) { + RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random small regexps involving literals, capturing parens, +// and egrep operators. +TEST(Random, SmallEgrepCaptures) { + RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random bigger regexps involving literals, capturing parens, +// and egrep operators. +TEST(Random, BigEgrepCaptures) { + RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(), + 15, Explode("abc"), + ""); +} + +// Tests random large complicated expressions, using all the possible +// operators, some literals, some parenthesized literals, and predefined +// character classes like \d. (Adding larger character classes would +// make for too many possibilities.) +TEST(Random, Complicated) { + vector ops = Split(" ", + "%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? " + "%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} " + "%s{2} %s{2,} %s{3,4} %s{4,5}"); + + // Use (?:\b) and (?:\B) instead of \b and \B, + // because PCRE rejects \b* but accepts (?:\b)*. + // Ditto ^ and $. + vector atoms = Split(" ", + ". (?:^) (?:$) \\a \\f \\n \\r \\t \\v " + "\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) " + "a (a) b c - \\\\"); + vector alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a"); + RandomTest(10, 10, atoms, ops, 20, alphabet, ""); +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/re2_arg_test.cc b/outside/re2/re2/testing/re2_arg_test.cc new file mode 100644 index 000000000..ae7a7b0dc --- /dev/null +++ b/outside/re2/re2/testing/re2_arg_test.cc @@ -0,0 +1,133 @@ +// Copyright 2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This tests to make sure numbers are parsed from strings +// correctly. +// Todo: Expand the test to validate strings parsed to the other types +// supported by RE2::Arg class + +#include "util/test.h" +#include "re2/re2.h" + +namespace re2 { + +struct SuccessTable { + const char * value_string; + int64 value; + bool success[6]; +}; + +// Test boundary cases for different integral sizes. +// Specifically I want to make sure that values outside the boundries +// of an integral type will fail and that negative numbers will fail +// for unsigned types. The following table contains the boundaries for +// the various integral types and has entries for whether or not each +// type can contain the given value. +const SuccessTable kSuccessTable[] = { +// string integer value short ushort int uint int64 uint64 +// 0 to 2^7-1 +{ "0", 0, { true, true, true, true, true, true }}, +{ "127", 127, { true, true, true, true, true, true }}, + +// -1 to -2^7 +{ "-1", -1, { true, false, true, false, true, false }}, +{ "-128", -128, { true, false, true, false, true, false }}, + +// 2^7 to 2^8-1 +{ "128", 128, { true, true, true, true, true, true }}, +{ "255", 255, { true, true, true, true, true, true }}, + +// 2^8 to 2^15-1 +{ "256", 256, { true, true, true, true, true, true }}, +{ "32767", 32767, { true, true, true, true, true, true }}, + +// -2^7-1 to -2^15 +{ "-129", -129, { true, false, true, false, true, false }}, +{ "-32768", -32768, { true, false, true, false, true, false }}, + +// 2^15 to 2^16-1 +{ "32768", 32768, { false, true, true, true, true, true }}, +{ "65535", 65535, { false, true, true, true, true, true }}, + +// 2^16 to 2^31-1 +{ "65536", 65536, { false, false, true, true, true, true }}, +{ "2147483647", 2147483647, { false, false, true, true, true, true }}, + +// -2^15-1 to -2^31 +{ "-32769", -32769, { false, false, true, false, true, false }}, +{ "-2147483648", + static_cast(0xFFFFFFFF80000000LL), +{ false, false, true, false, true, false }}, + +// 2^31 to 2^32-1 +{ "2147483648", 2147483648U, { false, false, false, true, true, true }}, +{ "4294967295", 4294967295U, { false, false, false, true, true, true }}, + +// 2^32 to 2^63-1 +{ "4294967296", 4294967296LL, { false, false, false, false, true, true }}, +{ "9223372036854775807", + 9223372036854775807LL, { false, false, false, false, true, true }}, + +// -2^31-1 to -2^63 +{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }}, +{ "-9223372036854775808", static_cast(0x8000000000000000LL), + { false, false, false, false, true, false }}, + +// 2^63 to 2^64-1 +{ "9223372036854775808", static_cast(9223372036854775808ULL), + { false, false, false, false, false, true }}, +{ "18446744073709551615", static_cast(18446744073709551615ULL), + { false, false, false, false, false, true }}, + +// >= 2^64 +{ "18446744073709551616", 0, { false, false, false, false, false, false }}, +}; + +const int kNumStrings = ARRAYSIZE(kSuccessTable); + +// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M +// macro outside of a TEST block and this seems to be the only way to +// avoid code duplication. I can also pull off a couple nice tricks +// using concatenation for the type I'm checking against. +#define PARSE_FOR_TYPE(type, column) { \ + type r; \ + for ( int i = 0; i < kNumStrings; ++i ) { \ + RE2::Arg arg(&r); \ + const char* const p = kSuccessTable[i].value_string; \ + bool retval = arg.Parse(p, strlen(p)); \ + bool success = kSuccessTable[i].success[column]; \ + ASSERT_TRUE_M(retval == success, \ + StringPrintf("Parsing '%s' for type " #type " should return %d", \ + p, success).c_str()); \ + if ( success ) { \ + ASSERT_EQUALS(r, kSuccessTable[i].value); \ + } \ + } \ +} + +TEST(REArgTest, Int16Test) { + PARSE_FOR_TYPE(int16, 0); +} + +TEST(REArgTest, Uint16Test) { + PARSE_FOR_TYPE(uint16, 1); +} + +TEST(REArgTest, IntTest) { + PARSE_FOR_TYPE(int, 2); +} + +TEST(REArgTest, UInt32Test) { + PARSE_FOR_TYPE(uint32, 3); +} + +TEST(REArgTest, Iint64Test) { + PARSE_FOR_TYPE(int64, 4); +} + +TEST(REArgTest, Uint64Test) { + PARSE_FOR_TYPE(uint64, 5); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/re2_test.cc b/outside/re2/re2/testing/re2_test.cc new file mode 100644 index 000000000..e2f97755d --- /dev/null +++ b/outside/re2/re2/testing/re2_test.cc @@ -0,0 +1,1404 @@ +// -*- coding: utf-8 -*- +// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO: Test extractions for PartialMatch/Consume + +#include +#include +#include +#include +#include +#include "util/test.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DECLARE_bool(logtostderr); + +namespace re2 { + +TEST(RE2, HexTests) { + + VLOG(1) << "hex tests"; + +#define CHECK_HEX(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ + CHECK_EQ(v, 0x ## value); \ + CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, 0x ## value); \ + } while(0) + + CHECK_HEX(short, 2bad); + CHECK_HEX(unsigned short, 2badU); + CHECK_HEX(int, dead); + CHECK_HEX(unsigned int, deadU); + CHECK_HEX(long, 7eadbeefL); + CHECK_HEX(unsigned long, deadbeefUL); + CHECK_HEX(long long, 12345678deadbeefLL); + CHECK_HEX(unsigned long long, cafebabedeadbeefULL); + +#undef CHECK_HEX +} + +TEST(RE2, OctalTests) { + VLOG(1) << "octal tests"; + +#define CHECK_OCTAL(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ + CHECK_EQ(v, 0 ## value); \ + CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, 0 ## value); \ + } while(0) + + CHECK_OCTAL(short, 77777); + CHECK_OCTAL(unsigned short, 177777U); + CHECK_OCTAL(int, 17777777777); + CHECK_OCTAL(unsigned int, 37777777777U); + CHECK_OCTAL(long, 17777777777L); + CHECK_OCTAL(unsigned long, 37777777777UL); + CHECK_OCTAL(long long, 777777777777777777777LL); + CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL); + +#undef CHECK_OCTAL +} + +TEST(RE2, DecimalTests) { + VLOG(1) << "decimal tests"; + +#define CHECK_DECIMAL(type, value) \ + do { \ + type v; \ + CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ + CHECK_EQ(v, value); \ + CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + CHECK_EQ(v, value); \ + } while(0) + + CHECK_DECIMAL(short, -1); + CHECK_DECIMAL(unsigned short, 9999); + CHECK_DECIMAL(int, -1000); + CHECK_DECIMAL(unsigned int, 12345U); + CHECK_DECIMAL(long, -10000000L); + CHECK_DECIMAL(unsigned long, 3083324652U); + CHECK_DECIMAL(long long, -100000000000000LL); + CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL); + +#undef CHECK_DECIMAL +} + +TEST(RE2, Replace) { + VLOG(1) << "TestReplace"; + + struct ReplaceTest { + const char *regexp; + const char *rewrite; + const char *original; + const char *single; + const char *global; + int greplace_count; + }; + static const ReplaceTest tests[] = { + { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "\\2\\1ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9 }, + { "\\w+", + "\\0-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4 }, + { "^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1 }, + { "^", + "(START)", + "", + "(START)", + "(START)", + 1 }, + { "$", + "(END)", + "", + "(END)", + "(END)", + 1 }, + { "b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5 }, + { "b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6 }, + { "b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6 }, + // Check newline handling + { "a.*a", + "(\\0)", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2 }, + { "", NULL, NULL, NULL, NULL, 0 } + }; + + for (const ReplaceTest *t = tests; t->original != NULL; ++t) { + VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite); + string one(t->original); + CHECK(RE2::Replace(&one, t->regexp, t->rewrite)); + CHECK_EQ(one, t->single); + string all(t->original); + CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + << "Got: " << all; + CHECK_EQ(all, t->global); + } +} + +static void TestCheckRewriteString(const char* regexp, const char* rewrite, + bool expect_ok) { + string error; + RE2 exp(regexp); + bool actual_ok = exp.CheckRewriteString(rewrite, &error); + EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +} + +TEST(CheckRewriteString, all) { + TestCheckRewriteString("abc", "foo", true); + TestCheckRewriteString("abc", "foo\\", false); + TestCheckRewriteString("abc", "foo\\0bar", true); + + TestCheckRewriteString("a(b)c", "foo", true); + TestCheckRewriteString("a(b)c", "foo\\0bar", true); + TestCheckRewriteString("a(b)c", "foo\\1bar", true); + TestCheckRewriteString("a(b)c", "foo\\2bar", false); + TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + + TestCheckRewriteString("a(b)(c)", "foo\\12", true); + TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); + TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +} + +TEST(RE2, Extract) { + VLOG(1) << "TestExtract"; + + string s; + + CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); + CHECK_EQ(s, "kremvax!boris"); + + CHECK(RE2::Extract("foo", ".*", "'\\0'", &s)); + CHECK_EQ(s, "'foo'"); + // check that false match doesn't overwrite + CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s)); + CHECK_EQ(s, "'foo'"); +} + +TEST(RE2, Consume) { + VLOG(1) << "TestConsume"; + + RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace + string word; + + string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + CHECK(RE2::Consume(&input, r, &word)); + CHECK_EQ(word, "aaa") << " input: " << input; + CHECK(RE2::Consume(&input, r, &word)); + CHECK_EQ(word, "b") << " input: " << input; + CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input; +} + +TEST(RE2, ConsumeN) { + const string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". + + // 1 arg + string word; + argv[0] = &word; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, FindAndConsume) { + VLOG(1) << "TestFindAndConsume"; + + RE2 r("(\\w+)"); // matches a word + string word; + + string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "aaa"); + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "b"); + CHECK(RE2::FindAndConsume(&input, r, &word)); + CHECK_EQ(word, "cccc"); + CHECK(! RE2::FindAndConsume(&input, r, &word)); + + // Check that FindAndConsume works without any submatches. + // Earlier version used uninitialized data for + // length to consume. + input = "aaa"; + CHECK(RE2::FindAndConsume(&input, "aaa")); + CHECK_EQ(input, ""); +} + +TEST(RE2, FindAndConsumeN) { + const string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". + + // 1 arg + string word; + argv[0] = &word; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, MatchNumberPeculiarity) { + VLOG(1) << "TestMatchNumberPeculiarity"; + + RE2 r("(foo)|(bar)|(baz)"); + string word1; + string word2; + string word3; + + CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); + CHECK_EQ(word1, "foo"); + CHECK_EQ(word2, ""); + CHECK_EQ(word3, ""); + CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); + CHECK_EQ(word1, ""); + CHECK_EQ(word2, "bar"); + CHECK_EQ(word3, ""); + CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); + CHECK_EQ(word1, ""); + CHECK_EQ(word2, ""); + CHECK_EQ(word3, "baz"); + CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3)); + + string a; + CHECK(RE2::FullMatch("hello", "(foo)|hello", &a)); + CHECK_EQ(a, ""); +} + +TEST(RE2, Match) { + RE2 re("((\\w+):([0-9]+))"); // extracts host and port + StringPiece group[4]; + + // No match. + StringPiece s = "zyzzyva"; + CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED, + group, arraysize(group))); + + // Matches and extracts. + s = "a chrisr:9000 here"; + CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED, + group, arraysize(group))); + CHECK_EQ(group[0], "chrisr:9000"); + CHECK_EQ(group[1], "chrisr:9000"); + CHECK_EQ(group[2], "chrisr"); + CHECK_EQ(group[3], "9000"); + + string all, host; + int port; + CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); + CHECK_EQ(all, "chrisr:9000"); + CHECK_EQ(host, "chrisr"); + CHECK_EQ(port, 9000); +} + +static void TestRecursion(int size, const char *pattern) { + // Fill up a string repeating the pattern given + string domain; + domain.resize(size); + int patlen = strlen(pattern); + for (int i = 0; i < size; ++i) { + domain[i] = pattern[i % patlen]; + } + // Just make sure it doesn't crash due to too much recursion. + RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); + RE2::FullMatch(domain, re); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void TestQuoteMeta(string unquoted, + const RE2::Options& options = RE2::DefaultOptions) { + string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_TRUE_M(RE2::FullMatch(unquoted, re), + "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void NegativeTestQuoteMeta(string unquoted, string should_not_match, + const RE2::Options& options = RE2::DefaultOptions) { + string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re), + "Unquoted='" + unquoted + "', quoted='" + quoted + "'."); +} + +// Tests that quoted meta characters match their original strings, +// and that a few things that shouldn't match indeed do not. +TEST(QuoteMeta, Simple) { + TestQuoteMeta("foo"); + TestQuoteMeta("foo.bar"); + TestQuoteMeta("foo\\.bar"); + TestQuoteMeta("[1-9]"); + TestQuoteMeta("1.5-2.0?"); + TestQuoteMeta("\\d"); + TestQuoteMeta("Who doesn't like ice cream?"); + TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); + TestQuoteMeta("((?!)xxx).*yyy"); + TestQuoteMeta("(["); +} +TEST(QuoteMeta, SimpleNegative) { + NegativeTestQuoteMeta("foo", "bar"); + NegativeTestQuoteMeta("...", "bar"); + NegativeTestQuoteMeta("\\.", "."); + NegativeTestQuoteMeta("\\.", ".."); + NegativeTestQuoteMeta("(a)", "a"); + NegativeTestQuoteMeta("(a|b)", "a"); + NegativeTestQuoteMeta("(a|b)", "(a)"); + NegativeTestQuoteMeta("(a|b)", "a|b"); + NegativeTestQuoteMeta("[0-9]", "0"); + NegativeTestQuoteMeta("[0-9]", "0-9"); + NegativeTestQuoteMeta("[0-9]", "[9]"); + NegativeTestQuoteMeta("((?!)xxx)", "xxx"); +} + +TEST(QuoteMeta, Latin1) { + TestQuoteMeta("3\xb2 = 9", RE2::Latin1); +} + +TEST(QuoteMeta, UTF8) { + TestQuoteMeta("Plácido Domingo"); + TestQuoteMeta("xyz"); // No fancy utf8. + TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. + TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. + TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. + TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. + TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should + // still work. + NegativeTestQuoteMeta("27\xc2\xb0", + "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. +} + +TEST(QuoteMeta, HasNull) { + string has_null; + + // string with one null character + has_null += '\0'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, ""); + + // Don't want null-followed-by-'1' to be interpreted as '\01'. + has_null += '1'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, "\1"); +} + +TEST(ProgramSize, BigProgram) { + RE2 re_simple("simple regexp"); + RE2 re_medium("medium.*regexp"); + RE2 re_complex("hard.{1,128}regexp"); + + CHECK_GT(re_simple.ProgramSize(), 0); + CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); + CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); +} + +// Issue 956519: handling empty character sets was +// causing NULL dereference. This tests a few empty character sets. +// (The way to get an empty character set is to negate a full one.) +TEST(EmptyCharset, Fuzz) { + static const char *empties[] = { + "[^\\S\\s]", + "[^\\S[:space:]]", + "[^\\D\\d]", + "[^\\D[:digit:]]" + }; + for (int i = 0; i < arraysize(empties); i++) + CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); +} + +// Bitstate assumes that kInstFail instructions in +// alternations or capture groups have been "compiled away". +TEST(EmptyCharset, BitstateAssumptions) { + // Captures trigger use of Bitstate. + static const char *nop_empties[] = { + "((((()))))" "[^\\S\\s]?", + "((((()))))" "([^\\S\\s])?", + "((((()))))" "([^\\S\\s]|[^\\S\\s])?", + "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" + }; + StringPiece group[6]; + for (int i = 0; i < arraysize(nop_empties); i++) + CHECK(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); +} + +// Test that named groups work correctly. +TEST(Capture, NamedGroups) { + { + RE2 re("(hello world)"); + CHECK_EQ(re.NumberOfCapturingGroups(), 1); + const map& m = re.NamedCapturingGroups(); + CHECK_EQ(m.size(), 0); + } + + { + RE2 re("(?Pexpr(?Pexpr)(?Pexpr))((expr)(?Pexpr))"); + CHECK_EQ(re.NumberOfCapturingGroups(), 6); + const map& m = re.NamedCapturingGroups(); + CHECK_EQ(m.size(), 4); + CHECK_EQ(m.find("A")->second, 1); + CHECK_EQ(m.find("B")->second, 2); + CHECK_EQ(m.find("C")->second, 3); + CHECK_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous + } +} + +TEST(RE2, FullMatchWithNoArgs) { + CHECK(RE2::FullMatch("h", "h")); + CHECK(RE2::FullMatch("hello", "hello")); + CHECK(RE2::FullMatch("hello", "h.*o")); + CHECK(!RE2::FullMatch("othello", "h.*o")); // Must be anchored at front + CHECK(!RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end +} + +TEST(RE2, PartialMatch) { + CHECK(RE2::PartialMatch("x", "x")); + CHECK(RE2::PartialMatch("hello", "h.*o")); + CHECK(RE2::PartialMatch("othello", "h.*o")); + CHECK(RE2::PartialMatch("hello!", "h.*o")); + CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); +} + +TEST(RE2, PartialMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); + EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + string s; + argv[1] = &s; + EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchZeroArg) { + // Zero-arg + CHECK(RE2::FullMatch("1001", "\\d+")); +} + +TEST(RE2, FullMatchOneArg) { + int i; + + // Single-arg + CHECK(RE2::FullMatch("1001", "(\\d+)", &i)); + CHECK_EQ(i, 1001); + CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i)); + CHECK_EQ(i, -123); + CHECK(!RE2::FullMatch("10", "()\\d+", &i)); + CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890", + "(\\d+)", &i)); +} + +TEST(RE2, FullMatchIntegerArg) { + int i; + + // Digits surrounding integer-arg + CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i)); + CHECK_EQ(i, 23); + CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i)); + CHECK_EQ(i, 1); + CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); + CHECK_EQ(i, -1); + CHECK(RE2::PartialMatch("1234", "(\\d)", &i)); + CHECK_EQ(i, 1); + CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i)); + CHECK_EQ(i, -1); +} + +TEST(RE2, FullMatchStringArg) { + string s; + // String-arg + CHECK(RE2::FullMatch("hello", "h(.*)o", &s)); + CHECK_EQ(s, string("ell")); +} + +TEST(RE2, FullMatchStringPieceArg) { + int i; + // StringPiece-arg + StringPiece sp; + CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); + CHECK_EQ(sp.size(), 4); + CHECK(memcmp(sp.data(), "ruby", 4) == 0); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchMultiArg) { + int i; + string s; + // Multi-arg + CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); + CHECK_EQ(s, string("ruby")); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); + EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + string s; + argv[1] = &s; + EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchIgnoredArg) { + int i; + string s; + // Ignored arg + CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); + CHECK_EQ(s, string("ruby")); + CHECK_EQ(i, 1234); +} + +TEST(RE2, FullMatchTypedNullArg) { + string s; + + // Ignore non-void* NULL arg + CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); + CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL)); + CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL)); + CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); + CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); + CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); + + // Fail on non-void* NULL arg if the match doesn't parse for the given type. + CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL)); + CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL)); + CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL)); +} + +// Check that numeric parsing code does not read past the end of +// the number being parsed. +TEST(RE2, NULTerminated) { + char *v; + int x; + long pagesize = sysconf(_SC_PAGE_SIZE); + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + v = static_cast(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); + CHECK(v != reinterpret_cast(-1)); + LOG(INFO) << "Memory at " << (void*)v; + CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; + v[pagesize - 1] = '1'; + + x = 0; + CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); + CHECK_EQ(x, 1); +} + +TEST(RE2, FullMatchTypeTests) { + // Type tests + string zeros(100, '0'); + { + char c; + CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); + CHECK_EQ(c, 'H'); + } + { + unsigned char c; + CHECK(RE2::FullMatch("Hello", "(H)ello", &c)); + CHECK_EQ(c, static_cast('H')); + } + { + int16 v; + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("32767", "(-?\\d+)", &v)); CHECK_EQ(v, 32767); + CHECK(RE2::FullMatch("-32768", "(-?\\d+)", &v)); CHECK_EQ(v, -32768); + CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch("32768", "(-?\\d+)", &v)); + } + { + uint16 v; + CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("32767", "(\\d+)", &v)); CHECK_EQ(v, 32767); + CHECK(RE2::FullMatch("65535", "(\\d+)", &v)); CHECK_EQ(v, 65535); + CHECK(!RE2::FullMatch("65536", "(\\d+)", &v)); + } + { + int32 v; + static const int32 max = 0x7fffffff; + static const int32 min = -max - 1; + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min); + CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + CHECK_EQ(v, max); + CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + CHECK_EQ(v, min); + + CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + CHECK_EQ(v, max); + CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); + } + { + uint32 v; + static const uint32 max = 0xfffffffful; + CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); + CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v)); + CHECK(!RE2::FullMatch("-1", "(\\d+)", &v)); + + CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max); + } + { + int64 v; + static const int64 max = 0x7fffffffffffffffull; + static const int64 min = -max - 1; + char buf[32]; + + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100); + + snprintf(buf, sizeof(buf), "%lld", (long long int)max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); + + snprintf(buf, sizeof(buf), "%lld", (long long int)min); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min); + + snprintf(buf, sizeof(buf), "%lld", (long long int)max); + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + + snprintf(buf, sizeof(buf), "%lld", (long long int)min); + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + } + { + uint64 v; + int64 v2; + static const uint64 max = 0xffffffffffffffffull; + char buf[32]; + + CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100); + + snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max); + CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max); + + assert(buf[strlen(buf)-1] != '9'); + buf[strlen(buf)-1]++; + CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v)); + } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + string zeros(100, '0'); + { + float v; + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23)); + + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + CHECK_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); + } + { + double v; + CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100); + CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100); + CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23); + CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + CHECK_EQ(v, double(1e23)); + + CHECK(RE2::FullMatch("0.1", "(.*)", &v)); + CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + CHECK_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + } +} + +TEST(RE2, FullMatchAnchored) { + int i; + // Check that matching is fully anchored + CHECK(!RE2::FullMatch("x1001", "(\\d+)", &i)); + CHECK(!RE2::FullMatch("1001x", "(\\d+)", &i)); + CHECK(RE2::FullMatch("x1001", "x(\\d+)", &i)); CHECK_EQ(i, 1001); + CHECK(RE2::FullMatch("1001x", "(\\d+)x", &i)); CHECK_EQ(i, 1001); +} + +TEST(RE2, FullMatchBraces) { + // Braces + CHECK(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); + CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); + CHECK(!RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); +} + +TEST(RE2, Complicated) { + // Complicated RE2 + CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]")); + CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]")); + CHECK(RE2::FullMatch("X", "foo|bar|[A-Z]")); + CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]")); +} + +TEST(RE2, FullMatchEnd) { + // Check full-match handling (needs '$' tacked on internally) + CHECK(RE2::FullMatch("fo", "fo|foo")); + CHECK(RE2::FullMatch("foo", "fo|foo")); + CHECK(RE2::FullMatch("fo", "fo|foo$")); + CHECK(RE2::FullMatch("foo", "fo|foo$")); + CHECK(RE2::FullMatch("foo", "foo$")); + CHECK(!RE2::FullMatch("foo$bar", "foo\\$")); + CHECK(!RE2::FullMatch("fox", "fo|bar")); + + // Uncomment the following if we change the handling of '$' to + // prevent it from matching a trailing newline + if (false) { + // Check that we don't get bitten by pcre's special handling of a + // '\n' at the end of the string matching '$' + CHECK(!RE2::PartialMatch("foo\n", "foo$")); + } +} + +TEST(RE2, FullMatchArgCount) { + // Number of args + int a[16]; + CHECK(RE2::FullMatch("", "")); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1", + "(\\d){1}", + &a[0])); + CHECK_EQ(a[0], 1); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("12", + "(\\d)(\\d)", + &a[0], &a[1])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("123", + "(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234", + "(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("12345", + "(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234567", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + CHECK_EQ(a[6], 7); + + memset(a, 0, sizeof(0)); + CHECK(RE2::FullMatch("1234567890123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7], + &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15])); + CHECK_EQ(a[0], 1); + CHECK_EQ(a[1], 2); + CHECK_EQ(a[2], 3); + CHECK_EQ(a[3], 4); + CHECK_EQ(a[4], 5); + CHECK_EQ(a[5], 6); + CHECK_EQ(a[6], 7); + CHECK_EQ(a[7], 8); + CHECK_EQ(a[8], 9); + CHECK_EQ(a[9], 0); + CHECK_EQ(a[10], 1); + CHECK_EQ(a[11], 2); + CHECK_EQ(a[12], 3); + CHECK_EQ(a[13], 4); + CHECK_EQ(a[14], 5); + CHECK_EQ(a[15], 6); +} + +TEST(RE2, Accessors) { + // Check the pattern() accessor + { + const string kPattern = "http://([^/]+)/.*"; + const RE2 re(kPattern); + CHECK_EQ(kPattern, re.pattern()); + } + + // Check RE2 error field. + { + RE2 re("foo"); + CHECK(re.error().empty()); // Must have no error + CHECK(re.ok()); + CHECK(re.error_code() == RE2::NoError); + } +} + +TEST(RE2, UTF8) { + // Check UTF-8 handling + // Three Japanese characters (nihongo) + const char utf8_string[] = { + (char)0xe6, (char)0x97, (char)0xa5, // 65e5 + (char)0xe6, (char)0x9c, (char)0xac, // 627c + (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e + 0 + }; + const char utf8_pattern[] = { + '.', + (char)0xe6, (char)0x9c, (char)0xac, // 627c + '.', + 0 + }; + + // Both should match in either mode, bytes or UTF-8 + RE2 re_test1(".........", RE2::Latin1); + CHECK(RE2::FullMatch(utf8_string, re_test1)); + RE2 re_test2("..."); + CHECK(RE2::FullMatch(utf8_string, re_test2)); + + // Check that '.' matches one byte or UTF-8 character + // according to the mode. + string s; + RE2 re_test3("(.)", RE2::Latin1); + CHECK(RE2::PartialMatch(utf8_string, re_test3, &s)); + CHECK_EQ(s, string("\xe6")); + RE2 re_test4("(.)"); + CHECK(RE2::PartialMatch(utf8_string, re_test4, &s)); + CHECK_EQ(s, string("\xe6\x97\xa5")); + + // Check that string matches itself in either mode + RE2 re_test5(utf8_string, RE2::Latin1); + CHECK(RE2::FullMatch(utf8_string, re_test5)); + RE2 re_test6(utf8_string); + CHECK(RE2::FullMatch(utf8_string, re_test6)); + + // Check that pattern matches string only in UTF8 mode + RE2 re_test7(utf8_pattern, RE2::Latin1); + CHECK(!RE2::FullMatch(utf8_string, re_test7)); + RE2 re_test8(utf8_pattern); + CHECK(RE2::FullMatch(utf8_string, re_test8)); +} + +TEST(RE2, UngreedyUTF8) { + // Check that ungreedy, UTF8 regular expressions don't match when they + // oughtn't -- see bug 82246. + { + // This code always worked. + const char* pattern = "\\w+X"; + const string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + RE2 match_sentence_re(pattern); + + CHECK(!RE2::FullMatch(target, match_sentence)); + CHECK(!RE2::FullMatch(target, match_sentence_re)); + } + { + const char* pattern = "(?U)\\w+X"; + const string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + CHECK_EQ(match_sentence.error(), ""); + RE2 match_sentence_re(pattern); + + CHECK(!RE2::FullMatch(target, match_sentence)); + CHECK(!RE2::FullMatch(target, match_sentence_re)); + } +} + +TEST(RE2, Rejects) { + { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); } + { + RE2 re("a[x", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a[z-a]", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a[[:foobar:]]", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a(b", RE2::Quiet); + CHECK(!re.ok()); + } + { + RE2 re("a\\", RE2::Quiet); + CHECK(!re.ok()); + } +} + +TEST(RE2, NoCrash) { + // Test that using a bad regexp doesn't crash. + { + RE2 re("a\\", RE2::Quiet); + CHECK(!re.ok()); + CHECK(!RE2::PartialMatch("a\\b", re)); + } + + // Test that using an enormous regexp doesn't crash + { + RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); + CHECK(!re.ok()); + CHECK(!RE2::PartialMatch("aaa", re)); + } + + // Test that a crazy regexp still compiles and runs. + { + RE2 re(".{512}x", RE2::Quiet); + CHECK(re.ok()); + string s; + s.append(515, 'c'); + s.append("x"); + CHECK(RE2::PartialMatch(s, re)); + } +} + +TEST(RE2, Recursion) { + // Test that recursion is stopped. + // This test is PCRE-legacy -- there's no recursion in RE2. + int bytes = 15 * 1024; // enough to crash PCRE + TestRecursion(bytes, "."); + TestRecursion(bytes, "a"); + TestRecursion(bytes, "a."); + TestRecursion(bytes, "ab."); + TestRecursion(bytes, "abc."); +} + +TEST(RE2, BigCountedRepetition) { + // Test that counted repetition works, given tons of memory. + RE2::Options opt; + opt.set_max_mem(256<<20); + + RE2 re(".{512}x", opt); + CHECK(re.ok()); + string s; + s.append(515, 'c'); + s.append("x"); + CHECK(RE2::PartialMatch(s, re)); +} + +TEST(RE2, DeepRecursion) { + // Test for deep stack recursion. This would fail with a + // segmentation violation due to stack overflow before pcre was + // patched. + // Again, a PCRE legacy test. RE2 doesn't recurse. + string comment("x*"); + string a(131072, 'a'); + comment += a; + comment += "*x"; + RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); + CHECK(RE2::FullMatch(comment, re)); +} + +// Suggested by Josh Hyman. Failed when SearchOnePass was +// not implementing case-folding. +TEST(CaseInsensitive, MatchAndConsume) { + string result; + string text = "A fish named *Wanda*"; + StringPiece sp(text); + + EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result)); + EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); +} + +// RE2 should permit implicit conversions from string, StringPiece, const char*, +// and C string literals. +TEST(RE2, ImplicitConversions) { + string re_string("."); + StringPiece re_stringpiece("."); + const char* re_cstring = "."; + EXPECT_TRUE(RE2::PartialMatch("e", re_string)); + EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); + EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", ".")); +} + +// Bugs introduced by 8622304 +TEST(RE2, CL8622304) { + // reported by ingow + string dir; + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails + + // reported by jacobsa + string key, val; + EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", + "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", + &key, + &val)); + EXPECT_EQ(key, "bar"); + EXPECT_EQ(val, "1,0x2F,030,4,5"); +} + + +// Check that RE2 returns correct regexp pieces on error. +// In particular, make sure it returns whole runes +// and that it always reports invalid UTF-8. +// Also check that Perl error flag piece is big enough. +static struct ErrorTest { + const char *regexp; + const char *error; +} error_tests[] = { + { "ab\\αcd", "\\α" }, + { "ef\\x☺01", "\\x☺0" }, + { "gh\\x1☺01", "\\x1☺" }, + { "ij\\x1", "\\x1" }, + { "kl\\x", "\\x" }, + { "uv\\x{0000☺}", "\\x{0000☺" }, + { "wx\\p{ABC", "\\p{ABC" }, + { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X + { "aa(?sm☺i", "(?sm☺" }, + { "bb[abc", "[abc" }, + + { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 + { "op\377qr", "" }, + { "st\\x{00000\377", "" }, + { "zz\\p{\377}", "" }, + { "zz\\x{00\377}", "" }, + { "zz(?Pabc)", "" }, +}; +TEST(RE2, ErrorArgs) { + for (int i = 0; i < arraysize(error_tests); i++) { + RE2 re(error_tests[i].regexp, RE2::Quiet); + EXPECT_FALSE(re.ok()); + EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); + } +} + +// Check that "never match \n" mode never matches \n. +static struct NeverTest { + const char* regexp; + const char* text; + const char* match; +} never_tests[] = { + { "(.*)", "abc\ndef\nghi\n", "abc" }, + { "(?s)(abc.*def)", "abc\ndef\n", NULL }, + { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, +}; +TEST(RE2, NeverNewline) { + RE2::Options opt; + opt.set_never_nl(true); + for (int i = 0; i < arraysize(never_tests); i++) { + const NeverTest& t = never_tests[i]; + RE2 re(t.regexp, opt); + if (t.match == NULL) { + EXPECT_FALSE(re.PartialMatch(t.text, re)); + } else { + StringPiece m; + EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); + EXPECT_EQ(m, t.match); + } + } +} + +// Check that dot_nl option works. +TEST(RE2, DotNL) { + RE2::Options opt; + opt.set_dot_nl(true); + EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); + opt.set_never_nl(true); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +} + +// Check that there are no capturing groups in "never capture" mode. +TEST(RE2, NeverCapture) { + RE2::Options opt; + opt.set_never_capture(true); + RE2 re("(r)(e)", opt); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. +// Triggered by a failed DFA search falling back to Bitstate when +// using Match with a NULL submatch set. Bitstate tried to read +// the submatch[0] entry even if nsubmatch was 0. +TEST(RE2, BitstateCaptureBug) { + RE2::Options opt; + opt.set_max_mem(20000); + RE2 re("(_________$)", opt); + StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); +} + +// C++ version of bug 609710. +TEST(RE2, UnicodeClasses) { + const string str = "ABCDEFGHI譚永鋒"; + string a, b, c; + + EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); + + EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("譚", a); + EXPECT_EQ("永", b); + EXPECT_EQ("鋒", c); +} + +// Bug reported by saito. 2009/02/17 +TEST(RE2, NullVsEmptyString) { + RE2 re2(".*"); + StringPiece v1(""); + EXPECT_TRUE(RE2::FullMatch(v1, re2)); + + StringPiece v2; + EXPECT_TRUE(RE2::FullMatch(v2, re2)); +} + +// Issue 1816809 +TEST(RE2, Bug1816809) { + RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); + StringPiece piece("llx-3;llx4"); + string x; + EXPECT_TRUE(RE2::Consume(&piece, re, &x)); +} + +// Issue 3061120 +TEST(RE2, Bug3061120) { + RE2 re("(?i)\\W"); + EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked + EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin + EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s +} + +TEST(RE2, CapturingGroupNames) { + // Opening parentheses annotated with group IDs: + // 12 3 45 6 7 + RE2 re("((abc)(?P)|((e+)(?P.*)(?Pu+)))"); + EXPECT_TRUE(re.ok()); + const map& have = re.CapturingGroupNames(); + map want; + want[3] = "G2"; + want[6] = "G2"; + want[7] = "G1"; + EXPECT_EQ(want, have); +} + +TEST(RE2, RegexpToStringLossOfAnchor) { + EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); + EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); + EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); + EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); +} + +// Issue 10131674 +TEST(RE2, Bug10131674) { + // Some of these escapes describe values that do not fit in a byte. + RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(RE2::FullMatch("hello world", re)); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/regexp_benchmark.cc b/outside/re2/re2/testing/regexp_benchmark.cc new file mode 100644 index 000000000..ca7627f5d --- /dev/null +++ b/outside/re2/re2/testing/regexp_benchmark.cc @@ -0,0 +1,1461 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Benchmarks for regular expression implementations. + +#include "util/test.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" +#include "util/pcre.h" +#include "util/benchmark.h" + +namespace re2 { +void Test(); +void MemoryUsage(); +} // namespace re2 + +typedef testing::MallocCounter MallocCounter; + +namespace re2 { + +void Test() { + Regexp* re = Regexp::Parse("(\\d+)-(\\d+)-(\\d+)", Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + const char* text = "650-253-0001"; + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + CHECK_EQ(sp[0], "650-253-0001"); + CHECK_EQ(sp[1], "650"); + CHECK_EQ(sp[2], "253"); + CHECK_EQ(sp[3], "0001"); + delete prog; + re->Decref(); + LOG(INFO) << "test passed\n"; +} + +void MemoryUsage() { + const char* regexp = "(\\d+)-(\\d+)-(\\d+)"; + const char* text = "650-253-0001"; + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + // Can't pass mc.HeapGrowth() and mc.PeakHeapGrowth() to LOG(INFO) directly, + // because LOG(INFO) might do a big allocation before they get evaluated. + fprintf(stderr, "Regexp: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + fprintf(stderr, "Prog: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + mc.Reset(); + + StringPiece sp[4]; + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + fprintf(stderr, "Search: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete prog; + re->Decref(); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE re(regexp, PCRE::UTF8); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, re); + fprintf(stderr, "RE: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + PCRE* re = new PCRE(regexp, PCRE::UTF8); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + PCRE::FullMatch(text, *re); + fprintf(stderr, "PCRE*: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + delete re; + } + + { + MallocCounter mc(MallocCounter::THIS_THREAD_ONLY); + + RE2 re(regexp); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + RE2::FullMatch(text, re); + fprintf(stderr, "RE2: %7lld bytes (peak=%lld)\n", mc.HeapGrowth(), mc.PeakHeapGrowth()); + } + + fprintf(stderr, "sizeof: PCRE=%d RE2=%d Prog=%d Inst=%d\n", + static_cast(sizeof(PCRE)), + static_cast(sizeof(RE2)), + static_cast(sizeof(Prog)), + static_cast(sizeof(Prog::Inst))); +} + +// Regular expression implementation wrappers. +// Defined at bottom of file, but they are repetitive +// and not interesting. + +typedef void SearchImpl(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match); + +SearchImpl SearchDFA, SearchNFA, SearchOnePass, SearchBitState, + SearchPCRE, SearchRE2, + SearchCachedDFA, SearchCachedNFA, SearchCachedOnePass, SearchCachedBitState, + SearchCachedPCRE, SearchCachedRE2; + +typedef void ParseImpl(int iters, const char* regexp, const StringPiece& text); + +ParseImpl Parse1NFA, Parse1OnePass, Parse1BitState, + Parse1PCRE, Parse1RE2, + Parse1Backtrack, + Parse1CachedNFA, Parse1CachedOnePass, Parse1CachedBitState, + Parse1CachedPCRE, Parse1CachedRE2, + Parse1CachedBacktrack; + +ParseImpl Parse3NFA, Parse3OnePass, Parse3BitState, + Parse3PCRE, Parse3RE2, + Parse3Backtrack, + Parse3CachedNFA, Parse3CachedOnePass, Parse3CachedBitState, + Parse3CachedPCRE, Parse3CachedRE2, + Parse3CachedBacktrack; + +ParseImpl SearchParse2CachedPCRE, SearchParse2CachedRE2; + +ParseImpl SearchParse1CachedPCRE, SearchParse1CachedRE2; + +// Benchmark: failed search for regexp in random text. + +// Generate random text that won't contain the search string, +// to test worst-case search behavior. +void MakeText(string* text, int nbytes) { + text->resize(nbytes); + srand(0); + for (int i = 0; i < nbytes; i++) { + if (!rand()%30) + (*text)[i] = '\n'; + else + (*text)[i] = rand()%(0x7E + 1 - 0x20)+0x20; + } +} + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void Search(int iters, int nbytes, const char* regexp, SearchImpl* search) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, regexp, s, Prog::kUnanchored, false); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +// These two are easy because they start with an A, +// giving the search loop something to memchr for. +#define EASY0 "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" +#define EASY1 "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" + +// This is a little harder, since it starts with a character class +// and thus can't be memchr'ed. Could look for ABC and work backward, +// but no one does that. +#define MEDIUM "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This is a fair amount harder, because of the leading [ -~]*. +// A bad backtracking implementation will take O(text^2) time to +// figure out there's no match. +#define HARD "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" + +// This stresses engines that are trying to track parentheses. +#define PARENS "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" \ + "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" + +void Search_Easy0_CachedDFA(int i, int n) { Search(i, n, EASY0, SearchCachedDFA); } +void Search_Easy0_CachedNFA(int i, int n) { Search(i, n, EASY0, SearchCachedNFA); } +void Search_Easy0_CachedPCRE(int i, int n) { Search(i, n, EASY0, SearchCachedPCRE); } +void Search_Easy0_CachedRE2(int i, int n) { Search(i, n, EASY0, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy0_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy0_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy0_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy0_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Easy1_CachedDFA(int i, int n) { Search(i, n, EASY1, SearchCachedDFA); } +void Search_Easy1_CachedNFA(int i, int n) { Search(i, n, EASY1, SearchCachedNFA); } +void Search_Easy1_CachedPCRE(int i, int n) { Search(i, n, EASY1, SearchCachedPCRE); } +void Search_Easy1_CachedRE2(int i, int n) { Search(i, n, EASY1, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Easy1_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Easy1_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Easy1_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Easy1_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Medium_CachedDFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedDFA); } +void Search_Medium_CachedNFA(int i, int n) { Search(i, n, MEDIUM, SearchCachedNFA); } +void Search_Medium_CachedPCRE(int i, int n) { Search(i, n, MEDIUM, SearchCachedPCRE); } +void Search_Medium_CachedRE2(int i, int n) { Search(i, n, MEDIUM, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Medium_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Medium_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Medium_CachedPCRE, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Medium_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Hard_CachedDFA(int i, int n) { Search(i, n, HARD, SearchCachedDFA); } +void Search_Hard_CachedNFA(int i, int n) { Search(i, n, HARD, SearchCachedNFA); } +void Search_Hard_CachedPCRE(int i, int n) { Search(i, n, HARD, SearchCachedPCRE); } +void Search_Hard_CachedRE2(int i, int n) { Search(i, n, HARD, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Hard_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Hard_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Hard_CachedPCRE, 8, 4<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Hard_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void Search_Parens_CachedDFA(int i, int n) { Search(i, n, PARENS, SearchCachedDFA); } +void Search_Parens_CachedNFA(int i, int n) { Search(i, n, PARENS, SearchCachedNFA); } +void Search_Parens_CachedPCRE(int i, int n) { Search(i, n, PARENS, SearchCachedPCRE); } +void Search_Parens_CachedRE2(int i, int n) { Search(i, n, PARENS, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Parens_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Parens_CachedNFA, 8, 256<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Parens_CachedPCRE, 8, 8)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Parens_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +void SearchBigFixed(int iters, int nbytes, SearchImpl* search) { + StopBenchmarkTiming(); + string s; + s.append(nbytes/2, 'x'); + string regexp = "^" + s + ".*$"; + string t; + MakeText(&t, nbytes/2); + s += t; + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, regexp.c_str(), s, Prog::kUnanchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +void Search_BigFixed_CachedDFA(int i, int n) { SearchBigFixed(i, n, SearchCachedDFA); } +void Search_BigFixed_CachedNFA(int i, int n) { SearchBigFixed(i, n, SearchCachedNFA); } +void Search_BigFixed_CachedPCRE(int i, int n) { SearchBigFixed(i, n, SearchCachedPCRE); } +void Search_BigFixed_CachedRE2(int i, int n) { SearchBigFixed(i, n, SearchCachedRE2); } + +BENCHMARK_RANGE(Search_BigFixed_CachedDFA, 8, 1<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_BigFixed_CachedNFA, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_BigFixed_CachedPCRE, 8, 32<<10)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_BigFixed_CachedRE2, 8, 1<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: FindAndConsume +void FindAndConsume(int iters, int nbytes) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + s.append("Hello World"); + StartBenchmarkTiming(); + RE2 re("((Hello World))"); + for (int i = 0; i < iters; i++) { + StringPiece t = s; + StringPiece u; + CHECK(RE2::FindAndConsume(&t, re, &u)); + CHECK_EQ(u, "Hello World"); + } + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +BENCHMARK_RANGE(FindAndConsume, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: successful anchored search. + +void SearchSuccess(int iters, int nbytes, const char* regexp, SearchImpl* search) { + string s; + MakeText(&s, nbytes); + BenchmarkMemoryUsage(); + search(iters, regexp, s, Prog::kAnchored, true); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +// Unambiguous search (RE2 can use OnePass). + +void Search_Success_DFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchDFA); } +void Search_Success_OnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchOnePass); } +void Search_Success_PCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchPCRE); } +void Search_Success_RE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchRE2); } + +BENCHMARK_RANGE(Search_Success_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_OnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success_CachedDFA(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedDFA); } +void Search_Success_CachedOnePass(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedOnePass); } +void Search_Success_CachedPCRE(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedPCRE); } +void Search_Success_CachedRE2(int i, int n) { SearchSuccess(i, n, ".*$", SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Success_CachedDFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success_CachedOnePass, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +// Ambiguous search (RE2 cannot use OnePass). + +void Search_Success1_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchDFA); } +void Search_Success1_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchPCRE); } +void Search_Success1_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchRE2); } +void Search_Success1_BitState(int i, int n) { SearchSuccess(i, n, ".*.$", SearchBitState); } + +BENCHMARK_RANGE(Search_Success1_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); +BENCHMARK_RANGE(Search_Success1_BitState, 8, 2<<20)->ThreadRange(1, NumCPUs()); + +void Search_Success1_Cached_DFA(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedDFA); } +void Search_Success1_Cached_PCRE(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedPCRE); } +void Search_Success1_Cached_RE2(int i, int n) { SearchSuccess(i, n, ".*.$", SearchCachedRE2); } + +BENCHMARK_RANGE(Search_Success1_Cached_DFA, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK_RANGE(Search_Success1_Cached_PCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(Search_Success1_Cached_RE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to find phone number. + +void SearchDigits(int iters, SearchImpl* search) { + const char *text = "650-253-0001"; + int len = strlen(text); + BenchmarkMemoryUsage(); + search(iters, "([0-9]+)-([0-9]+)-([0-9]+)", + StringPiece(text, len), Prog::kAnchored, true); + SetBenchmarkItemsProcessed(iters); +} + +void Search_Digits_DFA(int i) { SearchDigits(i, SearchDFA); } +void Search_Digits_NFA(int i) { SearchDigits(i, SearchNFA); } +void Search_Digits_OnePass(int i) { SearchDigits(i, SearchOnePass); } +void Search_Digits_PCRE(int i) { SearchDigits(i, SearchPCRE); } +void Search_Digits_RE2(int i) { SearchDigits(i, SearchRE2); } +void Search_Digits_BitState(int i) { SearchDigits(i, SearchBitState); } + +BENCHMARK(Search_Digits_DFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Search_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Search_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Search_Digits_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: use regexp to parse digit fields in phone number. + +void Parse3Digits(int iters, + void (*parse3)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse3(iters, "([0-9]+)-([0-9]+)-([0-9]+)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_Digits_NFA(int i) { Parse3Digits(i, Parse3NFA); } +void Parse_Digits_OnePass(int i) { Parse3Digits(i, Parse3OnePass); } +void Parse_Digits_PCRE(int i) { Parse3Digits(i, Parse3PCRE); } +void Parse_Digits_RE2(int i) { Parse3Digits(i, Parse3RE2); } +void Parse_Digits_Backtrack(int i) { Parse3Digits(i, Parse3Backtrack); } +void Parse_Digits_BitState(int i) { Parse3Digits(i, Parse3BitState); } + +BENCHMARK(Parse_Digits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Digits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Digits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Digits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigits_NFA(int i) { Parse3Digits(i, Parse3CachedNFA); } +void Parse_CachedDigits_OnePass(int i) { Parse3Digits(i, Parse3CachedOnePass); } +void Parse_CachedDigits_PCRE(int i) { Parse3Digits(i, Parse3CachedPCRE); } +void Parse_CachedDigits_RE2(int i) { Parse3Digits(i, Parse3CachedRE2); } +void Parse_CachedDigits_Backtrack(int i) { Parse3Digits(i, Parse3CachedBacktrack); } +void Parse_CachedDigits_BitState(int i) { Parse3Digits(i, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigits_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigits_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigits_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigits_BitState)->ThreadRange(1, NumCPUs()); + +void Parse3DigitDs(int iters, + void (*parse3)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse3(iters, "(\\d+)-(\\d+)-(\\d+)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_DigitDs_NFA(int i) { Parse3DigitDs(i, Parse3NFA); } +void Parse_DigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3OnePass); } +void Parse_DigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3PCRE); } +void Parse_DigitDs_RE2(int i) { Parse3DigitDs(i, Parse3RE2); } +void Parse_DigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } +void Parse_DigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } + +BENCHMARK(Parse_DigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_DigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_DigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_DigitDs_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedDigitDs_NFA(int i) { Parse3DigitDs(i, Parse3CachedNFA); } +void Parse_CachedDigitDs_OnePass(int i) { Parse3DigitDs(i, Parse3CachedOnePass); } +void Parse_CachedDigitDs_PCRE(int i) { Parse3DigitDs(i, Parse3CachedPCRE); } +void Parse_CachedDigitDs_RE2(int i) { Parse3DigitDs(i, Parse3CachedRE2); } +void Parse_CachedDigitDs_Backtrack(int i) { Parse3DigitDs(i, Parse3CachedBacktrack); } +void Parse_CachedDigitDs_BitState(int i) { Parse3DigitDs(i, Parse3CachedBitState); } + +BENCHMARK(Parse_CachedDigitDs_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedDigitDs_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedDigitDs_Backtrack)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedDigitDs_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field. + +void Parse1Split(int iters, + void (*parse1)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + parse1(iters, "[0-9]+-(.*)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_Split_NFA(int i) { Parse1Split(i, Parse1NFA); } +void Parse_Split_OnePass(int i) { Parse1Split(i, Parse1OnePass); } +void Parse_Split_PCRE(int i) { Parse1Split(i, Parse1PCRE); } +void Parse_Split_RE2(int i) { Parse1Split(i, Parse1RE2); } +void Parse_Split_BitState(int i) { Parse1Split(i, Parse1BitState); } + +BENCHMARK(Parse_Split_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_Split_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_Split_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_Split_BitState)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplit_NFA(int i) { Parse1Split(i, Parse1CachedNFA); } +void Parse_CachedSplit_OnePass(int i) { Parse1Split(i, Parse1CachedOnePass); } +void Parse_CachedSplit_PCRE(int i) { Parse1Split(i, Parse1CachedPCRE); } +void Parse_CachedSplit_RE2(int i) { Parse1Split(i, Parse1CachedRE2); } +void Parse_CachedSplit_BitState(int i) { Parse1Split(i, Parse1CachedBitState); } + +BENCHMARK(Parse_CachedSplit_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_OnePass)->ThreadRange(1, NumCPUs()); +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplit_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplit_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplit_BitState)->ThreadRange(1, NumCPUs()); + +// Benchmark: splitting off leading number field but harder (ambiguous regexp). + +void Parse1SplitHard(int iters, + void (*run)(int, const char*, const StringPiece&)) { + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", "650-253-0001"); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_SplitHard_NFA(int i) { Parse1SplitHard(i, Parse1NFA); } +void Parse_SplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1PCRE); } +void Parse_SplitHard_RE2(int i) { Parse1SplitHard(i, Parse1RE2); } +void Parse_SplitHard_BitState(int i) { Parse1SplitHard(i, Parse1BitState); } + +#ifdef USEPCRE +BENCHMARK(Parse_SplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_SplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_SplitHard_NFA)->ThreadRange(1, NumCPUs()); + +void Parse_CachedSplitHard_NFA(int i) { Parse1SplitHard(i, Parse1CachedNFA); } +void Parse_CachedSplitHard_PCRE(int i) { Parse1SplitHard(i, Parse1CachedPCRE); } +void Parse_CachedSplitHard_RE2(int i) { Parse1SplitHard(i, Parse1CachedRE2); } +void Parse_CachedSplitHard_BitState(int i) { Parse1SplitHard(i, Parse1CachedBitState); } +void Parse_CachedSplitHard_Backtrack(int i) { Parse1SplitHard(i, Parse1CachedBacktrack); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitHard_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitHard_RE2)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_BitState)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_NFA)->ThreadRange(1, NumCPUs()); +BENCHMARK(Parse_CachedSplitHard_Backtrack)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, small match. + +void Parse1SplitBig1(int iters, + void (*run)(int, const char*, const StringPiece&)) { + string s; + s.append(100000, 'x'); + s.append("650-253-0001"); + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", s); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_CachedSplitBig1_PCRE(int i) { Parse1SplitBig1(i, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig1_RE2(int i) { Parse1SplitBig1(i, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig1_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig1_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: Parse1SplitHard, big text, big match. + +void Parse1SplitBig2(int iters, + void (*run)(int, const char*, const StringPiece&)) { + string s; + s.append("650-253-"); + s.append(100000, '0'); + BenchmarkMemoryUsage(); + run(iters, "[0-9]+.(.*)", s); + SetBenchmarkItemsProcessed(iters); +} + +void Parse_CachedSplitBig2_PCRE(int i) { Parse1SplitBig2(i, SearchParse1CachedPCRE); } +void Parse_CachedSplitBig2_RE2(int i) { Parse1SplitBig2(i, SearchParse1CachedRE2); } + +#ifdef USEPCRE +BENCHMARK(Parse_CachedSplitBig2_PCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(Parse_CachedSplitBig2_RE2)->ThreadRange(1, NumCPUs()); + +// Benchmark: measure time required to parse (but not execute) +// a simple regular expression. + +void ParseRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + re->Decref(); + } +} + +void SimplifyRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + sre->Decref(); + re->Decref(); + } +} + +void NullWalkRegexp(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (int i = 0; i < iters; i++) { + re->NullWalk(); + } + re->Decref(); +} + +void SimplifyCompileRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Regexp* sre = re->Simplify(); + CHECK(sre); + Prog* prog = sre->CompileToProg(0); + CHECK(prog); + delete prog; + sre->Decref(); + re->Decref(); + } +} + +void CompileRegexp(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + re->Decref(); + } +} + +void CompileToProg(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + for (int i = 0; i < iters; i++) { + Prog* prog = re->CompileToProg(0); + CHECK(prog); + delete prog; + } + re->Decref(); +} + +void CompileByteMap(int iters, const string& regexp) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) { + prog->ComputeByteMap(); + } + delete prog; + re->Decref(); +} + +void CompilePCRE(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + } +} + +void CompileRE2(int iters, const string& regexp) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + } +} + +void RunBuild(int iters, const string& regexp, void (*run)(int, const string&)) { + run(iters, regexp); + SetBenchmarkItemsProcessed(iters); +} + +} // namespace re2 + +DEFINE_string(compile_regexp, "(.*)-(\\d+)-of-(\\d+)", "regexp for compile benchmarks"); + +namespace re2 { + +void BM_PCRE_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompilePCRE); } +void BM_Regexp_Parse(int i) { RunBuild(i, FLAGS_compile_regexp, ParseRegexp); } +void BM_Regexp_Simplify(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyRegexp); } +void BM_CompileToProg(int i) { RunBuild(i, FLAGS_compile_regexp, CompileToProg); } +void BM_CompileByteMap(int i) { RunBuild(i, FLAGS_compile_regexp, CompileByteMap); } +void BM_Regexp_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRegexp); } +void BM_Regexp_SimplifyCompile(int i) { RunBuild(i, FLAGS_compile_regexp, SimplifyCompileRegexp); } +void BM_Regexp_NullWalk(int i) { RunBuild(i, FLAGS_compile_regexp, NullWalkRegexp); } +void BM_RE2_Compile(int i) { RunBuild(i, FLAGS_compile_regexp, CompileRE2); } + +#ifdef USEPCRE +BENCHMARK(BM_PCRE_Compile)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(BM_Regexp_Parse)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Simplify)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileToProg)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_CompileByteMap)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_Compile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_SimplifyCompile)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_Regexp_NullWalk)->ThreadRange(1, NumCPUs()); +BENCHMARK(BM_RE2_Compile)->ThreadRange(1, NumCPUs()); + + +// Makes text of size nbytes, then calls run to search +// the text for regexp iters times. +void SearchPhone(int iters, int nbytes, ParseImpl* search) { + StopBenchmarkTiming(); + string s; + MakeText(&s, nbytes); + s.append("(650) 253-0001"); + BenchmarkMemoryUsage(); + StartBenchmarkTiming(); + search(iters, "(\\d{3}-|\\(\\d{3}\\)\\s+)(\\d{3}-\\d{4})", s); + SetBenchmarkBytesProcessed(static_cast(iters)*nbytes); +} + +void SearchPhone_CachedPCRE(int i, int n) { + SearchPhone(i, n, SearchParse2CachedPCRE); +} +void SearchPhone_CachedRE2(int i, int n) { + SearchPhone(i, n, SearchParse2CachedRE2); +} + +#ifdef USEPCRE +BENCHMARK_RANGE(SearchPhone_CachedPCRE, 8, 16<<20)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK_RANGE(SearchPhone_CachedRE2, 8, 16<<20)->ThreadRange(1, NumCPUs()); + +/* +TODO(rsc): Make this work again. + +// Generates and returns a string over binary alphabet {0,1} that contains +// all possible binary sequences of length n as subsequences. The obvious +// brute force method would generate a string of length n * 2^n, but this +// generates a string of length n + 2^n - 1 called a De Bruijn cycle. +// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. +static string DeBruijnString(int n) { + CHECK_LT(n, 8*sizeof(int)); + CHECK_GT(n, 0); + + vector did(1<(iters)*s.size()); +} + +void CacheFillPCRE(int i, int n) { CacheFill(i, n, SearchCachedPCRE); } +void CacheFillRE2(int i, int n) { CacheFill(i, n, SearchCachedRE2); } +void CacheFillNFA(int i, int n) { CacheFill(i, n, SearchCachedNFA); } +void CacheFillDFA(int i, int n) { CacheFill(i, n, SearchCachedDFA); } + +// BENCHMARK_WITH_ARG uses __LINE__ to generate distinct identifiers +// for the static BenchmarkRegisterer, which makes it unusable inside +// a macro like DO24 below. MY_BENCHMARK_WITH_ARG uses the argument a +// to make the identifiers distinct (only possible when 'a' is a simple +// expression like 2, not like 1+1). +#define MY_BENCHMARK_WITH_ARG(n, a) \ + bool __benchmark_ ## n ## a = \ + (new ::testing::Benchmark(#n, NewPermanentCallback(&n)))->ThreadRange(1, NumCPUs()); + +#define DO24(A, B) \ + A(B, 1); A(B, 2); A(B, 3); A(B, 4); A(B, 5); A(B, 6); \ + A(B, 7); A(B, 8); A(B, 9); A(B, 10); A(B, 11); A(B, 12); \ + A(B, 13); A(B, 14); A(B, 15); A(B, 16); A(B, 17); A(B, 18); \ + A(B, 19); A(B, 20); A(B, 21); A(B, 22); A(B, 23); A(B, 24); + +DO24(MY_BENCHMARK_WITH_ARG, CacheFillPCRE) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillNFA) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillRE2) +DO24(MY_BENCHMARK_WITH_ARG, CacheFillDFA) + +#undef DO24 +#undef MY_BENCHMARK_WITH_ARG +*/ + +//////////////////////////////////////////////////////////////////////// +// +// Implementation routines. Sad that there are so many, +// but all the interfaces are slightly different. + +// Runs implementation to search for regexp in text, iters times. +// Expect_match says whether the regexp should be found. +// Anchored says whether to run an anchored search. + +void SearchDFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, NULL, anchor, Prog::kFirstMatch, + NULL, &failed, NULL), + expect_match); + CHECK(!failed); + delete prog; + re->Decref(); + } +} + +void SearchNFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchOnePass(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchBitState(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); + } +} + +void SearchPCRE(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchRE2(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + +// SearchCachedXXX is like SearchXXX but only does the +// regexp parsing and compiling once. This lets us measure +// search time without the per-regexp overhead. + +void SearchCachedDFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(1LL<<31); + CHECK(prog); + for (int i = 0; i < iters; i++) { + bool failed = false; + CHECK_EQ(prog->SearchDFA(text, NULL, anchor, + Prog::kFirstMatch, NULL, &failed, NULL), + expect_match); + CHECK(!failed); + } + delete prog; + re->Decref(); +} + +void SearchCachedNFA(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) { + CHECK_EQ(prog->SearchNFA(text, NULL, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + } + delete prog; + re->Decref(); +} + +void SearchCachedOnePass(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + for (int i = 0; i < iters; i++) + CHECK_EQ(prog->SearchOnePass(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); +} + +void SearchCachedBitState(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + for (int i = 0; i < iters; i++) + CHECK_EQ(prog->SearchBitState(text, text, anchor, Prog::kFirstMatch, NULL, 0), + expect_match); + delete prog; + re->Decref(); +} + +void SearchCachedPCRE(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + if (anchor == Prog::kAnchored) + CHECK_EQ(PCRE::FullMatch(text, re), expect_match); + else + CHECK_EQ(PCRE::PartialMatch(text, re), expect_match); + } +} + +void SearchCachedRE2(int iters, const char* regexp, const StringPiece& text, + Prog::Anchor anchor, bool expect_match) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + if (anchor == Prog::kAnchored) + CHECK_EQ(RE2::FullMatch(text, re), expect_match); + else + CHECK_EQ(RE2::PartialMatch(text, re), expect_match); + } +} + + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse3NFA(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3OnePass(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3BitState(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3Backtrack(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); + } +} + +void Parse3PCRE(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3RE2(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedNFA(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) { + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + } + delete prog; + re->Decref(); +} + +void Parse3CachedOnePass(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedBitState(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[4]; // 4 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 4)); + delete prog; + re->Decref(); +} + +void Parse3CachedPCRE(int iters, const char* regexp, const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + for (int i = 0; i < iters; i++) { + CHECK(PCRE::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + +void Parse3CachedRE2(int iters, const char* regexp, const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1, sp2, sp3; + for (int i = 0; i < iters; i++) { + CHECK(RE2::FullMatch(text, re, &sp1, &sp2, &sp3)); + } +} + + +// Runs implementation to full match regexp against text, +// extracting three submatches. Expects match always. + +void Parse1NFA(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1OnePass(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1BitState(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); + } +} + +void Parse1PCRE(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1RE2(int iters, const char* regexp, const StringPiece& text) { + for (int i = 0; i < iters; i++) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedNFA(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) { + CHECK(prog->SearchNFA(text, NULL, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + } + delete prog; + re->Decref(); +} + +void Parse1CachedOnePass(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + CHECK(prog->IsOnePass()); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchOnePass(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedBitState(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->SearchBitState(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedBacktrack(int iters, const char* regexp, const StringPiece& text) { + Regexp* re = Regexp::Parse(regexp, Regexp::LikePerl, NULL); + CHECK(re); + Prog* prog = re->CompileToProg(0); + CHECK(prog); + StringPiece sp[2]; // 2 because sp[0] is whole match. + for (int i = 0; i < iters; i++) + CHECK(prog->UnsafeSearchBacktrack(text, text, Prog::kAnchored, Prog::kFullMatch, sp, 2)); + delete prog; + re->Decref(); +} + +void Parse1CachedPCRE(int iters, const char* regexp, const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + for (int i = 0; i < iters; i++) { + CHECK(PCRE::FullMatch(text, re, &sp1)); + } +} + +void Parse1CachedRE2(int iters, const char* regexp, const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + StringPiece sp1; + for (int i = 0; i < iters; i++) { + CHECK(RE2::FullMatch(text, re, &sp1)); + } +} + +void SearchParse2CachedPCRE(int iters, const char* regexp, + const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1, sp2; + CHECK(PCRE::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse2CachedRE2(int iters, const char* regexp, + const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1, sp2; + CHECK(RE2::PartialMatch(text, re, &sp1, &sp2)); + } +} + +void SearchParse1CachedPCRE(int iters, const char* regexp, + const StringPiece& text) { + PCRE re(regexp, PCRE::UTF8); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1; + CHECK(PCRE::PartialMatch(text, re, &sp1)); + } +} + +void SearchParse1CachedRE2(int iters, const char* regexp, + const StringPiece& text) { + RE2 re(regexp); + CHECK_EQ(re.error(), ""); + for (int i = 0; i < iters; i++) { + StringPiece sp1; + CHECK(RE2::PartialMatch(text, re, &sp1)); + } +} + +void EmptyPartialMatchPCRE(int n) { + PCRE re(""); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch("", re); + } +} + +void EmptyPartialMatchRE2(int n) { + RE2 re(""); + for (int i = 0; i < n; i++) { + RE2::PartialMatch("", re); + } +} +#ifdef USEPCRE +BENCHMARK(EmptyPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(EmptyPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void SimplePartialMatchPCRE(int n) { + PCRE re("abcdefg"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch("abcdefg", re); + } +} + +void SimplePartialMatchRE2(int n) { + RE2 re("abcdefg"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch("abcdefg", re); + } +} +#ifdef USEPCRE +BENCHMARK(SimplePartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SimplePartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static string http_text = + "GET /asdfhjasdhfasdlfhasdflkjasdfkljasdhflaskdjhf" + "alksdjfhasdlkfhasdlkjfhasdljkfhadsjklf HTTP/1.1"; + +void HTTPPartialMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void HTTPPartialMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(HTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(HTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +static string http_smalltext = + "GET /abc HTTP/1.1"; + +void SmallHTTPPartialMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void SmallHTTPPartialMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(?:GET|POST) +([^ ]+) HTTP"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(SmallHTTPPartialMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(SmallHTTPPartialMatchRE2)->ThreadRange(1, NumCPUs()); + +void DotMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^(.+)"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void DotMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^(.+)"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(DotMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(DotMatchRE2)->ThreadRange(1, NumCPUs()); + +void ASCIIMatchPCRE(int n) { + StringPiece a; + PCRE re("(?-s)^([ -~]+)"); + for (int i = 0; i < n; i++) { + PCRE::PartialMatch(http_text, re, &a); + } +} + +void ASCIIMatchRE2(int n) { + StringPiece a; + RE2 re("(?-s)^([ -~]+)"); + for (int i = 0; i < n; i++) { + RE2::PartialMatch(http_text, re, &a); + } +} + +#ifdef USEPCRE +BENCHMARK(ASCIIMatchPCRE)->ThreadRange(1, NumCPUs()); +#endif +BENCHMARK(ASCIIMatchRE2)->ThreadRange(1, NumCPUs()); + +void FullMatchPCRE(int iter, int n, const char *regexp) { + StopBenchmarkTiming(); + string s; + MakeText(&s, n); + s += "ABCDEFGHIJ"; + BenchmarkMemoryUsage(); + PCRE re(regexp); + StartBenchmarkTiming(); + for (int i = 0; i < iter; i++) + CHECK(PCRE::FullMatch(s, re)); + SetBenchmarkBytesProcessed(static_cast(iter)*n); +} + +void FullMatchRE2(int iter, int n, const char *regexp) { + StopBenchmarkTiming(); + string s; + MakeText(&s, n); + s += "ABCDEFGHIJ"; + BenchmarkMemoryUsage(); + RE2 re(regexp, RE2::Latin1); + StartBenchmarkTiming(); + for (int i = 0; i < iter; i++) + CHECK(RE2::FullMatch(s, re)); + SetBenchmarkBytesProcessed(static_cast(iter)*n); +} + +void FullMatch_DotStar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*"); } +void FullMatch_DotStar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*"); } + +void FullMatch_DotStarDollar_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s).*$"); } +void FullMatch_DotStarDollar_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s).*$"); } + +void FullMatch_DotStarCapture_CachedPCRE(int i, int n) { FullMatchPCRE(i, n, "(?s)((.*)()()($))"); } +void FullMatch_DotStarCapture_CachedRE2(int i, int n) { FullMatchRE2(i, n, "(?s)((.*)()()($))"); } + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarDollar_CachedRE2, 8, 2<<20); + +#ifdef USEPCRE +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedPCRE, 8, 2<<20); +#endif +BENCHMARK_RANGE(FullMatch_DotStarCapture_CachedRE2, 8, 2<<20); + +} // namespace re2 diff --git a/outside/re2/re2/testing/regexp_generator.cc b/outside/re2/re2/testing/regexp_generator.cc new file mode 100644 index 000000000..cf2db1185 --- /dev/null +++ b/outside/re2/re2/testing/regexp_generator.cc @@ -0,0 +1,264 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression generator: generates all possible +// regular expressions within parameters (see regexp_generator.h for details). + +// The regexp generator first generates a sequence of commands in a simple +// postfix language. Each command in the language is a string, +// like "a" or "%s*" or "%s|%s". +// +// To evaluate a command, enough arguments are popped from the value stack to +// plug into the %s slots. Then the result is pushed onto the stack. +// For example, the command sequence +// a b %s%s c +// results in the stack +// ab c +// +// GeneratePostfix generates all possible command sequences. +// Then RunPostfix turns each sequence into a regular expression +// and passes the regexp to HandleRegexp. + +#include +#include +#include +#include +#include "util/test.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns a vector of the egrep regexp operators. +const vector& RegexpGenerator::EgrepOps() { + static const char *ops[] = { + "%s%s", + "%s|%s", + "%s*", + "%s+", + "%s?", + "%s\\C*", + }; + static vector v(ops, ops + arraysize(ops)); + return v; +} + +RegexpGenerator::RegexpGenerator(int maxatoms, int maxops, + const vector& atoms, + const vector& ops) + : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) { + // Degenerate case. + if (atoms_.size() == 0) + maxatoms_ = 0; + if (ops_.size() == 0) + maxops_ = 0; +} + +// Generates all possible regular expressions (within the parameters), +// calling HandleRegexp for each one. +void RegexpGenerator::Generate() { + vector postfix; + GeneratePostfix(&postfix, 0, 0, 0); +} + +// Generates random regular expressions, calling HandleRegexp for each one. +void RegexpGenerator::GenerateRandom(int32 seed, int n) { + ACMRandom acm(seed); + acm_ = &acm; + + for (int i = 0; i < n; i++) { + vector postfix; + GenerateRandomPostfix(&postfix, 0, 0, 0); + } + + acm_ = NULL; +} + +// Counts and returns the number of occurrences of "%s" in s. +static int CountArgs(const string& s) { + const char *p = s.c_str(); + int n = 0; + while ((p = strstr(p, "%s")) != NULL) { + p += 2; + n++; + } + return n; +} + +// Generates all possible postfix command sequences. +// Each sequence is handed off to RunPostfix to generate a regular expression. +// The arguments are: +// post: the current postfix sequence +// nstk: the number of elements that would be on the stack after executing +// the sequence +// ops: the number of operators used in the sequence +// atoms: the number of atoms used in the sequence +// For example, if post were ["a", "b", "%s%s", "c"], +// then nstk = 2, ops = 1, atoms = 3. +// +// The initial call should be GeneratePostfix([empty vector], 0, 0, 0). +// +void RegexpGenerator::GeneratePostfix(vector* post, int nstk, + int ops, int atoms) { + if (nstk == 1) + RunPostfix(*post); + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return; + + // Add atoms if there is room. + if (atoms < maxatoms_) { + for (int i = 0; i < atoms_.size(); i++) { + post->push_back(atoms_[i]); + GeneratePostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + } + } + + // Add operators if there are enough arguments. + if (ops < maxops_) { + for (int i = 0; i < ops_.size(); i++) { + const string& fmt = ops_[i]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms); + post->pop_back(); + } + } + } +} + +// Generates a random postfix command sequence. +// Stops and returns true once a single sequence has been generated. +bool RegexpGenerator::GenerateRandomPostfix(vector *post, int nstk, + int ops, int atoms) { + for (;;) { + // Stop if we get to a single element, but only sometimes. + if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) { + RunPostfix(*post); + return true; + } + + // Early out: if used too many operators or can't + // get back down to a single expression on the stack + // using binary operators, give up. + if (ops + nstk - 1 > maxops_) + return false; + + // Add operators if there are enough arguments. + if (ops < maxops_ && acm_->Uniform(2) == 0) { + const string& fmt = ops_[acm_->Uniform(ops_.size())]; + int nargs = CountArgs(fmt); + if (nargs <= nstk) { + post->push_back(fmt); + bool ret = GenerateRandomPostfix(post, nstk - nargs + 1, + ops + 1, atoms); + post->pop_back(); + if (ret) + return true; + } + } + + // Add atoms if there is room. + if (atoms < maxatoms_ && acm_->Uniform(2) == 0) { + post->push_back(atoms_[acm_->Uniform(atoms_.size())]); + bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1); + post->pop_back(); + if (ret) + return true; + } + } +} + +// Interprets the postfix command sequence to create a regular expression +// passed to HandleRegexp. The results of operators like %s|%s are wrapped +// in (?: ) to avoid needing to maintain a precedence table. +void RegexpGenerator::RunPostfix(const vector& post) { + stack regexps; + for (int i = 0; i < post.size(); i++) { + switch (CountArgs(post[i])) { + default: + LOG(FATAL) << "Bad operator: " << post[i]; + case 0: + regexps.push(post[i]); + break; + case 1: { + string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")"); + break; + } + case 2: { + string b = regexps.top(); + regexps.pop(); + string a = regexps.top(); + regexps.pop(); + regexps.push("(?:" + + StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) + + ")"); + break; + } + } + } + + if (regexps.size() != 1) { + // Internal error - should never happen. + printf("Bad regexp program:\n"); + for (int i = 0; i < post.size(); i++) { + printf(" %s\n", CEscape(post[i]).c_str()); + } + printf("Stack after running program:\n"); + while (!regexps.empty()) { + printf(" %s\n", CEscape(regexps.top()).c_str()); + regexps.pop(); + } + LOG(FATAL) << "Bad regexp program."; + } + + HandleRegexp(regexps.top()); + HandleRegexp("^(?:" + regexps.top() + ")$"); + HandleRegexp("^(?:" + regexps.top() + ")"); + HandleRegexp("(?:" + regexps.top() + ")$"); +} + +// Split s into an vector of strings, one for each UTF-8 character. +vector Explode(const StringPiece& s) { + vector v; + + for (const char *q = s.begin(); q < s.end(); ) { + const char* p = q; + Rune r; + q += chartorune(&r, q); + v.push_back(string(p, q - p)); + } + + return v; +} + +// Split string everywhere a substring is found, returning +// vector of pieces. +vector Split(const StringPiece& sep, const StringPiece& s) { + vector v; + + if (sep.size() == 0) + return Explode(s); + + const char *p = s.begin(); + for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) { + if (StringPiece(q, sep.size()) == sep) { + v.push_back(string(p, q - p)); + p = q + sep.size(); + q = p - 1; // -1 for ++ in loop + continue; + } + } + if (p < s.end()) + v.push_back(string(p, s.end() - p)); + return v; +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/regexp_generator.h b/outside/re2/re2/testing/regexp_generator.h new file mode 100644 index 000000000..b4506f2a5 --- /dev/null +++ b/outside/re2/re2/testing/regexp_generator.h @@ -0,0 +1,70 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression generator: generates all possible +// regular expressions within given parameters (see below for details). + +#ifndef RE2_TESTING_REGEXP_GENERATOR_H__ +#define RE2_TESTING_REGEXP_GENERATOR_H__ + +#include +#include +#include "util/random.h" +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// Regular expression generator. +// +// Given a set of atom expressions like "a", "b", or "." +// and operators like "%s*", generates all possible regular expressions +// using at most maxbases base expressions and maxops operators. +// For each such expression re, calls HandleRegexp(re). +// +// Callers are expected to subclass RegexpGenerator and provide HandleRegexp. +// +class RegexpGenerator { + public: + RegexpGenerator(int maxatoms, int maxops, const vector& atoms, + const vector& ops); + virtual ~RegexpGenerator() {} + + // Generates all the regular expressions, calling HandleRegexp(re) for each. + void Generate(); + + // Generates n random regular expressions, calling HandleRegexp(re) for each. + void GenerateRandom(int32 seed, int n); + + // Handles a regular expression. Must be provided by subclass. + virtual void HandleRegexp(const string& regexp) = 0; + + // The egrep regexp operators: * + ? | and concatenation. + static const vector& EgrepOps(); + + private: + void RunPostfix(const vector& post); + void GeneratePostfix(vector* post, int nstk, int ops, int lits); + bool GenerateRandomPostfix(vector* post, int nstk, int ops, int lits); + + int maxatoms_; // Maximum number of atoms allowed in expr. + int maxops_; // Maximum number of ops allowed in expr. + vector atoms_; // Possible atoms. + vector ops_; // Possible ops. + ACMRandom* acm_; // Random generator. + DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator); +}; + +// Helpers for preparing arguments to RegexpGenerator constructor. + +// Returns one string for each character in s. +vector Explode(const StringPiece& s); + +// Splits string everywhere sep is found, returning +// vector of pieces. +vector Split(const StringPiece& sep, const StringPiece& s); + +} // namespace re2 + +#endif // RE2_TESTING_REGEXP_GENERATOR_H__ diff --git a/outside/re2/re2/testing/regexp_test.cc b/outside/re2/re2/testing/regexp_test.cc new file mode 100644 index 000000000..f317cbca8 --- /dev/null +++ b/outside/re2/re2/testing/regexp_test.cc @@ -0,0 +1,81 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test parse.cc, dump.cc, and tostring.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +// Test that overflowed ref counts work. +TEST(Regexp, BigRef) { + Regexp* re; + re = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + for (int i = 0; i < 100000; i++) + re->Incref(); + for (int i = 0; i < 100000; i++) + re->Decref(); + CHECK_EQ(re->Ref(), 1); + re->Decref(); +} + +// Test that very large Concats work. +// Depends on overflowed ref counts working. +TEST(Regexp, BigConcat) { + Regexp* x; + x = Regexp::Parse("x", Regexp::NoParseFlags, NULL); + vector v(90000, x); // ToString bails out at 100000 + for (int i = 0; i < v.size(); i++) + x->Incref(); + CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref(); + Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags); + CHECK_EQ(re->ToString(), string(v.size(), 'x')); + re->Decref(); + CHECK_EQ(x->Ref(), 1) << x->Ref(); + x->Decref(); +} + +TEST(Regexp, NamedCaptures) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?Pa+)|(e)(?Pw*)+(?Pb+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const map* have = x->NamedCaptures(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(2, have->size()); // there are only two named groups in + // the regexp: 'g1' and 'g2'. + map want; + want["g1"] = 1; + want["g2"] = 3; + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +TEST(Regexp, CaptureNames) { + Regexp* x; + RegexpStatus status; + x = Regexp::Parse( + "(?Pa+)|(e)(?Pw*)+(?Pb+)", Regexp::PerlX, &status); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(4, x->NumCaptures()); + const map* have = x->CaptureNames(); + EXPECT_TRUE(have != NULL); + EXPECT_EQ(3, have->size()); + map want; + want[1] = "g1"; + want[3] = "g2"; + want[4] = "g1"; + + EXPECT_EQ(want, *have); + x->Decref(); + delete have; +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/required_prefix_test.cc b/outside/re2/re2/testing/required_prefix_test.cc new file mode 100644 index 000000000..1f0b216aa --- /dev/null +++ b/outside/re2/re2/testing/required_prefix_test.cc @@ -0,0 +1,67 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct PrefixTest { + const char* regexp; + bool return_value; + const char* prefix; + bool foldcase; + const char* suffix; +}; + +static PrefixTest tests[] = { + // If the regexp is missing a ^, there's no required prefix. + { "abc", false }, + { "", false }, + { "(?m)^", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "^(abc)", false }, + { "^a*", false }, + + // Otherwise, it should work. + { "^abc$", true, "abc", false, "(?-m:$)" }, + { "^abc", "true", "abc", false, "" }, + { "^(?i)abc", true, "abc", true, "" }, + { "^abcd*", true, "abc", false, "d*" }, + { "^[Aa][Bb]cd*", true, "ab", true, "cd*" }, + { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" }, + { "^☺abc", true, "☺abc", false, "" }, +}; + +TEST(RequiredPrefix, SimpleTests) { + for (int i = 0; i < arraysize(tests); i++) { + const PrefixTest& t = tests[i]; + for (int j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + CHECK(re) << " " << t.regexp; + string p; + bool f = false; + Regexp* s = NULL; + CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump(); + if (t.return_value) { + CHECK_EQ(p, string(t.prefix)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + CHECK_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + CHECK_EQ(s->ToString(), string(t.suffix)) + << " " << t.regexp << " " << (j==0 ? "latin1" : "utf"); + s->Decref(); + } + re->Decref(); + } + } +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/search_test.cc b/outside/re2/re2/testing/search_test.cc new file mode 100644 index 000000000..3ab2ae3bf --- /dev/null +++ b/outside/re2/re2/testing/search_test.cc @@ -0,0 +1,325 @@ +// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include "util/test.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/testing/tester.h" +#include "re2/testing/exhaustive_tester.h" + +namespace re2 { + +struct RegexpTest { + const char* regexp; + const char* text; +}; + +RegexpTest simple_tests[] = { + { "a", "a" }, + { "a", "zyzzyva" }, + { "a+", "aa" }, + { "(a+|b)+", "ab" }, + { "ab|cd", "xabcdx" }, + { "h.*od?", "hello\ngoodbye\n" }, + { "h.*o", "hello\ngoodbye\n" }, + { "h.*o", "goodbye\nhello\n" }, + { "h.*o", "hello world" }, + { "h.*o", "othello, world" }, + { "[^\\s\\S]", "aaaaaaa" }, + { "a", "aaaaaaa" }, + { "a*", "aaaaaaa" }, + { "a*", "" }, + { "a*", NULL }, + { "ab|cd", "xabcdx" }, + { "a", "cab" }, + { "a*b", "cab" }, + { "((((((((((((((((((((x))))))))))))))))))))", "x" }, + { "[abcd]", "xxxabcdxxx" }, + { "[^x]", "xxxabcdxxx" }, + { "[abcd]+", "xxxabcdxxx" }, + { "[^x]+", "xxxabcdxxx" }, + { "(fo|foo)", "fo" }, + { "(foo|fo)", "foo" }, + + { "aa", "aA" }, + { "a", "Aa" }, + { "a", "A" }, + { "ABC", "abc" }, + { "abc", "XABCY" }, + { "ABC", "xabcy" }, + + // Make sure ^ and $ work. + // The pathological cases didn't work + // in the original grep code. + { "foo|bar|[A-Z]", "foo" }, + { "^(foo|bar|[A-Z])", "foo" }, + { "(foo|bar|[A-Z])$", "foo\n" }, + { "(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "foo\n" }, + { "^(foo|bar|[A-Z])$", "foo" }, + { "^(foo|bar|[A-Z])$", "bar" }, + { "^(foo|bar|[A-Z])$", "X" }, + { "^(foo|bar|[A-Z])$", "XY" }, + { "^(fo|foo)$", "fo" }, + { "^(fo|foo)$", "foo" }, + { "^^(fo|foo)$", "fo" }, + { "^^(fo|foo)$", "foo" }, + { "^$", "" }, + { "^$", "x" }, + { "^^$", "" }, + { "^$$", "" }, + { "^^$", "x" }, + { "^$$", "x" }, + { "^^$$", "" }, + { "^^$$", "x" }, + { "^^^^^^^^$$$$$$$$", "" }, + { "^", "x" }, + { "$", "x" }, + + // Word boundaries. + { "\\bfoo\\b", "nofoo foo that" }, + { "a\\b", "faoa x" }, + { "\\bbar", "bar x" }, + { "\\bbar", "foo\nbar x" }, + { "bar\\b", "foobar" }, + { "bar\\b", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\b", "foo" }, + { "(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b", "" }, + { "\\b", "x" }, + { "\\b(foo|bar|[A-Z])", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "X" }, + { "\\b(foo|bar|[A-Z])\\b", "XY" }, + { "\\b(foo|bar|[A-Z])\\b", "bar" }, + { "\\b(foo|bar|[A-Z])\\b", "foo" }, + { "\\b(foo|bar|[A-Z])\\b", "foo\n" }, + { "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" }, + { "\\b(fo|foo)\\b", "fo" }, + { "\\b(fo|foo)\\b", "foo" }, + { "\\b\\b", "" }, + { "\\b\\b", "x" }, + { "\\b$", "" }, + { "\\b$", "x" }, + { "\\b$", "y x" }, + { "\\b.$", "x" }, + { "^\\b(fo|foo)\\b", "fo" }, + { "^\\b(fo|foo)\\b", "foo" }, + { "^\\b", "" }, + { "^\\b", "x" }, + { "^\\b\\b", "" }, + { "^\\b\\b", "x" }, + { "^\\b$", "" }, + { "^\\b$", "x" }, + { "^\\b.$", "x" }, + { "^\\b.\\b$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "" }, + { "^^^^^^^^\\b.$$$$$$", "x" }, + { "^^^^^^^^\\b$$$$$$$", "x" }, + + // Non-word boundaries. + { "\\Bfoo\\B", "n foo xfoox that" }, + { "a\\B", "faoa x" }, + { "\\Bbar", "bar x" }, + { "\\Bbar", "foo\nbar x" }, + { "bar\\B", "foobar" }, + { "bar\\B", "foobar\nxxx" }, + { "(foo|bar|[A-Z])\\B", "foox" }, + { "(foo|bar|[A-Z])\\B", "foo\n" }, + { "\\B", "" }, + { "\\B", "x" }, + { "\\B(foo|bar|[A-Z])", "foo" }, + { "\\B(foo|bar|[A-Z])\\B", "xXy" }, + { "\\B(foo|bar|[A-Z])\\B", "XY" }, + { "\\B(foo|bar|[A-Z])\\B", "XYZ" }, + { "\\B(foo|bar|[A-Z])\\B", "abara" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo_" }, + { "\\B(foo|bar|[A-Z])\\B", "xfoo\n" }, + { "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" }, + { "\\B(fo|foo)\\B", "xfoo" }, + { "\\B(foo|fo)\\B", "xfooo" }, + { "\\B\\B", "" }, + { "\\B\\B", "x" }, + { "\\B$", "" }, + { "\\B$", "x" }, + { "\\B$", "y x" }, + { "\\B.$", "x" }, + { "^\\B(fo|foo)\\B", "fo" }, + { "^\\B(fo|foo)\\B", "foo" }, + { "^\\B", "" }, + { "^\\B", "x" }, + { "^\\B\\B", "" }, + { "^\\B\\B", "x" }, + { "^\\B$", "" }, + { "^\\B$", "x" }, + { "^\\B.$", "x" }, + { "^\\B.\\B$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "" }, + { "^^^^^^^^\\B.$$$$$$", "x" }, + { "^^^^^^^^\\B$$$$$$$", "x" }, + + // PCRE uses only ASCII for \b computation. + // All non-ASCII are *not* word characters. + { "\\bx\\b", "x" }, + { "\\bx\\b", "x>" }, + { "\\bx\\b", "" }, + { "\\bx\\b", "ax" }, + { "\\bx\\b", "xb" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "«x" }, + { "\\bx\\b", "x»" }, + { "\\bx\\b", "«x»" }, + { "\\bx\\b", "axb" }, + { "\\bx\\b", "áxβ" }, + { "\\Bx\\B", "axb" }, + { "\\Bx\\B", "áxβ" }, + + // Weird boundary cases. + { "^$^$", "" }, + { "^$^", "" }, + { "$^$", "" }, + + { "^$^$", "x" }, + { "^$^", "x" }, + { "$^$", "x" }, + + { "^$^$", "x\ny" }, + { "^$^", "x\ny" }, + { "$^$", "x\ny" }, + + { "^$^$", "x\n\ny" }, + { "^$^", "x\n\ny" }, + { "$^$", "x\n\ny" }, + + { "^(foo\\$)$", "foo$bar" }, + { "(foo\\$)", "foo$bar" }, + { "^...$", "abc" }, + + // UTF-8 + { "^\xe6\x9c\xac$", "\xe6\x9c\xac" }, + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + + { "^\\C\\C\\C$", "\xe6\x9c\xac" }, + { "^\\C$", "\xe6\x9c\xac" }, + { "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + + // Latin1 + { "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" }, + { "^...$", ".\xe6\x9c\xac." }, + { "^.....$", ".\xe6\x9c\xac." }, + + // Perl v Posix + { "\\B(fo|foo)\\B", "xfooo" }, + { "(fo|foo)", "foo" }, + + // Octal escapes. + { "\\141", "a" }, + { "\\060", "0" }, + { "\\0600", "00" }, + { "\\608", "08" }, + { "\\01", "\01" }, + { "\\018", "\01" "8" }, + + // Hexadecimal escapes + { "\\x{61}", "a" }, + { "\\x61", "a" }, + { "\\x{00000061}", "a" }, + + // Unicode scripts. + { "\\p{Greek}+", "aαβb" }, + { "\\P{Greek}+", "aαβb" }, + { "\\p{^Greek}+", "aαβb" }, + { "\\P{^Greek}+", "aαβb" }, + + // Unicode properties. Nd is decimal number. N is any number. + { "[^0-9]+", "abc123" }, + { "\\p{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\p{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{Nd}+", "abc123²³¼½¾₀₉" }, + { "\\P{^Nd}+", "abc123²³¼½¾₀₉" }, + { "\\pN+", "abc123²³¼½¾₀₉" }, + { "\\p{N}+", "abc123²³¼½¾₀₉" }, + { "\\p{^N}+", "abc123²³¼½¾₀₉" }, + + { "\\p{Any}+", "abc123" }, + + // Character classes & case folding. + { "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B + { "(?i)[A-Z]+", "aAzZ" }, + { "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z - + // splits the ranges in an interesting way. + + // would like to use, but PCRE mishandles in full-match, non-greedy mode + // { "(?i)[\\\\]+", "Aa" }, + + { "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Character classes & case folding. + { "[@-A]+", "@AaB" }, + { "[A-Z]+", "aAzZ" }, + { "[^\\\\]+", "Aa\\" }, + { "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" }, + + // Anchoring. (^abc in aabcdef was a former bug) + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "^abc", "abcdef" }, + { "^abc", "aabcdef" }, + { "^[ay]*[bx]+c", "abcdef" }, + { "^[ay]*[bx]+c", "aabcdef" }, + { "def$", "abcdef" }, + { "def$", "abcdeff" }, + { "d[ex][fy]$", "abcdef" }, + { "d[ex][fy]$", "abcdeff" }, + { "[dz][ex][fy]$", "abcdef" }, + { "[dz][ex][fy]$", "abcdeff" }, + { "(?m)^abc", "abcdef" }, + { "(?m)^abc", "aabcdef" }, + { "(?m)^[ay]*[bx]+c", "abcdef" }, + { "(?m)^[ay]*[bx]+c", "aabcdef" }, + { "(?m)def$", "abcdef" }, + { "(?m)def$", "abcdeff" }, + { "(?m)d[ex][fy]$", "abcdef" }, + { "(?m)d[ex][fy]$", "abcdeff" }, + { "(?m)[dz][ex][fy]$", "abcdef" }, + { "(?m)[dz][ex][fy]$", "abcdeff" }, + { "^", "a" }, + { "^^", "a" }, + + // Context. + // The tester checks for a match in the text and + // subpieces of the text with a byte removed on either side. + { "a", "a" }, + { "ab*", "a" }, + { "a\\C*", "a" }, + + // Former bugs. + { "a\\C*|ba\\C", "baba" }, +}; + +TEST(Regexp, SearchTests) { + int failures = 0; + for (int i = 0; i < arraysize(simple_tests); i++) { + const RegexpTest& t = simple_tests[i]; + if (!TestRegexpOnText(t.regexp, t.text)) + failures++; + +#ifdef LOGGING + // Build a dummy ExhaustiveTest call that will trigger just + // this one test, so that we log the test case. + vector atom, alpha, ops; + atom.push_back(StringPiece(t.regexp).as_string()); + alpha.push_back(StringPiece(t.text).as_string()); + ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", ""); +#endif + + } + EXPECT_EQ(failures, 0); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/set_test.cc b/outside/re2/re2/testing/set_test.cc new file mode 100644 index 000000000..74058a47d --- /dev/null +++ b/outside/re2/re2/testing/set_test.cc @@ -0,0 +1,114 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include "util/test.h" +#include "re2/re2.h" +#include "re2/set.h" + +namespace re2 { + +TEST(Set, Unanchored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("bar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("fooba", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + v.clear(); + CHECK_EQ(s.Match("oobar", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 1); +} + +TEST(Set, UnanchoredFactored) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("foobar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("obarfoobaroo", &v), true); + CHECK_EQ(v.size(), 2); + CHECK_EQ(v[0], 0); + CHECK_EQ(v[1], 1); + + v.clear(); + CHECK_EQ(s.Match("fooba", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + v.clear(); + CHECK_EQ(s.Match("oobar", &v), false); + CHECK_EQ(v.size(), 0); +} + +TEST(Set, UnanchoredDollar) { + RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED); + + CHECK_EQ(s.Add("foo$", NULL), 0); + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foo", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); +} + +TEST(Set, Anchored) { + RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH); + + CHECK_EQ(s.Add("foo", NULL), 0); + CHECK_EQ(s.Add("(", NULL), -1); + CHECK_EQ(s.Add("bar", NULL), 1); + + CHECK_EQ(s.Compile(), true); + + vector v; + CHECK_EQ(s.Match("foobar", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("fooba", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("oobar", &v), false); + CHECK_EQ(v.size(), 0); + + CHECK_EQ(s.Match("foo", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 0); + + CHECK_EQ(s.Match("bar", &v), true); + CHECK_EQ(v.size(), 1); + CHECK_EQ(v[0], 1); + +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/simplify_test.cc b/outside/re2/re2/testing/simplify_test.cc new file mode 100644 index 000000000..d54837c95 --- /dev/null +++ b/outside/re2/re2/testing/simplify_test.cc @@ -0,0 +1,167 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test simplify.cc. + +#include +#include +#include "util/test.h" +#include "re2/regexp.h" + +namespace re2 { + +struct Test { + const char* regexp; + const char* simplified; +}; + +static Test tests[] = { + // Already-simple constructs + { "a", "a" }, + { "ab", "ab" }, + { "a|b", "[a-b]" }, + { "ab|cd", "ab|cd" }, + { "(ab)*", "(ab)*" }, + { "(ab)+", "(ab)+" }, + { "(ab)?", "(ab)?" }, + { ".", "." }, + { "^", "^" }, + { "$", "$" }, + { "[ac]", "[ac]" }, + { "[^ac]", "[^ac]" }, + + // Posix character classes + { "[[:alnum:]]", "[0-9A-Za-z]" }, + { "[[:alpha:]]", "[A-Za-z]" }, + { "[[:blank:]]", "[\\t ]" }, + { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, + { "[[:digit:]]", "[0-9]" }, + { "[[:graph:]]", "[!-~]" }, + { "[[:lower:]]", "[a-z]" }, + { "[[:print:]]", "[ -~]" }, + { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, + { "[[:space:]]" , "[\\t-\\r ]" }, + { "[[:upper:]]", "[A-Z]" }, + { "[[:xdigit:]]", "[0-9A-Fa-f]" }, + + // Perl character classes + { "\\d", "[0-9]" }, + { "\\s", "[\\t-\\n\\f-\\r ]" }, + { "\\w", "[0-9A-Z_a-z]" }, + { "\\D", "[^0-9]" }, + { "\\S", "[^\\t-\\n\\f-\\r ]" }, + { "\\W", "[^0-9A-Z_a-z]" }, + { "[\\d]", "[0-9]" }, + { "[\\s]", "[\\t-\\n\\f-\\r ]" }, + { "[\\w]", "[0-9A-Z_a-z]" }, + { "[\\D]", "[^0-9]" }, + { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, + { "[\\W]", "[^0-9A-Z_a-z]" }, + + // Posix repetitions + { "a{1}", "a" }, + { "a{2}", "aa" }, + { "a{5}", "aaaaa" }, + { "a{0,1}", "a?" }, + // The next three are illegible because Simplify inserts (?:) + // parens instead of () parens to avoid creating extra + // captured subexpressions. The comments show a version fewer parens. + { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? + { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? + { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,2}", "(?:aa?)?" }, // (aa?)? + { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? + { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? + { "a{0,}", "a*" }, + { "a{1,}", "a+" }, + { "a{2,}", "aa+" }, + { "a{5,}", "aaaaa+" }, + + // Test that operators simplify their arguments. + // (Simplify used to not simplify arguments to a {} repeat.) + { "(?:a{1,}){1,}", "a+" }, + { "(a{1,}b{1,})", "(a+b+)" }, + { "a{1,}|b{1,}", "a+|b+" }, + { "(?:a{1,})*", "(?:a+)*" }, + { "(?:a{1,})+", "a+" }, + { "(?:a{1,})?", "(?:a+)?" }, + { "a{0}", "" }, + + // Character class simplification + { "[ab]", "[a-b]" }, + { "[a-za-za-z]", "[a-z]" }, + { "[A-Za-zA-Za-z]", "[A-Za-z]" }, + { "[ABCDEFGH]", "[A-H]" }, + { "[AB-CD-EF-GH]", "[A-H]" }, + { "[W-ZP-XE-R]", "[E-Z]" }, + { "[a-ee-gg-m]", "[a-m]" }, + { "[a-ea-ha-m]", "[a-m]" }, + { "[a-ma-ha-e]", "[a-m]" }, + { "[a-zA-Z0-9 -~]", "[ -~]" }, + + // Empty character classes + { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, + + // Full character classes + { "[[:cntrl:][:^cntrl:]]", "." }, + + // Unicode case folding. + { "(?i)A", "[Aa]" }, + { "(?i)a", "[Aa]" }, + { "(?i)K", "[Kk\\x{212a}]" }, + { "(?i)k", "[Kk\\x{212a}]" }, + { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, + { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, + { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, + { "(?i)[\\x00-\\x{10ffff}]", "." }, + + // Empty string as a regular expression. + // Empty string must be preserved inside parens in order + // to make submatches work right, so these are less + // interesting than they used to be. ToString inserts + // explicit (?:) in place of non-parenthesized empty strings, + // to make them easier to spot for other parsers. + { "(a|b|)", "([a-b]|(?:))" }, + { "(|)", "()" }, + { "a()", "a()" }, + { "(()|())", "(()|())" }, + { "(a|)", "(a|(?:))" }, + { "ab()cd()", "ab()cd()" }, + { "()", "()" }, + { "()*", "()*" }, + { "()+", "()+" }, + { "()?" , "()?" }, + { "(){0}", "" }, + { "(){1}", "()" }, + { "(){1,}", "()+" }, + { "(){0,2}", "(?:()()?)?" }, +}; + +TEST(TestSimplify, SimpleRegexps) { + for (int i = 0; i < arraysize(tests); i++) { + RegexpStatus status; + VLOG(1) << "Testing " << tests[i].regexp; + Regexp* re = Regexp::Parse(tests[i].regexp, + Regexp::MatchNL | (Regexp::LikePerl & + ~Regexp::OneLine), + &status); + CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); + Regexp* sre = re->Simplify(); + CHECK(sre != NULL); + + // Check that already-simple regexps don't allocate new ones. + if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { + CHECK(re == sre) << " " << tests[i].regexp + << " " << re->ToString() << " " << sre->ToString(); + } + + EXPECT_EQ(tests[i].simplified, sre->ToString()) + << " " << tests[i].regexp << " " << sre->Dump(); + + re->Decref(); + sre->Decref(); + } +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/string_generator.cc b/outside/re2/re2/testing/string_generator.cc new file mode 100644 index 000000000..5be6d3eb5 --- /dev/null +++ b/outside/re2/re2/testing/string_generator.cc @@ -0,0 +1,113 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#include +#include +#include "util/test.h" +#include "re2/testing/string_generator.h" + +namespace re2 { + +StringGenerator::StringGenerator(int maxlen, const vector& alphabet) + : maxlen_(maxlen), alphabet_(alphabet), + generate_null_(false), + random_(false), nrandom_(0), acm_(NULL) { + + // Degenerate case: no letters, no non-empty strings. + if (alphabet_.size() == 0) + maxlen_ = 0; + + // Next() will return empty string (digits_ is empty). + hasnext_ = true; +} + +StringGenerator::~StringGenerator() { + delete acm_; +} + +// Resets the string generator state to the beginning. +void StringGenerator::Reset() { + digits_.clear(); + hasnext_ = true; + random_ = false; + nrandom_ = 0; + generate_null_ = false; +} + +// Increments the big number in digits_, returning true if successful. +// Returns false if all the numbers have been used. +bool StringGenerator::IncrementDigits() { + // First try to increment the current number. + for (int i = digits_.size() - 1; i >= 0; i--) { + if (++digits_[i] < alphabet_.size()) + return true; + digits_[i] = 0; + } + + // If that failed, make a longer number. + if (digits_.size() < maxlen_) { + digits_.push_back(0); + return true; + } + + return false; +} + +// Generates random digits_, return true if successful. +// Returns false if the random sequence is over. +bool StringGenerator::RandomDigits() { + if (--nrandom_ <= 0) + return false; + + // Pick length. + int len = acm_->Uniform(maxlen_+1); + digits_.resize(len); + for (int i = 0; i < len; i++) + digits_[i] = acm_->Uniform(alphabet_.size()); + return true; +} + +// Returns the next string in the iteration, which is the one +// currently described by digits_. Calls IncrementDigits +// after computing the string, so that it knows the answer +// for subsequent HasNext() calls. +const StringPiece& StringGenerator::Next() { + CHECK(hasnext_); + if (generate_null_) { + generate_null_ = false; + sp_ = NULL; + return sp_; + } + s_.clear(); + for (int i = 0; i < digits_.size(); i++) { + s_ += alphabet_[digits_[i]]; + } + hasnext_ = random_ ? RandomDigits() : IncrementDigits(); + sp_ = s_; + return sp_; +} + +// Sets generator up to return n random strings. +void StringGenerator::Random(int32 seed, int n) { + if (acm_ == NULL) + acm_ = new ACMRandom(seed); + else + acm_->Reset(seed); + + random_ = true; + nrandom_ = n; + hasnext_ = nrandom_ > 0; +} + +void StringGenerator::GenerateNULL() { + generate_null_ = true; + hasnext_ = true; +} + +} // namespace re2 + diff --git a/outside/re2/re2/testing/string_generator.h b/outside/re2/re2/testing/string_generator.h new file mode 100644 index 000000000..6a9ef42a5 --- /dev/null +++ b/outside/re2/re2/testing/string_generator.h @@ -0,0 +1,58 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// String generator: generates all possible strings of up to +// maxlen letters using the set of letters in alpha. +// Fetch strings using a Java-like Next()/HasNext() interface. + +#ifndef RE2_TESTING_STRING_GENERATOR_H__ +#define RE2_TESTING_STRING_GENERATOR_H__ + +#include +#include +#include "util/util.h" +#include "util/random.h" +#include "re2/stringpiece.h" + +namespace re2 { + +class StringGenerator { + public: + StringGenerator(int maxlen, const vector& alphabet); + ~StringGenerator(); + const StringPiece& Next(); + bool HasNext() { return hasnext_; } + + // Resets generator to start sequence over. + void Reset(); + + // Causes generator to emit random strings for next n calls to Next(). + void Random(int32 seed, int n); + + // Causes generator to emit a NULL as the next call. + void GenerateNULL(); + + private: + bool IncrementDigits(); + bool RandomDigits(); + + // Global state. + int maxlen_; // Maximum length string to generate. + vector alphabet_; // Alphabet, one string per letter. + + // Iteration state. + StringPiece sp_; // Last StringPiece returned by Next(). + string s_; // String data in last StringPiece returned by Next(). + bool hasnext_; // Whether Next() can be called again. + vector digits_; // Alphabet indices for next string. + bool generate_null_; // Whether to generate a NULL StringPiece next. + bool random_; // Whether generated strings are random. + int nrandom_; // Number of random strings left to generate. + ACMRandom* acm_; // Random number generator + DISALLOW_EVIL_CONSTRUCTORS(StringGenerator); +}; + +} // namespace re2 + +#endif // RE2_TESTING_STRING_GENERATOR_H__ diff --git a/outside/re2/re2/testing/string_generator_test.cc b/outside/re2/re2/testing/string_generator_test.cc new file mode 100644 index 000000000..d13401a57 --- /dev/null +++ b/outside/re2/re2/testing/string_generator_test.cc @@ -0,0 +1,109 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Test StringGenerator. + +#include +#include +#include +#include "util/test.h" +#include "re2/testing/string_generator.h" +#include "re2/testing/regexp_generator.h" + +namespace re2 { + +// Returns i to the e. +static int64 IntegerPower(int i, int e) { + int64 p = 1; + while (e-- > 0) + p *= i; + return p; +} + +// Checks that for given settings of the string generator: +// * it generates strings that are non-decreasing in length. +// * strings of the same length are sorted in alphabet order. +// * it doesn't generate the same string twice. +// * it generates the right number of strings. +// +// If all of these hold, the StringGenerator is behaving. +// Assumes that the alphabet is sorted, so that the generated +// strings can just be compared lexicographically. +static void RunTest(int len, string alphabet, bool donull) { + StringGenerator g(len, Explode(alphabet)); + + int n = 0; + int last_l = -1; + string last_s; + + if (donull) { + g.GenerateNULL(); + EXPECT_TRUE(g.HasNext()); + StringPiece sp = g.Next(); + EXPECT_EQ(sp.data(), static_cast(NULL)); + EXPECT_EQ(sp.size(), 0); + } + + while (g.HasNext()) { + string s = g.Next().as_string(); + n++; + + // Check that all characters in s appear in alphabet. + for (const char *p = s.c_str(); *p != '\0'; ) { + Rune r; + p += chartorune(&r, p); + EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL); + } + + // Check that string is properly ordered w.r.t. previous string. + int l = utflen(s.c_str()); + EXPECT_LE(l, len); + if (last_l < l) { + last_l = l; + } else { + EXPECT_EQ(last_l, l); + EXPECT_LT(last_s, s); + } + last_s = s; + } + + // Check total string count. + int64 m = 0; + int alpha = utflen(alphabet.c_str()); + if (alpha == 0) // Degenerate case. + len = 0; + for (int i = 0; i <= len; i++) + m += IntegerPower(alpha, i); + EXPECT_EQ(n, m); +} + +TEST(StringGenerator, NoLength) { + RunTest(0, "abc", false); +} + +TEST(StringGenerator, NoLengthNoAlphabet) { + RunTest(0, "", false); +} + +TEST(StringGenerator, NoAlphabet) { + RunTest(5, "", false); +} + +TEST(StringGenerator, Simple) { + RunTest(3, "abc", false); +} + +TEST(StringGenerator, UTF8) { + RunTest(4, "abc\xE2\x98\xBA", false); +} + +TEST(StringGenerator, GenNULL) { + RunTest(0, "abc", true); + RunTest(0, "", true); + RunTest(5, "", true); + RunTest(3, "abc", true); + RunTest(4, "abc\xE2\x98\xBA", true); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/tester.cc b/outside/re2/re2/testing/tester.cc new file mode 100644 index 000000000..003dc5add --- /dev/null +++ b/outside/re2/re2/testing/tester.cc @@ -0,0 +1,640 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression engine tester -- test all the implementations against each other. + +#include "util/util.h" +#include "util/flags.h" +#include "re2/testing/tester.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DEFINE_bool(dump_prog, false, "dump regexp program"); +DEFINE_bool(log_okay, false, "log successful runs"); +DEFINE_bool(dump_rprog, false, "dump reversed regexp program"); + +DEFINE_int32(max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); + +DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test"); + +namespace re2 { + +enum { + kMaxSubmatch = 1+16, // $0...$16 +}; + +const char* engine_types[kEngineMax] = { + "Backtrack", + "NFA", + "DFA", + "DFA1", + "OnePass", + "BitState", + "RE2", + "RE2a", + "RE2b", + "PCRE", +}; + +// Returns the name string for the type t. +static string EngineString(Engine t) { + if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) { + return StringPrintf("type%d", static_cast(t)); + } + return engine_types[t]; +} + +// Returns bit mask of engines to use. +static uint32 Engines() { + static uint32 cached_engines; + static bool did_parse; + + if (did_parse) + return cached_engines; + + if (FLAGS_regexp_engines.empty()) { + cached_engines = ~0; + } else { + for (Engine i = static_cast(0); i < kEngineMax; i++) + if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str())) + cached_engines |= 1<(0); i < kEngineMax; i++) { + if (cached_engines & (1<(s.begin() - text.begin()), + static_cast(s.end() - text.begin())); +} + +// Returns whether text contains non-ASCII (>= 0x80) bytes. +static bool NonASCII(const StringPiece& text) { + for (int i = 0; i < text.size(); i++) + if ((uint8)text[i] >= 0x80) + return true; + return false; +} + +// Returns string representation of match kind. +static string FormatKind(Prog::MatchKind kind) { + switch (kind) { + case Prog::kFullMatch: + return "full match"; + case Prog::kLongestMatch: + return "longest match"; + case Prog::kFirstMatch: + return "first match"; + case Prog::kManyMatch: + return "many match"; + } + return "???"; +} + +// Returns string representation of anchor kind. +static string FormatAnchor(Prog::Anchor anchor) { + switch (anchor) { + case Prog::kAnchored: + return "anchored"; + case Prog::kUnanchored: + return "unanchored"; + } + return "???"; +} + +struct ParseMode { + Regexp::ParseFlags parse_flags; + string desc; +}; + +static const Regexp::ParseFlags single_line = + Regexp::LikePerl; +static const Regexp::ParseFlags multi_line = + static_cast(Regexp::LikePerl & ~Regexp::OneLine); + +static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +}; + +static string FormatMode(Regexp::ParseFlags flags) { + for (int i = 0; i < arraysize(parse_modes); i++) + if (parse_modes[i].parse_flags == flags) + return parse_modes[i].desc; + return StringPrintf("%#x", static_cast(flags)); +} + +// Constructs and saves all the matching engines that +// will be required for the given tests. +TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, + Regexp::ParseFlags flags) + : regexp_str_(regexp_str), + kind_(kind), + flags_(flags), + error_(false), + regexp_(NULL), + num_captures_(0), + prog_(NULL), + rprog_(NULL), + re_(NULL), + re2_(NULL) { + + VLOG(1) << CEscape(regexp_str); + + // Compile regexp to prog. + // Always required - needed for backtracking (reference implementation). + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + << " mode: " << FormatMode(flags); + error_ = true; + return; + } + num_captures_ = regexp_->NumCaptures(); + prog_ = regexp_->CompileToProg(0); + if (prog_ == NULL) { + LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (FLAGS_dump_prog) { + LOG(INFO) << "Prog for " + << " regexp " + << CEscape(regexp_str_) + << " (" << FormatKind(kind_) + << ", " << FormatMode(flags_) + << ")\n" + << prog_->Dump(); + } + + // Compile regexp to reversed prog. Only needed for DFA engines. + if (Engines() & ((1<CompileToReverseProg(0); + if (rprog_ == NULL) { + LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (FLAGS_dump_rprog) + LOG(INFO) << rprog_->Dump(); + } + + // Create re string that will be used for RE and RE2. + string re = regexp_str.as_string(); + // Accomodate flags. + // Regexp::Latin1 will be accomodated below. + if (!(flags & Regexp::OneLine)) + re = "(?m)" + re; + if (flags & Regexp::NonGreedy) + re = "(?U)" + re; + if (flags & Regexp::DotNL) + re = "(?s)" + re; + + // Compile regexp to RE2. + if (Engines() & ((1<error().empty()) { + LOG(INFO) << "Cannot RE2: " << CEscape(re); + error_ = true; + return; + } + } + + // Compile regexp to RE. + // PCRE as exposed by the RE interface isn't always usable. + // 1. It disagrees about handling of empty-string reptitions + // like matching (a*)* against "b". PCRE treats the (a*) as + // occurring once, while we treat it as occurring not at all. + // 2. It treats $ as this weird thing meaning end of string + // or before the \n at the end of the string. + // 3. It doesn't implement POSIX leftmost-longest matching. + // MimicsPCRE() detects 1 and 2. + if ((Engines() & (1<MimicsPCRE() && + kind_ != Prog::kLongestMatch) { + PCRE_Options o; + o.set_option(PCRE::UTF8); + if (flags & Regexp::Latin1) + o.set_option(PCRE::None); + // PCRE has interface bug keeping us from finding $0, so + // add one more layer of parens. + re_ = new PCRE("("+re+")", o); + if (!re_->error().empty()) { + LOG(INFO) << "Cannot PCRE: " << CEscape(re); + error_ = true; + return; + } + } +} + +TestInstance::~TestInstance() { + if (regexp_) + regexp_->Decref(); + delete prog_; + delete rprog_; + delete re_; + delete re2_; +} + +// Runs a single search using the named engine type. +// This interface hides all the irregularities of the various +// engine interfaces from the rest of this file. +void TestInstance::RunSearch(Engine type, + const StringPiece& orig_text, + const StringPiece& orig_context, + Prog::Anchor anchor, + Result *result) { + memset(result, 0, sizeof *result); + if (regexp_ == NULL) { + result->skipped = true; + return; + } + int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 + if (nsubmatch > kMaxSubmatch) + nsubmatch = kMaxSubmatch; + + StringPiece text = orig_text; + StringPiece context = orig_context; + + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; + + case kEngineBacktrack: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineNFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchNFA(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineDFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, + &result->skipped, NULL); + break; + + case kEngineDFA1: + if (prog_ == NULL || rprog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchDFA(text, context, anchor, kind_, result->submatch, + &result->skipped, NULL); + // If anchored, no need for second run, + // but do it anyway to find more bugs. + if (result->matched) { + if (!rprog_->SearchDFA(result->submatch[0], context, + Prog::kAnchored, Prog::kLongestMatch, + result->submatch, + &result->skipped, NULL)) { + LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) + << " on " << CEscape(text); + result->matched = false; + } + } + result->have_submatch0 = true; + break; + + case kEngineOnePass: + if (prog_ == NULL || + anchor == Prog::kUnanchored || + !prog_->IsOnePass() || + nsubmatch > Prog::kMaxOnePassCapture) { + result->skipped = true; + break; + } + result->matched = prog_->SearchOnePass(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineBitState: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchBitState(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineRE2: + case kEngineRE2a: + case kEngineRE2b: { + if (!re2_ || text.end() != context.end()) { + result->skipped = true; + break; + } + + RE2::Anchor re_anchor; + if (anchor == Prog::kAnchored) + re_anchor = RE2::ANCHOR_START; + else + re_anchor = RE2::UNANCHORED; + if (kind_ == Prog::kFullMatch) + re_anchor = RE2::ANCHOR_BOTH; + + result->matched = re2_->Match(context, + text.begin() - context.begin(), + text.end() - context.begin(), + re_anchor, result->submatch, nsubmatch); + result->have_submatch = nsubmatch > 0; + break; + } + + case kEnginePCRE: { + if (!re_ || text.begin() != context.begin() || + text.end() != context.end()) { + result->skipped = true; + break; + } + + const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; + PCRE::Arg *a = new PCRE::Arg[nsubmatch]; + for (int i = 0; i < nsubmatch; i++) { + a[i] = PCRE::Arg(&result->submatch[i]); + argptr[i] = &a[i]; + } + int consumed; + PCRE::Anchor pcre_anchor; + if (anchor == Prog::kAnchored) + pcre_anchor = PCRE::ANCHOR_START; + else + pcre_anchor = PCRE::UNANCHORED; + if (kind_ == Prog::kFullMatch) + pcre_anchor = PCRE::ANCHOR_BOTH; + re_->ClearHitLimit(); + result->matched = + re_->DoMatch(text, + pcre_anchor, + &consumed, + argptr, nsubmatch); + if (re_->HitLimit()) { + result->untrusted = true; + delete[] argptr; + delete[] a; + break; + } + result->have_submatch = true; + + // Work around RE interface bug: PCRE returns -1 as the + // offsets for an unmatched subexpression, and RE should + // turn that into StringPiece(NULL) but in fact it uses + // StringPiece(text.begin() - 1, 0). Oops. + for (int i = 0; i < nsubmatch; i++) + if (result->submatch[i].begin() == text.begin() - 1) + result->submatch[i] = NULL; + delete[] argptr; + delete[] a; + break; + } + } + + if (!result->matched) + memset(result->submatch, 0, sizeof result->submatch); +} + +// Checks whether r is okay given that correct is the right answer. +// Specifically, r's answers have to match (but it doesn't have to +// claim to have all the answers). +static bool ResultOkay(const Result& r, const Result& correct) { + if (r.skipped) + return true; + if (r.matched != correct.matched) + return false; + if (r.have_submatch || r.have_submatch0) { + for (int i = 0; i < kMaxSubmatch; i++) { + if (correct.submatch[i].begin() != r.submatch[i].begin() || + correct.submatch[i].size() != r.submatch[i].size()) + return false; + if (!r.have_submatch) + break; + } + } + return true; +} + +// Runs a single test. +bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + // Backtracking is the gold standard. + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { + if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); + return false; + } + VLOG(1) << "Try: regexp " << CEscape(regexp_str_) + << " text " << CEscape(text) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; + + // Compare the others. + bool all_okay = true; + for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { + if (!(Engines() & (1< 0 && --FLAGS_max_regexp_failures == 0) + LOG(QFATAL) << "Too many regexp failures."; + } + + return all_okay; +} + +void TestInstance::LogMatch(const char* prefix, Engine e, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + LOG(INFO) << prefix + << EngineString(e) + << " regexp " + << CEscape(regexp_str_) + << " " + << CEscape(regexp_->ToString()) + << " text " + << CEscape(text) + << " (" + << text.begin() - context.begin() + << "," + << text.end() - context.begin() + << ") of context " + << CEscape(context) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; +} + +static Prog::MatchKind kinds[] = { + Prog::kFirstMatch, + Prog::kLongestMatch, + Prog::kFullMatch, +}; + +// Test all possible match kinds and parse modes. +Tester::Tester(const StringPiece& regexp) { + error_ = false; + for (int i = 0; i < arraysize(kinds); i++) { + for (int j = 0; j < arraysize(parse_modes); j++) { + TestInstance* t = new TestInstance(regexp, kinds[i], + parse_modes[j].parse_flags); + error_ |= t->error(); + v_.push_back(t); + } + } +} + +Tester::~Tester() { + for (int i = 0; i < v_.size(); i++) + delete v_[i]; +} + +bool Tester::TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + bool okay = true; + for (int i = 0; i < v_.size(); i++) + okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); + return okay; +} + +static Prog::Anchor anchors[] = { + Prog::kAnchored, + Prog::kUnanchored +}; + +bool Tester::TestInput(const StringPiece& text) { + bool okay = TestInputInContext(text, text); + if (text.size() > 0) { + StringPiece sp; + sp = text; + sp.remove_prefix(1); + okay &= TestInputInContext(sp, text); + sp = text; + sp.remove_suffix(1); + okay &= TestInputInContext(sp, text); + } + return okay; +} + +bool Tester::TestInputInContext(const StringPiece& text, + const StringPiece& context) { + bool okay = true; + for (int i = 0; i < arraysize(anchors); i++) + okay &= TestCase(text, context, anchors[i]); + return okay; +} + +bool TestRegexpOnText(const StringPiece& regexp, + const StringPiece& text) { + Tester t(regexp); + return t.TestInput(text); +} + +} // namespace re2 diff --git a/outside/re2/re2/testing/tester.h b/outside/re2/re2/testing/tester.h new file mode 100644 index 000000000..6e16e7757 --- /dev/null +++ b/outside/re2/re2/testing/tester.h @@ -0,0 +1,121 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Comparative tester for regular expression matching. +// Checks all implementations against each other. + +#ifndef RE2_TESTING_TESTER_H__ +#define RE2_TESTING_TESTER_H__ + +#include "re2/stringpiece.h" +#include "re2/prog.h" +#include "re2/regexp.h" +#include "re2/re2.h" +#include "util/pcre.h" + +namespace re2 { + +class Regexp; + +// All the supported regexp engines. +enum Engine { + kEngineBacktrack = 0, // Prog::BadSearchBacktrack + kEngineNFA, // Prog::SearchNFA + kEngineDFA, // Prog::SearchDFA, only ask whether it matched + kEngineDFA1, // Prog::SearchDFA, ask for match[0] + kEngineOnePass, // Prog::SearchOnePass, if applicable + kEngineBitState, // Prog::SearchBitState + kEngineRE2, // RE2, all submatches + kEngineRE2a, // RE2, only ask for match[0] + kEngineRE2b, // RE2, only ask whether it matched + kEnginePCRE, // PCRE (util/pcre.h) + + kEngineMax, +}; + +// Make normal math on the enum preserve the type. +// By default, C++ doesn't define ++ on enum, and e+1 has type int. +static inline void operator++(Engine& e, int unused) { + e = static_cast(e+1); +} + +static inline Engine operator+(Engine e, int i) { + return static_cast(static_cast(e)+i); +} + +// A TestInstance caches per-regexp state for a given +// regular expression in a given configuration +// (UTF-8 vs Latin1, longest vs first match, etc.). +class TestInstance { + public: + struct Result; + + TestInstance(const StringPiece& regexp, Prog::MatchKind kind, + Regexp::ParseFlags flags); + ~TestInstance(); + Regexp::ParseFlags flags() { return flags_; } + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + private: + // Runs a single search using the named engine type. + void RunSearch(Engine type, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor, + Result *result); + + void LogMatch(const char* prefix, Engine e, const StringPiece& text, + const StringPiece& context, Prog::Anchor anchor); + + const StringPiece& regexp_str_; // regexp being tested + Prog::MatchKind kind_; // kind of match + Regexp::ParseFlags flags_; // flags for parsing regexp_str_ + bool error_; // error during constructor? + + Regexp* regexp_; // parsed regexp + int num_captures_; // regexp_->NumCaptures() cached + Prog* prog_; // compiled program + Prog* rprog_; // compiled reverse program + PCRE* re_; // PCRE implementation + RE2* re2_; // RE2 implementation + + DISALLOW_EVIL_CONSTRUCTORS(TestInstance); +}; + +// A group of TestInstances for all possible configurations. +class Tester { + public: + explicit Tester(const StringPiece& regexp); + ~Tester(); + + bool error() { return error_; } + + // Runs a single test case: search in text, which is in context, + // using the given anchoring. + bool TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor); + + // Run TestCase(text, text, anchor) for all anchoring modes. + bool TestInput(const StringPiece& text); + + // Run TestCase(text, context, anchor) for all anchoring modes. + bool TestInputInContext(const StringPiece& text, const StringPiece& context); + + private: + bool error_; + vector v_; + + DISALLOW_EVIL_CONSTRUCTORS(Tester); +}; + +// Run all possible tests using regexp and text. +bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text); + +} // namespace re2 + +#endif // RE2_TESTING_TESTER_H__ diff --git a/outside/re2/re2/testing/unicode_test.py b/outside/re2/re2/testing/unicode_test.py new file mode 100755 index 000000000..a88a3ad5a --- /dev/null +++ b/outside/re2/re2/testing/unicode_test.py @@ -0,0 +1,207 @@ +#!/usr/bin/python2.4 +# +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +"""Unittest for the util/regexp/re2/unicode.py module.""" + +import os +import StringIO +from google3.pyglib import flags +from google3.testing.pybase import googletest +from google3.util.regexp.re2 import unicode + +_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party", + "unicode", "ucd-5.1.0") + + +class ConvertTest(googletest.TestCase): + """Test the conversion functions.""" + + def testUInt(self): + self.assertEquals(0x0000, unicode._UInt("0000")) + self.assertEquals(0x263A, unicode._UInt("263A")) + self.assertEquals(0x10FFFF, unicode._UInt("10FFFF")) + self.assertRaises(unicode.InputError, unicode._UInt, "263") + self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA") + self.assertRaises(unicode.InputError, unicode._UInt, "110000") + + def testURange(self): + self.assertEquals([1, 2, 3], unicode._URange("0001..0003")) + self.assertEquals([1], unicode._URange("0001")) + self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005") + self.assertRaises(unicode.InputError, unicode._URange, "0003..0001") + self.assertRaises(unicode.InputError, unicode._URange, "0001..0001") + + def testUStr(self): + self.assertEquals("0x263A", unicode._UStr(0x263a)) + self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF)) + self.assertRaises(unicode.InputError, unicode._UStr, 0x110000) + self.assertRaises(unicode.InputError, unicode._UStr, -1) + + +_UNICODE_TABLE = """# Commented line, should be ignored. +# The next line is blank and should be ignored. + +0041;Capital A;Line 1 +0061..007A;Lowercase;Line 2 +1F00;;Ignored +1FFE;;Line 3 +10FFFF;Runemax;Line 4 +0000;Zero;Line 5 +""" + +_BAD_TABLE1 = """ +111111;Not a code point; +""" + +_BAD_TABLE2 = """ +0000;;Missing +""" + +_BAD_TABLE3 = """ +0010..0001;Bad range; +""" + + +class AbortError(Exception): + """Function should not have been called.""" + + +def Abort(): + raise AbortError("Abort") + + +def StringTable(s, n, f): + unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f) + + +class ReadUnicodeTableTest(googletest.TestCase): + """Test the ReadUnicodeTable function.""" + + def testSimpleTable(self): + + ncall = [0] # can't assign to ordinary int in DoLine + + def DoLine(codes, fields): + self.assertEquals(3, len(fields)) + ncall[0] += 1 + self.assertEquals("Line %d" % (ncall[0],), fields[2]) + if ncall[0] == 1: + self.assertEquals([0x0041], codes) + self.assertEquals("0041", fields[0]) + self.assertEquals("Capital A", fields[1]) + elif ncall[0] == 2: + self.assertEquals(range(0x0061, 0x007A + 1), codes) + self.assertEquals("0061..007A", fields[0]) + self.assertEquals("Lowercase", fields[1]) + elif ncall[0] == 3: + self.assertEquals(range(0x1F00, 0x1FFE + 1), codes) + self.assertEquals("1F00..1FFE", fields[0]) + self.assertEquals("Greek", fields[1]) + elif ncall[0] == 4: + self.assertEquals([0x10FFFF], codes) + self.assertEquals("10FFFF", fields[0]) + self.assertEquals("Runemax", fields[1]) + elif ncall[0] == 5: + self.assertEquals([0x0000], codes) + self.assertEquals("0000", fields[0]) + self.assertEquals("Zero", fields[1]) + + StringTable(_UNICODE_TABLE, 3, DoLine) + self.assertEquals(5, ncall[0]) + + def testErrorTables(self): + self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort) + self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort) + self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort) + + +class ParseContinueTest(googletest.TestCase): + """Test the ParseContinue function.""" + + def testParseContinue(self): + self.assertEquals(("Private Use", "First"), + unicode._ParseContinue("")) + self.assertEquals(("Private Use", "Last"), + unicode._ParseContinue("")) + self.assertEquals(("", None), + unicode._ParseContinue("")) + + +class CaseGroupsTest(googletest.TestCase): + """Test the CaseGroups function (and the CaseFoldingReader).""" + + def FindGroup(self, c): + if type(c) == str: + c = ord(c) + for g in self.groups: + if c in g: + return g + return None + + def testCaseGroups(self): + self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR) + self.assertEquals([ord("A"), ord("a")], self.FindGroup("a")) + self.assertEquals(None, self.FindGroup("0")) + + +class ScriptsTest(googletest.TestCase): + """Test the Scripts function (and the ScriptsReader).""" + + def FindScript(self, c): + if type(c) == str: + c = ord(c) + for script, codes in self.scripts.items(): + for code in codes: + if c == code: + return script + return None + + def testScripts(self): + self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR) + self.assertEquals("Latin", self.FindScript("a")) + self.assertEquals("Common", self.FindScript("0")) + self.assertEquals(None, self.FindScript(0xFFFE)) + + +class CategoriesTest(googletest.TestCase): + """Test the Categories function (and the UnicodeDataReader).""" + + def FindCategory(self, c): + if type(c) == str: + c = ord(c) + short = None + for category, codes in self.categories.items(): + for code in codes: + if code == c: + # prefer category Nd over N + if len(category) > 1: + return category + if short == None: + short = category + return short + + def testCategories(self): + self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR) + self.assertEquals("Ll", self.FindCategory("a")) + self.assertEquals("Nd", self.FindCategory("0")) + self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range + self.assertEquals(None, self.FindCategory(0xFFFE)) + self.assertEquals("Lo", self.FindCategory(0x8B5A)) + self.assertEquals("Lo", self.FindCategory(0x6C38)) + self.assertEquals("Lo", self.FindCategory(0x92D2)) + self.assertTrue(ord("a") in self.categories["L"]) + self.assertTrue(ord("0") in self.categories["N"]) + self.assertTrue(0x8B5A in self.categories["L"]) + self.assertTrue(0x6C38 in self.categories["L"]) + self.assertTrue(0x92D2 in self.categories["L"]) + +def main(): + googletest.main() + +if __name__ == "__main__": + main() diff --git a/outside/re2/re2/tostring.cc b/outside/re2/re2/tostring.cc new file mode 100644 index 000000000..555524f29 --- /dev/null +++ b/outside/re2/re2/tostring.cc @@ -0,0 +1,341 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Format a regular expression structure as a string. +// Tested by parse_test.cc + +#include "util/util.h" +#include "re2/regexp.h" +#include "re2/walker-inl.h" + +namespace re2 { + +enum { + PrecAtom, + PrecUnary, + PrecConcat, + PrecAlternate, + PrecEmpty, + PrecParen, + PrecToplevel, +}; + +// Helper function. See description below. +static void AppendCCRange(string* t, Rune lo, Rune hi); + +// Walker to generate string in s_. +// The arg pointers are actually integers giving the +// context precedence. +// The child_args are always NULL. +class ToStringWalker : public Regexp::Walker { + public: + explicit ToStringWalker(string* t) : t_(t) {} + + virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); + virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args); + virtual int ShortVisit(Regexp* re, int parent_arg) { + return 0; + } + + private: + string* t_; // The string the walker appends to. + + DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker); +}; + +string Regexp::ToString() { + string t; + ToStringWalker w(&t); + w.WalkExponential(this, PrecToplevel, 100000); + if (w.stopped_early()) + t += " [truncated]"; + return t; +} + +#define ToString DontCallToString // Avoid accidental recursion. + +// Visits re before children are processed. +// Appends ( if needed and passes new precedence to children. +int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { + int prec = parent_arg; + int nprec = PrecAtom; + + switch (re->op()) { + case kRegexpNoMatch: + case kRegexpEmptyMatch: + case kRegexpLiteral: + case kRegexpAnyChar: + case kRegexpAnyByte: + case kRegexpBeginLine: + case kRegexpEndLine: + case kRegexpBeginText: + case kRegexpEndText: + case kRegexpWordBoundary: + case kRegexpNoWordBoundary: + case kRegexpCharClass: + case kRegexpHaveMatch: + nprec = PrecAtom; + break; + + case kRegexpConcat: + case kRegexpLiteralString: + if (prec < PrecConcat) + t_->append("(?:"); + nprec = PrecConcat; + break; + + case kRegexpAlternate: + if (prec < PrecAlternate) + t_->append("(?:"); + nprec = PrecAlternate; + break; + + case kRegexpCapture: + t_->append("("); + if (re->name()) { + t_->append("?P<"); + t_->append(*re->name()); + t_->append(">"); + } + nprec = PrecParen; + break; + + case kRegexpStar: + case kRegexpPlus: + case kRegexpQuest: + case kRegexpRepeat: + if (prec < PrecUnary) + t_->append("(?:"); + // The subprecedence here is PrecAtom instead of PrecUnary + // because PCRE treats two unary ops in a row as a parse error. + nprec = PrecAtom; + break; + } + + return nprec; +} + +static void AppendLiteral(string *t, Rune r, bool foldcase) { + if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { + t->append(1, '\\'); + t->append(1, r); + } else if (foldcase && 'a' <= r && r <= 'z') { + if ('a' <= r && r <= 'z') + r += 'A' - 'a'; + t->append(1, '['); + t->append(1, r); + t->append(1, r + 'a' - 'A'); + t->append(1, ']'); + } else { + AppendCCRange(t, r, r); + } +} + +// Visits re after children are processed. +// For childless regexps, all the work is done here. +// For regexps with children, append any unary suffixes or ). +int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) { + int prec = parent_arg; + switch (re->op()) { + case kRegexpNoMatch: + // There's no simple symbol for "no match", but + // [^0-Runemax] excludes everything. + t_->append("[^\\x00-\\x{10ffff}]"); + break; + + case kRegexpEmptyMatch: + // Append (?:) to make empty string visible, + // unless this is already being parenthesized. + if (prec < PrecEmpty) + t_->append("(?:)"); + break; + + case kRegexpLiteral: + AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase); + break; + + case kRegexpLiteralString: + for (int i = 0; i < re->nrunes(); i++) + AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase); + if (prec < PrecConcat) + t_->append(")"); + break; + + case kRegexpConcat: + if (prec < PrecConcat) + t_->append(")"); + break; + + case kRegexpAlternate: + // Clumsy but workable: the children all appended | + // at the end of their strings, so just remove the last one. + if ((*t_)[t_->size()-1] == '|') + t_->erase(t_->size()-1); + else + LOG(DFATAL) << "Bad final char: " << t_; + if (prec < PrecAlternate) + t_->append(")"); + break; + + case kRegexpStar: + t_->append("*"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpPlus: + t_->append("+"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpQuest: + t_->append("?"); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpRepeat: + if (re->max() == -1) + t_->append(StringPrintf("{%d,}", re->min())); + else if (re->min() == re->max()) + t_->append(StringPrintf("{%d}", re->min())); + else + t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); + if (re->parse_flags() & Regexp::NonGreedy) + t_->append("?"); + if (prec < PrecUnary) + t_->append(")"); + break; + + case kRegexpAnyChar: + t_->append("."); + break; + + case kRegexpAnyByte: + t_->append("\\C"); + break; + + case kRegexpBeginLine: + t_->append("^"); + break; + + case kRegexpEndLine: + t_->append("$"); + break; + + case kRegexpBeginText: + t_->append("(?-m:^)"); + break; + + case kRegexpEndText: + if (re->parse_flags() & Regexp::WasDollar) + t_->append("(?-m:$)"); + else + t_->append("\\z"); + break; + + case kRegexpWordBoundary: + t_->append("\\b"); + break; + + case kRegexpNoWordBoundary: + t_->append("\\B"); + break; + + case kRegexpCharClass: { + if (re->cc()->size() == 0) { + t_->append("[^\\x00-\\x{10ffff}]"); + break; + } + t_->append("["); + // Heuristic: show class as negated if it contains the + // non-character 0xFFFE. + CharClass* cc = re->cc(); + if (cc->Contains(0xFFFE)) { + cc = cc->Negate(); + t_->append("^"); + } + for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) + AppendCCRange(t_, i->lo, i->hi); + if (cc != re->cc()) + cc->Delete(); + t_->append("]"); + break; + } + + case kRegexpCapture: + t_->append(")"); + break; + + case kRegexpHaveMatch: + // There's no syntax accepted by the parser to generate + // this node (it is generated by RE2::Set) so make something + // up that is readable but won't compile. + t_->append("(?HaveMatch:%d)", re->match_id()); + break; + } + + // If the parent is an alternation, append the | for it. + if (prec == PrecAlternate) + t_->append("|"); + + return 0; +} + +// Appends a rune for use in a character class to the string t. +static void AppendCCChar(string* t, Rune r) { + if (0x20 <= r && r <= 0x7E) { + if (strchr("[]^-\\", r)) + t->append("\\"); + t->append(1, r); + return; + } + switch (r) { + default: + break; + + case '\r': + t->append("\\r"); + return; + + case '\t': + t->append("\\t"); + return; + + case '\n': + t->append("\\n"); + return; + + case '\f': + t->append("\\f"); + return; + } + + if (r < 0x100) { + StringAppendF(t, "\\x%02x", static_cast(r)); + return; + } + StringAppendF(t, "\\x{%x}", static_cast(r)); +} + +static void AppendCCRange(string* t, Rune lo, Rune hi) { + if (lo > hi) + return; + AppendCCChar(t, lo); + if (lo < hi) { + t->append("-"); + AppendCCChar(t, hi); + } +} + +} // namespace re2 diff --git a/outside/re2/re2/unicode.py b/outside/re2/re2/unicode.py new file mode 100644 index 000000000..6dfe87bbc --- /dev/null +++ b/outside/re2/re2/unicode.py @@ -0,0 +1,297 @@ +# Copyright 2008 The RE2 Authors. All Rights Reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +"""Parser for Unicode data files (as distributed by unicode.org).""" + +import os +import re +import urllib2 + +# Directory or URL where Unicode tables reside. +_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" + +# Largest valid Unicode code value. +_RUNE_MAX = 0x10FFFF + + +class Error(Exception): + """Unicode error base class.""" + + +class InputError(Error): + """Unicode input error class. Raised on invalid input.""" + + +def _UInt(s): + """Converts string to Unicode code point ('263A' => 0x263a). + + Args: + s: string to convert + + Returns: + Unicode code point + + Raises: + InputError: the string is not a valid Unicode value. + """ + + try: + v = int(s, 16) + except ValueError: + v = -1 + if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: + raise InputError("invalid Unicode value %s" % (s,)) + return v + + +def _URange(s): + """Converts string to Unicode range. + + '0001..0003' => [1, 2, 3]. + '0001' => [1]. + + Args: + s: string to convert + + Returns: + Unicode range + + Raises: + InputError: the string is not a valid Unicode range. + """ + a = s.split("..") + if len(a) == 1: + return [_UInt(a[0])] + if len(a) == 2: + lo = _UInt(a[0]) + hi = _UInt(a[1]) + if lo < hi: + return range(lo, hi + 1) + raise InputError("invalid Unicode range %s" % (s,)) + + +def _UStr(v): + """Converts Unicode code point to hex string. + + 0x263a => '0x263A'. + + Args: + v: code point to convert + + Returns: + Unicode string + + Raises: + InputError: the argument is not a valid Unicode value. + """ + if v < 0 or v > _RUNE_MAX: + raise InputError("invalid Unicode value %s" % (v,)) + return "0x%04X" % (v,) + + +def _ParseContinue(s): + """Parses a Unicode continuation field. + + These are of the form '' or ''. + Instead of giving an explicit range in a single table entry, + some Unicode tables use two entries, one for the first + code value in the range and one for the last. + The first entry's description is '' instead of 'Name' + and the second is ''. + + '' => ('Name', 'First') + '' => ('Name', 'Last') + 'Anything else' => ('Anything else', None) + + Args: + s: continuation field string + + Returns: + pair: name and ('First', 'Last', or None) + """ + + match = re.match("<(.*), (First|Last)>", s) + if match is not None: + return match.groups() + return (s, None) + + +def ReadUnicodeTable(filename, nfields, doline): + """Generic Unicode table text file reader. + + The reader takes care of stripping out comments and also + parsing the two different ways that the Unicode tables specify + code ranges (using the .. notation and splitting the range across + multiple lines). + + Each non-comment line in the table is expected to have the given + number of fields. The first field is known to be the Unicode value + and the second field its description. + + The reader calls doline(codes, fields) for each entry in the table. + If fn raises an exception, the reader prints that exception, + prefixed with the file name and line number, and continues + processing the file. When done with the file, the reader re-raises + the first exception encountered during the file. + + Arguments: + filename: the Unicode data file to read, or a file-like object. + nfields: the number of expected fields per line in that file. + doline: the function to call for each table entry. + + Raises: + InputError: nfields is invalid (must be >= 2). + """ + + if nfields < 2: + raise InputError("invalid number of fields %d" % (nfields,)) + + if type(filename) == str: + if filename.startswith("http://"): + fil = urllib2.urlopen(filename) + else: + fil = open(filename, "r") + else: + fil = filename + + first = None # first code in multiline range + expect_last = None # tag expected for "Last" line in multiline range + lineno = 0 # current line number + for line in fil: + lineno += 1 + try: + # Chop # comments and white space; ignore empty lines. + sharp = line.find("#") + if sharp >= 0: + line = line[:sharp] + line = line.strip() + if not line: + continue + + # Split fields on ";", chop more white space. + # Must have the expected number of fields. + fields = [s.strip() for s in line.split(";")] + if len(fields) != nfields: + raise InputError("wrong number of fields %d %d - %s" % + (len(fields), nfields, line)) + + # The Unicode text files have two different ways + # to list a Unicode range. Either the first field is + # itself a range (0000..FFFF), or the range is split + # across two lines, with the second field noting + # the continuation. + codes = _URange(fields[0]) + (name, cont) = _ParseContinue(fields[1]) + + if expect_last is not None: + # If the last line gave the First code in a range, + # this one had better give the Last one. + if (len(codes) != 1 or codes[0] <= first or + cont != "Last" or name != expect_last): + raise InputError("expected Last line for %s" % + (expect_last,)) + codes = range(first, codes[0] + 1) + first = None + expect_last = None + fields[0] = "%04X..%04X" % (codes[0], codes[-1]) + fields[1] = name + elif cont == "First": + # Otherwise, if this is the First code in a range, + # remember it and go to the next line. + if len(codes) != 1: + raise InputError("bad First line: range given") + expect_last = name + first = codes[0] + continue + + doline(codes, fields) + + except Exception, e: + print "%s:%d: %s" % (filename, lineno, e) + raise + + if expect_last is not None: + raise InputError("expected Last line for %s; got EOF" % + (expect_last,)) + + +def CaseGroups(unicode_dir=_UNICODE_DIR): + """Returns list of Unicode code groups equivalent under case folding. + + Each group is a sorted list of code points, + and the list of groups is sorted by first code point + in the group. + + Args: + unicode_dir: Unicode data directory + + Returns: + list of Unicode code groups + """ + + # Dict mapping lowercase code point to fold-equivalent group. + togroup = {} + + def DoLine(codes, fields): + """Process single CaseFolding.txt line, updating togroup.""" + (_, foldtype, lower, _) = fields + if foldtype not in ("C", "S"): + return + lower = _UInt(lower) + togroup.setdefault(lower, [lower]).extend(codes) + + ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) + + groups = togroup.values() + for g in groups: + g.sort() + groups.sort() + return togroup, groups + + +def Scripts(unicode_dir=_UNICODE_DIR): + """Returns dict mapping script names to code lists. + + Args: + unicode_dir: Unicode data directory + + Returns: + dict mapping script names to code lists + """ + + scripts = {} + + def DoLine(codes, fields): + """Process single Scripts.txt line, updating scripts.""" + (_, name) = fields + scripts.setdefault(name, []).extend(codes) + + ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) + return scripts + + +def Categories(unicode_dir=_UNICODE_DIR): + """Returns dict mapping category names to code lists. + + Args: + unicode_dir: Unicode data directory + + Returns: + dict mapping category names to code lists + """ + + categories = {} + + def DoLine(codes, fields): + """Process single UnicodeData.txt line, updating categories.""" + category = fields[2] + categories.setdefault(category, []).extend(codes) + # Add codes from Lu into L, etc. + if len(category) > 1: + short = category[0] + categories.setdefault(short, []).extend(codes) + + ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) + return categories + diff --git a/outside/re2/re2/unicode_casefold.cc b/outside/re2/re2/unicode_casefold.cc new file mode 100644 index 000000000..2293cc75c --- /dev/null +++ b/outside/re2/re2/unicode_casefold.cc @@ -0,0 +1,480 @@ + +// GENERATED BY make_unicode_casefold.py; DO NOT EDIT. +// make_unicode_casefold.py >unicode_casefold.cc + +#include "re2/unicode_casefold.h" + +namespace re2 { + + +// 1034 groups, 2089 pairs, 289 ranges +const CaseFold unicode_casefold[] = { + { 65, 90, 32 }, + { 97, 106, -32 }, + { 107, 107, 8383 }, + { 108, 114, -32 }, + { 115, 115, 268 }, + { 116, 122, -32 }, + { 181, 181, 743 }, + { 192, 214, 32 }, + { 216, 222, 32 }, + { 223, 223, 7615 }, + { 224, 228, -32 }, + { 229, 229, 8262 }, + { 230, 246, -32 }, + { 248, 254, -32 }, + { 255, 255, 121 }, + { 256, 303, EvenOdd }, + { 306, 311, EvenOdd }, + { 313, 328, OddEven }, + { 330, 375, EvenOdd }, + { 376, 376, -121 }, + { 377, 382, OddEven }, + { 383, 383, -300 }, + { 384, 384, 195 }, + { 385, 385, 210 }, + { 386, 389, EvenOdd }, + { 390, 390, 206 }, + { 391, 392, OddEven }, + { 393, 394, 205 }, + { 395, 396, OddEven }, + { 398, 398, 79 }, + { 399, 399, 202 }, + { 400, 400, 203 }, + { 401, 402, OddEven }, + { 403, 403, 205 }, + { 404, 404, 207 }, + { 405, 405, 97 }, + { 406, 406, 211 }, + { 407, 407, 209 }, + { 408, 409, EvenOdd }, + { 410, 410, 163 }, + { 412, 412, 211 }, + { 413, 413, 213 }, + { 414, 414, 130 }, + { 415, 415, 214 }, + { 416, 421, EvenOdd }, + { 422, 422, 218 }, + { 423, 424, OddEven }, + { 425, 425, 218 }, + { 428, 429, EvenOdd }, + { 430, 430, 218 }, + { 431, 432, OddEven }, + { 433, 434, 217 }, + { 435, 438, OddEven }, + { 439, 439, 219 }, + { 440, 441, EvenOdd }, + { 444, 445, EvenOdd }, + { 447, 447, 56 }, + { 452, 452, EvenOdd }, + { 453, 453, OddEven }, + { 454, 454, -2 }, + { 455, 455, OddEven }, + { 456, 456, EvenOdd }, + { 457, 457, -2 }, + { 458, 458, EvenOdd }, + { 459, 459, OddEven }, + { 460, 460, -2 }, + { 461, 476, OddEven }, + { 477, 477, -79 }, + { 478, 495, EvenOdd }, + { 497, 497, OddEven }, + { 498, 498, EvenOdd }, + { 499, 499, -2 }, + { 500, 501, EvenOdd }, + { 502, 502, -97 }, + { 503, 503, -56 }, + { 504, 543, EvenOdd }, + { 544, 544, -130 }, + { 546, 563, EvenOdd }, + { 570, 570, 10795 }, + { 571, 572, OddEven }, + { 573, 573, -163 }, + { 574, 574, 10792 }, + { 575, 576, 10815 }, + { 577, 578, OddEven }, + { 579, 579, -195 }, + { 580, 580, 69 }, + { 581, 581, 71 }, + { 582, 591, EvenOdd }, + { 592, 592, 10783 }, + { 593, 593, 10780 }, + { 594, 594, 10782 }, + { 595, 595, -210 }, + { 596, 596, -206 }, + { 598, 599, -205 }, + { 601, 601, -202 }, + { 603, 603, -203 }, + { 608, 608, -205 }, + { 611, 611, -207 }, + { 613, 613, 42280 }, + { 614, 614, 42308 }, + { 616, 616, -209 }, + { 617, 617, -211 }, + { 619, 619, 10743 }, + { 623, 623, -211 }, + { 625, 625, 10749 }, + { 626, 626, -213 }, + { 629, 629, -214 }, + { 637, 637, 10727 }, + { 640, 640, -218 }, + { 643, 643, -218 }, + { 648, 648, -218 }, + { 649, 649, -69 }, + { 650, 651, -217 }, + { 652, 652, -71 }, + { 658, 658, -219 }, + { 837, 837, 84 }, + { 880, 883, EvenOdd }, + { 886, 887, EvenOdd }, + { 891, 893, 130 }, + { 902, 902, 38 }, + { 904, 906, 37 }, + { 908, 908, 64 }, + { 910, 911, 63 }, + { 913, 929, 32 }, + { 931, 931, 31 }, + { 932, 939, 32 }, + { 940, 940, -38 }, + { 941, 943, -37 }, + { 945, 945, -32 }, + { 946, 946, 30 }, + { 947, 948, -32 }, + { 949, 949, 64 }, + { 950, 951, -32 }, + { 952, 952, 25 }, + { 953, 953, 7173 }, + { 954, 954, 54 }, + { 955, 955, -32 }, + { 956, 956, -775 }, + { 957, 959, -32 }, + { 960, 960, 22 }, + { 961, 961, 48 }, + { 962, 962, EvenOdd }, + { 963, 965, -32 }, + { 966, 966, 15 }, + { 967, 968, -32 }, + { 969, 969, 7517 }, + { 970, 971, -32 }, + { 972, 972, -64 }, + { 973, 974, -63 }, + { 975, 975, 8 }, + { 976, 976, -62 }, + { 977, 977, 35 }, + { 981, 981, -47 }, + { 982, 982, -54 }, + { 983, 983, -8 }, + { 984, 1007, EvenOdd }, + { 1008, 1008, -86 }, + { 1009, 1009, -80 }, + { 1010, 1010, 7 }, + { 1012, 1012, -92 }, + { 1013, 1013, -96 }, + { 1015, 1016, OddEven }, + { 1017, 1017, -7 }, + { 1018, 1019, EvenOdd }, + { 1021, 1023, -130 }, + { 1024, 1039, 80 }, + { 1040, 1071, 32 }, + { 1072, 1103, -32 }, + { 1104, 1119, -80 }, + { 1120, 1153, EvenOdd }, + { 1162, 1215, EvenOdd }, + { 1216, 1216, 15 }, + { 1217, 1230, OddEven }, + { 1231, 1231, -15 }, + { 1232, 1319, EvenOdd }, + { 1329, 1366, 48 }, + { 1377, 1414, -48 }, + { 4256, 4293, 7264 }, + { 4295, 4295, 7264 }, + { 4301, 4301, 7264 }, + { 7545, 7545, 35332 }, + { 7549, 7549, 3814 }, + { 7680, 7776, EvenOdd }, + { 7777, 7777, 58 }, + { 7778, 7829, EvenOdd }, + { 7835, 7835, -59 }, + { 7838, 7838, -7615 }, + { 7840, 7935, EvenOdd }, + { 7936, 7943, 8 }, + { 7944, 7951, -8 }, + { 7952, 7957, 8 }, + { 7960, 7965, -8 }, + { 7968, 7975, 8 }, + { 7976, 7983, -8 }, + { 7984, 7991, 8 }, + { 7992, 7999, -8 }, + { 8000, 8005, 8 }, + { 8008, 8013, -8 }, + { 8017, 8017, 8 }, + { 8019, 8019, 8 }, + { 8021, 8021, 8 }, + { 8023, 8023, 8 }, + { 8025, 8025, -8 }, + { 8027, 8027, -8 }, + { 8029, 8029, -8 }, + { 8031, 8031, -8 }, + { 8032, 8039, 8 }, + { 8040, 8047, -8 }, + { 8048, 8049, 74 }, + { 8050, 8053, 86 }, + { 8054, 8055, 100 }, + { 8056, 8057, 128 }, + { 8058, 8059, 112 }, + { 8060, 8061, 126 }, + { 8064, 8071, 8 }, + { 8072, 8079, -8 }, + { 8080, 8087, 8 }, + { 8088, 8095, -8 }, + { 8096, 8103, 8 }, + { 8104, 8111, -8 }, + { 8112, 8113, 8 }, + { 8115, 8115, 9 }, + { 8120, 8121, -8 }, + { 8122, 8123, -74 }, + { 8124, 8124, -9 }, + { 8126, 8126, -7289 }, + { 8131, 8131, 9 }, + { 8136, 8139, -86 }, + { 8140, 8140, -9 }, + { 8144, 8145, 8 }, + { 8152, 8153, -8 }, + { 8154, 8155, -100 }, + { 8160, 8161, 8 }, + { 8165, 8165, 7 }, + { 8168, 8169, -8 }, + { 8170, 8171, -112 }, + { 8172, 8172, -7 }, + { 8179, 8179, 9 }, + { 8184, 8185, -128 }, + { 8186, 8187, -126 }, + { 8188, 8188, -9 }, + { 8486, 8486, -7549 }, + { 8490, 8490, -8415 }, + { 8491, 8491, -8294 }, + { 8498, 8498, 28 }, + { 8526, 8526, -28 }, + { 8544, 8559, 16 }, + { 8560, 8575, -16 }, + { 8579, 8580, OddEven }, + { 9398, 9423, 26 }, + { 9424, 9449, -26 }, + { 11264, 11310, 48 }, + { 11312, 11358, -48 }, + { 11360, 11361, EvenOdd }, + { 11362, 11362, -10743 }, + { 11363, 11363, -3814 }, + { 11364, 11364, -10727 }, + { 11365, 11365, -10795 }, + { 11366, 11366, -10792 }, + { 11367, 11372, OddEven }, + { 11373, 11373, -10780 }, + { 11374, 11374, -10749 }, + { 11375, 11375, -10783 }, + { 11376, 11376, -10782 }, + { 11378, 11379, EvenOdd }, + { 11381, 11382, OddEven }, + { 11390, 11391, -10815 }, + { 11392, 11491, EvenOdd }, + { 11499, 11502, OddEven }, + { 11506, 11507, EvenOdd }, + { 11520, 11557, -7264 }, + { 11559, 11559, -7264 }, + { 11565, 11565, -7264 }, + { 42560, 42605, EvenOdd }, + { 42624, 42647, EvenOdd }, + { 42786, 42799, EvenOdd }, + { 42802, 42863, EvenOdd }, + { 42873, 42876, OddEven }, + { 42877, 42877, -35332 }, + { 42878, 42887, EvenOdd }, + { 42891, 42892, OddEven }, + { 42893, 42893, -42280 }, + { 42896, 42899, EvenOdd }, + { 42912, 42921, EvenOdd }, + { 42922, 42922, -42308 }, + { 65313, 65338, 32 }, + { 65345, 65370, -32 }, + { 66560, 66599, 40 }, + { 66600, 66639, -40 }, +}; +const int num_unicode_casefold = 289; + +// 1034 groups, 1055 pairs, 167 ranges +const CaseFold unicode_tolower[] = { + { 65, 90, 32 }, + { 181, 181, 775 }, + { 192, 214, 32 }, + { 216, 222, 32 }, + { 256, 302, EvenOddSkip }, + { 306, 310, EvenOddSkip }, + { 313, 327, OddEvenSkip }, + { 330, 374, EvenOddSkip }, + { 376, 376, -121 }, + { 377, 381, OddEvenSkip }, + { 383, 383, -268 }, + { 385, 385, 210 }, + { 386, 388, EvenOddSkip }, + { 390, 390, 206 }, + { 391, 391, OddEven }, + { 393, 394, 205 }, + { 395, 395, OddEven }, + { 398, 398, 79 }, + { 399, 399, 202 }, + { 400, 400, 203 }, + { 401, 401, OddEven }, + { 403, 403, 205 }, + { 404, 404, 207 }, + { 406, 406, 211 }, + { 407, 407, 209 }, + { 408, 408, EvenOdd }, + { 412, 412, 211 }, + { 413, 413, 213 }, + { 415, 415, 214 }, + { 416, 420, EvenOddSkip }, + { 422, 422, 218 }, + { 423, 423, OddEven }, + { 425, 425, 218 }, + { 428, 428, EvenOdd }, + { 430, 430, 218 }, + { 431, 431, OddEven }, + { 433, 434, 217 }, + { 435, 437, OddEvenSkip }, + { 439, 439, 219 }, + { 440, 440, EvenOdd }, + { 444, 444, EvenOdd }, + { 452, 452, 2 }, + { 453, 453, OddEven }, + { 455, 455, 2 }, + { 456, 456, EvenOdd }, + { 458, 458, 2 }, + { 459, 475, OddEvenSkip }, + { 478, 494, EvenOddSkip }, + { 497, 497, 2 }, + { 498, 500, EvenOddSkip }, + { 502, 502, -97 }, + { 503, 503, -56 }, + { 504, 542, EvenOddSkip }, + { 544, 544, -130 }, + { 546, 562, EvenOddSkip }, + { 570, 570, 10795 }, + { 571, 571, OddEven }, + { 573, 573, -163 }, + { 574, 574, 10792 }, + { 577, 577, OddEven }, + { 579, 579, -195 }, + { 580, 580, 69 }, + { 581, 581, 71 }, + { 582, 590, EvenOddSkip }, + { 837, 837, 116 }, + { 880, 882, EvenOddSkip }, + { 886, 886, EvenOdd }, + { 902, 902, 38 }, + { 904, 906, 37 }, + { 908, 908, 64 }, + { 910, 911, 63 }, + { 913, 929, 32 }, + { 931, 939, 32 }, + { 962, 962, EvenOdd }, + { 975, 975, 8 }, + { 976, 976, -30 }, + { 977, 977, -25 }, + { 981, 981, -15 }, + { 982, 982, -22 }, + { 984, 1006, EvenOddSkip }, + { 1008, 1008, -54 }, + { 1009, 1009, -48 }, + { 1012, 1012, -60 }, + { 1013, 1013, -64 }, + { 1015, 1015, OddEven }, + { 1017, 1017, -7 }, + { 1018, 1018, EvenOdd }, + { 1021, 1023, -130 }, + { 1024, 1039, 80 }, + { 1040, 1071, 32 }, + { 1120, 1152, EvenOddSkip }, + { 1162, 1214, EvenOddSkip }, + { 1216, 1216, 15 }, + { 1217, 1229, OddEvenSkip }, + { 1232, 1318, EvenOddSkip }, + { 1329, 1366, 48 }, + { 4256, 4293, 7264 }, + { 4295, 4295, 7264 }, + { 4301, 4301, 7264 }, + { 7680, 7828, EvenOddSkip }, + { 7835, 7835, -58 }, + { 7838, 7838, -7615 }, + { 7840, 7934, EvenOddSkip }, + { 7944, 7951, -8 }, + { 7960, 7965, -8 }, + { 7976, 7983, -8 }, + { 7992, 7999, -8 }, + { 8008, 8013, -8 }, + { 8025, 8025, -8 }, + { 8027, 8027, -8 }, + { 8029, 8029, -8 }, + { 8031, 8031, -8 }, + { 8040, 8047, -8 }, + { 8072, 8079, -8 }, + { 8088, 8095, -8 }, + { 8104, 8111, -8 }, + { 8120, 8121, -8 }, + { 8122, 8123, -74 }, + { 8124, 8124, -9 }, + { 8126, 8126, -7173 }, + { 8136, 8139, -86 }, + { 8140, 8140, -9 }, + { 8152, 8153, -8 }, + { 8154, 8155, -100 }, + { 8168, 8169, -8 }, + { 8170, 8171, -112 }, + { 8172, 8172, -7 }, + { 8184, 8185, -128 }, + { 8186, 8187, -126 }, + { 8188, 8188, -9 }, + { 8486, 8486, -7517 }, + { 8490, 8490, -8383 }, + { 8491, 8491, -8262 }, + { 8498, 8498, 28 }, + { 8544, 8559, 16 }, + { 8579, 8579, OddEven }, + { 9398, 9423, 26 }, + { 11264, 11310, 48 }, + { 11360, 11360, EvenOdd }, + { 11362, 11362, -10743 }, + { 11363, 11363, -3814 }, + { 11364, 11364, -10727 }, + { 11367, 11371, OddEvenSkip }, + { 11373, 11373, -10780 }, + { 11374, 11374, -10749 }, + { 11375, 11375, -10783 }, + { 11376, 11376, -10782 }, + { 11378, 11378, EvenOdd }, + { 11381, 11381, OddEven }, + { 11390, 11391, -10815 }, + { 11392, 11490, EvenOddSkip }, + { 11499, 11501, OddEvenSkip }, + { 11506, 11506, EvenOdd }, + { 42560, 42604, EvenOddSkip }, + { 42624, 42646, EvenOddSkip }, + { 42786, 42798, EvenOddSkip }, + { 42802, 42862, EvenOddSkip }, + { 42873, 42875, OddEvenSkip }, + { 42877, 42877, -35332 }, + { 42878, 42886, EvenOddSkip }, + { 42891, 42891, OddEven }, + { 42893, 42893, -42280 }, + { 42896, 42898, EvenOddSkip }, + { 42912, 42920, EvenOddSkip }, + { 42922, 42922, -42308 }, + { 65313, 65338, 32 }, + { 66560, 66599, 40 }, +}; +const int num_unicode_tolower = 167; + + + +} // namespace re2 + + diff --git a/outside/re2/re2/unicode_casefold.h b/outside/re2/re2/unicode_casefold.h new file mode 100644 index 000000000..7f438aabf --- /dev/null +++ b/outside/re2/re2/unicode_casefold.h @@ -0,0 +1,75 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Unicode case folding tables. + +// The Unicode case folding tables encode the mapping from one Unicode point +// to the next largest Unicode point with equivalent folding. The largest +// point wraps back to the first. For example, the tables map: +// +// 'A' -> 'a' +// 'a' -> 'A' +// +// 'K' -> 'k' +// 'k' -> 'K' (Kelvin symbol) +// 'K' -> 'K' +// +// Like everything Unicode, these tables are big. If we represent the table +// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. +// Most table entries look like the ones around them: +// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. +// Instead of listing all the pairs explicitly, we make a list of ranges +// and deltas, so that the table entries for 'A' through 'Z' can be represented +// as a single entry { 'A', 'Z', +32 }. +// +// In addition to blocks that map to each other (A-Z mapping to a-z) +// there are blocks of pairs that individually map to each other +// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). +// For those, the special delta value EvenOdd marks even/odd pairs +// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. +// +// In this form, the table has 274 entries, about 3kB. If we were to split +// the table into one for 16-bit codes and an overflow table for larger ones, +// we could get it down to about 1.5kB, but that's not worth the complexity. +// +// The grouped form also allows for efficient fold range calculations +// rather than looping one character at a time. + +#ifndef RE2_UNICODE_CASEFOLD_H__ +#define RE2_UNICODE_CASEFOLD_H__ + +#include "util/util.h" + +namespace re2 { + +enum { + EvenOdd = 1, + OddEven = -1, + EvenOddSkip = 1<<30, + OddEvenSkip, +}; + +struct CaseFold { + uint32 lo; + uint32 hi; + int32 delta; +}; + +extern const CaseFold unicode_casefold[]; +extern const int num_unicode_casefold; + +extern const CaseFold unicode_tolower[]; +extern const int num_unicode_tolower; + +// Returns the CaseFold* in the tables that contains rune. +// If rune is not in the tables, returns the first CaseFold* after rune. +// If rune is larger than any value in the tables, returns NULL. +extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); + +// Returns the result of applying the fold f to the rune r. +extern Rune ApplyFold(const CaseFold *f, Rune r); + +} // namespace re2 + +#endif // RE2_UNICODE_CASEFOLD_H__ diff --git a/outside/re2/re2/unicode_groups.cc b/outside/re2/re2/unicode_groups.cc new file mode 100644 index 000000000..0df585e35 --- /dev/null +++ b/outside/re2/re2/unicode_groups.cc @@ -0,0 +1,5078 @@ + +// GENERATED BY make_unicode_groups.py; DO NOT EDIT. +// make_unicode_groups.py >unicode_groups.cc + +#include "re2/unicode_groups.h" + +namespace re2 { + + +static const URange16 Ps_range16[] = { + { 40, 40 }, + { 91, 91 }, + { 123, 123 }, + { 3898, 3898 }, + { 3900, 3900 }, + { 5787, 5787 }, + { 8218, 8218 }, + { 8222, 8222 }, + { 8261, 8261 }, + { 8317, 8317 }, + { 8333, 8333 }, + { 8968, 8968 }, + { 8970, 8970 }, + { 9001, 9001 }, + { 10088, 10088 }, + { 10090, 10090 }, + { 10092, 10092 }, + { 10094, 10094 }, + { 10096, 10096 }, + { 10098, 10098 }, + { 10100, 10100 }, + { 10181, 10181 }, + { 10214, 10214 }, + { 10216, 10216 }, + { 10218, 10218 }, + { 10220, 10220 }, + { 10222, 10222 }, + { 10627, 10627 }, + { 10629, 10629 }, + { 10631, 10631 }, + { 10633, 10633 }, + { 10635, 10635 }, + { 10637, 10637 }, + { 10639, 10639 }, + { 10641, 10641 }, + { 10643, 10643 }, + { 10645, 10645 }, + { 10647, 10647 }, + { 10712, 10712 }, + { 10714, 10714 }, + { 10748, 10748 }, + { 11810, 11810 }, + { 11812, 11812 }, + { 11814, 11814 }, + { 11816, 11816 }, + { 12296, 12296 }, + { 12298, 12298 }, + { 12300, 12300 }, + { 12302, 12302 }, + { 12304, 12304 }, + { 12308, 12308 }, + { 12310, 12310 }, + { 12312, 12312 }, + { 12314, 12314 }, + { 12317, 12317 }, + { 64830, 64830 }, + { 65047, 65047 }, + { 65077, 65077 }, + { 65079, 65079 }, + { 65081, 65081 }, + { 65083, 65083 }, + { 65085, 65085 }, + { 65087, 65087 }, + { 65089, 65089 }, + { 65091, 65091 }, + { 65095, 65095 }, + { 65113, 65113 }, + { 65115, 65115 }, + { 65117, 65117 }, + { 65288, 65288 }, + { 65339, 65339 }, + { 65371, 65371 }, + { 65375, 65375 }, + { 65378, 65378 }, +}; +static const URange16 Nl_range16[] = { + { 5870, 5872 }, + { 8544, 8578 }, + { 8581, 8584 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12346 }, + { 42726, 42735 }, +}; +static const URange32 Nl_range32[] = { + { 65856, 65908 }, + { 66369, 66369 }, + { 66378, 66378 }, + { 66513, 66517 }, + { 74752, 74850 }, +}; +static const URange16 No_range16[] = { + { 178, 179 }, + { 185, 185 }, + { 188, 190 }, + { 2548, 2553 }, + { 2930, 2935 }, + { 3056, 3058 }, + { 3192, 3198 }, + { 3440, 3445 }, + { 3882, 3891 }, + { 4969, 4988 }, + { 6128, 6137 }, + { 6618, 6618 }, + { 8304, 8304 }, + { 8308, 8313 }, + { 8320, 8329 }, + { 8528, 8543 }, + { 8585, 8585 }, + { 9312, 9371 }, + { 9450, 9471 }, + { 10102, 10131 }, + { 11517, 11517 }, + { 12690, 12693 }, + { 12832, 12841 }, + { 12872, 12879 }, + { 12881, 12895 }, + { 12928, 12937 }, + { 12977, 12991 }, + { 43056, 43061 }, +}; +static const URange32 No_range32[] = { + { 65799, 65843 }, + { 65909, 65912 }, + { 65930, 65930 }, + { 66336, 66339 }, + { 67672, 67679 }, + { 67862, 67867 }, + { 68160, 68167 }, + { 68221, 68222 }, + { 68440, 68447 }, + { 68472, 68479 }, + { 69216, 69246 }, + { 69714, 69733 }, + { 119648, 119665 }, + { 127232, 127242 }, +}; +static const URange16 Lo_range16[] = { + { 170, 170 }, + { 186, 186 }, + { 443, 443 }, + { 448, 451 }, + { 660, 660 }, + { 1488, 1514 }, + { 1520, 1522 }, + { 1568, 1599 }, + { 1601, 1610 }, + { 1646, 1647 }, + { 1649, 1747 }, + { 1749, 1749 }, + { 1774, 1775 }, + { 1786, 1788 }, + { 1791, 1791 }, + { 1808, 1808 }, + { 1810, 1839 }, + { 1869, 1957 }, + { 1969, 1969 }, + { 1994, 2026 }, + { 2048, 2069 }, + { 2112, 2136 }, + { 2208, 2208 }, + { 2210, 2220 }, + { 2308, 2361 }, + { 2365, 2365 }, + { 2384, 2384 }, + { 2392, 2401 }, + { 2418, 2423 }, + { 2425, 2431 }, + { 2437, 2444 }, + { 2447, 2448 }, + { 2451, 2472 }, + { 2474, 2480 }, + { 2482, 2482 }, + { 2486, 2489 }, + { 2493, 2493 }, + { 2510, 2510 }, + { 2524, 2525 }, + { 2527, 2529 }, + { 2544, 2545 }, + { 2565, 2570 }, + { 2575, 2576 }, + { 2579, 2600 }, + { 2602, 2608 }, + { 2610, 2611 }, + { 2613, 2614 }, + { 2616, 2617 }, + { 2649, 2652 }, + { 2654, 2654 }, + { 2674, 2676 }, + { 2693, 2701 }, + { 2703, 2705 }, + { 2707, 2728 }, + { 2730, 2736 }, + { 2738, 2739 }, + { 2741, 2745 }, + { 2749, 2749 }, + { 2768, 2768 }, + { 2784, 2785 }, + { 2821, 2828 }, + { 2831, 2832 }, + { 2835, 2856 }, + { 2858, 2864 }, + { 2866, 2867 }, + { 2869, 2873 }, + { 2877, 2877 }, + { 2908, 2909 }, + { 2911, 2913 }, + { 2929, 2929 }, + { 2947, 2947 }, + { 2949, 2954 }, + { 2958, 2960 }, + { 2962, 2965 }, + { 2969, 2970 }, + { 2972, 2972 }, + { 2974, 2975 }, + { 2979, 2980 }, + { 2984, 2986 }, + { 2990, 3001 }, + { 3024, 3024 }, + { 3077, 3084 }, + { 3086, 3088 }, + { 3090, 3112 }, + { 3114, 3123 }, + { 3125, 3129 }, + { 3133, 3133 }, + { 3160, 3161 }, + { 3168, 3169 }, + { 3205, 3212 }, + { 3214, 3216 }, + { 3218, 3240 }, + { 3242, 3251 }, + { 3253, 3257 }, + { 3261, 3261 }, + { 3294, 3294 }, + { 3296, 3297 }, + { 3313, 3314 }, + { 3333, 3340 }, + { 3342, 3344 }, + { 3346, 3386 }, + { 3389, 3389 }, + { 3406, 3406 }, + { 3424, 3425 }, + { 3450, 3455 }, + { 3461, 3478 }, + { 3482, 3505 }, + { 3507, 3515 }, + { 3517, 3517 }, + { 3520, 3526 }, + { 3585, 3632 }, + { 3634, 3635 }, + { 3648, 3653 }, + { 3713, 3714 }, + { 3716, 3716 }, + { 3719, 3720 }, + { 3722, 3722 }, + { 3725, 3725 }, + { 3732, 3735 }, + { 3737, 3743 }, + { 3745, 3747 }, + { 3749, 3749 }, + { 3751, 3751 }, + { 3754, 3755 }, + { 3757, 3760 }, + { 3762, 3763 }, + { 3773, 3773 }, + { 3776, 3780 }, + { 3804, 3807 }, + { 3840, 3840 }, + { 3904, 3911 }, + { 3913, 3948 }, + { 3976, 3980 }, + { 4096, 4138 }, + { 4159, 4159 }, + { 4176, 4181 }, + { 4186, 4189 }, + { 4193, 4193 }, + { 4197, 4198 }, + { 4206, 4208 }, + { 4213, 4225 }, + { 4238, 4238 }, + { 4304, 4346 }, + { 4349, 4680 }, + { 4682, 4685 }, + { 4688, 4694 }, + { 4696, 4696 }, + { 4698, 4701 }, + { 4704, 4744 }, + { 4746, 4749 }, + { 4752, 4784 }, + { 4786, 4789 }, + { 4792, 4798 }, + { 4800, 4800 }, + { 4802, 4805 }, + { 4808, 4822 }, + { 4824, 4880 }, + { 4882, 4885 }, + { 4888, 4954 }, + { 4992, 5007 }, + { 5024, 5108 }, + { 5121, 5740 }, + { 5743, 5759 }, + { 5761, 5786 }, + { 5792, 5866 }, + { 5888, 5900 }, + { 5902, 5905 }, + { 5920, 5937 }, + { 5952, 5969 }, + { 5984, 5996 }, + { 5998, 6000 }, + { 6016, 6067 }, + { 6108, 6108 }, + { 6176, 6210 }, + { 6212, 6263 }, + { 6272, 6312 }, + { 6314, 6314 }, + { 6320, 6389 }, + { 6400, 6428 }, + { 6480, 6509 }, + { 6512, 6516 }, + { 6528, 6571 }, + { 6593, 6599 }, + { 6656, 6678 }, + { 6688, 6740 }, + { 6917, 6963 }, + { 6981, 6987 }, + { 7043, 7072 }, + { 7086, 7087 }, + { 7098, 7141 }, + { 7168, 7203 }, + { 7245, 7247 }, + { 7258, 7287 }, + { 7401, 7404 }, + { 7406, 7409 }, + { 7413, 7414 }, + { 8501, 8504 }, + { 11568, 11623 }, + { 11648, 11670 }, + { 11680, 11686 }, + { 11688, 11694 }, + { 11696, 11702 }, + { 11704, 11710 }, + { 11712, 11718 }, + { 11720, 11726 }, + { 11728, 11734 }, + { 11736, 11742 }, + { 12294, 12294 }, + { 12348, 12348 }, + { 12353, 12438 }, + { 12447, 12447 }, + { 12449, 12538 }, + { 12543, 12543 }, + { 12549, 12589 }, + { 12593, 12686 }, + { 12704, 12730 }, + { 12784, 12799 }, + { 13312, 19893 }, + { 19968, 40908 }, + { 40960, 40980 }, + { 40982, 42124 }, + { 42192, 42231 }, + { 42240, 42507 }, + { 42512, 42527 }, + { 42538, 42539 }, + { 42606, 42606 }, + { 42656, 42725 }, + { 43003, 43009 }, + { 43011, 43013 }, + { 43015, 43018 }, + { 43020, 43042 }, + { 43072, 43123 }, + { 43138, 43187 }, + { 43250, 43255 }, + { 43259, 43259 }, + { 43274, 43301 }, + { 43312, 43334 }, + { 43360, 43388 }, + { 43396, 43442 }, + { 43520, 43560 }, + { 43584, 43586 }, + { 43588, 43595 }, + { 43616, 43631 }, + { 43633, 43638 }, + { 43642, 43642 }, + { 43648, 43695 }, + { 43697, 43697 }, + { 43701, 43702 }, + { 43705, 43709 }, + { 43712, 43712 }, + { 43714, 43714 }, + { 43739, 43740 }, + { 43744, 43754 }, + { 43762, 43762 }, + { 43777, 43782 }, + { 43785, 43790 }, + { 43793, 43798 }, + { 43808, 43814 }, + { 43816, 43822 }, + { 43968, 44002 }, + { 44032, 55203 }, + { 55216, 55238 }, + { 55243, 55291 }, + { 63744, 64109 }, + { 64112, 64217 }, + { 64285, 64285 }, + { 64287, 64296 }, + { 64298, 64310 }, + { 64312, 64316 }, + { 64318, 64318 }, + { 64320, 64321 }, + { 64323, 64324 }, + { 64326, 64433 }, + { 64467, 64829 }, + { 64848, 64911 }, + { 64914, 64967 }, + { 65008, 65019 }, + { 65136, 65140 }, + { 65142, 65276 }, + { 65382, 65391 }, + { 65393, 65437 }, + { 65440, 65470 }, + { 65474, 65479 }, + { 65482, 65487 }, + { 65490, 65495 }, + { 65498, 65500 }, +}; +static const URange32 Lo_range32[] = { + { 65536, 65547 }, + { 65549, 65574 }, + { 65576, 65594 }, + { 65596, 65597 }, + { 65599, 65613 }, + { 65616, 65629 }, + { 65664, 65786 }, + { 66176, 66204 }, + { 66208, 66256 }, + { 66304, 66334 }, + { 66352, 66368 }, + { 66370, 66377 }, + { 66432, 66461 }, + { 66464, 66499 }, + { 66504, 66511 }, + { 66640, 66717 }, + { 67584, 67589 }, + { 67592, 67592 }, + { 67594, 67637 }, + { 67639, 67640 }, + { 67644, 67644 }, + { 67647, 67669 }, + { 67840, 67861 }, + { 67872, 67897 }, + { 67968, 68023 }, + { 68030, 68031 }, + { 68096, 68096 }, + { 68112, 68115 }, + { 68117, 68119 }, + { 68121, 68147 }, + { 68192, 68220 }, + { 68352, 68405 }, + { 68416, 68437 }, + { 68448, 68466 }, + { 68608, 68680 }, + { 69635, 69687 }, + { 69763, 69807 }, + { 69840, 69864 }, + { 69891, 69926 }, + { 70019, 70066 }, + { 70081, 70084 }, + { 71296, 71338 }, + { 73728, 74606 }, + { 77824, 78894 }, + { 92160, 92728 }, + { 93952, 94020 }, + { 94032, 94032 }, + { 110592, 110593 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, + { 131072, 173782 }, + { 173824, 177972 }, + { 177984, 178205 }, + { 194560, 195101 }, +}; +static const URange16 Ll_range16[] = { + { 97, 122 }, + { 181, 181 }, + { 223, 246 }, + { 248, 255 }, + { 257, 257 }, + { 259, 259 }, + { 261, 261 }, + { 263, 263 }, + { 265, 265 }, + { 267, 267 }, + { 269, 269 }, + { 271, 271 }, + { 273, 273 }, + { 275, 275 }, + { 277, 277 }, + { 279, 279 }, + { 281, 281 }, + { 283, 283 }, + { 285, 285 }, + { 287, 287 }, + { 289, 289 }, + { 291, 291 }, + { 293, 293 }, + { 295, 295 }, + { 297, 297 }, + { 299, 299 }, + { 301, 301 }, + { 303, 303 }, + { 305, 305 }, + { 307, 307 }, + { 309, 309 }, + { 311, 312 }, + { 314, 314 }, + { 316, 316 }, + { 318, 318 }, + { 320, 320 }, + { 322, 322 }, + { 324, 324 }, + { 326, 326 }, + { 328, 329 }, + { 331, 331 }, + { 333, 333 }, + { 335, 335 }, + { 337, 337 }, + { 339, 339 }, + { 341, 341 }, + { 343, 343 }, + { 345, 345 }, + { 347, 347 }, + { 349, 349 }, + { 351, 351 }, + { 353, 353 }, + { 355, 355 }, + { 357, 357 }, + { 359, 359 }, + { 361, 361 }, + { 363, 363 }, + { 365, 365 }, + { 367, 367 }, + { 369, 369 }, + { 371, 371 }, + { 373, 373 }, + { 375, 375 }, + { 378, 378 }, + { 380, 380 }, + { 382, 384 }, + { 387, 387 }, + { 389, 389 }, + { 392, 392 }, + { 396, 397 }, + { 402, 402 }, + { 405, 405 }, + { 409, 411 }, + { 414, 414 }, + { 417, 417 }, + { 419, 419 }, + { 421, 421 }, + { 424, 424 }, + { 426, 427 }, + { 429, 429 }, + { 432, 432 }, + { 436, 436 }, + { 438, 438 }, + { 441, 442 }, + { 445, 447 }, + { 454, 454 }, + { 457, 457 }, + { 460, 460 }, + { 462, 462 }, + { 464, 464 }, + { 466, 466 }, + { 468, 468 }, + { 470, 470 }, + { 472, 472 }, + { 474, 474 }, + { 476, 477 }, + { 479, 479 }, + { 481, 481 }, + { 483, 483 }, + { 485, 485 }, + { 487, 487 }, + { 489, 489 }, + { 491, 491 }, + { 493, 493 }, + { 495, 496 }, + { 499, 499 }, + { 501, 501 }, + { 505, 505 }, + { 507, 507 }, + { 509, 509 }, + { 511, 511 }, + { 513, 513 }, + { 515, 515 }, + { 517, 517 }, + { 519, 519 }, + { 521, 521 }, + { 523, 523 }, + { 525, 525 }, + { 527, 527 }, + { 529, 529 }, + { 531, 531 }, + { 533, 533 }, + { 535, 535 }, + { 537, 537 }, + { 539, 539 }, + { 541, 541 }, + { 543, 543 }, + { 545, 545 }, + { 547, 547 }, + { 549, 549 }, + { 551, 551 }, + { 553, 553 }, + { 555, 555 }, + { 557, 557 }, + { 559, 559 }, + { 561, 561 }, + { 563, 569 }, + { 572, 572 }, + { 575, 576 }, + { 578, 578 }, + { 583, 583 }, + { 585, 585 }, + { 587, 587 }, + { 589, 589 }, + { 591, 659 }, + { 661, 687 }, + { 881, 881 }, + { 883, 883 }, + { 887, 887 }, + { 891, 893 }, + { 912, 912 }, + { 940, 974 }, + { 976, 977 }, + { 981, 983 }, + { 985, 985 }, + { 987, 987 }, + { 989, 989 }, + { 991, 991 }, + { 993, 993 }, + { 995, 995 }, + { 997, 997 }, + { 999, 999 }, + { 1001, 1001 }, + { 1003, 1003 }, + { 1005, 1005 }, + { 1007, 1011 }, + { 1013, 1013 }, + { 1016, 1016 }, + { 1019, 1020 }, + { 1072, 1119 }, + { 1121, 1121 }, + { 1123, 1123 }, + { 1125, 1125 }, + { 1127, 1127 }, + { 1129, 1129 }, + { 1131, 1131 }, + { 1133, 1133 }, + { 1135, 1135 }, + { 1137, 1137 }, + { 1139, 1139 }, + { 1141, 1141 }, + { 1143, 1143 }, + { 1145, 1145 }, + { 1147, 1147 }, + { 1149, 1149 }, + { 1151, 1151 }, + { 1153, 1153 }, + { 1163, 1163 }, + { 1165, 1165 }, + { 1167, 1167 }, + { 1169, 1169 }, + { 1171, 1171 }, + { 1173, 1173 }, + { 1175, 1175 }, + { 1177, 1177 }, + { 1179, 1179 }, + { 1181, 1181 }, + { 1183, 1183 }, + { 1185, 1185 }, + { 1187, 1187 }, + { 1189, 1189 }, + { 1191, 1191 }, + { 1193, 1193 }, + { 1195, 1195 }, + { 1197, 1197 }, + { 1199, 1199 }, + { 1201, 1201 }, + { 1203, 1203 }, + { 1205, 1205 }, + { 1207, 1207 }, + { 1209, 1209 }, + { 1211, 1211 }, + { 1213, 1213 }, + { 1215, 1215 }, + { 1218, 1218 }, + { 1220, 1220 }, + { 1222, 1222 }, + { 1224, 1224 }, + { 1226, 1226 }, + { 1228, 1228 }, + { 1230, 1231 }, + { 1233, 1233 }, + { 1235, 1235 }, + { 1237, 1237 }, + { 1239, 1239 }, + { 1241, 1241 }, + { 1243, 1243 }, + { 1245, 1245 }, + { 1247, 1247 }, + { 1249, 1249 }, + { 1251, 1251 }, + { 1253, 1253 }, + { 1255, 1255 }, + { 1257, 1257 }, + { 1259, 1259 }, + { 1261, 1261 }, + { 1263, 1263 }, + { 1265, 1265 }, + { 1267, 1267 }, + { 1269, 1269 }, + { 1271, 1271 }, + { 1273, 1273 }, + { 1275, 1275 }, + { 1277, 1277 }, + { 1279, 1279 }, + { 1281, 1281 }, + { 1283, 1283 }, + { 1285, 1285 }, + { 1287, 1287 }, + { 1289, 1289 }, + { 1291, 1291 }, + { 1293, 1293 }, + { 1295, 1295 }, + { 1297, 1297 }, + { 1299, 1299 }, + { 1301, 1301 }, + { 1303, 1303 }, + { 1305, 1305 }, + { 1307, 1307 }, + { 1309, 1309 }, + { 1311, 1311 }, + { 1313, 1313 }, + { 1315, 1315 }, + { 1317, 1317 }, + { 1319, 1319 }, + { 1377, 1415 }, + { 7424, 7467 }, + { 7531, 7543 }, + { 7545, 7578 }, + { 7681, 7681 }, + { 7683, 7683 }, + { 7685, 7685 }, + { 7687, 7687 }, + { 7689, 7689 }, + { 7691, 7691 }, + { 7693, 7693 }, + { 7695, 7695 }, + { 7697, 7697 }, + { 7699, 7699 }, + { 7701, 7701 }, + { 7703, 7703 }, + { 7705, 7705 }, + { 7707, 7707 }, + { 7709, 7709 }, + { 7711, 7711 }, + { 7713, 7713 }, + { 7715, 7715 }, + { 7717, 7717 }, + { 7719, 7719 }, + { 7721, 7721 }, + { 7723, 7723 }, + { 7725, 7725 }, + { 7727, 7727 }, + { 7729, 7729 }, + { 7731, 7731 }, + { 7733, 7733 }, + { 7735, 7735 }, + { 7737, 7737 }, + { 7739, 7739 }, + { 7741, 7741 }, + { 7743, 7743 }, + { 7745, 7745 }, + { 7747, 7747 }, + { 7749, 7749 }, + { 7751, 7751 }, + { 7753, 7753 }, + { 7755, 7755 }, + { 7757, 7757 }, + { 7759, 7759 }, + { 7761, 7761 }, + { 7763, 7763 }, + { 7765, 7765 }, + { 7767, 7767 }, + { 7769, 7769 }, + { 7771, 7771 }, + { 7773, 7773 }, + { 7775, 7775 }, + { 7777, 7777 }, + { 7779, 7779 }, + { 7781, 7781 }, + { 7783, 7783 }, + { 7785, 7785 }, + { 7787, 7787 }, + { 7789, 7789 }, + { 7791, 7791 }, + { 7793, 7793 }, + { 7795, 7795 }, + { 7797, 7797 }, + { 7799, 7799 }, + { 7801, 7801 }, + { 7803, 7803 }, + { 7805, 7805 }, + { 7807, 7807 }, + { 7809, 7809 }, + { 7811, 7811 }, + { 7813, 7813 }, + { 7815, 7815 }, + { 7817, 7817 }, + { 7819, 7819 }, + { 7821, 7821 }, + { 7823, 7823 }, + { 7825, 7825 }, + { 7827, 7827 }, + { 7829, 7837 }, + { 7839, 7839 }, + { 7841, 7841 }, + { 7843, 7843 }, + { 7845, 7845 }, + { 7847, 7847 }, + { 7849, 7849 }, + { 7851, 7851 }, + { 7853, 7853 }, + { 7855, 7855 }, + { 7857, 7857 }, + { 7859, 7859 }, + { 7861, 7861 }, + { 7863, 7863 }, + { 7865, 7865 }, + { 7867, 7867 }, + { 7869, 7869 }, + { 7871, 7871 }, + { 7873, 7873 }, + { 7875, 7875 }, + { 7877, 7877 }, + { 7879, 7879 }, + { 7881, 7881 }, + { 7883, 7883 }, + { 7885, 7885 }, + { 7887, 7887 }, + { 7889, 7889 }, + { 7891, 7891 }, + { 7893, 7893 }, + { 7895, 7895 }, + { 7897, 7897 }, + { 7899, 7899 }, + { 7901, 7901 }, + { 7903, 7903 }, + { 7905, 7905 }, + { 7907, 7907 }, + { 7909, 7909 }, + { 7911, 7911 }, + { 7913, 7913 }, + { 7915, 7915 }, + { 7917, 7917 }, + { 7919, 7919 }, + { 7921, 7921 }, + { 7923, 7923 }, + { 7925, 7925 }, + { 7927, 7927 }, + { 7929, 7929 }, + { 7931, 7931 }, + { 7933, 7933 }, + { 7935, 7943 }, + { 7952, 7957 }, + { 7968, 7975 }, + { 7984, 7991 }, + { 8000, 8005 }, + { 8016, 8023 }, + { 8032, 8039 }, + { 8048, 8061 }, + { 8064, 8071 }, + { 8080, 8087 }, + { 8096, 8103 }, + { 8112, 8116 }, + { 8118, 8119 }, + { 8126, 8126 }, + { 8130, 8132 }, + { 8134, 8135 }, + { 8144, 8147 }, + { 8150, 8151 }, + { 8160, 8167 }, + { 8178, 8180 }, + { 8182, 8183 }, + { 8458, 8458 }, + { 8462, 8463 }, + { 8467, 8467 }, + { 8495, 8495 }, + { 8500, 8500 }, + { 8505, 8505 }, + { 8508, 8509 }, + { 8518, 8521 }, + { 8526, 8526 }, + { 8580, 8580 }, + { 11312, 11358 }, + { 11361, 11361 }, + { 11365, 11366 }, + { 11368, 11368 }, + { 11370, 11370 }, + { 11372, 11372 }, + { 11377, 11377 }, + { 11379, 11380 }, + { 11382, 11387 }, + { 11393, 11393 }, + { 11395, 11395 }, + { 11397, 11397 }, + { 11399, 11399 }, + { 11401, 11401 }, + { 11403, 11403 }, + { 11405, 11405 }, + { 11407, 11407 }, + { 11409, 11409 }, + { 11411, 11411 }, + { 11413, 11413 }, + { 11415, 11415 }, + { 11417, 11417 }, + { 11419, 11419 }, + { 11421, 11421 }, + { 11423, 11423 }, + { 11425, 11425 }, + { 11427, 11427 }, + { 11429, 11429 }, + { 11431, 11431 }, + { 11433, 11433 }, + { 11435, 11435 }, + { 11437, 11437 }, + { 11439, 11439 }, + { 11441, 11441 }, + { 11443, 11443 }, + { 11445, 11445 }, + { 11447, 11447 }, + { 11449, 11449 }, + { 11451, 11451 }, + { 11453, 11453 }, + { 11455, 11455 }, + { 11457, 11457 }, + { 11459, 11459 }, + { 11461, 11461 }, + { 11463, 11463 }, + { 11465, 11465 }, + { 11467, 11467 }, + { 11469, 11469 }, + { 11471, 11471 }, + { 11473, 11473 }, + { 11475, 11475 }, + { 11477, 11477 }, + { 11479, 11479 }, + { 11481, 11481 }, + { 11483, 11483 }, + { 11485, 11485 }, + { 11487, 11487 }, + { 11489, 11489 }, + { 11491, 11492 }, + { 11500, 11500 }, + { 11502, 11502 }, + { 11507, 11507 }, + { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, + { 42561, 42561 }, + { 42563, 42563 }, + { 42565, 42565 }, + { 42567, 42567 }, + { 42569, 42569 }, + { 42571, 42571 }, + { 42573, 42573 }, + { 42575, 42575 }, + { 42577, 42577 }, + { 42579, 42579 }, + { 42581, 42581 }, + { 42583, 42583 }, + { 42585, 42585 }, + { 42587, 42587 }, + { 42589, 42589 }, + { 42591, 42591 }, + { 42593, 42593 }, + { 42595, 42595 }, + { 42597, 42597 }, + { 42599, 42599 }, + { 42601, 42601 }, + { 42603, 42603 }, + { 42605, 42605 }, + { 42625, 42625 }, + { 42627, 42627 }, + { 42629, 42629 }, + { 42631, 42631 }, + { 42633, 42633 }, + { 42635, 42635 }, + { 42637, 42637 }, + { 42639, 42639 }, + { 42641, 42641 }, + { 42643, 42643 }, + { 42645, 42645 }, + { 42647, 42647 }, + { 42787, 42787 }, + { 42789, 42789 }, + { 42791, 42791 }, + { 42793, 42793 }, + { 42795, 42795 }, + { 42797, 42797 }, + { 42799, 42801 }, + { 42803, 42803 }, + { 42805, 42805 }, + { 42807, 42807 }, + { 42809, 42809 }, + { 42811, 42811 }, + { 42813, 42813 }, + { 42815, 42815 }, + { 42817, 42817 }, + { 42819, 42819 }, + { 42821, 42821 }, + { 42823, 42823 }, + { 42825, 42825 }, + { 42827, 42827 }, + { 42829, 42829 }, + { 42831, 42831 }, + { 42833, 42833 }, + { 42835, 42835 }, + { 42837, 42837 }, + { 42839, 42839 }, + { 42841, 42841 }, + { 42843, 42843 }, + { 42845, 42845 }, + { 42847, 42847 }, + { 42849, 42849 }, + { 42851, 42851 }, + { 42853, 42853 }, + { 42855, 42855 }, + { 42857, 42857 }, + { 42859, 42859 }, + { 42861, 42861 }, + { 42863, 42863 }, + { 42865, 42872 }, + { 42874, 42874 }, + { 42876, 42876 }, + { 42879, 42879 }, + { 42881, 42881 }, + { 42883, 42883 }, + { 42885, 42885 }, + { 42887, 42887 }, + { 42892, 42892 }, + { 42894, 42894 }, + { 42897, 42897 }, + { 42899, 42899 }, + { 42913, 42913 }, + { 42915, 42915 }, + { 42917, 42917 }, + { 42919, 42919 }, + { 42921, 42921 }, + { 43002, 43002 }, + { 64256, 64262 }, + { 64275, 64279 }, + { 65345, 65370 }, +}; +static const URange32 Ll_range32[] = { + { 66600, 66639 }, + { 119834, 119859 }, + { 119886, 119892 }, + { 119894, 119911 }, + { 119938, 119963 }, + { 119990, 119993 }, + { 119995, 119995 }, + { 119997, 120003 }, + { 120005, 120015 }, + { 120042, 120067 }, + { 120094, 120119 }, + { 120146, 120171 }, + { 120198, 120223 }, + { 120250, 120275 }, + { 120302, 120327 }, + { 120354, 120379 }, + { 120406, 120431 }, + { 120458, 120485 }, + { 120514, 120538 }, + { 120540, 120545 }, + { 120572, 120596 }, + { 120598, 120603 }, + { 120630, 120654 }, + { 120656, 120661 }, + { 120688, 120712 }, + { 120714, 120719 }, + { 120746, 120770 }, + { 120772, 120777 }, + { 120779, 120779 }, +}; +static const URange16 Lm_range16[] = { + { 688, 705 }, + { 710, 721 }, + { 736, 740 }, + { 748, 748 }, + { 750, 750 }, + { 884, 884 }, + { 890, 890 }, + { 1369, 1369 }, + { 1600, 1600 }, + { 1765, 1766 }, + { 2036, 2037 }, + { 2042, 2042 }, + { 2074, 2074 }, + { 2084, 2084 }, + { 2088, 2088 }, + { 2417, 2417 }, + { 3654, 3654 }, + { 3782, 3782 }, + { 4348, 4348 }, + { 6103, 6103 }, + { 6211, 6211 }, + { 6823, 6823 }, + { 7288, 7293 }, + { 7468, 7530 }, + { 7544, 7544 }, + { 7579, 7615 }, + { 8305, 8305 }, + { 8319, 8319 }, + { 8336, 8348 }, + { 11388, 11389 }, + { 11631, 11631 }, + { 11823, 11823 }, + { 12293, 12293 }, + { 12337, 12341 }, + { 12347, 12347 }, + { 12445, 12446 }, + { 12540, 12542 }, + { 40981, 40981 }, + { 42232, 42237 }, + { 42508, 42508 }, + { 42623, 42623 }, + { 42775, 42783 }, + { 42864, 42864 }, + { 42888, 42888 }, + { 43000, 43001 }, + { 43471, 43471 }, + { 43632, 43632 }, + { 43741, 43741 }, + { 43763, 43764 }, + { 65392, 65392 }, + { 65438, 65439 }, +}; +static const URange32 Lm_range32[] = { + { 94099, 94111 }, +}; +static const URange16 Nd_range16[] = { + { 48, 57 }, + { 1632, 1641 }, + { 1776, 1785 }, + { 1984, 1993 }, + { 2406, 2415 }, + { 2534, 2543 }, + { 2662, 2671 }, + { 2790, 2799 }, + { 2918, 2927 }, + { 3046, 3055 }, + { 3174, 3183 }, + { 3302, 3311 }, + { 3430, 3439 }, + { 3664, 3673 }, + { 3792, 3801 }, + { 3872, 3881 }, + { 4160, 4169 }, + { 4240, 4249 }, + { 6112, 6121 }, + { 6160, 6169 }, + { 6470, 6479 }, + { 6608, 6617 }, + { 6784, 6793 }, + { 6800, 6809 }, + { 6992, 7001 }, + { 7088, 7097 }, + { 7232, 7241 }, + { 7248, 7257 }, + { 42528, 42537 }, + { 43216, 43225 }, + { 43264, 43273 }, + { 43472, 43481 }, + { 43600, 43609 }, + { 44016, 44025 }, + { 65296, 65305 }, +}; +static const URange32 Nd_range32[] = { + { 66720, 66729 }, + { 69734, 69743 }, + { 69872, 69881 }, + { 69942, 69951 }, + { 70096, 70105 }, + { 71360, 71369 }, + { 120782, 120831 }, +}; +static const URange16 Pc_range16[] = { + { 95, 95 }, + { 8255, 8256 }, + { 8276, 8276 }, + { 65075, 65076 }, + { 65101, 65103 }, + { 65343, 65343 }, +}; +static const URange16 Lt_range16[] = { + { 453, 453 }, + { 456, 456 }, + { 459, 459 }, + { 498, 498 }, + { 8072, 8079 }, + { 8088, 8095 }, + { 8104, 8111 }, + { 8124, 8124 }, + { 8140, 8140 }, + { 8188, 8188 }, +}; +static const URange16 Lu_range16[] = { + { 65, 90 }, + { 192, 214 }, + { 216, 222 }, + { 256, 256 }, + { 258, 258 }, + { 260, 260 }, + { 262, 262 }, + { 264, 264 }, + { 266, 266 }, + { 268, 268 }, + { 270, 270 }, + { 272, 272 }, + { 274, 274 }, + { 276, 276 }, + { 278, 278 }, + { 280, 280 }, + { 282, 282 }, + { 284, 284 }, + { 286, 286 }, + { 288, 288 }, + { 290, 290 }, + { 292, 292 }, + { 294, 294 }, + { 296, 296 }, + { 298, 298 }, + { 300, 300 }, + { 302, 302 }, + { 304, 304 }, + { 306, 306 }, + { 308, 308 }, + { 310, 310 }, + { 313, 313 }, + { 315, 315 }, + { 317, 317 }, + { 319, 319 }, + { 321, 321 }, + { 323, 323 }, + { 325, 325 }, + { 327, 327 }, + { 330, 330 }, + { 332, 332 }, + { 334, 334 }, + { 336, 336 }, + { 338, 338 }, + { 340, 340 }, + { 342, 342 }, + { 344, 344 }, + { 346, 346 }, + { 348, 348 }, + { 350, 350 }, + { 352, 352 }, + { 354, 354 }, + { 356, 356 }, + { 358, 358 }, + { 360, 360 }, + { 362, 362 }, + { 364, 364 }, + { 366, 366 }, + { 368, 368 }, + { 370, 370 }, + { 372, 372 }, + { 374, 374 }, + { 376, 377 }, + { 379, 379 }, + { 381, 381 }, + { 385, 386 }, + { 388, 388 }, + { 390, 391 }, + { 393, 395 }, + { 398, 401 }, + { 403, 404 }, + { 406, 408 }, + { 412, 413 }, + { 415, 416 }, + { 418, 418 }, + { 420, 420 }, + { 422, 423 }, + { 425, 425 }, + { 428, 428 }, + { 430, 431 }, + { 433, 435 }, + { 437, 437 }, + { 439, 440 }, + { 444, 444 }, + { 452, 452 }, + { 455, 455 }, + { 458, 458 }, + { 461, 461 }, + { 463, 463 }, + { 465, 465 }, + { 467, 467 }, + { 469, 469 }, + { 471, 471 }, + { 473, 473 }, + { 475, 475 }, + { 478, 478 }, + { 480, 480 }, + { 482, 482 }, + { 484, 484 }, + { 486, 486 }, + { 488, 488 }, + { 490, 490 }, + { 492, 492 }, + { 494, 494 }, + { 497, 497 }, + { 500, 500 }, + { 502, 504 }, + { 506, 506 }, + { 508, 508 }, + { 510, 510 }, + { 512, 512 }, + { 514, 514 }, + { 516, 516 }, + { 518, 518 }, + { 520, 520 }, + { 522, 522 }, + { 524, 524 }, + { 526, 526 }, + { 528, 528 }, + { 530, 530 }, + { 532, 532 }, + { 534, 534 }, + { 536, 536 }, + { 538, 538 }, + { 540, 540 }, + { 542, 542 }, + { 544, 544 }, + { 546, 546 }, + { 548, 548 }, + { 550, 550 }, + { 552, 552 }, + { 554, 554 }, + { 556, 556 }, + { 558, 558 }, + { 560, 560 }, + { 562, 562 }, + { 570, 571 }, + { 573, 574 }, + { 577, 577 }, + { 579, 582 }, + { 584, 584 }, + { 586, 586 }, + { 588, 588 }, + { 590, 590 }, + { 880, 880 }, + { 882, 882 }, + { 886, 886 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 911 }, + { 913, 929 }, + { 931, 939 }, + { 975, 975 }, + { 978, 980 }, + { 984, 984 }, + { 986, 986 }, + { 988, 988 }, + { 990, 990 }, + { 992, 992 }, + { 994, 994 }, + { 996, 996 }, + { 998, 998 }, + { 1000, 1000 }, + { 1002, 1002 }, + { 1004, 1004 }, + { 1006, 1006 }, + { 1012, 1012 }, + { 1015, 1015 }, + { 1017, 1018 }, + { 1021, 1071 }, + { 1120, 1120 }, + { 1122, 1122 }, + { 1124, 1124 }, + { 1126, 1126 }, + { 1128, 1128 }, + { 1130, 1130 }, + { 1132, 1132 }, + { 1134, 1134 }, + { 1136, 1136 }, + { 1138, 1138 }, + { 1140, 1140 }, + { 1142, 1142 }, + { 1144, 1144 }, + { 1146, 1146 }, + { 1148, 1148 }, + { 1150, 1150 }, + { 1152, 1152 }, + { 1162, 1162 }, + { 1164, 1164 }, + { 1166, 1166 }, + { 1168, 1168 }, + { 1170, 1170 }, + { 1172, 1172 }, + { 1174, 1174 }, + { 1176, 1176 }, + { 1178, 1178 }, + { 1180, 1180 }, + { 1182, 1182 }, + { 1184, 1184 }, + { 1186, 1186 }, + { 1188, 1188 }, + { 1190, 1190 }, + { 1192, 1192 }, + { 1194, 1194 }, + { 1196, 1196 }, + { 1198, 1198 }, + { 1200, 1200 }, + { 1202, 1202 }, + { 1204, 1204 }, + { 1206, 1206 }, + { 1208, 1208 }, + { 1210, 1210 }, + { 1212, 1212 }, + { 1214, 1214 }, + { 1216, 1217 }, + { 1219, 1219 }, + { 1221, 1221 }, + { 1223, 1223 }, + { 1225, 1225 }, + { 1227, 1227 }, + { 1229, 1229 }, + { 1232, 1232 }, + { 1234, 1234 }, + { 1236, 1236 }, + { 1238, 1238 }, + { 1240, 1240 }, + { 1242, 1242 }, + { 1244, 1244 }, + { 1246, 1246 }, + { 1248, 1248 }, + { 1250, 1250 }, + { 1252, 1252 }, + { 1254, 1254 }, + { 1256, 1256 }, + { 1258, 1258 }, + { 1260, 1260 }, + { 1262, 1262 }, + { 1264, 1264 }, + { 1266, 1266 }, + { 1268, 1268 }, + { 1270, 1270 }, + { 1272, 1272 }, + { 1274, 1274 }, + { 1276, 1276 }, + { 1278, 1278 }, + { 1280, 1280 }, + { 1282, 1282 }, + { 1284, 1284 }, + { 1286, 1286 }, + { 1288, 1288 }, + { 1290, 1290 }, + { 1292, 1292 }, + { 1294, 1294 }, + { 1296, 1296 }, + { 1298, 1298 }, + { 1300, 1300 }, + { 1302, 1302 }, + { 1304, 1304 }, + { 1306, 1306 }, + { 1308, 1308 }, + { 1310, 1310 }, + { 1312, 1312 }, + { 1314, 1314 }, + { 1316, 1316 }, + { 1318, 1318 }, + { 1329, 1366 }, + { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, + { 7680, 7680 }, + { 7682, 7682 }, + { 7684, 7684 }, + { 7686, 7686 }, + { 7688, 7688 }, + { 7690, 7690 }, + { 7692, 7692 }, + { 7694, 7694 }, + { 7696, 7696 }, + { 7698, 7698 }, + { 7700, 7700 }, + { 7702, 7702 }, + { 7704, 7704 }, + { 7706, 7706 }, + { 7708, 7708 }, + { 7710, 7710 }, + { 7712, 7712 }, + { 7714, 7714 }, + { 7716, 7716 }, + { 7718, 7718 }, + { 7720, 7720 }, + { 7722, 7722 }, + { 7724, 7724 }, + { 7726, 7726 }, + { 7728, 7728 }, + { 7730, 7730 }, + { 7732, 7732 }, + { 7734, 7734 }, + { 7736, 7736 }, + { 7738, 7738 }, + { 7740, 7740 }, + { 7742, 7742 }, + { 7744, 7744 }, + { 7746, 7746 }, + { 7748, 7748 }, + { 7750, 7750 }, + { 7752, 7752 }, + { 7754, 7754 }, + { 7756, 7756 }, + { 7758, 7758 }, + { 7760, 7760 }, + { 7762, 7762 }, + { 7764, 7764 }, + { 7766, 7766 }, + { 7768, 7768 }, + { 7770, 7770 }, + { 7772, 7772 }, + { 7774, 7774 }, + { 7776, 7776 }, + { 7778, 7778 }, + { 7780, 7780 }, + { 7782, 7782 }, + { 7784, 7784 }, + { 7786, 7786 }, + { 7788, 7788 }, + { 7790, 7790 }, + { 7792, 7792 }, + { 7794, 7794 }, + { 7796, 7796 }, + { 7798, 7798 }, + { 7800, 7800 }, + { 7802, 7802 }, + { 7804, 7804 }, + { 7806, 7806 }, + { 7808, 7808 }, + { 7810, 7810 }, + { 7812, 7812 }, + { 7814, 7814 }, + { 7816, 7816 }, + { 7818, 7818 }, + { 7820, 7820 }, + { 7822, 7822 }, + { 7824, 7824 }, + { 7826, 7826 }, + { 7828, 7828 }, + { 7838, 7838 }, + { 7840, 7840 }, + { 7842, 7842 }, + { 7844, 7844 }, + { 7846, 7846 }, + { 7848, 7848 }, + { 7850, 7850 }, + { 7852, 7852 }, + { 7854, 7854 }, + { 7856, 7856 }, + { 7858, 7858 }, + { 7860, 7860 }, + { 7862, 7862 }, + { 7864, 7864 }, + { 7866, 7866 }, + { 7868, 7868 }, + { 7870, 7870 }, + { 7872, 7872 }, + { 7874, 7874 }, + { 7876, 7876 }, + { 7878, 7878 }, + { 7880, 7880 }, + { 7882, 7882 }, + { 7884, 7884 }, + { 7886, 7886 }, + { 7888, 7888 }, + { 7890, 7890 }, + { 7892, 7892 }, + { 7894, 7894 }, + { 7896, 7896 }, + { 7898, 7898 }, + { 7900, 7900 }, + { 7902, 7902 }, + { 7904, 7904 }, + { 7906, 7906 }, + { 7908, 7908 }, + { 7910, 7910 }, + { 7912, 7912 }, + { 7914, 7914 }, + { 7916, 7916 }, + { 7918, 7918 }, + { 7920, 7920 }, + { 7922, 7922 }, + { 7924, 7924 }, + { 7926, 7926 }, + { 7928, 7928 }, + { 7930, 7930 }, + { 7932, 7932 }, + { 7934, 7934 }, + { 7944, 7951 }, + { 7960, 7965 }, + { 7976, 7983 }, + { 7992, 7999 }, + { 8008, 8013 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8031 }, + { 8040, 8047 }, + { 8120, 8123 }, + { 8136, 8139 }, + { 8152, 8155 }, + { 8168, 8172 }, + { 8184, 8187 }, + { 8450, 8450 }, + { 8455, 8455 }, + { 8459, 8461 }, + { 8464, 8466 }, + { 8469, 8469 }, + { 8473, 8477 }, + { 8484, 8484 }, + { 8486, 8486 }, + { 8488, 8488 }, + { 8490, 8493 }, + { 8496, 8499 }, + { 8510, 8511 }, + { 8517, 8517 }, + { 8579, 8579 }, + { 11264, 11310 }, + { 11360, 11360 }, + { 11362, 11364 }, + { 11367, 11367 }, + { 11369, 11369 }, + { 11371, 11371 }, + { 11373, 11376 }, + { 11378, 11378 }, + { 11381, 11381 }, + { 11390, 11392 }, + { 11394, 11394 }, + { 11396, 11396 }, + { 11398, 11398 }, + { 11400, 11400 }, + { 11402, 11402 }, + { 11404, 11404 }, + { 11406, 11406 }, + { 11408, 11408 }, + { 11410, 11410 }, + { 11412, 11412 }, + { 11414, 11414 }, + { 11416, 11416 }, + { 11418, 11418 }, + { 11420, 11420 }, + { 11422, 11422 }, + { 11424, 11424 }, + { 11426, 11426 }, + { 11428, 11428 }, + { 11430, 11430 }, + { 11432, 11432 }, + { 11434, 11434 }, + { 11436, 11436 }, + { 11438, 11438 }, + { 11440, 11440 }, + { 11442, 11442 }, + { 11444, 11444 }, + { 11446, 11446 }, + { 11448, 11448 }, + { 11450, 11450 }, + { 11452, 11452 }, + { 11454, 11454 }, + { 11456, 11456 }, + { 11458, 11458 }, + { 11460, 11460 }, + { 11462, 11462 }, + { 11464, 11464 }, + { 11466, 11466 }, + { 11468, 11468 }, + { 11470, 11470 }, + { 11472, 11472 }, + { 11474, 11474 }, + { 11476, 11476 }, + { 11478, 11478 }, + { 11480, 11480 }, + { 11482, 11482 }, + { 11484, 11484 }, + { 11486, 11486 }, + { 11488, 11488 }, + { 11490, 11490 }, + { 11499, 11499 }, + { 11501, 11501 }, + { 11506, 11506 }, + { 42560, 42560 }, + { 42562, 42562 }, + { 42564, 42564 }, + { 42566, 42566 }, + { 42568, 42568 }, + { 42570, 42570 }, + { 42572, 42572 }, + { 42574, 42574 }, + { 42576, 42576 }, + { 42578, 42578 }, + { 42580, 42580 }, + { 42582, 42582 }, + { 42584, 42584 }, + { 42586, 42586 }, + { 42588, 42588 }, + { 42590, 42590 }, + { 42592, 42592 }, + { 42594, 42594 }, + { 42596, 42596 }, + { 42598, 42598 }, + { 42600, 42600 }, + { 42602, 42602 }, + { 42604, 42604 }, + { 42624, 42624 }, + { 42626, 42626 }, + { 42628, 42628 }, + { 42630, 42630 }, + { 42632, 42632 }, + { 42634, 42634 }, + { 42636, 42636 }, + { 42638, 42638 }, + { 42640, 42640 }, + { 42642, 42642 }, + { 42644, 42644 }, + { 42646, 42646 }, + { 42786, 42786 }, + { 42788, 42788 }, + { 42790, 42790 }, + { 42792, 42792 }, + { 42794, 42794 }, + { 42796, 42796 }, + { 42798, 42798 }, + { 42802, 42802 }, + { 42804, 42804 }, + { 42806, 42806 }, + { 42808, 42808 }, + { 42810, 42810 }, + { 42812, 42812 }, + { 42814, 42814 }, + { 42816, 42816 }, + { 42818, 42818 }, + { 42820, 42820 }, + { 42822, 42822 }, + { 42824, 42824 }, + { 42826, 42826 }, + { 42828, 42828 }, + { 42830, 42830 }, + { 42832, 42832 }, + { 42834, 42834 }, + { 42836, 42836 }, + { 42838, 42838 }, + { 42840, 42840 }, + { 42842, 42842 }, + { 42844, 42844 }, + { 42846, 42846 }, + { 42848, 42848 }, + { 42850, 42850 }, + { 42852, 42852 }, + { 42854, 42854 }, + { 42856, 42856 }, + { 42858, 42858 }, + { 42860, 42860 }, + { 42862, 42862 }, + { 42873, 42873 }, + { 42875, 42875 }, + { 42877, 42878 }, + { 42880, 42880 }, + { 42882, 42882 }, + { 42884, 42884 }, + { 42886, 42886 }, + { 42891, 42891 }, + { 42893, 42893 }, + { 42896, 42896 }, + { 42898, 42898 }, + { 42912, 42912 }, + { 42914, 42914 }, + { 42916, 42916 }, + { 42918, 42918 }, + { 42920, 42920 }, + { 42922, 42922 }, + { 65313, 65338 }, +}; +static const URange32 Lu_range32[] = { + { 66560, 66599 }, + { 119808, 119833 }, + { 119860, 119885 }, + { 119912, 119937 }, + { 119964, 119964 }, + { 119966, 119967 }, + { 119970, 119970 }, + { 119973, 119974 }, + { 119977, 119980 }, + { 119982, 119989 }, + { 120016, 120041 }, + { 120068, 120069 }, + { 120071, 120074 }, + { 120077, 120084 }, + { 120086, 120092 }, + { 120120, 120121 }, + { 120123, 120126 }, + { 120128, 120132 }, + { 120134, 120134 }, + { 120138, 120144 }, + { 120172, 120197 }, + { 120224, 120249 }, + { 120276, 120301 }, + { 120328, 120353 }, + { 120380, 120405 }, + { 120432, 120457 }, + { 120488, 120512 }, + { 120546, 120570 }, + { 120604, 120628 }, + { 120662, 120686 }, + { 120720, 120744 }, + { 120778, 120778 }, +}; +static const URange16 Pf_range16[] = { + { 187, 187 }, + { 8217, 8217 }, + { 8221, 8221 }, + { 8250, 8250 }, + { 11779, 11779 }, + { 11781, 11781 }, + { 11786, 11786 }, + { 11789, 11789 }, + { 11805, 11805 }, + { 11809, 11809 }, +}; +static const URange16 Pd_range16[] = { + { 45, 45 }, + { 1418, 1418 }, + { 1470, 1470 }, + { 5120, 5120 }, + { 6150, 6150 }, + { 8208, 8213 }, + { 11799, 11799 }, + { 11802, 11802 }, + { 11834, 11835 }, + { 12316, 12316 }, + { 12336, 12336 }, + { 12448, 12448 }, + { 65073, 65074 }, + { 65112, 65112 }, + { 65123, 65123 }, + { 65293, 65293 }, +}; +static const URange16 Pe_range16[] = { + { 41, 41 }, + { 93, 93 }, + { 125, 125 }, + { 3899, 3899 }, + { 3901, 3901 }, + { 5788, 5788 }, + { 8262, 8262 }, + { 8318, 8318 }, + { 8334, 8334 }, + { 8969, 8969 }, + { 8971, 8971 }, + { 9002, 9002 }, + { 10089, 10089 }, + { 10091, 10091 }, + { 10093, 10093 }, + { 10095, 10095 }, + { 10097, 10097 }, + { 10099, 10099 }, + { 10101, 10101 }, + { 10182, 10182 }, + { 10215, 10215 }, + { 10217, 10217 }, + { 10219, 10219 }, + { 10221, 10221 }, + { 10223, 10223 }, + { 10628, 10628 }, + { 10630, 10630 }, + { 10632, 10632 }, + { 10634, 10634 }, + { 10636, 10636 }, + { 10638, 10638 }, + { 10640, 10640 }, + { 10642, 10642 }, + { 10644, 10644 }, + { 10646, 10646 }, + { 10648, 10648 }, + { 10713, 10713 }, + { 10715, 10715 }, + { 10749, 10749 }, + { 11811, 11811 }, + { 11813, 11813 }, + { 11815, 11815 }, + { 11817, 11817 }, + { 12297, 12297 }, + { 12299, 12299 }, + { 12301, 12301 }, + { 12303, 12303 }, + { 12305, 12305 }, + { 12309, 12309 }, + { 12311, 12311 }, + { 12313, 12313 }, + { 12315, 12315 }, + { 12318, 12319 }, + { 64831, 64831 }, + { 65048, 65048 }, + { 65078, 65078 }, + { 65080, 65080 }, + { 65082, 65082 }, + { 65084, 65084 }, + { 65086, 65086 }, + { 65088, 65088 }, + { 65090, 65090 }, + { 65092, 65092 }, + { 65096, 65096 }, + { 65114, 65114 }, + { 65116, 65116 }, + { 65118, 65118 }, + { 65289, 65289 }, + { 65341, 65341 }, + { 65373, 65373 }, + { 65376, 65376 }, + { 65379, 65379 }, +}; +static const URange16 Pi_range16[] = { + { 171, 171 }, + { 8216, 8216 }, + { 8219, 8220 }, + { 8223, 8223 }, + { 8249, 8249 }, + { 11778, 11778 }, + { 11780, 11780 }, + { 11785, 11785 }, + { 11788, 11788 }, + { 11804, 11804 }, + { 11808, 11808 }, +}; +static const URange16 Po_range16[] = { + { 33, 35 }, + { 37, 39 }, + { 42, 42 }, + { 44, 44 }, + { 46, 47 }, + { 58, 59 }, + { 63, 64 }, + { 92, 92 }, + { 161, 161 }, + { 167, 167 }, + { 182, 183 }, + { 191, 191 }, + { 894, 894 }, + { 903, 903 }, + { 1370, 1375 }, + { 1417, 1417 }, + { 1472, 1472 }, + { 1475, 1475 }, + { 1478, 1478 }, + { 1523, 1524 }, + { 1545, 1546 }, + { 1548, 1549 }, + { 1563, 1563 }, + { 1566, 1567 }, + { 1642, 1645 }, + { 1748, 1748 }, + { 1792, 1805 }, + { 2039, 2041 }, + { 2096, 2110 }, + { 2142, 2142 }, + { 2404, 2405 }, + { 2416, 2416 }, + { 2800, 2800 }, + { 3572, 3572 }, + { 3663, 3663 }, + { 3674, 3675 }, + { 3844, 3858 }, + { 3860, 3860 }, + { 3973, 3973 }, + { 4048, 4052 }, + { 4057, 4058 }, + { 4170, 4175 }, + { 4347, 4347 }, + { 4960, 4968 }, + { 5741, 5742 }, + { 5867, 5869 }, + { 5941, 5942 }, + { 6100, 6102 }, + { 6104, 6106 }, + { 6144, 6149 }, + { 6151, 6154 }, + { 6468, 6469 }, + { 6686, 6687 }, + { 6816, 6822 }, + { 6824, 6829 }, + { 7002, 7008 }, + { 7164, 7167 }, + { 7227, 7231 }, + { 7294, 7295 }, + { 7360, 7367 }, + { 7379, 7379 }, + { 8214, 8215 }, + { 8224, 8231 }, + { 8240, 8248 }, + { 8251, 8254 }, + { 8257, 8259 }, + { 8263, 8273 }, + { 8275, 8275 }, + { 8277, 8286 }, + { 11513, 11516 }, + { 11518, 11519 }, + { 11632, 11632 }, + { 11776, 11777 }, + { 11782, 11784 }, + { 11787, 11787 }, + { 11790, 11798 }, + { 11800, 11801 }, + { 11803, 11803 }, + { 11806, 11807 }, + { 11818, 11822 }, + { 11824, 11833 }, + { 12289, 12291 }, + { 12349, 12349 }, + { 12539, 12539 }, + { 42238, 42239 }, + { 42509, 42511 }, + { 42611, 42611 }, + { 42622, 42622 }, + { 42738, 42743 }, + { 43124, 43127 }, + { 43214, 43215 }, + { 43256, 43258 }, + { 43310, 43311 }, + { 43359, 43359 }, + { 43457, 43469 }, + { 43486, 43487 }, + { 43612, 43615 }, + { 43742, 43743 }, + { 43760, 43761 }, + { 44011, 44011 }, + { 65040, 65046 }, + { 65049, 65049 }, + { 65072, 65072 }, + { 65093, 65094 }, + { 65097, 65100 }, + { 65104, 65106 }, + { 65108, 65111 }, + { 65119, 65121 }, + { 65128, 65128 }, + { 65130, 65131 }, + { 65281, 65283 }, + { 65285, 65287 }, + { 65290, 65290 }, + { 65292, 65292 }, + { 65294, 65295 }, + { 65306, 65307 }, + { 65311, 65312 }, + { 65340, 65340 }, + { 65377, 65377 }, + { 65380, 65381 }, +}; +static const URange32 Po_range32[] = { + { 65792, 65794 }, + { 66463, 66463 }, + { 66512, 66512 }, + { 67671, 67671 }, + { 67871, 67871 }, + { 67903, 67903 }, + { 68176, 68184 }, + { 68223, 68223 }, + { 68409, 68415 }, + { 69703, 69709 }, + { 69819, 69820 }, + { 69822, 69825 }, + { 69952, 69955 }, + { 70085, 70088 }, + { 74864, 74867 }, +}; +static const URange16 Me_range16[] = { + { 1160, 1161 }, + { 8413, 8416 }, + { 8418, 8420 }, + { 42608, 42610 }, +}; +static const URange16 C_range16[] = { + { 0, 31 }, + { 127, 159 }, + { 173, 173 }, + { 1536, 1540 }, + { 1564, 1564 }, + { 1757, 1757 }, + { 1807, 1807 }, + { 6158, 6158 }, + { 8203, 8207 }, + { 8234, 8238 }, + { 8288, 8292 }, + { 8294, 8303 }, + { 55296, 63743 }, + { 65279, 65279 }, + { 65529, 65531 }, +}; +static const URange32 C_range32[] = { + { 69821, 69821 }, + { 119155, 119162 }, + { 917505, 917505 }, + { 917536, 917631 }, + { 983040, 1048573 }, + { 1048576, 1114109 }, +}; +static const URange16 Mc_range16[] = { + { 2307, 2307 }, + { 2363, 2363 }, + { 2366, 2368 }, + { 2377, 2380 }, + { 2382, 2383 }, + { 2434, 2435 }, + { 2494, 2496 }, + { 2503, 2504 }, + { 2507, 2508 }, + { 2519, 2519 }, + { 2563, 2563 }, + { 2622, 2624 }, + { 2691, 2691 }, + { 2750, 2752 }, + { 2761, 2761 }, + { 2763, 2764 }, + { 2818, 2819 }, + { 2878, 2878 }, + { 2880, 2880 }, + { 2887, 2888 }, + { 2891, 2892 }, + { 2903, 2903 }, + { 3006, 3007 }, + { 3009, 3010 }, + { 3014, 3016 }, + { 3018, 3020 }, + { 3031, 3031 }, + { 3073, 3075 }, + { 3137, 3140 }, + { 3202, 3203 }, + { 3262, 3262 }, + { 3264, 3268 }, + { 3271, 3272 }, + { 3274, 3275 }, + { 3285, 3286 }, + { 3330, 3331 }, + { 3390, 3392 }, + { 3398, 3400 }, + { 3402, 3404 }, + { 3415, 3415 }, + { 3458, 3459 }, + { 3535, 3537 }, + { 3544, 3551 }, + { 3570, 3571 }, + { 3902, 3903 }, + { 3967, 3967 }, + { 4139, 4140 }, + { 4145, 4145 }, + { 4152, 4152 }, + { 4155, 4156 }, + { 4182, 4183 }, + { 4194, 4196 }, + { 4199, 4205 }, + { 4227, 4228 }, + { 4231, 4236 }, + { 4239, 4239 }, + { 4250, 4252 }, + { 6070, 6070 }, + { 6078, 6085 }, + { 6087, 6088 }, + { 6435, 6438 }, + { 6441, 6443 }, + { 6448, 6449 }, + { 6451, 6456 }, + { 6576, 6592 }, + { 6600, 6601 }, + { 6681, 6682 }, + { 6741, 6741 }, + { 6743, 6743 }, + { 6753, 6753 }, + { 6755, 6756 }, + { 6765, 6770 }, + { 6916, 6916 }, + { 6965, 6965 }, + { 6971, 6971 }, + { 6973, 6977 }, + { 6979, 6980 }, + { 7042, 7042 }, + { 7073, 7073 }, + { 7078, 7079 }, + { 7082, 7082 }, + { 7084, 7085 }, + { 7143, 7143 }, + { 7146, 7148 }, + { 7150, 7150 }, + { 7154, 7155 }, + { 7204, 7211 }, + { 7220, 7221 }, + { 7393, 7393 }, + { 7410, 7411 }, + { 12334, 12335 }, + { 43043, 43044 }, + { 43047, 43047 }, + { 43136, 43137 }, + { 43188, 43203 }, + { 43346, 43347 }, + { 43395, 43395 }, + { 43444, 43445 }, + { 43450, 43451 }, + { 43453, 43456 }, + { 43567, 43568 }, + { 43571, 43572 }, + { 43597, 43597 }, + { 43643, 43643 }, + { 43755, 43755 }, + { 43758, 43759 }, + { 43765, 43765 }, + { 44003, 44004 }, + { 44006, 44007 }, + { 44009, 44010 }, + { 44012, 44012 }, +}; +static const URange32 Mc_range32[] = { + { 69632, 69632 }, + { 69634, 69634 }, + { 69762, 69762 }, + { 69808, 69810 }, + { 69815, 69816 }, + { 69932, 69932 }, + { 70018, 70018 }, + { 70067, 70069 }, + { 70079, 70080 }, + { 71340, 71340 }, + { 71342, 71343 }, + { 71350, 71350 }, + { 94033, 94078 }, + { 119141, 119142 }, + { 119149, 119154 }, +}; +static const URange16 Mn_range16[] = { + { 768, 879 }, + { 1155, 1159 }, + { 1425, 1469 }, + { 1471, 1471 }, + { 1473, 1474 }, + { 1476, 1477 }, + { 1479, 1479 }, + { 1552, 1562 }, + { 1611, 1631 }, + { 1648, 1648 }, + { 1750, 1756 }, + { 1759, 1764 }, + { 1767, 1768 }, + { 1770, 1773 }, + { 1809, 1809 }, + { 1840, 1866 }, + { 1958, 1968 }, + { 2027, 2035 }, + { 2070, 2073 }, + { 2075, 2083 }, + { 2085, 2087 }, + { 2089, 2093 }, + { 2137, 2139 }, + { 2276, 2302 }, + { 2304, 2306 }, + { 2362, 2362 }, + { 2364, 2364 }, + { 2369, 2376 }, + { 2381, 2381 }, + { 2385, 2391 }, + { 2402, 2403 }, + { 2433, 2433 }, + { 2492, 2492 }, + { 2497, 2500 }, + { 2509, 2509 }, + { 2530, 2531 }, + { 2561, 2562 }, + { 2620, 2620 }, + { 2625, 2626 }, + { 2631, 2632 }, + { 2635, 2637 }, + { 2641, 2641 }, + { 2672, 2673 }, + { 2677, 2677 }, + { 2689, 2690 }, + { 2748, 2748 }, + { 2753, 2757 }, + { 2759, 2760 }, + { 2765, 2765 }, + { 2786, 2787 }, + { 2817, 2817 }, + { 2876, 2876 }, + { 2879, 2879 }, + { 2881, 2884 }, + { 2893, 2893 }, + { 2902, 2902 }, + { 2914, 2915 }, + { 2946, 2946 }, + { 3008, 3008 }, + { 3021, 3021 }, + { 3134, 3136 }, + { 3142, 3144 }, + { 3146, 3149 }, + { 3157, 3158 }, + { 3170, 3171 }, + { 3260, 3260 }, + { 3263, 3263 }, + { 3270, 3270 }, + { 3276, 3277 }, + { 3298, 3299 }, + { 3393, 3396 }, + { 3405, 3405 }, + { 3426, 3427 }, + { 3530, 3530 }, + { 3538, 3540 }, + { 3542, 3542 }, + { 3633, 3633 }, + { 3636, 3642 }, + { 3655, 3662 }, + { 3761, 3761 }, + { 3764, 3769 }, + { 3771, 3772 }, + { 3784, 3789 }, + { 3864, 3865 }, + { 3893, 3893 }, + { 3895, 3895 }, + { 3897, 3897 }, + { 3953, 3966 }, + { 3968, 3972 }, + { 3974, 3975 }, + { 3981, 3991 }, + { 3993, 4028 }, + { 4038, 4038 }, + { 4141, 4144 }, + { 4146, 4151 }, + { 4153, 4154 }, + { 4157, 4158 }, + { 4184, 4185 }, + { 4190, 4192 }, + { 4209, 4212 }, + { 4226, 4226 }, + { 4229, 4230 }, + { 4237, 4237 }, + { 4253, 4253 }, + { 4957, 4959 }, + { 5906, 5908 }, + { 5938, 5940 }, + { 5970, 5971 }, + { 6002, 6003 }, + { 6068, 6069 }, + { 6071, 6077 }, + { 6086, 6086 }, + { 6089, 6099 }, + { 6109, 6109 }, + { 6155, 6157 }, + { 6313, 6313 }, + { 6432, 6434 }, + { 6439, 6440 }, + { 6450, 6450 }, + { 6457, 6459 }, + { 6679, 6680 }, + { 6683, 6683 }, + { 6742, 6742 }, + { 6744, 6750 }, + { 6752, 6752 }, + { 6754, 6754 }, + { 6757, 6764 }, + { 6771, 6780 }, + { 6783, 6783 }, + { 6912, 6915 }, + { 6964, 6964 }, + { 6966, 6970 }, + { 6972, 6972 }, + { 6978, 6978 }, + { 7019, 7027 }, + { 7040, 7041 }, + { 7074, 7077 }, + { 7080, 7081 }, + { 7083, 7083 }, + { 7142, 7142 }, + { 7144, 7145 }, + { 7149, 7149 }, + { 7151, 7153 }, + { 7212, 7219 }, + { 7222, 7223 }, + { 7376, 7378 }, + { 7380, 7392 }, + { 7394, 7400 }, + { 7405, 7405 }, + { 7412, 7412 }, + { 7616, 7654 }, + { 7676, 7679 }, + { 8400, 8412 }, + { 8417, 8417 }, + { 8421, 8432 }, + { 11503, 11505 }, + { 11647, 11647 }, + { 11744, 11775 }, + { 12330, 12333 }, + { 12441, 12442 }, + { 42607, 42607 }, + { 42612, 42621 }, + { 42655, 42655 }, + { 42736, 42737 }, + { 43010, 43010 }, + { 43014, 43014 }, + { 43019, 43019 }, + { 43045, 43046 }, + { 43204, 43204 }, + { 43232, 43249 }, + { 43302, 43309 }, + { 43335, 43345 }, + { 43392, 43394 }, + { 43443, 43443 }, + { 43446, 43449 }, + { 43452, 43452 }, + { 43561, 43566 }, + { 43569, 43570 }, + { 43573, 43574 }, + { 43587, 43587 }, + { 43596, 43596 }, + { 43696, 43696 }, + { 43698, 43700 }, + { 43703, 43704 }, + { 43710, 43711 }, + { 43713, 43713 }, + { 43756, 43757 }, + { 43766, 43766 }, + { 44005, 44005 }, + { 44008, 44008 }, + { 44013, 44013 }, + { 64286, 64286 }, + { 65024, 65039 }, + { 65056, 65062 }, +}; +static const URange32 Mn_range32[] = { + { 66045, 66045 }, + { 68097, 68099 }, + { 68101, 68102 }, + { 68108, 68111 }, + { 68152, 68154 }, + { 68159, 68159 }, + { 69633, 69633 }, + { 69688, 69702 }, + { 69760, 69761 }, + { 69811, 69814 }, + { 69817, 69818 }, + { 69888, 69890 }, + { 69927, 69931 }, + { 69933, 69940 }, + { 70016, 70017 }, + { 70070, 70078 }, + { 71339, 71339 }, + { 71341, 71341 }, + { 71344, 71349 }, + { 71351, 71351 }, + { 94095, 94098 }, + { 119143, 119145 }, + { 119163, 119170 }, + { 119173, 119179 }, + { 119210, 119213 }, + { 119362, 119364 }, + { 917760, 917999 }, +}; +static const URange16 M_range16[] = { + { 768, 879 }, + { 1155, 1161 }, + { 1425, 1469 }, + { 1471, 1471 }, + { 1473, 1474 }, + { 1476, 1477 }, + { 1479, 1479 }, + { 1552, 1562 }, + { 1611, 1631 }, + { 1648, 1648 }, + { 1750, 1756 }, + { 1759, 1764 }, + { 1767, 1768 }, + { 1770, 1773 }, + { 1809, 1809 }, + { 1840, 1866 }, + { 1958, 1968 }, + { 2027, 2035 }, + { 2070, 2073 }, + { 2075, 2083 }, + { 2085, 2087 }, + { 2089, 2093 }, + { 2137, 2139 }, + { 2276, 2302 }, + { 2304, 2307 }, + { 2362, 2364 }, + { 2366, 2383 }, + { 2385, 2391 }, + { 2402, 2403 }, + { 2433, 2435 }, + { 2492, 2492 }, + { 2494, 2500 }, + { 2503, 2504 }, + { 2507, 2509 }, + { 2519, 2519 }, + { 2530, 2531 }, + { 2561, 2563 }, + { 2620, 2620 }, + { 2622, 2626 }, + { 2631, 2632 }, + { 2635, 2637 }, + { 2641, 2641 }, + { 2672, 2673 }, + { 2677, 2677 }, + { 2689, 2691 }, + { 2748, 2748 }, + { 2750, 2757 }, + { 2759, 2761 }, + { 2763, 2765 }, + { 2786, 2787 }, + { 2817, 2819 }, + { 2876, 2876 }, + { 2878, 2884 }, + { 2887, 2888 }, + { 2891, 2893 }, + { 2902, 2903 }, + { 2914, 2915 }, + { 2946, 2946 }, + { 3006, 3010 }, + { 3014, 3016 }, + { 3018, 3021 }, + { 3031, 3031 }, + { 3073, 3075 }, + { 3134, 3140 }, + { 3142, 3144 }, + { 3146, 3149 }, + { 3157, 3158 }, + { 3170, 3171 }, + { 3202, 3203 }, + { 3260, 3260 }, + { 3262, 3268 }, + { 3270, 3272 }, + { 3274, 3277 }, + { 3285, 3286 }, + { 3298, 3299 }, + { 3330, 3331 }, + { 3390, 3396 }, + { 3398, 3400 }, + { 3402, 3405 }, + { 3415, 3415 }, + { 3426, 3427 }, + { 3458, 3459 }, + { 3530, 3530 }, + { 3535, 3540 }, + { 3542, 3542 }, + { 3544, 3551 }, + { 3570, 3571 }, + { 3633, 3633 }, + { 3636, 3642 }, + { 3655, 3662 }, + { 3761, 3761 }, + { 3764, 3769 }, + { 3771, 3772 }, + { 3784, 3789 }, + { 3864, 3865 }, + { 3893, 3893 }, + { 3895, 3895 }, + { 3897, 3897 }, + { 3902, 3903 }, + { 3953, 3972 }, + { 3974, 3975 }, + { 3981, 3991 }, + { 3993, 4028 }, + { 4038, 4038 }, + { 4139, 4158 }, + { 4182, 4185 }, + { 4190, 4192 }, + { 4194, 4196 }, + { 4199, 4205 }, + { 4209, 4212 }, + { 4226, 4237 }, + { 4239, 4239 }, + { 4250, 4253 }, + { 4957, 4959 }, + { 5906, 5908 }, + { 5938, 5940 }, + { 5970, 5971 }, + { 6002, 6003 }, + { 6068, 6099 }, + { 6109, 6109 }, + { 6155, 6157 }, + { 6313, 6313 }, + { 6432, 6443 }, + { 6448, 6459 }, + { 6576, 6592 }, + { 6600, 6601 }, + { 6679, 6683 }, + { 6741, 6750 }, + { 6752, 6780 }, + { 6783, 6783 }, + { 6912, 6916 }, + { 6964, 6980 }, + { 7019, 7027 }, + { 7040, 7042 }, + { 7073, 7085 }, + { 7142, 7155 }, + { 7204, 7223 }, + { 7376, 7378 }, + { 7380, 7400 }, + { 7405, 7405 }, + { 7410, 7412 }, + { 7616, 7654 }, + { 7676, 7679 }, + { 8400, 8432 }, + { 11503, 11505 }, + { 11647, 11647 }, + { 11744, 11775 }, + { 12330, 12335 }, + { 12441, 12442 }, + { 42607, 42610 }, + { 42612, 42621 }, + { 42655, 42655 }, + { 42736, 42737 }, + { 43010, 43010 }, + { 43014, 43014 }, + { 43019, 43019 }, + { 43043, 43047 }, + { 43136, 43137 }, + { 43188, 43204 }, + { 43232, 43249 }, + { 43302, 43309 }, + { 43335, 43347 }, + { 43392, 43395 }, + { 43443, 43456 }, + { 43561, 43574 }, + { 43587, 43587 }, + { 43596, 43597 }, + { 43643, 43643 }, + { 43696, 43696 }, + { 43698, 43700 }, + { 43703, 43704 }, + { 43710, 43711 }, + { 43713, 43713 }, + { 43755, 43759 }, + { 43765, 43766 }, + { 44003, 44010 }, + { 44012, 44013 }, + { 64286, 64286 }, + { 65024, 65039 }, + { 65056, 65062 }, +}; +static const URange32 M_range32[] = { + { 66045, 66045 }, + { 68097, 68099 }, + { 68101, 68102 }, + { 68108, 68111 }, + { 68152, 68154 }, + { 68159, 68159 }, + { 69632, 69634 }, + { 69688, 69702 }, + { 69760, 69762 }, + { 69808, 69818 }, + { 69888, 69890 }, + { 69927, 69940 }, + { 70016, 70018 }, + { 70067, 70080 }, + { 71339, 71351 }, + { 94033, 94078 }, + { 94095, 94098 }, + { 119141, 119145 }, + { 119149, 119154 }, + { 119163, 119170 }, + { 119173, 119179 }, + { 119210, 119213 }, + { 119362, 119364 }, + { 917760, 917999 }, +}; +static const URange16 L_range16[] = { + { 65, 90 }, + { 97, 122 }, + { 170, 170 }, + { 181, 181 }, + { 186, 186 }, + { 192, 214 }, + { 216, 246 }, + { 248, 705 }, + { 710, 721 }, + { 736, 740 }, + { 748, 748 }, + { 750, 750 }, + { 880, 884 }, + { 886, 887 }, + { 890, 893 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 929 }, + { 931, 1013 }, + { 1015, 1153 }, + { 1162, 1319 }, + { 1329, 1366 }, + { 1369, 1369 }, + { 1377, 1415 }, + { 1488, 1514 }, + { 1520, 1522 }, + { 1568, 1610 }, + { 1646, 1647 }, + { 1649, 1747 }, + { 1749, 1749 }, + { 1765, 1766 }, + { 1774, 1775 }, + { 1786, 1788 }, + { 1791, 1791 }, + { 1808, 1808 }, + { 1810, 1839 }, + { 1869, 1957 }, + { 1969, 1969 }, + { 1994, 2026 }, + { 2036, 2037 }, + { 2042, 2042 }, + { 2048, 2069 }, + { 2074, 2074 }, + { 2084, 2084 }, + { 2088, 2088 }, + { 2112, 2136 }, + { 2208, 2208 }, + { 2210, 2220 }, + { 2308, 2361 }, + { 2365, 2365 }, + { 2384, 2384 }, + { 2392, 2401 }, + { 2417, 2423 }, + { 2425, 2431 }, + { 2437, 2444 }, + { 2447, 2448 }, + { 2451, 2472 }, + { 2474, 2480 }, + { 2482, 2482 }, + { 2486, 2489 }, + { 2493, 2493 }, + { 2510, 2510 }, + { 2524, 2525 }, + { 2527, 2529 }, + { 2544, 2545 }, + { 2565, 2570 }, + { 2575, 2576 }, + { 2579, 2600 }, + { 2602, 2608 }, + { 2610, 2611 }, + { 2613, 2614 }, + { 2616, 2617 }, + { 2649, 2652 }, + { 2654, 2654 }, + { 2674, 2676 }, + { 2693, 2701 }, + { 2703, 2705 }, + { 2707, 2728 }, + { 2730, 2736 }, + { 2738, 2739 }, + { 2741, 2745 }, + { 2749, 2749 }, + { 2768, 2768 }, + { 2784, 2785 }, + { 2821, 2828 }, + { 2831, 2832 }, + { 2835, 2856 }, + { 2858, 2864 }, + { 2866, 2867 }, + { 2869, 2873 }, + { 2877, 2877 }, + { 2908, 2909 }, + { 2911, 2913 }, + { 2929, 2929 }, + { 2947, 2947 }, + { 2949, 2954 }, + { 2958, 2960 }, + { 2962, 2965 }, + { 2969, 2970 }, + { 2972, 2972 }, + { 2974, 2975 }, + { 2979, 2980 }, + { 2984, 2986 }, + { 2990, 3001 }, + { 3024, 3024 }, + { 3077, 3084 }, + { 3086, 3088 }, + { 3090, 3112 }, + { 3114, 3123 }, + { 3125, 3129 }, + { 3133, 3133 }, + { 3160, 3161 }, + { 3168, 3169 }, + { 3205, 3212 }, + { 3214, 3216 }, + { 3218, 3240 }, + { 3242, 3251 }, + { 3253, 3257 }, + { 3261, 3261 }, + { 3294, 3294 }, + { 3296, 3297 }, + { 3313, 3314 }, + { 3333, 3340 }, + { 3342, 3344 }, + { 3346, 3386 }, + { 3389, 3389 }, + { 3406, 3406 }, + { 3424, 3425 }, + { 3450, 3455 }, + { 3461, 3478 }, + { 3482, 3505 }, + { 3507, 3515 }, + { 3517, 3517 }, + { 3520, 3526 }, + { 3585, 3632 }, + { 3634, 3635 }, + { 3648, 3654 }, + { 3713, 3714 }, + { 3716, 3716 }, + { 3719, 3720 }, + { 3722, 3722 }, + { 3725, 3725 }, + { 3732, 3735 }, + { 3737, 3743 }, + { 3745, 3747 }, + { 3749, 3749 }, + { 3751, 3751 }, + { 3754, 3755 }, + { 3757, 3760 }, + { 3762, 3763 }, + { 3773, 3773 }, + { 3776, 3780 }, + { 3782, 3782 }, + { 3804, 3807 }, + { 3840, 3840 }, + { 3904, 3911 }, + { 3913, 3948 }, + { 3976, 3980 }, + { 4096, 4138 }, + { 4159, 4159 }, + { 4176, 4181 }, + { 4186, 4189 }, + { 4193, 4193 }, + { 4197, 4198 }, + { 4206, 4208 }, + { 4213, 4225 }, + { 4238, 4238 }, + { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, + { 4304, 4346 }, + { 4348, 4680 }, + { 4682, 4685 }, + { 4688, 4694 }, + { 4696, 4696 }, + { 4698, 4701 }, + { 4704, 4744 }, + { 4746, 4749 }, + { 4752, 4784 }, + { 4786, 4789 }, + { 4792, 4798 }, + { 4800, 4800 }, + { 4802, 4805 }, + { 4808, 4822 }, + { 4824, 4880 }, + { 4882, 4885 }, + { 4888, 4954 }, + { 4992, 5007 }, + { 5024, 5108 }, + { 5121, 5740 }, + { 5743, 5759 }, + { 5761, 5786 }, + { 5792, 5866 }, + { 5888, 5900 }, + { 5902, 5905 }, + { 5920, 5937 }, + { 5952, 5969 }, + { 5984, 5996 }, + { 5998, 6000 }, + { 6016, 6067 }, + { 6103, 6103 }, + { 6108, 6108 }, + { 6176, 6263 }, + { 6272, 6312 }, + { 6314, 6314 }, + { 6320, 6389 }, + { 6400, 6428 }, + { 6480, 6509 }, + { 6512, 6516 }, + { 6528, 6571 }, + { 6593, 6599 }, + { 6656, 6678 }, + { 6688, 6740 }, + { 6823, 6823 }, + { 6917, 6963 }, + { 6981, 6987 }, + { 7043, 7072 }, + { 7086, 7087 }, + { 7098, 7141 }, + { 7168, 7203 }, + { 7245, 7247 }, + { 7258, 7293 }, + { 7401, 7404 }, + { 7406, 7409 }, + { 7413, 7414 }, + { 7424, 7615 }, + { 7680, 7957 }, + { 7960, 7965 }, + { 7968, 8005 }, + { 8008, 8013 }, + { 8016, 8023 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8061 }, + { 8064, 8116 }, + { 8118, 8124 }, + { 8126, 8126 }, + { 8130, 8132 }, + { 8134, 8140 }, + { 8144, 8147 }, + { 8150, 8155 }, + { 8160, 8172 }, + { 8178, 8180 }, + { 8182, 8188 }, + { 8305, 8305 }, + { 8319, 8319 }, + { 8336, 8348 }, + { 8450, 8450 }, + { 8455, 8455 }, + { 8458, 8467 }, + { 8469, 8469 }, + { 8473, 8477 }, + { 8484, 8484 }, + { 8486, 8486 }, + { 8488, 8488 }, + { 8490, 8493 }, + { 8495, 8505 }, + { 8508, 8511 }, + { 8517, 8521 }, + { 8526, 8526 }, + { 8579, 8580 }, + { 11264, 11310 }, + { 11312, 11358 }, + { 11360, 11492 }, + { 11499, 11502 }, + { 11506, 11507 }, + { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, + { 11568, 11623 }, + { 11631, 11631 }, + { 11648, 11670 }, + { 11680, 11686 }, + { 11688, 11694 }, + { 11696, 11702 }, + { 11704, 11710 }, + { 11712, 11718 }, + { 11720, 11726 }, + { 11728, 11734 }, + { 11736, 11742 }, + { 11823, 11823 }, + { 12293, 12294 }, + { 12337, 12341 }, + { 12347, 12348 }, + { 12353, 12438 }, + { 12445, 12447 }, + { 12449, 12538 }, + { 12540, 12543 }, + { 12549, 12589 }, + { 12593, 12686 }, + { 12704, 12730 }, + { 12784, 12799 }, + { 13312, 19893 }, + { 19968, 40908 }, + { 40960, 42124 }, + { 42192, 42237 }, + { 42240, 42508 }, + { 42512, 42527 }, + { 42538, 42539 }, + { 42560, 42606 }, + { 42623, 42647 }, + { 42656, 42725 }, + { 42775, 42783 }, + { 42786, 42888 }, + { 42891, 42894 }, + { 42896, 42899 }, + { 42912, 42922 }, + { 43000, 43009 }, + { 43011, 43013 }, + { 43015, 43018 }, + { 43020, 43042 }, + { 43072, 43123 }, + { 43138, 43187 }, + { 43250, 43255 }, + { 43259, 43259 }, + { 43274, 43301 }, + { 43312, 43334 }, + { 43360, 43388 }, + { 43396, 43442 }, + { 43471, 43471 }, + { 43520, 43560 }, + { 43584, 43586 }, + { 43588, 43595 }, + { 43616, 43638 }, + { 43642, 43642 }, + { 43648, 43695 }, + { 43697, 43697 }, + { 43701, 43702 }, + { 43705, 43709 }, + { 43712, 43712 }, + { 43714, 43714 }, + { 43739, 43741 }, + { 43744, 43754 }, + { 43762, 43764 }, + { 43777, 43782 }, + { 43785, 43790 }, + { 43793, 43798 }, + { 43808, 43814 }, + { 43816, 43822 }, + { 43968, 44002 }, + { 44032, 55203 }, + { 55216, 55238 }, + { 55243, 55291 }, + { 63744, 64109 }, + { 64112, 64217 }, + { 64256, 64262 }, + { 64275, 64279 }, + { 64285, 64285 }, + { 64287, 64296 }, + { 64298, 64310 }, + { 64312, 64316 }, + { 64318, 64318 }, + { 64320, 64321 }, + { 64323, 64324 }, + { 64326, 64433 }, + { 64467, 64829 }, + { 64848, 64911 }, + { 64914, 64967 }, + { 65008, 65019 }, + { 65136, 65140 }, + { 65142, 65276 }, + { 65313, 65338 }, + { 65345, 65370 }, + { 65382, 65470 }, + { 65474, 65479 }, + { 65482, 65487 }, + { 65490, 65495 }, + { 65498, 65500 }, +}; +static const URange32 L_range32[] = { + { 65536, 65547 }, + { 65549, 65574 }, + { 65576, 65594 }, + { 65596, 65597 }, + { 65599, 65613 }, + { 65616, 65629 }, + { 65664, 65786 }, + { 66176, 66204 }, + { 66208, 66256 }, + { 66304, 66334 }, + { 66352, 66368 }, + { 66370, 66377 }, + { 66432, 66461 }, + { 66464, 66499 }, + { 66504, 66511 }, + { 66560, 66717 }, + { 67584, 67589 }, + { 67592, 67592 }, + { 67594, 67637 }, + { 67639, 67640 }, + { 67644, 67644 }, + { 67647, 67669 }, + { 67840, 67861 }, + { 67872, 67897 }, + { 67968, 68023 }, + { 68030, 68031 }, + { 68096, 68096 }, + { 68112, 68115 }, + { 68117, 68119 }, + { 68121, 68147 }, + { 68192, 68220 }, + { 68352, 68405 }, + { 68416, 68437 }, + { 68448, 68466 }, + { 68608, 68680 }, + { 69635, 69687 }, + { 69763, 69807 }, + { 69840, 69864 }, + { 69891, 69926 }, + { 70019, 70066 }, + { 70081, 70084 }, + { 71296, 71338 }, + { 73728, 74606 }, + { 77824, 78894 }, + { 92160, 92728 }, + { 93952, 94020 }, + { 94032, 94032 }, + { 94099, 94111 }, + { 110592, 110593 }, + { 119808, 119892 }, + { 119894, 119964 }, + { 119966, 119967 }, + { 119970, 119970 }, + { 119973, 119974 }, + { 119977, 119980 }, + { 119982, 119993 }, + { 119995, 119995 }, + { 119997, 120003 }, + { 120005, 120069 }, + { 120071, 120074 }, + { 120077, 120084 }, + { 120086, 120092 }, + { 120094, 120121 }, + { 120123, 120126 }, + { 120128, 120132 }, + { 120134, 120134 }, + { 120138, 120144 }, + { 120146, 120485 }, + { 120488, 120512 }, + { 120514, 120538 }, + { 120540, 120570 }, + { 120572, 120596 }, + { 120598, 120628 }, + { 120630, 120654 }, + { 120656, 120686 }, + { 120688, 120712 }, + { 120714, 120744 }, + { 120746, 120770 }, + { 120772, 120779 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, + { 131072, 173782 }, + { 173824, 177972 }, + { 177984, 178205 }, + { 194560, 195101 }, +}; +static const URange16 N_range16[] = { + { 48, 57 }, + { 178, 179 }, + { 185, 185 }, + { 188, 190 }, + { 1632, 1641 }, + { 1776, 1785 }, + { 1984, 1993 }, + { 2406, 2415 }, + { 2534, 2543 }, + { 2548, 2553 }, + { 2662, 2671 }, + { 2790, 2799 }, + { 2918, 2927 }, + { 2930, 2935 }, + { 3046, 3058 }, + { 3174, 3183 }, + { 3192, 3198 }, + { 3302, 3311 }, + { 3430, 3445 }, + { 3664, 3673 }, + { 3792, 3801 }, + { 3872, 3891 }, + { 4160, 4169 }, + { 4240, 4249 }, + { 4969, 4988 }, + { 5870, 5872 }, + { 6112, 6121 }, + { 6128, 6137 }, + { 6160, 6169 }, + { 6470, 6479 }, + { 6608, 6618 }, + { 6784, 6793 }, + { 6800, 6809 }, + { 6992, 7001 }, + { 7088, 7097 }, + { 7232, 7241 }, + { 7248, 7257 }, + { 8304, 8304 }, + { 8308, 8313 }, + { 8320, 8329 }, + { 8528, 8578 }, + { 8581, 8585 }, + { 9312, 9371 }, + { 9450, 9471 }, + { 10102, 10131 }, + { 11517, 11517 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12346 }, + { 12690, 12693 }, + { 12832, 12841 }, + { 12872, 12879 }, + { 12881, 12895 }, + { 12928, 12937 }, + { 12977, 12991 }, + { 42528, 42537 }, + { 42726, 42735 }, + { 43056, 43061 }, + { 43216, 43225 }, + { 43264, 43273 }, + { 43472, 43481 }, + { 43600, 43609 }, + { 44016, 44025 }, + { 65296, 65305 }, +}; +static const URange32 N_range32[] = { + { 65799, 65843 }, + { 65856, 65912 }, + { 65930, 65930 }, + { 66336, 66339 }, + { 66369, 66369 }, + { 66378, 66378 }, + { 66513, 66517 }, + { 66720, 66729 }, + { 67672, 67679 }, + { 67862, 67867 }, + { 68160, 68167 }, + { 68221, 68222 }, + { 68440, 68447 }, + { 68472, 68479 }, + { 69216, 69246 }, + { 69714, 69743 }, + { 69872, 69881 }, + { 69942, 69951 }, + { 70096, 70105 }, + { 71360, 71369 }, + { 74752, 74850 }, + { 119648, 119665 }, + { 120782, 120831 }, + { 127232, 127242 }, +}; +static const URange16 Sk_range16[] = { + { 94, 94 }, + { 96, 96 }, + { 168, 168 }, + { 175, 175 }, + { 180, 180 }, + { 184, 184 }, + { 706, 709 }, + { 722, 735 }, + { 741, 747 }, + { 749, 749 }, + { 751, 767 }, + { 885, 885 }, + { 900, 901 }, + { 8125, 8125 }, + { 8127, 8129 }, + { 8141, 8143 }, + { 8157, 8159 }, + { 8173, 8175 }, + { 8189, 8190 }, + { 12443, 12444 }, + { 42752, 42774 }, + { 42784, 42785 }, + { 42889, 42890 }, + { 64434, 64449 }, + { 65342, 65342 }, + { 65344, 65344 }, + { 65507, 65507 }, +}; +static const URange16 P_range16[] = { + { 33, 35 }, + { 37, 42 }, + { 44, 47 }, + { 58, 59 }, + { 63, 64 }, + { 91, 93 }, + { 95, 95 }, + { 123, 123 }, + { 125, 125 }, + { 161, 161 }, + { 167, 167 }, + { 171, 171 }, + { 182, 183 }, + { 187, 187 }, + { 191, 191 }, + { 894, 894 }, + { 903, 903 }, + { 1370, 1375 }, + { 1417, 1418 }, + { 1470, 1470 }, + { 1472, 1472 }, + { 1475, 1475 }, + { 1478, 1478 }, + { 1523, 1524 }, + { 1545, 1546 }, + { 1548, 1549 }, + { 1563, 1563 }, + { 1566, 1567 }, + { 1642, 1645 }, + { 1748, 1748 }, + { 1792, 1805 }, + { 2039, 2041 }, + { 2096, 2110 }, + { 2142, 2142 }, + { 2404, 2405 }, + { 2416, 2416 }, + { 2800, 2800 }, + { 3572, 3572 }, + { 3663, 3663 }, + { 3674, 3675 }, + { 3844, 3858 }, + { 3860, 3860 }, + { 3898, 3901 }, + { 3973, 3973 }, + { 4048, 4052 }, + { 4057, 4058 }, + { 4170, 4175 }, + { 4347, 4347 }, + { 4960, 4968 }, + { 5120, 5120 }, + { 5741, 5742 }, + { 5787, 5788 }, + { 5867, 5869 }, + { 5941, 5942 }, + { 6100, 6102 }, + { 6104, 6106 }, + { 6144, 6154 }, + { 6468, 6469 }, + { 6686, 6687 }, + { 6816, 6822 }, + { 6824, 6829 }, + { 7002, 7008 }, + { 7164, 7167 }, + { 7227, 7231 }, + { 7294, 7295 }, + { 7360, 7367 }, + { 7379, 7379 }, + { 8208, 8231 }, + { 8240, 8259 }, + { 8261, 8273 }, + { 8275, 8286 }, + { 8317, 8318 }, + { 8333, 8334 }, + { 8968, 8971 }, + { 9001, 9002 }, + { 10088, 10101 }, + { 10181, 10182 }, + { 10214, 10223 }, + { 10627, 10648 }, + { 10712, 10715 }, + { 10748, 10749 }, + { 11513, 11516 }, + { 11518, 11519 }, + { 11632, 11632 }, + { 11776, 11822 }, + { 11824, 11835 }, + { 12289, 12291 }, + { 12296, 12305 }, + { 12308, 12319 }, + { 12336, 12336 }, + { 12349, 12349 }, + { 12448, 12448 }, + { 12539, 12539 }, + { 42238, 42239 }, + { 42509, 42511 }, + { 42611, 42611 }, + { 42622, 42622 }, + { 42738, 42743 }, + { 43124, 43127 }, + { 43214, 43215 }, + { 43256, 43258 }, + { 43310, 43311 }, + { 43359, 43359 }, + { 43457, 43469 }, + { 43486, 43487 }, + { 43612, 43615 }, + { 43742, 43743 }, + { 43760, 43761 }, + { 44011, 44011 }, + { 64830, 64831 }, + { 65040, 65049 }, + { 65072, 65106 }, + { 65108, 65121 }, + { 65123, 65123 }, + { 65128, 65128 }, + { 65130, 65131 }, + { 65281, 65283 }, + { 65285, 65290 }, + { 65292, 65295 }, + { 65306, 65307 }, + { 65311, 65312 }, + { 65339, 65341 }, + { 65343, 65343 }, + { 65371, 65371 }, + { 65373, 65373 }, + { 65375, 65381 }, +}; +static const URange32 P_range32[] = { + { 65792, 65794 }, + { 66463, 66463 }, + { 66512, 66512 }, + { 67671, 67671 }, + { 67871, 67871 }, + { 67903, 67903 }, + { 68176, 68184 }, + { 68223, 68223 }, + { 68409, 68415 }, + { 69703, 69709 }, + { 69819, 69820 }, + { 69822, 69825 }, + { 69952, 69955 }, + { 70085, 70088 }, + { 74864, 74867 }, +}; +static const URange16 S_range16[] = { + { 36, 36 }, + { 43, 43 }, + { 60, 62 }, + { 94, 94 }, + { 96, 96 }, + { 124, 124 }, + { 126, 126 }, + { 162, 166 }, + { 168, 169 }, + { 172, 172 }, + { 174, 177 }, + { 180, 180 }, + { 184, 184 }, + { 215, 215 }, + { 247, 247 }, + { 706, 709 }, + { 722, 735 }, + { 741, 747 }, + { 749, 749 }, + { 751, 767 }, + { 885, 885 }, + { 900, 901 }, + { 1014, 1014 }, + { 1154, 1154 }, + { 1423, 1423 }, + { 1542, 1544 }, + { 1547, 1547 }, + { 1550, 1551 }, + { 1758, 1758 }, + { 1769, 1769 }, + { 1789, 1790 }, + { 2038, 2038 }, + { 2546, 2547 }, + { 2554, 2555 }, + { 2801, 2801 }, + { 2928, 2928 }, + { 3059, 3066 }, + { 3199, 3199 }, + { 3449, 3449 }, + { 3647, 3647 }, + { 3841, 3843 }, + { 3859, 3859 }, + { 3861, 3863 }, + { 3866, 3871 }, + { 3892, 3892 }, + { 3894, 3894 }, + { 3896, 3896 }, + { 4030, 4037 }, + { 4039, 4044 }, + { 4046, 4047 }, + { 4053, 4056 }, + { 4254, 4255 }, + { 5008, 5017 }, + { 6107, 6107 }, + { 6464, 6464 }, + { 6622, 6655 }, + { 7009, 7018 }, + { 7028, 7036 }, + { 8125, 8125 }, + { 8127, 8129 }, + { 8141, 8143 }, + { 8157, 8159 }, + { 8173, 8175 }, + { 8189, 8190 }, + { 8260, 8260 }, + { 8274, 8274 }, + { 8314, 8316 }, + { 8330, 8332 }, + { 8352, 8378 }, + { 8448, 8449 }, + { 8451, 8454 }, + { 8456, 8457 }, + { 8468, 8468 }, + { 8470, 8472 }, + { 8478, 8483 }, + { 8485, 8485 }, + { 8487, 8487 }, + { 8489, 8489 }, + { 8494, 8494 }, + { 8506, 8507 }, + { 8512, 8516 }, + { 8522, 8525 }, + { 8527, 8527 }, + { 8592, 8967 }, + { 8972, 9000 }, + { 9003, 9203 }, + { 9216, 9254 }, + { 9280, 9290 }, + { 9372, 9449 }, + { 9472, 9983 }, + { 9985, 10087 }, + { 10132, 10180 }, + { 10183, 10213 }, + { 10224, 10626 }, + { 10649, 10711 }, + { 10716, 10747 }, + { 10750, 11084 }, + { 11088, 11097 }, + { 11493, 11498 }, + { 11904, 11929 }, + { 11931, 12019 }, + { 12032, 12245 }, + { 12272, 12283 }, + { 12292, 12292 }, + { 12306, 12307 }, + { 12320, 12320 }, + { 12342, 12343 }, + { 12350, 12351 }, + { 12443, 12444 }, + { 12688, 12689 }, + { 12694, 12703 }, + { 12736, 12771 }, + { 12800, 12830 }, + { 12842, 12871 }, + { 12880, 12880 }, + { 12896, 12927 }, + { 12938, 12976 }, + { 12992, 13054 }, + { 13056, 13311 }, + { 19904, 19967 }, + { 42128, 42182 }, + { 42752, 42774 }, + { 42784, 42785 }, + { 42889, 42890 }, + { 43048, 43051 }, + { 43062, 43065 }, + { 43639, 43641 }, + { 64297, 64297 }, + { 64434, 64449 }, + { 65020, 65021 }, + { 65122, 65122 }, + { 65124, 65126 }, + { 65129, 65129 }, + { 65284, 65284 }, + { 65291, 65291 }, + { 65308, 65310 }, + { 65342, 65342 }, + { 65344, 65344 }, + { 65372, 65372 }, + { 65374, 65374 }, + { 65504, 65510 }, + { 65512, 65518 }, + { 65532, 65533 }, +}; +static const URange32 S_range32[] = { + { 65847, 65855 }, + { 65913, 65929 }, + { 65936, 65947 }, + { 66000, 66044 }, + { 118784, 119029 }, + { 119040, 119078 }, + { 119081, 119140 }, + { 119146, 119148 }, + { 119171, 119172 }, + { 119180, 119209 }, + { 119214, 119261 }, + { 119296, 119361 }, + { 119365, 119365 }, + { 119552, 119638 }, + { 120513, 120513 }, + { 120539, 120539 }, + { 120571, 120571 }, + { 120597, 120597 }, + { 120629, 120629 }, + { 120655, 120655 }, + { 120687, 120687 }, + { 120713, 120713 }, + { 120745, 120745 }, + { 120771, 120771 }, + { 126704, 126705 }, + { 126976, 127019 }, + { 127024, 127123 }, + { 127136, 127150 }, + { 127153, 127166 }, + { 127169, 127183 }, + { 127185, 127199 }, + { 127248, 127278 }, + { 127280, 127339 }, + { 127344, 127386 }, + { 127462, 127490 }, + { 127504, 127546 }, + { 127552, 127560 }, + { 127568, 127569 }, + { 127744, 127776 }, + { 127792, 127797 }, + { 127799, 127868 }, + { 127872, 127891 }, + { 127904, 127940 }, + { 127942, 127946 }, + { 127968, 127984 }, + { 128000, 128062 }, + { 128064, 128064 }, + { 128066, 128247 }, + { 128249, 128252 }, + { 128256, 128317 }, + { 128320, 128323 }, + { 128336, 128359 }, + { 128507, 128576 }, + { 128581, 128591 }, + { 128640, 128709 }, + { 128768, 128883 }, +}; +static const URange16 So_range16[] = { + { 166, 166 }, + { 169, 169 }, + { 174, 174 }, + { 176, 176 }, + { 1154, 1154 }, + { 1550, 1551 }, + { 1758, 1758 }, + { 1769, 1769 }, + { 1789, 1790 }, + { 2038, 2038 }, + { 2554, 2554 }, + { 2928, 2928 }, + { 3059, 3064 }, + { 3066, 3066 }, + { 3199, 3199 }, + { 3449, 3449 }, + { 3841, 3843 }, + { 3859, 3859 }, + { 3861, 3863 }, + { 3866, 3871 }, + { 3892, 3892 }, + { 3894, 3894 }, + { 3896, 3896 }, + { 4030, 4037 }, + { 4039, 4044 }, + { 4046, 4047 }, + { 4053, 4056 }, + { 4254, 4255 }, + { 5008, 5017 }, + { 6464, 6464 }, + { 6622, 6655 }, + { 7009, 7018 }, + { 7028, 7036 }, + { 8448, 8449 }, + { 8451, 8454 }, + { 8456, 8457 }, + { 8468, 8468 }, + { 8470, 8471 }, + { 8478, 8483 }, + { 8485, 8485 }, + { 8487, 8487 }, + { 8489, 8489 }, + { 8494, 8494 }, + { 8506, 8507 }, + { 8522, 8522 }, + { 8524, 8525 }, + { 8527, 8527 }, + { 8597, 8601 }, + { 8604, 8607 }, + { 8609, 8610 }, + { 8612, 8613 }, + { 8615, 8621 }, + { 8623, 8653 }, + { 8656, 8657 }, + { 8659, 8659 }, + { 8661, 8691 }, + { 8960, 8967 }, + { 8972, 8991 }, + { 8994, 9000 }, + { 9003, 9083 }, + { 9085, 9114 }, + { 9140, 9179 }, + { 9186, 9203 }, + { 9216, 9254 }, + { 9280, 9290 }, + { 9372, 9449 }, + { 9472, 9654 }, + { 9656, 9664 }, + { 9666, 9719 }, + { 9728, 9838 }, + { 9840, 9983 }, + { 9985, 10087 }, + { 10132, 10175 }, + { 10240, 10495 }, + { 11008, 11055 }, + { 11077, 11078 }, + { 11088, 11097 }, + { 11493, 11498 }, + { 11904, 11929 }, + { 11931, 12019 }, + { 12032, 12245 }, + { 12272, 12283 }, + { 12292, 12292 }, + { 12306, 12307 }, + { 12320, 12320 }, + { 12342, 12343 }, + { 12350, 12351 }, + { 12688, 12689 }, + { 12694, 12703 }, + { 12736, 12771 }, + { 12800, 12830 }, + { 12842, 12871 }, + { 12880, 12880 }, + { 12896, 12927 }, + { 12938, 12976 }, + { 12992, 13054 }, + { 13056, 13311 }, + { 19904, 19967 }, + { 42128, 42182 }, + { 43048, 43051 }, + { 43062, 43063 }, + { 43065, 43065 }, + { 43639, 43641 }, + { 65021, 65021 }, + { 65508, 65508 }, + { 65512, 65512 }, + { 65517, 65518 }, + { 65532, 65533 }, +}; +static const URange32 So_range32[] = { + { 65847, 65855 }, + { 65913, 65929 }, + { 65936, 65947 }, + { 66000, 66044 }, + { 118784, 119029 }, + { 119040, 119078 }, + { 119081, 119140 }, + { 119146, 119148 }, + { 119171, 119172 }, + { 119180, 119209 }, + { 119214, 119261 }, + { 119296, 119361 }, + { 119365, 119365 }, + { 119552, 119638 }, + { 126976, 127019 }, + { 127024, 127123 }, + { 127136, 127150 }, + { 127153, 127166 }, + { 127169, 127183 }, + { 127185, 127199 }, + { 127248, 127278 }, + { 127280, 127339 }, + { 127344, 127386 }, + { 127462, 127490 }, + { 127504, 127546 }, + { 127552, 127560 }, + { 127568, 127569 }, + { 127744, 127776 }, + { 127792, 127797 }, + { 127799, 127868 }, + { 127872, 127891 }, + { 127904, 127940 }, + { 127942, 127946 }, + { 127968, 127984 }, + { 128000, 128062 }, + { 128064, 128064 }, + { 128066, 128247 }, + { 128249, 128252 }, + { 128256, 128317 }, + { 128320, 128323 }, + { 128336, 128359 }, + { 128507, 128576 }, + { 128581, 128591 }, + { 128640, 128709 }, + { 128768, 128883 }, +}; +static const URange16 Sm_range16[] = { + { 43, 43 }, + { 60, 62 }, + { 124, 124 }, + { 126, 126 }, + { 172, 172 }, + { 177, 177 }, + { 215, 215 }, + { 247, 247 }, + { 1014, 1014 }, + { 1542, 1544 }, + { 8260, 8260 }, + { 8274, 8274 }, + { 8314, 8316 }, + { 8330, 8332 }, + { 8472, 8472 }, + { 8512, 8516 }, + { 8523, 8523 }, + { 8592, 8596 }, + { 8602, 8603 }, + { 8608, 8608 }, + { 8611, 8611 }, + { 8614, 8614 }, + { 8622, 8622 }, + { 8654, 8655 }, + { 8658, 8658 }, + { 8660, 8660 }, + { 8692, 8959 }, + { 8992, 8993 }, + { 9084, 9084 }, + { 9115, 9139 }, + { 9180, 9185 }, + { 9655, 9655 }, + { 9665, 9665 }, + { 9720, 9727 }, + { 9839, 9839 }, + { 10176, 10180 }, + { 10183, 10213 }, + { 10224, 10239 }, + { 10496, 10626 }, + { 10649, 10711 }, + { 10716, 10747 }, + { 10750, 11007 }, + { 11056, 11076 }, + { 11079, 11084 }, + { 64297, 64297 }, + { 65122, 65122 }, + { 65124, 65126 }, + { 65291, 65291 }, + { 65308, 65310 }, + { 65372, 65372 }, + { 65374, 65374 }, + { 65506, 65506 }, + { 65513, 65516 }, +}; +static const URange32 Sm_range32[] = { + { 120513, 120513 }, + { 120539, 120539 }, + { 120571, 120571 }, + { 120597, 120597 }, + { 120629, 120629 }, + { 120655, 120655 }, + { 120687, 120687 }, + { 120713, 120713 }, + { 120745, 120745 }, + { 120771, 120771 }, + { 126704, 126705 }, +}; +static const URange16 Sc_range16[] = { + { 36, 36 }, + { 162, 165 }, + { 1423, 1423 }, + { 1547, 1547 }, + { 2546, 2547 }, + { 2555, 2555 }, + { 2801, 2801 }, + { 3065, 3065 }, + { 3647, 3647 }, + { 6107, 6107 }, + { 8352, 8378 }, + { 43064, 43064 }, + { 65020, 65020 }, + { 65129, 65129 }, + { 65284, 65284 }, + { 65504, 65505 }, + { 65509, 65510 }, +}; +static const URange16 Z_range16[] = { + { 32, 32 }, + { 160, 160 }, + { 5760, 5760 }, + { 8192, 8202 }, + { 8232, 8233 }, + { 8239, 8239 }, + { 8287, 8287 }, + { 12288, 12288 }, +}; +static const URange16 Zl_range16[] = { + { 8232, 8232 }, +}; +static const URange16 Co_range16[] = { + { 57344, 63743 }, +}; +static const URange32 Co_range32[] = { + { 983040, 1048573 }, + { 1048576, 1114109 }, +}; +static const URange16 Cc_range16[] = { + { 0, 31 }, + { 127, 159 }, +}; +static const URange16 Cf_range16[] = { + { 173, 173 }, + { 1536, 1540 }, + { 1564, 1564 }, + { 1757, 1757 }, + { 1807, 1807 }, + { 6158, 6158 }, + { 8203, 8207 }, + { 8234, 8238 }, + { 8288, 8292 }, + { 8294, 8303 }, + { 65279, 65279 }, + { 65529, 65531 }, +}; +static const URange32 Cf_range32[] = { + { 69821, 69821 }, + { 119155, 119162 }, + { 917505, 917505 }, + { 917536, 917631 }, +}; +static const URange16 Cs_range16[] = { + { 55296, 57343 }, +}; +static const URange16 Zp_range16[] = { + { 8233, 8233 }, +}; +static const URange16 Zs_range16[] = { + { 32, 32 }, + { 160, 160 }, + { 5760, 5760 }, + { 8192, 8202 }, + { 8239, 8239 }, + { 8287, 8287 }, + { 12288, 12288 }, +}; +static const URange16 Thaana_range16[] = { + { 1920, 1969 }, +}; +static const URange16 Telugu_range16[] = { + { 3073, 3075 }, + { 3077, 3084 }, + { 3086, 3088 }, + { 3090, 3112 }, + { 3114, 3123 }, + { 3125, 3129 }, + { 3133, 3140 }, + { 3142, 3144 }, + { 3146, 3149 }, + { 3157, 3158 }, + { 3160, 3161 }, + { 3168, 3171 }, + { 3174, 3183 }, + { 3192, 3199 }, +}; +static const URange16 Cyrillic_range16[] = { + { 1024, 1156 }, + { 1159, 1319 }, + { 7467, 7467 }, + { 7544, 7544 }, + { 11744, 11775 }, + { 42560, 42647 }, + { 42655, 42655 }, +}; +static const URange16 Hangul_range16[] = { + { 4352, 4607 }, + { 12334, 12335 }, + { 12593, 12686 }, + { 12800, 12830 }, + { 12896, 12926 }, + { 43360, 43388 }, + { 44032, 55203 }, + { 55216, 55238 }, + { 55243, 55291 }, + { 65440, 65470 }, + { 65474, 65479 }, + { 65482, 65487 }, + { 65490, 65495 }, + { 65498, 65500 }, +}; +static const URange32 Old_South_Arabian_range32[] = { + { 68192, 68223 }, +}; +static const URange16 Ethiopic_range16[] = { + { 4608, 4680 }, + { 4682, 4685 }, + { 4688, 4694 }, + { 4696, 4696 }, + { 4698, 4701 }, + { 4704, 4744 }, + { 4746, 4749 }, + { 4752, 4784 }, + { 4786, 4789 }, + { 4792, 4798 }, + { 4800, 4800 }, + { 4802, 4805 }, + { 4808, 4822 }, + { 4824, 4880 }, + { 4882, 4885 }, + { 4888, 4954 }, + { 4957, 4988 }, + { 4992, 5017 }, + { 11648, 11670 }, + { 11680, 11686 }, + { 11688, 11694 }, + { 11696, 11702 }, + { 11704, 11710 }, + { 11712, 11718 }, + { 11720, 11726 }, + { 11728, 11734 }, + { 11736, 11742 }, + { 43777, 43782 }, + { 43785, 43790 }, + { 43793, 43798 }, + { 43808, 43814 }, + { 43816, 43822 }, +}; +static const URange16 Inherited_range16[] = { + { 768, 879 }, + { 1157, 1158 }, + { 1611, 1621 }, + { 1648, 1648 }, + { 2385, 2386 }, + { 7376, 7378 }, + { 7380, 7392 }, + { 7394, 7400 }, + { 7405, 7405 }, + { 7412, 7412 }, + { 7616, 7654 }, + { 7676, 7679 }, + { 8204, 8205 }, + { 8400, 8432 }, + { 12330, 12333 }, + { 12441, 12442 }, + { 65024, 65039 }, + { 65056, 65062 }, +}; +static const URange32 Inherited_range32[] = { + { 66045, 66045 }, + { 119143, 119145 }, + { 119163, 119170 }, + { 119173, 119179 }, + { 119210, 119213 }, + { 917760, 917999 }, +}; +static const URange32 Meroitic_Cursive_range32[] = { + { 68000, 68023 }, + { 68030, 68031 }, +}; +static const URange16 Han_range16[] = { + { 11904, 11929 }, + { 11931, 12019 }, + { 12032, 12245 }, + { 12293, 12293 }, + { 12295, 12295 }, + { 12321, 12329 }, + { 12344, 12347 }, + { 13312, 19893 }, + { 19968, 40908 }, + { 63744, 64109 }, + { 64112, 64217 }, +}; +static const URange32 Han_range32[] = { + { 131072, 173782 }, + { 173824, 177972 }, + { 177984, 178205 }, + { 194560, 195101 }, +}; +static const URange16 Armenian_range16[] = { + { 1329, 1366 }, + { 1369, 1375 }, + { 1377, 1415 }, + { 1418, 1418 }, + { 1423, 1423 }, + { 64275, 64279 }, +}; +static const URange16 Tamil_range16[] = { + { 2946, 2947 }, + { 2949, 2954 }, + { 2958, 2960 }, + { 2962, 2965 }, + { 2969, 2970 }, + { 2972, 2972 }, + { 2974, 2975 }, + { 2979, 2980 }, + { 2984, 2986 }, + { 2990, 3001 }, + { 3006, 3010 }, + { 3014, 3016 }, + { 3018, 3021 }, + { 3024, 3024 }, + { 3031, 3031 }, + { 3046, 3066 }, +}; +static const URange16 Bopomofo_range16[] = { + { 746, 747 }, + { 12549, 12589 }, + { 12704, 12730 }, +}; +static const URange16 Sundanese_range16[] = { + { 7040, 7103 }, + { 7360, 7367 }, +}; +static const URange16 Tagalog_range16[] = { + { 5888, 5900 }, + { 5902, 5908 }, +}; +static const URange16 Malayalam_range16[] = { + { 3330, 3331 }, + { 3333, 3340 }, + { 3342, 3344 }, + { 3346, 3386 }, + { 3389, 3396 }, + { 3398, 3400 }, + { 3402, 3406 }, + { 3415, 3415 }, + { 3424, 3427 }, + { 3430, 3445 }, + { 3449, 3455 }, +}; +static const URange32 Carian_range32[] = { + { 66208, 66256 }, +}; +static const URange16 Hiragana_range16[] = { + { 12353, 12438 }, + { 12445, 12447 }, +}; +static const URange32 Hiragana_range32[] = { + { 110593, 110593 }, + { 127488, 127488 }, +}; +static const URange16 Tagbanwa_range16[] = { + { 5984, 5996 }, + { 5998, 6000 }, + { 6002, 6003 }, +}; +static const URange16 Meetei_Mayek_range16[] = { + { 43744, 43766 }, + { 43968, 44013 }, + { 44016, 44025 }, +}; +static const URange16 Tai_Le_range16[] = { + { 6480, 6509 }, + { 6512, 6516 }, +}; +static const URange16 Kayah_Li_range16[] = { + { 43264, 43311 }, +}; +static const URange16 Buginese_range16[] = { + { 6656, 6683 }, + { 6686, 6687 }, +}; +static const URange32 Kharoshthi_range32[] = { + { 68096, 68099 }, + { 68101, 68102 }, + { 68108, 68115 }, + { 68117, 68119 }, + { 68121, 68147 }, + { 68152, 68154 }, + { 68159, 68167 }, + { 68176, 68184 }, +}; +static const URange16 Tai_Tham_range16[] = { + { 6688, 6750 }, + { 6752, 6780 }, + { 6783, 6793 }, + { 6800, 6809 }, + { 6816, 6829 }, +}; +static const URange32 Old_Italic_range32[] = { + { 66304, 66334 }, + { 66336, 66339 }, +}; +static const URange32 Old_Persian_range32[] = { + { 66464, 66499 }, + { 66504, 66517 }, +}; +static const URange16 Latin_range16[] = { + { 65, 90 }, + { 97, 122 }, + { 170, 170 }, + { 186, 186 }, + { 192, 214 }, + { 216, 246 }, + { 248, 696 }, + { 736, 740 }, + { 7424, 7461 }, + { 7468, 7516 }, + { 7522, 7525 }, + { 7531, 7543 }, + { 7545, 7614 }, + { 7680, 7935 }, + { 8305, 8305 }, + { 8319, 8319 }, + { 8336, 8348 }, + { 8490, 8491 }, + { 8498, 8498 }, + { 8526, 8526 }, + { 8544, 8584 }, + { 11360, 11391 }, + { 42786, 42887 }, + { 42891, 42894 }, + { 42896, 42899 }, + { 42912, 42922 }, + { 43000, 43007 }, + { 64256, 64262 }, + { 65313, 65338 }, + { 65345, 65370 }, +}; +static const URange16 Saurashtra_range16[] = { + { 43136, 43204 }, + { 43214, 43225 }, +}; +static const URange32 Shavian_range32[] = { + { 66640, 66687 }, +}; +static const URange16 Georgian_range16[] = { + { 4256, 4293 }, + { 4295, 4295 }, + { 4301, 4301 }, + { 4304, 4346 }, + { 4348, 4351 }, + { 11520, 11557 }, + { 11559, 11559 }, + { 11565, 11565 }, +}; +static const URange16 Batak_range16[] = { + { 7104, 7155 }, + { 7164, 7167 }, +}; +static const URange16 Devanagari_range16[] = { + { 2304, 2384 }, + { 2387, 2403 }, + { 2406, 2423 }, + { 2425, 2431 }, + { 43232, 43259 }, +}; +static const URange16 Thai_range16[] = { + { 3585, 3642 }, + { 3648, 3675 }, +}; +static const URange16 Tibetan_range16[] = { + { 3840, 3911 }, + { 3913, 3948 }, + { 3953, 3991 }, + { 3993, 4028 }, + { 4030, 4044 }, + { 4046, 4052 }, + { 4057, 4058 }, +}; +static const URange16 Tifinagh_range16[] = { + { 11568, 11623 }, + { 11631, 11632 }, + { 11647, 11647 }, +}; +static const URange32 Ugaritic_range32[] = { + { 66432, 66461 }, + { 66463, 66463 }, +}; +static const URange16 Braille_range16[] = { + { 10240, 10495 }, +}; +static const URange16 Greek_range16[] = { + { 880, 883 }, + { 885, 887 }, + { 890, 893 }, + { 900, 900 }, + { 902, 902 }, + { 904, 906 }, + { 908, 908 }, + { 910, 929 }, + { 931, 993 }, + { 1008, 1023 }, + { 7462, 7466 }, + { 7517, 7521 }, + { 7526, 7530 }, + { 7615, 7615 }, + { 7936, 7957 }, + { 7960, 7965 }, + { 7968, 8005 }, + { 8008, 8013 }, + { 8016, 8023 }, + { 8025, 8025 }, + { 8027, 8027 }, + { 8029, 8029 }, + { 8031, 8061 }, + { 8064, 8116 }, + { 8118, 8132 }, + { 8134, 8147 }, + { 8150, 8155 }, + { 8157, 8175 }, + { 8178, 8180 }, + { 8182, 8190 }, + { 8486, 8486 }, +}; +static const URange32 Greek_range32[] = { + { 65856, 65930 }, + { 119296, 119365 }, +}; +static const URange32 Lycian_range32[] = { + { 66176, 66204 }, +}; +static const URange16 Tai_Viet_range16[] = { + { 43648, 43714 }, + { 43739, 43743 }, +}; +static const URange16 Vai_range16[] = { + { 42240, 42539 }, +}; +static const URange16 Ogham_range16[] = { + { 5760, 5788 }, +}; +static const URange32 Inscriptional_Parthian_range32[] = { + { 68416, 68437 }, + { 68440, 68447 }, +}; +static const URange16 Cham_range16[] = { + { 43520, 43574 }, + { 43584, 43597 }, + { 43600, 43609 }, + { 43612, 43615 }, +}; +static const URange16 Syriac_range16[] = { + { 1792, 1805 }, + { 1807, 1866 }, + { 1869, 1871 }, +}; +static const URange16 Runic_range16[] = { + { 5792, 5866 }, + { 5870, 5872 }, +}; +static const URange32 Gothic_range32[] = { + { 66352, 66378 }, +}; +static const URange16 Katakana_range16[] = { + { 12449, 12538 }, + { 12541, 12543 }, + { 12784, 12799 }, + { 13008, 13054 }, + { 13056, 13143 }, + { 65382, 65391 }, + { 65393, 65437 }, +}; +static const URange32 Katakana_range32[] = { + { 110592, 110592 }, +}; +static const URange32 Osmanya_range32[] = { + { 66688, 66717 }, + { 66720, 66729 }, +}; +static const URange16 New_Tai_Lue_range16[] = { + { 6528, 6571 }, + { 6576, 6601 }, + { 6608, 6618 }, + { 6622, 6623 }, +}; +static const URange16 Ol_Chiki_range16[] = { + { 7248, 7295 }, +}; +static const URange16 Limbu_range16[] = { + { 6400, 6428 }, + { 6432, 6443 }, + { 6448, 6459 }, + { 6464, 6464 }, + { 6468, 6479 }, +}; +static const URange16 Cherokee_range16[] = { + { 5024, 5108 }, +}; +static const URange32 Miao_range32[] = { + { 93952, 94020 }, + { 94032, 94078 }, + { 94095, 94111 }, +}; +static const URange16 Oriya_range16[] = { + { 2817, 2819 }, + { 2821, 2828 }, + { 2831, 2832 }, + { 2835, 2856 }, + { 2858, 2864 }, + { 2866, 2867 }, + { 2869, 2873 }, + { 2876, 2884 }, + { 2887, 2888 }, + { 2891, 2893 }, + { 2902, 2903 }, + { 2908, 2909 }, + { 2911, 2915 }, + { 2918, 2935 }, +}; +static const URange32 Sharada_range32[] = { + { 70016, 70088 }, + { 70096, 70105 }, +}; +static const URange16 Gujarati_range16[] = { + { 2689, 2691 }, + { 2693, 2701 }, + { 2703, 2705 }, + { 2707, 2728 }, + { 2730, 2736 }, + { 2738, 2739 }, + { 2741, 2745 }, + { 2748, 2757 }, + { 2759, 2761 }, + { 2763, 2765 }, + { 2768, 2768 }, + { 2784, 2787 }, + { 2790, 2801 }, +}; +static const URange32 Inscriptional_Pahlavi_range32[] = { + { 68448, 68466 }, + { 68472, 68479 }, +}; +static const URange16 Khmer_range16[] = { + { 6016, 6109 }, + { 6112, 6121 }, + { 6128, 6137 }, + { 6624, 6655 }, +}; +static const URange32 Cuneiform_range32[] = { + { 73728, 74606 }, + { 74752, 74850 }, + { 74864, 74867 }, +}; +static const URange16 Mandaic_range16[] = { + { 2112, 2139 }, + { 2142, 2142 }, +}; +static const URange16 Syloti_Nagri_range16[] = { + { 43008, 43051 }, +}; +static const URange16 Nko_range16[] = { + { 1984, 2042 }, +}; +static const URange16 Canadian_Aboriginal_range16[] = { + { 5120, 5759 }, + { 6320, 6389 }, +}; +static const URange32 Meroitic_Hieroglyphs_range32[] = { + { 67968, 67999 }, +}; +static const URange32 Phoenician_range32[] = { + { 67840, 67867 }, + { 67871, 67871 }, +}; +static const URange16 Bengali_range16[] = { + { 2433, 2435 }, + { 2437, 2444 }, + { 2447, 2448 }, + { 2451, 2472 }, + { 2474, 2480 }, + { 2482, 2482 }, + { 2486, 2489 }, + { 2492, 2500 }, + { 2503, 2504 }, + { 2507, 2510 }, + { 2519, 2519 }, + { 2524, 2525 }, + { 2527, 2531 }, + { 2534, 2555 }, +}; +static const URange32 Kaithi_range32[] = { + { 69760, 69825 }, +}; +static const URange16 Glagolitic_range16[] = { + { 11264, 11310 }, + { 11312, 11358 }, +}; +static const URange32 Imperial_Aramaic_range32[] = { + { 67648, 67669 }, + { 67671, 67679 }, +}; +static const URange32 Sora_Sompeng_range32[] = { + { 69840, 69864 }, + { 69872, 69881 }, +}; +static const URange16 Gurmukhi_range16[] = { + { 2561, 2563 }, + { 2565, 2570 }, + { 2575, 2576 }, + { 2579, 2600 }, + { 2602, 2608 }, + { 2610, 2611 }, + { 2613, 2614 }, + { 2616, 2617 }, + { 2620, 2620 }, + { 2622, 2626 }, + { 2631, 2632 }, + { 2635, 2637 }, + { 2641, 2641 }, + { 2649, 2652 }, + { 2654, 2654 }, + { 2662, 2677 }, +}; +static const URange16 Javanese_range16[] = { + { 43392, 43469 }, + { 43472, 43481 }, + { 43486, 43487 }, +}; +static const URange16 Phags_Pa_range16[] = { + { 43072, 43127 }, +}; +static const URange32 Cypriot_range32[] = { + { 67584, 67589 }, + { 67592, 67592 }, + { 67594, 67637 }, + { 67639, 67640 }, + { 67644, 67644 }, + { 67647, 67647 }, +}; +static const URange16 Kannada_range16[] = { + { 3202, 3203 }, + { 3205, 3212 }, + { 3214, 3216 }, + { 3218, 3240 }, + { 3242, 3251 }, + { 3253, 3257 }, + { 3260, 3268 }, + { 3270, 3272 }, + { 3274, 3277 }, + { 3285, 3286 }, + { 3294, 3294 }, + { 3296, 3299 }, + { 3302, 3311 }, + { 3313, 3314 }, +}; +static const URange16 Mongolian_range16[] = { + { 6144, 6145 }, + { 6148, 6148 }, + { 6150, 6158 }, + { 6160, 6169 }, + { 6176, 6263 }, + { 6272, 6314 }, +}; +static const URange16 Sinhala_range16[] = { + { 3458, 3459 }, + { 3461, 3478 }, + { 3482, 3505 }, + { 3507, 3515 }, + { 3517, 3517 }, + { 3520, 3526 }, + { 3530, 3530 }, + { 3535, 3540 }, + { 3542, 3542 }, + { 3544, 3551 }, + { 3570, 3572 }, +}; +static const URange32 Brahmi_range32[] = { + { 69632, 69709 }, + { 69714, 69743 }, +}; +static const URange32 Deseret_range32[] = { + { 66560, 66639 }, +}; +static const URange16 Rejang_range16[] = { + { 43312, 43347 }, + { 43359, 43359 }, +}; +static const URange16 Yi_range16[] = { + { 40960, 42124 }, + { 42128, 42182 }, +}; +static const URange16 Balinese_range16[] = { + { 6912, 6987 }, + { 6992, 7036 }, +}; +static const URange16 Lao_range16[] = { + { 3713, 3714 }, + { 3716, 3716 }, + { 3719, 3720 }, + { 3722, 3722 }, + { 3725, 3725 }, + { 3732, 3735 }, + { 3737, 3743 }, + { 3745, 3747 }, + { 3749, 3749 }, + { 3751, 3751 }, + { 3754, 3755 }, + { 3757, 3769 }, + { 3771, 3773 }, + { 3776, 3780 }, + { 3782, 3782 }, + { 3784, 3789 }, + { 3792, 3801 }, + { 3804, 3807 }, +}; +static const URange16 Hanunoo_range16[] = { + { 5920, 5940 }, +}; +static const URange32 Linear_B_range32[] = { + { 65536, 65547 }, + { 65549, 65574 }, + { 65576, 65594 }, + { 65596, 65597 }, + { 65599, 65613 }, + { 65616, 65629 }, + { 65664, 65786 }, +}; +static const URange32 Old_Turkic_range32[] = { + { 68608, 68680 }, +}; +static const URange16 Lepcha_range16[] = { + { 7168, 7223 }, + { 7227, 7241 }, + { 7245, 7247 }, +}; +static const URange32 Lydian_range32[] = { + { 67872, 67897 }, + { 67903, 67903 }, +}; +static const URange32 Egyptian_Hieroglyphs_range32[] = { + { 77824, 78894 }, +}; +static const URange16 Samaritan_range16[] = { + { 2048, 2093 }, + { 2096, 2110 }, +}; +static const URange16 Lisu_range16[] = { + { 42192, 42239 }, +}; +static const URange16 Buhid_range16[] = { + { 5952, 5971 }, +}; +static const URange16 Common_range16[] = { + { 0, 64 }, + { 91, 96 }, + { 123, 169 }, + { 171, 185 }, + { 187, 191 }, + { 215, 215 }, + { 247, 247 }, + { 697, 735 }, + { 741, 745 }, + { 748, 767 }, + { 884, 884 }, + { 894, 894 }, + { 901, 901 }, + { 903, 903 }, + { 1417, 1417 }, + { 1548, 1548 }, + { 1563, 1563 }, + { 1567, 1567 }, + { 1600, 1600 }, + { 1632, 1641 }, + { 1757, 1757 }, + { 2404, 2405 }, + { 3647, 3647 }, + { 4053, 4056 }, + { 4347, 4347 }, + { 5867, 5869 }, + { 5941, 5942 }, + { 6146, 6147 }, + { 6149, 6149 }, + { 7379, 7379 }, + { 7393, 7393 }, + { 7401, 7404 }, + { 7406, 7411 }, + { 7413, 7414 }, + { 8192, 8203 }, + { 8206, 8292 }, + { 8294, 8304 }, + { 8308, 8318 }, + { 8320, 8334 }, + { 8352, 8378 }, + { 8448, 8485 }, + { 8487, 8489 }, + { 8492, 8497 }, + { 8499, 8525 }, + { 8527, 8543 }, + { 8585, 8585 }, + { 8592, 9203 }, + { 9216, 9254 }, + { 9280, 9290 }, + { 9312, 9983 }, + { 9985, 10239 }, + { 10496, 11084 }, + { 11088, 11097 }, + { 11776, 11835 }, + { 12272, 12283 }, + { 12288, 12292 }, + { 12294, 12294 }, + { 12296, 12320 }, + { 12336, 12343 }, + { 12348, 12351 }, + { 12443, 12444 }, + { 12448, 12448 }, + { 12539, 12540 }, + { 12688, 12703 }, + { 12736, 12771 }, + { 12832, 12895 }, + { 12927, 13007 }, + { 13144, 13311 }, + { 19904, 19967 }, + { 42752, 42785 }, + { 42888, 42890 }, + { 43056, 43065 }, + { 43471, 43471 }, + { 64830, 64831 }, + { 65021, 65021 }, + { 65040, 65049 }, + { 65072, 65106 }, + { 65108, 65126 }, + { 65128, 65131 }, + { 65279, 65279 }, + { 65281, 65312 }, + { 65339, 65344 }, + { 65371, 65381 }, + { 65392, 65392 }, + { 65438, 65439 }, + { 65504, 65510 }, + { 65512, 65518 }, + { 65529, 65533 }, +}; +static const URange32 Common_range32[] = { + { 65792, 65794 }, + { 65799, 65843 }, + { 65847, 65855 }, + { 65936, 65947 }, + { 66000, 66044 }, + { 118784, 119029 }, + { 119040, 119078 }, + { 119081, 119142 }, + { 119146, 119162 }, + { 119171, 119172 }, + { 119180, 119209 }, + { 119214, 119261 }, + { 119552, 119638 }, + { 119648, 119665 }, + { 119808, 119892 }, + { 119894, 119964 }, + { 119966, 119967 }, + { 119970, 119970 }, + { 119973, 119974 }, + { 119977, 119980 }, + { 119982, 119993 }, + { 119995, 119995 }, + { 119997, 120003 }, + { 120005, 120069 }, + { 120071, 120074 }, + { 120077, 120084 }, + { 120086, 120092 }, + { 120094, 120121 }, + { 120123, 120126 }, + { 120128, 120132 }, + { 120134, 120134 }, + { 120138, 120144 }, + { 120146, 120485 }, + { 120488, 120779 }, + { 120782, 120831 }, + { 126976, 127019 }, + { 127024, 127123 }, + { 127136, 127150 }, + { 127153, 127166 }, + { 127169, 127183 }, + { 127185, 127199 }, + { 127232, 127242 }, + { 127248, 127278 }, + { 127280, 127339 }, + { 127344, 127386 }, + { 127462, 127487 }, + { 127489, 127490 }, + { 127504, 127546 }, + { 127552, 127560 }, + { 127568, 127569 }, + { 127744, 127776 }, + { 127792, 127797 }, + { 127799, 127868 }, + { 127872, 127891 }, + { 127904, 127940 }, + { 127942, 127946 }, + { 127968, 127984 }, + { 128000, 128062 }, + { 128064, 128064 }, + { 128066, 128247 }, + { 128249, 128252 }, + { 128256, 128317 }, + { 128320, 128323 }, + { 128336, 128359 }, + { 128507, 128576 }, + { 128581, 128591 }, + { 128640, 128709 }, + { 128768, 128883 }, + { 917505, 917505 }, + { 917536, 917631 }, +}; +static const URange16 Coptic_range16[] = { + { 994, 1007 }, + { 11392, 11507 }, + { 11513, 11519 }, +}; +static const URange32 Chakma_range32[] = { + { 69888, 69940 }, + { 69942, 69955 }, +}; +static const URange16 Arabic_range16[] = { + { 1536, 1540 }, + { 1542, 1547 }, + { 1549, 1562 }, + { 1564, 1564 }, + { 1566, 1566 }, + { 1568, 1599 }, + { 1601, 1610 }, + { 1622, 1631 }, + { 1642, 1647 }, + { 1649, 1756 }, + { 1758, 1791 }, + { 1872, 1919 }, + { 2208, 2208 }, + { 2210, 2220 }, + { 2276, 2302 }, + { 64336, 64449 }, + { 64467, 64829 }, + { 64848, 64911 }, + { 64914, 64967 }, + { 65008, 65020 }, + { 65136, 65140 }, + { 65142, 65276 }, +}; +static const URange32 Arabic_range32[] = { + { 69216, 69246 }, + { 126464, 126467 }, + { 126469, 126495 }, + { 126497, 126498 }, + { 126500, 126500 }, + { 126503, 126503 }, + { 126505, 126514 }, + { 126516, 126519 }, + { 126521, 126521 }, + { 126523, 126523 }, + { 126530, 126530 }, + { 126535, 126535 }, + { 126537, 126537 }, + { 126539, 126539 }, + { 126541, 126543 }, + { 126545, 126546 }, + { 126548, 126548 }, + { 126551, 126551 }, + { 126553, 126553 }, + { 126555, 126555 }, + { 126557, 126557 }, + { 126559, 126559 }, + { 126561, 126562 }, + { 126564, 126564 }, + { 126567, 126570 }, + { 126572, 126578 }, + { 126580, 126583 }, + { 126585, 126588 }, + { 126590, 126590 }, + { 126592, 126601 }, + { 126603, 126619 }, + { 126625, 126627 }, + { 126629, 126633 }, + { 126635, 126651 }, + { 126704, 126705 }, +}; +static const URange16 Bamum_range16[] = { + { 42656, 42743 }, +}; +static const URange32 Bamum_range32[] = { + { 92160, 92728 }, +}; +static const URange16 Myanmar_range16[] = { + { 4096, 4255 }, + { 43616, 43643 }, +}; +static const URange32 Avestan_range32[] = { + { 68352, 68405 }, + { 68409, 68415 }, +}; +static const URange16 Hebrew_range16[] = { + { 1425, 1479 }, + { 1488, 1514 }, + { 1520, 1524 }, + { 64285, 64310 }, + { 64312, 64316 }, + { 64318, 64318 }, + { 64320, 64321 }, + { 64323, 64324 }, + { 64326, 64335 }, +}; +static const URange32 Takri_range32[] = { + { 71296, 71351 }, + { 71360, 71369 }, +}; +// 3867 16-bit ranges, 723 32-bit ranges +const UGroup unicode_groups[] = { + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Armenian", +1, Armenian_range16, 6, 0, 0 }, + { "Avestan", +1, 0, 0, Avestan_range32, 2 }, + { "Balinese", +1, Balinese_range16, 2, 0, 0 }, + { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, + { "Batak", +1, Batak_range16, 2, 0, 0 }, + { "Bengali", +1, Bengali_range16, 14, 0, 0 }, + { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 }, + { "Brahmi", +1, 0, 0, Brahmi_range32, 2 }, + { "Braille", +1, Braille_range16, 1, 0, 0 }, + { "Buginese", +1, Buginese_range16, 2, 0, 0 }, + { "Buhid", +1, Buhid_range16, 1, 0, 0 }, + { "C", +1, C_range16, 15, C_range32, 6 }, + { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, + { "Carian", +1, 0, 0, Carian_range32, 1 }, + { "Cc", +1, Cc_range16, 2, 0, 0 }, + { "Cf", +1, Cf_range16, 12, Cf_range32, 4 }, + { "Chakma", +1, 0, 0, Chakma_range32, 2 }, + { "Cham", +1, Cham_range16, 4, 0, 0 }, + { "Cherokee", +1, Cherokee_range16, 1, 0, 0 }, + { "Co", +1, Co_range16, 1, Co_range32, 2 }, + { "Common", +1, Common_range16, 88, Common_range32, 70 }, + { "Coptic", +1, Coptic_range16, 3, 0, 0 }, + { "Cs", +1, Cs_range16, 1, 0, 0 }, + { "Cuneiform", +1, 0, 0, Cuneiform_range32, 3 }, + { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, + { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 }, + { "Deseret", +1, 0, 0, Deseret_range32, 1 }, + { "Devanagari", +1, Devanagari_range16, 5, 0, 0 }, + { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, + { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, + { "Georgian", +1, Georgian_range16, 8, 0, 0 }, + { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 }, + { "Gothic", +1, 0, 0, Gothic_range32, 1 }, + { "Greek", +1, Greek_range16, 31, Greek_range32, 2 }, + { "Gujarati", +1, Gujarati_range16, 13, 0, 0 }, + { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, + { "Han", +1, Han_range16, 11, Han_range32, 4 }, + { "Hangul", +1, Hangul_range16, 14, 0, 0 }, + { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, + { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, + { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 2 }, + { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, + { "Inherited", +1, Inherited_range16, 18, Inherited_range32, 6 }, + { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, + { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, + { "Javanese", +1, Javanese_range16, 3, 0, 0 }, + { "Kaithi", +1, 0, 0, Kaithi_range32, 1 }, + { "Kannada", +1, Kannada_range16, 14, 0, 0 }, + { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 1 }, + { "Kayah_Li", +1, Kayah_Li_range16, 1, 0, 0 }, + { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, + { "Khmer", +1, Khmer_range16, 4, 0, 0 }, + { "L", +1, L_range16, 370, L_range32, 116 }, + { "Lao", +1, Lao_range16, 18, 0, 0 }, + { "Latin", +1, Latin_range16, 30, 0, 0 }, + { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, + { "Limbu", +1, Limbu_range16, 5, 0, 0 }, + { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, + { "Lisu", +1, Lisu_range16, 1, 0, 0 }, + { "Ll", +1, Ll_range16, 582, Ll_range32, 29 }, + { "Lm", +1, Lm_range16, 51, Lm_range32, 1 }, + { "Lo", +1, Lo_range16, 286, Lo_range32, 85 }, + { "Lt", +1, Lt_range16, 10, 0, 0 }, + { "Lu", +1, Lu_range16, 576, Lu_range32, 32 }, + { "Lycian", +1, 0, 0, Lycian_range32, 1 }, + { "Lydian", +1, 0, 0, Lydian_range32, 2 }, + { "M", +1, M_range16, 180, M_range32, 24 }, + { "Malayalam", +1, Malayalam_range16, 11, 0, 0 }, + { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, + { "Mc", +1, Mc_range16, 111, Mc_range32, 15 }, + { "Me", +1, Me_range16, 4, 0, 0 }, + { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, + { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 2 }, + { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, + { "Miao", +1, 0, 0, Miao_range32, 3 }, + { "Mn", +1, Mn_range16, 194, Mn_range32, 27 }, + { "Mongolian", +1, Mongolian_range16, 6, 0, 0 }, + { "Myanmar", +1, Myanmar_range16, 2, 0, 0 }, + { "N", +1, N_range16, 64, N_range32, 24 }, + { "Nd", +1, Nd_range16, 35, Nd_range32, 7 }, + { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, + { "Nko", +1, Nko_range16, 1, 0, 0 }, + { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, + { "No", +1, No_range16, 28, No_range32, 14 }, + { "Ogham", +1, Ogham_range16, 1, 0, 0 }, + { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, + { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, + { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 }, + { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, + { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, + { "Oriya", +1, Oriya_range16, 14, 0, 0 }, + { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, + { "P", +1, P_range16, 126, P_range32, 15 }, + { "Pc", +1, Pc_range16, 6, 0, 0 }, + { "Pd", +1, Pd_range16, 16, 0, 0 }, + { "Pe", +1, Pe_range16, 72, 0, 0 }, + { "Pf", +1, Pf_range16, 10, 0, 0 }, + { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, + { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, + { "Pi", +1, Pi_range16, 11, 0, 0 }, + { "Po", +1, Po_range16, 120, Po_range32, 15 }, + { "Ps", +1, Ps_range16, 74, 0, 0 }, + { "Rejang", +1, Rejang_range16, 2, 0, 0 }, + { "Runic", +1, Runic_range16, 2, 0, 0 }, + { "S", +1, S_range16, 143, S_range32, 56 }, + { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, + { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, + { "Sc", +1, Sc_range16, 17, 0, 0 }, + { "Sharada", +1, 0, 0, Sharada_range32, 2 }, + { "Shavian", +1, 0, 0, Shavian_range32, 1 }, + { "Sinhala", +1, Sinhala_range16, 11, 0, 0 }, + { "Sk", +1, Sk_range16, 27, 0, 0 }, + { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, + { "So", +1, So_range16, 108, So_range32, 45 }, + { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, + { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, + { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, + { "Syriac", +1, Syriac_range16, 3, 0, 0 }, + { "Tagalog", +1, Tagalog_range16, 2, 0, 0 }, + { "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 }, + { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 }, + { "Tai_Tham", +1, Tai_Tham_range16, 5, 0, 0 }, + { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, + { "Takri", +1, 0, 0, Takri_range32, 2 }, + { "Tamil", +1, Tamil_range16, 16, 0, 0 }, + { "Telugu", +1, Telugu_range16, 14, 0, 0 }, + { "Thaana", +1, Thaana_range16, 1, 0, 0 }, + { "Thai", +1, Thai_range16, 2, 0, 0 }, + { "Tibetan", +1, Tibetan_range16, 7, 0, 0 }, + { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 }, + { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, + { "Vai", +1, Vai_range16, 1, 0, 0 }, + { "Yi", +1, Yi_range16, 2, 0, 0 }, + { "Z", +1, Z_range16, 8, 0, 0 }, + { "Zl", +1, Zl_range16, 1, 0, 0 }, + { "Zp", +1, Zp_range16, 1, 0, 0 }, + { "Zs", +1, Zs_range16, 7, 0, 0 }, +}; +const int num_unicode_groups = 138; + + +} // namespace re2 + + diff --git a/outside/re2/re2/unicode_groups.h b/outside/re2/re2/unicode_groups.h new file mode 100644 index 000000000..7f5633115 --- /dev/null +++ b/outside/re2/re2/unicode_groups.h @@ -0,0 +1,64 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Unicode character groups. + +// The codes get split into ranges of 16-bit codes +// and ranges of 32-bit codes. It would be simpler +// to use only 32-bit ranges, but these tables are large +// enough to warrant extra care. +// +// Using just 32-bit ranges gives 27 kB of data. +// Adding 16-bit ranges gives 18 kB of data. +// Adding an extra table of 16-bit singletons would reduce +// to 16.5 kB of data but make the data harder to use; +// we don't bother. + +#ifndef RE2_UNICODE_GROUPS_H__ +#define RE2_UNICODE_GROUPS_H__ + +#include "util/util.h" + +namespace re2 { + +struct URange16 +{ + uint16 lo; + uint16 hi; +}; + +struct URange32 +{ + uint32 lo; + uint32 hi; +}; + +struct UGroup +{ + const char *name; + int sign; // +1 for [abc], -1 for [^abc] + const URange16 *r16; + int nr16; + const URange32 *r32; + int nr32; +}; + +// Named by property or script name (e.g., "Nd", "N", "Han"). +// Negated groups are not included. +extern const UGroup unicode_groups[]; +extern const int num_unicode_groups; + +// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). +// Negated groups are included. +extern const UGroup posix_groups[]; +extern const int num_posix_groups; + +// Named by Perl name (e.g., "\\d", "\\D"). +// Negated groups are included. +extern const UGroup perl_groups[]; +extern const int num_perl_groups; + +} // namespace re2 + +#endif // RE2_UNICODE_GROUPS_H__ diff --git a/outside/re2/re2/variadic_function.h b/outside/re2/re2/variadic_function.h new file mode 100644 index 000000000..7c7d6d561 --- /dev/null +++ b/outside/re2/re2/variadic_function.h @@ -0,0 +1,344 @@ +// Copyright 2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_VARIADIC_FUNCTION_H_ +#define RE2_VARIADIC_FUNCTION_H_ + +namespace re2 { + +template +class VariadicFunction2 { + public: + Result operator()(Param0 p0, Param1 p1) const { + return Func(p0, p1, 0, 0); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0) const { + const Arg* const args[] = { &a0 }; + return Func(p0, p1, args, 1); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const { + const Arg* const args[] = { &a0, &a1 }; + return Func(p0, p1, args, 2); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2) const { + const Arg* const args[] = { &a0, &a1, &a2 }; + return Func(p0, p1, args, 3); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3 }; + return Func(p0, p1, args, 4); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 }; + return Func(p0, p1, args, 5); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 }; + return Func(p0, p1, args, 6); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 }; + return Func(p0, p1, args, 7); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 }; + return Func(p0, p1, args, 8); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 }; + return Func(p0, p1, args, 9); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9 }; + return Func(p0, p1, args, 10); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10 }; + return Func(p0, p1, args, 11); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11 }; + return Func(p0, p1, args, 12); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12 }; + return Func(p0, p1, args, 13); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13 }; + return Func(p0, p1, args, 14); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14 }; + return Func(p0, p1, args, 15); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15 }; + return Func(p0, p1, args, 16); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 }; + return Func(p0, p1, args, 17); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 }; + return Func(p0, p1, args, 18); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 }; + return Func(p0, p1, args, 19); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 }; + return Func(p0, p1, args, 20); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, + &a20 }; + return Func(p0, p1, args, 21); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21 }; + return Func(p0, p1, args, 22); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22 }; + return Func(p0, p1, args, 23); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23 }; + return Func(p0, p1, args, 24); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24 }; + return Func(p0, p1, args, 25); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25 }; + return Func(p0, p1, args, 26); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26 }; + return Func(p0, p1, args, 27); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26, const Arg& a27) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26, &a27 }; + return Func(p0, p1, args, 28); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26, const Arg& a27, const Arg& a28) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 }; + return Func(p0, p1, args, 29); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 }; + return Func(p0, p1, args, 30); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29, + const Arg& a30) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 }; + return Func(p0, p1, args, 31); + } + + Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1, + const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, + const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9, + const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13, + const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17, + const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21, + const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25, + const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29, + const Arg& a30, const Arg& a31) const { + const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8, + &a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20, + &a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 }; + return Func(p0, p1, args, 32); + } +}; + +} // namespace re2 + +#endif // RE2_VARIADIC_FUNCTION_H_ diff --git a/outside/re2/re2/walker-inl.h b/outside/re2/re2/walker-inl.h new file mode 100644 index 000000000..4d2045f72 --- /dev/null +++ b/outside/re2/re2/walker-inl.h @@ -0,0 +1,244 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Helper class for traversing Regexps without recursion. +// Clients should declare their own subclasses that override +// the PreVisit and PostVisit methods, which are called before +// and after visiting the subexpressions. + +// Not quite the Visitor pattern, because (among other things) +// the Visitor pattern is recursive. + +#ifndef RE2_WALKER_INL_H__ +#define RE2_WALKER_INL_H__ + +#include "re2/regexp.h" + +namespace re2 { + +template struct WalkState; + +template class Regexp::Walker { + public: + Walker(); + virtual ~Walker(); + + // Virtual method called before visiting re's children. + // PreVisit passes ownership of its return value to its caller. + // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg + // and passed to the child PreVisits and PostVisits as parent_arg. + // At the top-most Regexp, parent_arg is arg passed to walk. + // If PreVisit sets *stop to true, the walk does not recurse + // into the children. Instead it behaves as though the return + // value from PreVisit is the return value from PostVisit. + // The default PreVisit returns parent_arg. + virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); + + // Virtual method called after visiting re's children. + // The pre_arg is the T that PreVisit returned. + // The child_args is a vector of the T that the child PostVisits returned. + // PostVisit takes ownership of pre_arg. + // PostVisit takes ownership of the Ts + // in *child_args, but not the vector itself. + // PostVisit passes ownership of its return value + // to its caller. + // The default PostVisit simply returns pre_arg. + virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, + T* child_args, int nchild_args); + + // Virtual method called to copy a T, + // when Walk notices that more than one child is the same re. + virtual T Copy(T arg); + + // Virtual method called to do a "quick visit" of the re, + // but not its children. Only called once the visit budget + // has been used up and we're trying to abort the walk + // as quickly as possible. Should return a value that + // makes sense for the parent PostVisits still to be run. + // This function is (hopefully) only called by + // WalkExponential, but must be implemented by all clients, + // just in case. + virtual T ShortVisit(Regexp* re, T parent_arg) = 0; + + // Walks over a regular expression. + // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. + // Returns the T returned by PostVisit on re. + T Walk(Regexp* re, T top_arg); + + // Like Walk, but doesn't use Copy. This can lead to + // exponential runtimes on cross-linked Regexps like the + // ones generated by Simplify. To help limit this, + // at most max_visits nodes will be visited and then + // the walk will be cut off early. + // If the walk *is* cut off early, ShortVisit(re) + // will be called on regexps that cannot be fully + // visited rather than calling PreVisit/PostVisit. + T WalkExponential(Regexp* re, T top_arg, int max_visits); + + // Clears the stack. Should never be necessary, since + // Walk always enters and exits with an empty stack. + // Logs DFATAL if stack is not already clear. + void Reset(); + + // Returns whether walk was cut off. + bool stopped_early() { return stopped_early_; } + + private: + // Walk state for the entire traversal. + stack >* stack_; + bool stopped_early_; + int max_visits_; + + T WalkInternal(Regexp* re, T top_arg, bool use_copy); + + DISALLOW_EVIL_CONSTRUCTORS(Walker); +}; + +template T Regexp::Walker::PreVisit(Regexp* re, + T parent_arg, + bool* stop) { + return parent_arg; +} + +template T Regexp::Walker::PostVisit(Regexp* re, + T parent_arg, + T pre_arg, + T* child_args, + int nchild_args) { + return pre_arg; +} + +template T Regexp::Walker::Copy(T arg) { + return arg; +} + +// State about a single level in the traversal. +template struct WalkState { + WalkState(Regexp* re, T parent) + : re(re), + n(-1), + parent_arg(parent), + child_args(NULL) { } + + Regexp* re; // The regexp + int n; // The index of the next child to process; -1 means need to PreVisit + T parent_arg; // Accumulated arguments. + T pre_arg; + T child_arg; // One-element buffer for child_args. + T* child_args; +}; + +template Regexp::Walker::Walker() { + stack_ = new stack >; + stopped_early_ = false; +} + +template Regexp::Walker::~Walker() { + Reset(); + delete stack_; +} + +// Clears the stack. Should never be necessary, since +// Walk always enters and exits with an empty stack. +// Logs DFATAL if stack is not already clear. +template void Regexp::Walker::Reset() { + if (stack_ && stack_->size() > 0) { + LOG(DFATAL) << "Stack not empty."; + while (stack_->size() > 0) { + delete stack_->top().child_args; + stack_->pop(); + } + } +} + +template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, + bool use_copy) { + Reset(); + + if (re == NULL) { + LOG(DFATAL) << "Walk NULL"; + return top_arg; + } + + stack_->push(WalkState(re, top_arg)); + + WalkState* s; + for (;;) { + T t; + s = &stack_->top(); + Regexp* re = s->re; + switch (s->n) { + case -1: { + if (--max_visits_ < 0) { + stopped_early_ = true; + t = ShortVisit(re, s->parent_arg); + break; + } + bool stop = false; + s->pre_arg = PreVisit(re, s->parent_arg, &stop); + if (stop) { + t = s->pre_arg; + break; + } + s->n = 0; + s->child_args = NULL; + if (re->nsub_ == 1) + s->child_args = &s->child_arg; + else if (re->nsub_ > 1) + s->child_args = new T[re->nsub_]; + // Fall through. + } + default: { + if (re->nsub_ > 0) { + Regexp** sub = re->sub(); + if (s->n < re->nsub_) { + if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { + s->child_args[s->n] = Copy(s->child_args[s->n - 1]); + s->n++; + } else { + stack_->push(WalkState(sub[s->n], s->pre_arg)); + } + continue; + } + } + + t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); + if (re->nsub_ > 1) + delete[] s->child_args; + break; + } + } + + // We've finished stack_->top(). + // Update next guy down. + stack_->pop(); + if (stack_->size() == 0) + return t; + s = &stack_->top(); + if (s->child_args != NULL) + s->child_args[s->n] = t; + else + s->child_arg = t; + s->n++; + } +} + +template T Regexp::Walker::Walk(Regexp* re, T top_arg) { + // Without the exponential walking behavior, + // this budget should be more than enough for any + // regexp, and yet not enough to get us in trouble + // as far as CPU time. + max_visits_ = 1000000; + return WalkInternal(re, top_arg, true); +} + +template T Regexp::Walker::WalkExponential(Regexp* re, T top_arg, + int max_visits) { + max_visits_ = max_visits; + return WalkInternal(re, top_arg, false); +} + +} // namespace re2 + +#endif // RE2_WALKER_INL_H__ diff --git a/outside/re2/runtests b/outside/re2/runtests new file mode 100755 index 000000000..aadcb92f9 --- /dev/null +++ b/outside/re2/runtests @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +success=true +for i +do + printf "%-40s" $i + if sh -c "$i >$i.log 2>&1" 2>/dev/null + then + echo PASS + else + echo FAIL';' output in $i.log + success=false + fi +done + +if $success; then + echo 'ALL TESTS PASSED.' + exit 0 +fi +echo 'TESTS FAILED.' +exit 1 diff --git a/outside/re2/testinstall.cc b/outside/re2/testinstall.cc new file mode 100644 index 000000000..97990c2c0 --- /dev/null +++ b/outside/re2/testinstall.cc @@ -0,0 +1,26 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +using namespace re2; + +int main(void) { + FilteredRE2 f; + int id; + f.Add("a.*b.*c", RE2::DefaultOptions, &id); + vector v; + f.Compile(&v); + vector ids; + f.FirstMatch("abbccc", ids); + + if(RE2::FullMatch("axbyc", "a.*b.*c")) { + printf("PASS\n"); + return 0; + } + printf("FAIL\n"); + return 2; +} diff --git a/outside/re2/ucs2.diff b/outside/re2/ucs2.diff new file mode 100644 index 000000000..57aec04a1 --- /dev/null +++ b/outside/re2/ucs2.diff @@ -0,0 +1,567 @@ +This is a dump from Google's source control system of the change +that removed UCS-2 support from RE2. As the explanation below +says, UCS-2 mode is fundamentally at odds with things like ^ and $, +so it never really worked very well. But if you are interested in using +it without those operators, it did work for that. It assumed that the +UCS-2 data was in the native host byte order. + +If you are interested in adding UCS-2 mode back, this patch might +be a good starting point. + + +Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 + + Retire UCS-2 mode. + + I added it as an experiment for V8, but it + requires 2-byte lookahead to do completely, + and RE2 has 1-byte lookahead (enough for UTF-8) + as a fairly deep fundamental assumption, + so it did not support ^ or $. + +==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== +re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 + cap_[0] = p; + if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. + return true; +- if (prog_->flags() & Regexp::UCS2) +- p++; + } + return false; + } +==== re2/compile.cc#17 - re2/compile.cc#18 ==== +re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 + // Input encodings. + enum Encoding { + kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) +- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order + kEncodingLatin1, // Latin1 (0-FF) + }; + +re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 + void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); + void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); + void Add_80_10ffff(); +- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); +- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, +- uint8 lo2, uint8 hi2, bool fold2); + + // New suffix that matches the byte range lo-hi, then goes to next. + Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); +re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 + + // Converts rune range lo-hi into a fragment that recognizes + // the bytes that would make up those runes in the current +- // encoding (Latin 1, UTF-8, or UCS-2). ++ // encoding (Latin 1 or UTF-8). + // This lets the machine work byte-by-byte even when + // using multibyte encodings. + +re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 + case kEncodingLatin1: + AddRuneRangeLatin1(lo, hi, foldcase); + break; +- case kEncodingUCS2: +- AddRuneRangeUCS2(lo, hi, foldcase); +- break; + } + } + +re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 + AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); + } + +- // Test whether 16-bit values are big or little endian. +- static bool BigEndian() { +- union { +- char byte[2]; +- int16 endian; +- } u; +- +- u.byte[0] = 1; +- u.byte[1] = 2; +- return u.endian == 0x0102; +- } +- +- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, +- uint8 lo2, uint8 hi2, bool fold2) { +- Inst* ip; +- if (reversed_) { +- ip = RuneByteSuffix(lo1, hi1, fold1, NULL); +- ip = RuneByteSuffix(lo2, hi2, fold2, ip); +- } else { +- ip = RuneByteSuffix(lo2, hi2, fold2, NULL); +- ip = RuneByteSuffix(lo1, hi1, fold1, ip); +- } +- AddSuffix(ip); +- } +- +- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { +- if (lo > hi || lo > 0xFFFF) +- return; +- if (hi > 0xFFFF) +- hi = 0xFFFF; +- +- // We'll assemble a pattern assuming big endian. +- // If the machine isn't, tell Cat to reverse its arguments. +- bool oldreversed = reversed_; +- if (!BigEndian()) { +- reversed_ = !oldreversed; +- } +- +- // Split into bytes. +- int lo1 = lo >> 8; +- int lo2 = lo & 0xFF; +- int hi1 = hi >> 8; +- int hi2 = hi & 0xFF; +- +- if (lo1 == hi1) { +- // Easy case: high bits are same in both. +- // Only do ASCII case folding on the second byte if the top byte is 00. +- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); +- } else { +- // Harder case: different second byte ranges depending on first byte. +- +- // Initial fragment. +- if (lo2 > 0) { +- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); +- lo1++; +- } +- +- // Trailing fragment. +- if (hi2 < 0xFF) { +- AddUCS2Pair(hi1, hi1, false, 0, hi2, false); +- hi1--; +- } +- +- // Inner ranges. +- if (lo1 <= hi1) { +- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); +- } +- } +- +- // Restore reverse setting. +- reversed_ = oldreversed; +- } +- + // Table describing how to make a UTF-8 matching machine + // for the rune range 80-10FFFF (Runeself-Runemax). + // This range happens frequently enough (for example /./ and /[^a-z]/) +re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 + + Frag Compiler::Literal(Rune r, bool foldcase) { + switch (encoding_) { +- default: // UCS-2 or something new +- BeginRange(); +- AddRuneRange(r, r, foldcase); +- return EndRange(); ++ default: ++ return kNullFrag; + + case kEncodingLatin1: + return ByteRange(r, r, foldcase); +re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 + + if (re->parse_flags() & Regexp::Latin1) + c.encoding_ = kEncodingLatin1; +- else if (re->parse_flags() & Regexp::UCS2) +- c.encoding_ = kEncodingUCS2; + c.reversed_ = reversed; + if (max_mem <= 0) { + c.max_inst_ = 100000; // more than enough +re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 + c.prog_->set_start_unanchored(c.prog_->start()); + } else { + Frag dot; +- if (c.encoding_ == kEncodingUCS2) { +- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); +- } else { +- dot = c.ByteRange(0x00, 0xFF, false); +- } ++ dot = c.ByteRange(0x00, 0xFF, false); + Frag dotloop = c.Star(dot, true); + Frag unanchored = c.Cat(dotloop, all); + c.prog_->set_start_unanchored(unanchored.begin); +==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== +re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 + const char* bp = context.begin(); + int c = -1; + int wasword = 0; +- bool ucs2 = prog_->flags() & Regexp::UCS2; + + if (text.begin() > context.begin()) { + c = text.begin()[-1] & 0xFF; +re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 + // If there's a required first byte for an unanchored search + // and we're not in the middle of any possible matches, + // use memchr to search for the byte quickly. +- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && ++ if (!anchored && first_byte_ >= 0 && runq->size() == 0 && + p < text.end() && (p[0] & 0xFF) != first_byte_) { + p = reinterpret_cast(memchr(p, first_byte_, + text.end() - p)); +re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 + flag = Prog::EmptyFlags(context, p); + } + +- // In UCS-2 mode, if we need to start a new thread, +- // make sure to do it on an even boundary. +- if(ucs2 && runq->size() == 0 && +- (p - context.begin()) % 2 && p < text.end()) { +- p++; +- flag = Prog::EmptyFlags(context, p); +- } +- + // Steal match storage (cleared but unused as of yet) + // temporarily to hold match boundaries for new thread. +- // In UCS-2 mode, only start the thread on a 2-byte boundary. +- if(!ucs2 || (p - context.begin()) % 2 == 0) { +- match_[0] = p; +- AddToThreadq(runq, start_, flag, p, match_); +- match_[0] = NULL; +- } ++ match_[0] = p; ++ AddToThreadq(runq, start_, flag, p, match_); ++ match_[0] = NULL; + } + + // If all the threads have died, stop early. +==== re2/parse.cc#22 - re2/parse.cc#23 ==== +re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 + status_(status), stacktop_(NULL), ncap_(0) { + if (flags_ & Latin1) + rune_max_ = 0xFF; +- else if (flags & UCS2) +- rune_max_ = 0xFFFF; + else + rune_max_ = Runemax; + } +re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 + bool Regexp::ParseState::PushCarat() { + if (flags_ & OneLine) { + return PushSimpleOp(kRegexpBeginText); +- } else { +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("multiline ^ in UCS-2 mode"); +- return false; +- } +- return PushSimpleOp(kRegexpBeginLine); + } ++ return PushSimpleOp(kRegexpBeginLine); + } + + // Pushes a \b or \B onto the stack. + bool Regexp::ParseState::PushWordBoundary(bool word) { +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("\\b or \\B in UCS-2 mode"); +- return false; +- } + if (word) + return PushSimpleOp(kRegexpWordBoundary); + return PushSimpleOp(kRegexpNoWordBoundary); +re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 + bool ret = PushSimpleOp(kRegexpEndText); + flags_ = oflags; + return ret; +- } +- if (flags_ & UCS2) { +- status_->set_code(kRegexpUnsupported); +- status_->set_error_arg("multiline $ in UCS-2 mode"); +- return false; + } + return PushSimpleOp(kRegexpEndLine); + } +==== re2/re2.cc#34 - re2/re2.cc#35 ==== +re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 + return RE2::ErrorBadUTF8; + case re2::kRegexpBadNamedCapture: + return RE2::ErrorBadNamedCapture; +- case re2::kRegexpUnsupported: +- return RE2::ErrorUnsupported; + } + return RE2::ErrorInternal; + } +re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 + break; + case RE2::Options::EncodingLatin1: + flags |= Regexp::Latin1; +- break; +- case RE2::Options::EncodingUCS2: +- flags |= Regexp::UCS2; + break; + } + +==== re2/re2.h#36 - re2/re2.h#37 ==== +re2/re2.h#36:246,252 - re2/re2.h#37:246,251 + ErrorBadUTF8, // invalid UTF-8 in regexp + ErrorBadNamedCapture, // bad named capture group + ErrorPatternTooLarge, // pattern too large (compile failed) +- ErrorUnsupported, // unsupported feature (in UCS-2 mode) + }; + + // Predefined common options. +re2/re2.h#36:570,576 - re2/re2.h#37:569,574 + + enum Encoding { + EncodingUTF8 = 1, +- EncodingUCS2, // 16-bit Unicode 0-FFFF only + EncodingLatin1 + }; + +==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== +re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 + // the regexp that remains after the prefix. The prefix might + // be ASCII case-insensitive. + bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { +- // Don't even bother for UCS-2; it's time to throw that code away. +- if (parse_flags_ & UCS2) +- return false; +- + // No need for a walker: the regexp must be of the form + // 1. some number of ^ anchors + // 2. a literal char or string +==== re2/regexp.h#20 - re2/regexp.h#21 ==== +re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 + kRegexpBadPerlOp, // bad perl operator + kRegexpBadUTF8, // invalid UTF-8 in regexp + kRegexpBadNamedCapture, // bad named capture +- kRegexpUnsupported, // unsupported operator + }; + + // Error status for certain operations. +re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte +- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. +- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group ++ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. +- NeverNL = 1<<12, // Never match NL, even if the regexp mentions ++ NeverNL = 1<<11, // Never match NL, even if the regexp mentions + // it explicitly. + + // As close to Perl as we can get. +==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== +re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 + cap_[0] = p; + if (Visit(prog_->start(), p)) // Match must be leftmost; done. + return true; +- if (prog_->flags() & Regexp::UCS2) +- p++; + } + return false; + } +==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== +re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 + static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, +- { single_line|Regexp::UCS2, "single-line, ucs2" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +- { multi_line|Regexp::UCS2, "multiline, ucs2" }, + }; + + static string FormatMode(Regexp::ParseFlags flags) { +re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { +- if (status.code() != kRegexpUnsupported) { +- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) +- << " mode: " << FormatMode(flags); +- error_ = true; +- } ++ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) ++ << " mode: " << FormatMode(flags); ++ error_ = true; + return; + } + prog_ = regexp_->CompileToProg(0); +re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 + RE2::Options options; + if (flags & Regexp::Latin1) + options.set_encoding(RE2::Options::EncodingLatin1); +- else if (flags & Regexp::UCS2) +- options.set_encoding(RE2::Options::EncodingUCS2); + if (kind_ == Prog::kLongestMatch) + options.set_longest_match(true); + re2_ = new RE2(re, options); +re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 + delete re2_; + } + +- // Converts UTF-8 string in text into UCS-2 string in new_text. +- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { +- const char* p = text.begin(); +- const char* ep = text.end(); +- uint16* q = new uint16[ep - p]; +- uint16* q0 = q; +- +- int n; +- Rune r; +- for (; p < ep; p += n) { +- if (!fullrune(p, ep - p)) { +- delete[] q0; +- return false; +- } +- n = chartorune(&r, p); +- if (r > 0xFFFF) { +- delete[] q0; +- return false; +- } +- *q++ = r; +- } +- *new_text = StringPiece(reinterpret_cast(q0), 2*(q - q0)); +- return true; +- } +- +- // Rewrites *sp from being a pointer into text8 (UTF-8) +- // to being a pointer into text16 (equivalent text but in UCS-2). +- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, +- StringPiece *sp) { +- if (sp->begin() == NULL && text8.begin() != NULL) +- return; +- +- int nrune = 0; +- int n; +- Rune r; +- const char* p = text8.begin(); +- const char* ep = text8.end(); +- const char* spbegin = NULL; +- const char* spend = NULL; +- for (;;) { +- if (p == sp->begin()) +- spbegin = text16.begin() + sizeof(uint16)*nrune; +- if (p == sp->end()) +- spend = text16.begin() + sizeof(uint16)*nrune; +- if (p >= ep) +- break; +- n = chartorune(&r, p); +- p += n; +- nrune++; +- } +- if (spbegin == NULL || spend == NULL) { +- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " +- << CEscape(text8) << " " +- << (int)(sp->begin() - text8.begin()) << " " +- << (int)(sp->end() - text8.begin()); +- } +- *sp = StringPiece(spbegin, spend - spbegin); +- } +- +- // Rewrites *sp from begin a pointer into text16 (UCS-2) +- // to being a pointer into text8 (equivalent text but in UTF-8). +- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, +- StringPiece* sp) { +- if (sp->begin() == NULL) +- return; +- +- int nrune = 0; +- int n; +- Rune r; +- const char* p = text8.begin(); +- const char* ep = text8.end(); +- const char* spbegin = NULL; +- const char* spend = NULL; +- for (;;) { +- if (nrune == (sp->begin() - text16.begin())/2) +- spbegin = p; +- if (nrune == (sp->end() - text16.begin())/2) +- spend = p; +- if (p >= ep) +- break; +- n = chartorune(&r, p); +- p += n; +- nrune++; +- } +- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { +- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " +- << CEscape(text16) << " " +- << (int)(sp->begin() - text16.begin()) << " " +- << (int)(sp->end() - text16.begin()); +- } +- *sp = StringPiece(spbegin, spend - spbegin); +- } +- + // Runs a single search using the named engine type. + // This interface hides all the irregularities of the various + // engine interfaces from the rest of this file. +re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 + + StringPiece text = orig_text; + StringPiece context = orig_context; +- bool ucs2 = false; + +- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { +- if (!ConvertUTF8ToUCS2(orig_context, &context)) { +- result->skipped = true; +- return; +- } +- +- // Rewrite context to refer to new text. +- AdjustUTF8ToUCS2(orig_context, context, &text); +- ucs2 = true; +- } +- + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; +re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 + } + } + +- // If we did UCS-2 matching, rewrite the matches to refer +- // to the original UTF-8 text. +- if (ucs2) { +- if (result->matched) { +- if (result->have_submatch0) { +- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); +- } else if (result->have_submatch) { +- for (int i = 0; i < nsubmatch; i++) { +- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); +- } +- } +- } +- delete[] context.begin(); +- } +- + if (!result->matched) + memset(result->submatch, 0, sizeof result->submatch); + } +re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 + return true; + } + +- // Check whether text uses only Unicode points <= 0xFFFF +- // (in the BMP). +- static bool IsBMP(const StringPiece& text) { +- const char* p = text.begin(); +- const char* ep = text.end(); +- while (p < ep) { +- if (!fullrune(p, ep - p)) +- return false; +- Rune r; +- p += chartorune(&r, p); +- if (r > 0xFFFF) +- return false; +- } +- return true; +- } +- + // Runs a single test. + bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { +re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { +- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode ++ if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); diff --git a/outside/re2/util/arena.cc b/outside/re2/util/arena.cc new file mode 100644 index 000000000..25753c5df --- /dev/null +++ b/outside/re2/util/arena.cc @@ -0,0 +1,168 @@ +// Copyright 2000 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" + +namespace re2 { + +// ---------------------------------------------------------------------- +// UnsafeArena::UnsafeArena() +// UnsafeArena::~UnsafeArena() +// Destroying the arena automatically calls Reset() +// ---------------------------------------------------------------------- + + +UnsafeArena::UnsafeArena(const size_t block_size) + : block_size_(block_size), + freestart_(NULL), // set for real in Reset() + last_alloc_(NULL), + remaining_(0), + blocks_alloced_(1), + overflow_blocks_(NULL) { + assert(block_size > kDefaultAlignment); + + first_blocks_[0].mem = reinterpret_cast(malloc(block_size_)); + first_blocks_[0].size = block_size_; + + Reset(); +} + +UnsafeArena::~UnsafeArena() { + FreeBlocks(); + assert(overflow_blocks_ == NULL); // FreeBlocks() should do that + // The first X blocks stay allocated always by default. Delete them now. + for (int i = 0; i < blocks_alloced_; i++) + free(first_blocks_[i].mem); +} + +// ---------------------------------------------------------------------- +// UnsafeArena::Reset() +// Clears all the memory an arena is using. +// ---------------------------------------------------------------------- + +void UnsafeArena::Reset() { + FreeBlocks(); + freestart_ = first_blocks_[0].mem; + remaining_ = first_blocks_[0].size; + last_alloc_ = NULL; + + // We do not know for sure whether or not the first block is aligned, + // so we fix that right now. + const int overage = reinterpret_cast(freestart_) & + (kDefaultAlignment-1); + if (overage > 0) { + const int waste = kDefaultAlignment - overage; + freestart_ += waste; + remaining_ -= waste; + } + freestart_when_empty_ = freestart_; + assert(!(reinterpret_cast(freestart_)&(kDefaultAlignment-1))); +} + +// ------------------------------------------------------------- +// UnsafeArena::AllocNewBlock() +// Adds and returns an AllocatedBlock. +// The returned AllocatedBlock* is valid until the next call +// to AllocNewBlock or Reset. (i.e. anything that might +// affect overflow_blocks_). +// ------------------------------------------------------------- + +UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) { + AllocatedBlock *block; + // Find the next block. + if ( blocks_alloced_ < arraysize(first_blocks_) ) { + // Use one of the pre-allocated blocks + block = &first_blocks_[blocks_alloced_++]; + } else { // oops, out of space, move to the vector + if (overflow_blocks_ == NULL) overflow_blocks_ = new vector; + // Adds another block to the vector. + overflow_blocks_->resize(overflow_blocks_->size()+1); + // block points to the last block of the vector. + block = &overflow_blocks_->back(); + } + + block->mem = reinterpret_cast(malloc(block_size)); + block->size = block_size; + + return block; +} + +// ---------------------------------------------------------------------- +// UnsafeArena::GetMemoryFallback() +// We take memory out of our pool, aligned on the byte boundary +// requested. If we don't have space in our current pool, we +// allocate a new block (wasting the remaining space in the +// current block) and give you that. If your memory needs are +// too big for a single block, we make a special your-memory-only +// allocation -- this is equivalent to not using the arena at all. +// ---------------------------------------------------------------------- + +void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) { + if (size == 0) + return NULL; // stl/stl_alloc.h says this is okay + + assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2 + + // If the object is more than a quarter of the block size, allocate + // it separately to avoid wasting too much space in leftover bytes + if (block_size_ == 0 || size > block_size_/4) { + // then it gets its own block in the arena + assert(align <= kDefaultAlignment); // because that's what new gives us + // This block stays separate from the rest of the world; in particular + // we don't update last_alloc_ so you can't reclaim space on this block. + return AllocNewBlock(size)->mem; + } + + const int overage = + (reinterpret_cast(freestart_) & (align-1)); + if (overage) { + const int waste = align - overage; + freestart_ += waste; + if (waste < remaining_) { + remaining_ -= waste; + } else { + remaining_ = 0; + } + } + if (size > remaining_) { + AllocatedBlock *block = AllocNewBlock(block_size_); + freestart_ = block->mem; + remaining_ = block->size; + } + remaining_ -= size; + last_alloc_ = freestart_; + freestart_ += size; + assert((reinterpret_cast(last_alloc_) & (align-1)) == 0); + return reinterpret_cast(last_alloc_); +} + +// ---------------------------------------------------------------------- +// UnsafeArena::FreeBlocks() +// Unlike GetMemory(), which does actual work, ReturnMemory() is a +// no-op: we don't "free" memory until Reset() is called. We do +// update some stats, though. Note we do no checking that the +// pointer you pass in was actually allocated by us, or that it +// was allocated for the size you say, so be careful here! +// FreeBlocks() does the work for Reset(), actually freeing all +// memory allocated in one fell swoop. +// ---------------------------------------------------------------------- + +void UnsafeArena::FreeBlocks() { + for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced + free(first_blocks_[i].mem); + first_blocks_[i].mem = NULL; + first_blocks_[i].size = 0; + } + blocks_alloced_ = 1; + if (overflow_blocks_ != NULL) { + vector::iterator it; + for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) { + free(it->mem); + } + delete overflow_blocks_; // These should be used very rarely + overflow_blocks_ = NULL; + } +} + +} // namespace re2 diff --git a/outside/re2/util/arena.h b/outside/re2/util/arena.h new file mode 100644 index 000000000..7eb385b00 --- /dev/null +++ b/outside/re2/util/arena.h @@ -0,0 +1,103 @@ +// Copyright 2000 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Sometimes it is necessary to allocate a large number of small +// objects. Doing this the usual way (malloc, new) is slow, +// especially for multithreaded programs. An UnsafeArena provides a +// mark/release method of memory management: it asks for a large chunk +// from the operating system and doles it out bit by bit as required. +// Then you free all the memory at once by calling UnsafeArena::Reset(). +// The "Unsafe" refers to the fact that UnsafeArena is not safe to +// call from multiple threads. +// +// The global operator new that can be used as follows: +// +// #include "lib/arena-inl.h" +// +// UnsafeArena arena(1000); +// Foo* foo = new (AllocateInArena, &arena) Foo; +// + +#ifndef RE2_UTIL_ARENA_H_ +#define RE2_UTIL_ARENA_H_ + +namespace re2 { + +// This class is thread-compatible. +class UnsafeArena { + public: + UnsafeArena(const size_t block_size); + virtual ~UnsafeArena(); + + void Reset(); + + // This should be the worst-case alignment for any type. This is + // good for IA-32, SPARC version 7 (the last one I know), and + // supposedly Alpha. i386 would be more time-efficient with a + // default alignment of 8, but ::operator new() uses alignment of 4, + // and an assertion will fail below after the call to MakeNewBlock() + // if you try to use a larger alignment. +#ifdef __i386__ + static const int kDefaultAlignment = 4; +#else + static const int kDefaultAlignment = 8; +#endif + + private: + void* GetMemoryFallback(const size_t size, const int align); + + public: + void* GetMemory(const size_t size, const int align) { + if ( size > 0 && size < remaining_ && align == 1 ) { // common case + last_alloc_ = freestart_; + freestart_ += size; + remaining_ -= size; + return reinterpret_cast(last_alloc_); + } + return GetMemoryFallback(size, align); + } + + private: + struct AllocatedBlock { + char *mem; + size_t size; + }; + + // The returned AllocatedBlock* is valid until the next call to AllocNewBlock + // or Reset (i.e. anything that might affect overflow_blocks_). + AllocatedBlock *AllocNewBlock(const size_t block_size); + + const AllocatedBlock *IndexToBlock(int index) const; + + const size_t block_size_; + char* freestart_; // beginning of the free space in most recent block + char* freestart_when_empty_; // beginning of the free space when we're empty + char* last_alloc_; // used to make sure ReturnBytes() is safe + size_t remaining_; + // STL vector isn't as efficient as it could be, so we use an array at first + int blocks_alloced_; // how many of the first_blocks_ have been alloced + AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary + // if the first_blocks_ aren't enough, expand into overflow_blocks_. + vector* overflow_blocks_; + + void FreeBlocks(); // Frees all except first block + + DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena); +}; + +// Operators for allocation on the arena +// Syntax: new (AllocateInArena, arena) MyClass; +// STL containers, etc. +enum AllocateInArenaType { AllocateInArena }; + +} // namespace re2 + +inline void* operator new(size_t size, + re2::AllocateInArenaType /* unused */, + re2::UnsafeArena *arena) { + return reinterpret_cast(arena->GetMemory(size, 1)); +} + +#endif // RE2_UTIL_ARENA_H_ + diff --git a/outside/re2/util/atomicops.h b/outside/re2/util/atomicops.h new file mode 100644 index 000000000..dd951f261 --- /dev/null +++ b/outside/re2/util/atomicops.h @@ -0,0 +1,137 @@ +// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_UTIL_ATOMICOPS_H__ +#define RE2_UTIL_ATOMICOPS_H__ + +// The memory ordering constraints resemble the ones in C11. +// RELAXED - no memory ordering, just an atomic operation. +// CONSUME - data-dependent ordering. +// ACQUIRE - prevents memory accesses from hoisting above the operation. +// RELEASE - prevents memory accesses from sinking below the operation. + +#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \ + (__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801) + +#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0) +#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0) +#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0) +#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED) +#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE) + +#else // old compiler + +#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0) +#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0) +#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0) +#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0) +#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0) + +// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier() +// are an implementation detail and must not be used in the rest of the code. + +#if defined(__i386__) + +static inline void WriteMemoryBarrier() { + int x; + __asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg. + :: "r" (&x)); +} + +#elif defined(__x86_64__) + +// 64-bit implementations of memory barrier can be simpler, because +// "sfence" is guaranteed to exist. +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("sfence" : : : "memory"); +} + +#elif defined(__ppc__) + +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("eieio" : : : "memory"); +} + +#elif defined(__alpha__) + +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("wmb" : : : "memory"); +} + +#elif defined(__aarch64__) + +static inline void WriteMemoryBarrier() { + __asm__ __volatile__("dmb st" : : : "memory"); +} + +#else + +#include "util/mutex.h" + +static inline void WriteMemoryBarrier() { + // Slight overkill, but good enough: + // any mutex implementation must have + // a read barrier after the lock operation and + // a write barrier before the unlock operation. + // + // It may be worthwhile to write architecture-specific + // barriers for the common platforms, as above, but + // this is a correct fallback. + re2::Mutex mu; + re2::MutexLock l(&mu); +} + +/* +#error Need WriteMemoryBarrier for architecture. + +// Windows +inline void WriteMemoryBarrier() { + LONG x; + ::InterlockedExchange(&x, 0); +} +*/ + +#endif + +// Alpha has very weak memory ordering. If relying on WriteBarriers, one must +// use read barriers for the readers too. +#if defined(__alpha__) + +static inline void MaybeReadMemoryBarrier() { + __asm__ __volatile__("mb" : : : "memory"); +} + +#else + +static inline void MaybeReadMemoryBarrier() {} + +#endif // __alpha__ + +// Read barrier for various targets. + +#if defined(__aarch64__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("dmb ld" : : : "memory"); +} + +#elif defined(__alpha__) + +static inline void ReadMemoryBarrier() { + __asm__ __volatile__("mb" : : : "memory"); +} + +#else + +static inline void ReadMemoryBarrier() {} + +#endif + +#endif // old compiler + +#ifndef NO_THREAD_SAFETY_ANALYSIS +#define NO_THREAD_SAFETY_ANALYSIS +#endif + +#endif // RE2_UTIL_ATOMICOPS_H__ diff --git a/outside/re2/util/benchmark.cc b/outside/re2/util/benchmark.cc new file mode 100644 index 000000000..c3aad7ed8 --- /dev/null +++ b/outside/re2/util/benchmark.cc @@ -0,0 +1,153 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" +#include "util/flags.h" +#include "util/benchmark.h" +#include "re2/re2.h" + +DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); + +using testing::Benchmark; +using namespace re2; + +static Benchmark* benchmarks[10000]; +static int nbenchmarks; + +void Benchmark::Register() { + benchmarks[nbenchmarks] = this; + if(lo < 1) + lo = 1; + if(hi < lo) + hi = lo; + nbenchmarks++; +} + +static int64 nsec() { + struct timeval tv; + if(gettimeofday(&tv, 0) < 0) + return -1; + return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000; +} + +static int64 bytes; +static int64 ns; +static int64 t0; +static int64 items; + +void SetBenchmarkBytesProcessed(long long x) { + bytes = x; +} + +void StopBenchmarkTiming() { + if(t0 != 0) + ns += nsec() - t0; + t0 = 0; +} + +void StartBenchmarkTiming() { + if(t0 == 0) + t0 = nsec(); +} + +void SetBenchmarkItemsProcessed(int n) { + items = n; +} + +void BenchmarkMemoryUsage() { + // TODO(rsc): Implement. +} + +int NumCPUs() { + return 1; +} + +static void runN(Benchmark *b, int n, int siz) { + bytes = 0; + items = 0; + ns = 0; + t0 = nsec(); + if(b->fn) + b->fn(n); + else if(b->fnr) + b->fnr(n, siz); + else { + fprintf(stderr, "%s: missing function\n", b->name); + exit(2); + } + if(t0 != 0) + ns += nsec() - t0; +} + +static int round(int n) { + int base = 1; + + while(base*10 < n) + base *= 10; + if(n < 2*base) + return 2*base; + if(n < 5*base) + return 5*base; + return 10*base; +} + +void RunBench(Benchmark* b, int nthread, int siz) { + int n, last; + + // TODO(rsc): Threaded benchmarks. + if(nthread != 1) + return; + + // run once in case it's expensive + n = 1; + runN(b, n, siz); + while(ns < (int)1e9 && n < (int)1e9) { + last = n; + if(ns/n == 0) + n = 1e9; + else + n = 1e9 / (ns/n); + + n = max(last+1, min(n+n/2, 100*last)); + n = round(n); + runN(b, n, siz); + } + + char mb[100]; + char suf[100]; + mb[0] = '\0'; + suf[0] = '\0'; + if(ns > 0 && bytes > 0) + snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9)); + if(b->fnr || b->lo != b->hi) { + if(siz >= (1<<20)) + snprintf(suf, sizeof suf, "/%dM", siz/(1<<20)); + else if(siz >= (1<<10)) + snprintf(suf, sizeof suf, "/%dK", siz/(1<<10)); + else + snprintf(suf, sizeof suf, "/%d", siz); + } + printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb); + fflush(stdout); +} + +static int match(const char* name, int argc, const char** argv) { + if(argc == 1) + return 1; + for(int i = 1; i < argc; i++) + if(RE2::PartialMatch(name, argv[i])) + return 1; + return 0; +} + +int main(int argc, const char** argv) { + for(int i = 0; i < nbenchmarks; i++) { + Benchmark* b = benchmarks[i]; + if(match(b->name, argc, argv)) + for(int j = b->threadlo; j <= b->threadhi; j++) + for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1) + RunBench(b, j, k); + } +} + diff --git a/outside/re2/util/benchmark.h b/outside/re2/util/benchmark.h new file mode 100644 index 000000000..31bbd5348 --- /dev/null +++ b/outside/re2/util/benchmark.h @@ -0,0 +1,41 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_UTIL_BENCHMARK_H__ +#define RE2_UTIL_BENCHMARK_H__ + +namespace testing { +struct Benchmark { + const char* name; + void (*fn)(int); + void (*fnr)(int, int); + int lo; + int hi; + int threadlo; + int threadhi; + + void Register(); + Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); } + Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); } + void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; } + Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; } +}; +} // namespace testing + +void SetBenchmarkBytesProcessed(long long); +void StopBenchmarkTiming(); +void StartBenchmarkTiming(); +void BenchmarkMemoryUsage(); +void SetBenchmarkItemsProcessed(int); + +int NumCPUs(); + +#define BENCHMARK(f) \ + ::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f)) + +#define BENCHMARK_RANGE(f, lo, hi) \ + ::testing::Benchmark* _benchmark_##f = \ + (new ::testing::Benchmark(#f, f, lo, hi)) + +#endif // RE2_UTIL_BENCHMARK_H__ diff --git a/outside/re2/util/flags.h b/outside/re2/util/flags.h new file mode 100644 index 000000000..77a06a222 --- /dev/null +++ b/outside/re2/util/flags.h @@ -0,0 +1,27 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Simplified version of Google's command line flags. +// Does not support parsing the command line. +// If you want to do that, see +// http://code.google.com/p/google-gflags + +#ifndef RE2_UTIL_FLAGS_H__ +#define RE2_UTIL_FLAGS_H__ + +#define DEFINE_flag(type, name, deflt, desc) \ + namespace re2 { type FLAGS_##name = deflt; } + +#define DECLARE_flag(type, name) \ + namespace re2 { extern type FLAGS_##name; } + +#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc) +#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc) +#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc) + +#define DECLARE_bool(name) DECLARE_flag(bool, name) +#define DECLARE_int32(name) DECLARE_flag(int32, name) +#define DECLARE_string(name) DECLARE_flag(string, name) + +#endif // RE2_UTIL_FLAGS_H__ diff --git a/outside/re2/util/hash.cc b/outside/re2/util/hash.cc new file mode 100644 index 000000000..dfef7b7c3 --- /dev/null +++ b/outside/re2/util/hash.cc @@ -0,0 +1,231 @@ +// Modified by Russ Cox to add "namespace re2". +// Also threw away all but hashword and hashword2. +// http://burtleburtle.net/bob/c/lookup3.c + +/* +------------------------------------------------------------------------------- +lookup3.c, by Bob Jenkins, May 2006, Public Domain. + +These are functions for producing 32-bit hashes for hash table lookup. +hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() +are externally useful functions. Routines to test the hash are included +if SELF_TEST is defined. You can use this free for any purpose. It's in +the public domain. It has no warranty. + +You probably want to use hashlittle(). hashlittle() and hashbig() +hash byte arrays. hashlittle() is is faster than hashbig() on +little-endian machines. Intel and AMD are little-endian machines. +On second thought, you probably want hashlittle2(), which is identical to +hashlittle() except it returns two 32-bit hashes for the price of one. +You could implement hashbig2() if you wanted but I haven't bothered here. + +If you want to find a hash of, say, exactly 7 integers, do + a = i1; b = i2; c = i3; + mix(a,b,c); + a += i4; b += i5; c += i6; + mix(a,b,c); + a += i7; + final(a,b,c); +then use c as the hash value. If you have a variable length array of +4-byte integers to hash, use hashword(). If you have a byte array (like +a character string), use hashlittle(). If you have several byte arrays, or +a mix of things, see the comments above hashlittle(). + +Why is this so big? I read 12 bytes at a time into 3 4-byte integers, +then mix those integers. This is fast (you can do a lot more thorough +mixing with 12*3 instructions on 3 integers than you can with 3 instructions +on 1 byte), but shoehorning those bytes into integers efficiently is messy. +------------------------------------------------------------------------------- +*/ + +#include "util/util.h" + +#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used http://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} + +namespace re2 { + +/* +-------------------------------------------------------------------- + This works on all machines. To be useful, it requires + -- that the key be an array of uint32_t's, and + -- that the length be the number of uint32_t's in the key + + The function hashword() is identical to hashlittle() on little-endian + machines, and identical to hashbig() on big-endian machines, + except that the length has to be measured in uint32_ts rather than in + bytes. hashlittle() is more complicated than hashword() only because + hashlittle() has to dance around fitting the key bytes into registers. +-------------------------------------------------------------------- +*/ +uint32 hashword( +const uint32 *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32 initval) /* the previous hash, or an arbitrary value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + return c; +} + + +/* +-------------------------------------------------------------------- +hashword2() -- same as hashword(), but take two seeds and return two +32-bit values. pc and pb must both be nonnull, and *pc and *pb must +both be initialized with seeds. If you pass in (*pb)==0, the output +(*pc) will be the same as the return value from hashword(). +-------------------------------------------------------------------- +*/ +void hashword2 ( +const uint32 *k, /* the key, an array of uint32_t values */ +size_t length, /* the length of the key, in uint32_ts */ +uint32 *pc, /* IN: seed OUT: primary hash value */ +uint32 *pb) /* IN: more seed OUT: secondary hash value */ +{ + uint32_t a,b,c; + + /* Set up the internal state */ + a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; + c += *pb; + + /*------------------------------------------------- handle most of the key */ + while (length > 3) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 3; + k += 3; + } + + /*------------------------------------------- handle the last 3 uint32_t's */ + switch(length) /* all the case statements fall through */ + { + case 3 : c+=k[2]; + case 2 : b+=k[1]; + case 1 : a+=k[0]; + final(a,b,c); + case 0: /* case 0: nothing left to add */ + break; + } + /*------------------------------------------------------ report the result */ + *pc=c; *pb=b; +} + +} // namespace re2 diff --git a/outside/re2/util/logging.h b/outside/re2/util/logging.h new file mode 100644 index 000000000..4443f7cdf --- /dev/null +++ b/outside/re2/util/logging.h @@ -0,0 +1,86 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Simplified version of Google's logging. + +#ifndef RE2_UTIL_LOGGING_H__ +#define RE2_UTIL_LOGGING_H__ + +#include /* for write */ +#include + +// Debug-only checking. +#define DCHECK(condition) assert(condition) +#define DCHECK_EQ(val1, val2) assert((val1) == (val2)) +#define DCHECK_NE(val1, val2) assert((val1) != (val2)) +#define DCHECK_LE(val1, val2) assert((val1) <= (val2)) +#define DCHECK_LT(val1, val2) assert((val1) < (val2)) +#define DCHECK_GE(val1, val2) assert((val1) >= (val2)) +#define DCHECK_GT(val1, val2) assert((val1) > (val2)) + +// Always-on checking +#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x +#define CHECK_LT(x, y) CHECK((x) < (y)) +#define CHECK_GT(x, y) CHECK((x) > (y)) +#define CHECK_LE(x, y) CHECK((x) <= (y)) +#define CHECK_GE(x, y) CHECK((x) >= (y)) +#define CHECK_EQ(x, y) CHECK((x) == (y)) +#define CHECK_NE(x, y) CHECK((x) != (y)) + +#define LOG_INFO LogMessage(__FILE__, __LINE__) +#define LOG_ERROR LOG_INFO +#define LOG_WARNING LOG_INFO +#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) +#define LOG_QFATAL LOG_FATAL + +#define VLOG(x) if((x)>0){}else LOG_INFO.stream() + +#ifdef NDEBUG +#define DEBUG_MODE 0 +#define LOG_DFATAL LOG_ERROR +#else +#define DEBUG_MODE 1 +#define LOG_DFATAL LOG_FATAL +#endif + +#define LOG(severity) LOG_ ## severity.stream() + +class LogMessage { + public: + LogMessage(const char* file, int line) : flushed_(false) { + stream() << file << ":" << line << ": "; + } + void Flush() { + stream() << "\n"; + string s = str_.str(); + int n = (int)s.size(); // shut up msvc + if(write(2, s.data(), n) < 0) {} // shut up gcc + flushed_ = true; + } + ~LogMessage() { + if (!flushed_) { + Flush(); + } + } + ostream& stream() { return str_; } + + private: + bool flushed_; + std::ostringstream str_; + DISALLOW_EVIL_CONSTRUCTORS(LogMessage); +}; + +class LogMessageFatal : public LogMessage { + public: + LogMessageFatal(const char* file, int line) + : LogMessage(file, line) { } + ~LogMessageFatal() { + Flush(); + abort(); + } + private: + DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal); +}; + +#endif // RE2_UTIL_LOGGING_H__ diff --git a/outside/re2/util/mutex.h b/outside/re2/util/mutex.h new file mode 100644 index 000000000..4a8de4c18 --- /dev/null +++ b/outside/re2/util/mutex.h @@ -0,0 +1,211 @@ +// Copyright 2007 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* + * A simple mutex wrapper, supporting locks and read-write locks. + * You should assume the locks are *not* re-entrant. + */ + +#ifndef RE2_UTIL_MUTEX_H_ +#define RE2_UTIL_MUTEX_H_ + +#include + +namespace re2 { + +#define HAVE_PTHREAD 1 +#define HAVE_RWLOCK 1 + +#if defined(NO_THREADS) + typedef int MutexType; // to keep a lock-count +#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) + // Needed for pthread_rwlock_*. If it causes problems, you could take it + // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it + // *does* cause problems for FreeBSD, or MacOSX, but isn't needed + // for locking there.) +# ifdef __linux__ +# undef _XOPEN_SOURCE +# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls +# endif +# include + typedef pthread_rwlock_t MutexType; +#elif defined(HAVE_PTHREAD) +# include + typedef pthread_mutex_t MutexType; +#elif defined(WIN32) +# define WIN32_LEAN_AND_MEAN // We only need minimal includes +# ifdef GMUTEX_TRYLOCK + // We need Windows NT or later for TryEnterCriticalSection(). If you + // don't need that functionality, you can remove these _WIN32_WINNT + // lines, and change TryLock() to assert(0) or something. +# ifndef _WIN32_WINNT +# define _WIN32_WINNT 0x0400 +# endif +# endif +# include + typedef CRITICAL_SECTION MutexType; +#else +# error Need to implement mutex.h for your architecture, or #define NO_THREADS +#endif + +class Mutex { + public: + // Create a Mutex that is not held by anybody. + inline Mutex(); + + // Destructor + inline ~Mutex(); + + inline void Lock(); // Block if needed until free then acquire exclusively + inline void Unlock(); // Release a lock acquired via Lock() + inline bool TryLock(); // If free, Lock() and return true, else return false + // Note that on systems that don't support read-write locks, these may + // be implemented as synonyms to Lock() and Unlock(). So you can use + // these for efficiency, but don't use them anyplace where being able + // to do shared reads is necessary to avoid deadlock. + inline void ReaderLock(); // Block until free or shared then acquire a share + inline void ReaderUnlock(); // Release a read share of this Mutex + inline void WriterLock() { Lock(); } // Acquire an exclusive lock + inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock() + inline void AssertHeld() { } + + private: + MutexType mutex_; + + // Catch the error of writing Mutex when intending MutexLock. + Mutex(Mutex *ignored); + // Disallow "evil" constructors + Mutex(const Mutex&); + void operator=(const Mutex&); +}; + +// Now the implementation of Mutex for various systems +#if defined(NO_THREADS) + +// When we don't have threads, we can be either reading or writing, +// but not both. We can have lots of readers at once (in no-threads +// mode, that's most likely to happen in recursive function calls), +// but only one writer. We represent this by having mutex_ be -1 when +// writing and a number > 0 when reading (and 0 when no lock is held). +// +// In debug mode, we assert these invariants, while in non-debug mode +// we do nothing, for efficiency. That's why everything is in an +// assert. +#include + +Mutex::Mutex() : mutex_(0) { } +Mutex::~Mutex() { assert(mutex_ == 0); } +void Mutex::Lock() { assert(--mutex_ == -1); } +void Mutex::Unlock() { assert(mutex_++ == -1); } +bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; } +void Mutex::ReaderLock() { assert(++mutex_ > 0); } +void Mutex::ReaderUnlock() { assert(mutex_-- > 0); } + +#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) + +#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) + +Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } +Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); } +void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); } +void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } +bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; } +void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); } +void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } + +#undef SAFE_PTHREAD + +#elif defined(HAVE_PTHREAD) + +#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) + +Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); } +Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); } +void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); } +void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); } +bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; } +void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks +void Mutex::ReaderUnlock() { Unlock(); } +#undef SAFE_PTHREAD + +#elif defined(WIN32) + +Mutex::Mutex() { InitializeCriticalSection(&mutex_); } +Mutex::~Mutex() { DeleteCriticalSection(&mutex_); } +void Mutex::Lock() { EnterCriticalSection(&mutex_); } +void Mutex::Unlock() { LeaveCriticalSection(&mutex_); } +bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; } +void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks +void Mutex::ReaderUnlock() { Unlock(); } + +#endif + + +// -------------------------------------------------------------------------- +// Some helper classes + +// MutexLock(mu) acquires mu when constructed and releases it when destroyed. +class MutexLock { + public: + explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); } + ~MutexLock() { mu_->Unlock(); } + private: + Mutex * const mu_; + // Disallow "evil" constructors + MutexLock(const MutexLock&); + void operator=(const MutexLock&); +}; + +// ReaderMutexLock and WriterMutexLock do the same, for rwlocks +class ReaderMutexLock { + public: + explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); } + ~ReaderMutexLock() { mu_->ReaderUnlock(); } + private: + Mutex * const mu_; + // Disallow "evil" constructors + ReaderMutexLock(const ReaderMutexLock&); + void operator=(const ReaderMutexLock&); +}; + +class WriterMutexLock { + public: + explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); } + ~WriterMutexLock() { mu_->WriterUnlock(); } + private: + Mutex * const mu_; + // Disallow "evil" constructors + WriterMutexLock(const WriterMutexLock&); + void operator=(const WriterMutexLock&); +}; + +// Catch bug where variable name is omitted, e.g. MutexLock (&mu); +#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name) +#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name) +#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) + +// Provide safe way to declare and use global, linker-initialized mutex. Sigh. +#ifdef HAVE_PTHREAD + +#define GLOBAL_MUTEX(name) \ + static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER +#define GLOBAL_MUTEX_LOCK(name) \ + pthread_mutex_lock(&(name)) +#define GLOBAL_MUTEX_UNLOCK(name) \ + pthread_mutex_unlock(&(name)) + +#else + +#define GLOBAL_MUTEX(name) \ + static Mutex name +#define GLOBAL_MUTEX_LOCK(name) \ + name.Lock() +#define GLOBAL_MUTEX_UNLOCK(name) \ + name.Unlock() + +#endif + +} // namespace re2 + +#endif /* #define RE2_UTIL_MUTEX_H_ */ diff --git a/outside/re2/util/pcre.cc b/outside/re2/util/pcre.cc new file mode 100644 index 000000000..5e67e1f38 --- /dev/null +++ b/outside/re2/util/pcre.cc @@ -0,0 +1,961 @@ +// Copyright 2003-2009 Google Inc. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.cc, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +#include +#include "util/util.h" +#include "util/flags.h" +#include "util/pcre.h" + +#define PCREPORT(level) LOG(level) + +// Default PCRE limits. +// Defaults chosen to allow a plausible amount of CPU and +// not exceed main thread stacks. Note that other threads +// often have smaller stacks, and therefore tightening +// regexp_stack_limit may frequently be necessary. +DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); +DEFINE_int32(regexp_match_limit, 1000000, + "default PCRE match limit (function calls)"); + +namespace re2 { + +// Maximum number of args we can set +static const int kMaxArgs = 16; +static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace + +// Approximate size of a recursive invocation of PCRE's +// internal "match()" frame. This varies depending on the +// compiler and architecture, of course, so the constant is +// just a conservative estimate. To find the exact number, +// run regexp_unittest with --regexp_stack_limit=0 under +// a debugger and look at the frames when it crashes. +// The exact frame size was 656 in production on 2008/02/03. +static const int kPCREFrameSize = 700; + +// Special name for missing C++ arguments. +PCRE::Arg PCRE::no_more_args((void*)NULL); + +const PCRE::PartialMatchFunctor PCRE::PartialMatch = { }; +const PCRE::FullMatchFunctor PCRE::FullMatch = { } ; +const PCRE::ConsumeFunctor PCRE::Consume = { }; +const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { }; + +// If a regular expression has no error, its error_ field points here +static const string empty_string; + +void PCRE::Init(const char* pattern, Option options, int match_limit, + int stack_limit, bool report_errors) { + pattern_ = pattern; + options_ = options; + match_limit_ = match_limit; + stack_limit_ = stack_limit; + hit_limit_ = false; + error_ = &empty_string; + report_errors_ = report_errors; + re_full_ = NULL; + re_partial_ = NULL; + + if (options & ~(EnabledCompileOptions | EnabledExecOptions)) { + error_ = new string("illegal regexp option"); + PCREPORT(ERROR) + << "Error compiling '" << pattern << "': illegal regexp option"; + } else { + re_partial_ = Compile(UNANCHORED); + if (re_partial_ != NULL) { + re_full_ = Compile(ANCHOR_BOTH); + } + } +} + +PCRE::PCRE(const char* pattern) { + Init(pattern, None, 0, 0, true); +} +PCRE::PCRE(const char* pattern, Option option) { + Init(pattern, option, 0, 0, true); +} +PCRE::PCRE(const string& pattern) { + Init(pattern.c_str(), None, 0, 0, true); +} +PCRE::PCRE(const string& pattern, Option option) { + Init(pattern.c_str(), option, 0, 0, true); +} +PCRE::PCRE(const string& pattern, const PCRE_Options& re_option) { + Init(pattern.c_str(), re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) { + Init(pattern, re_option.option(), re_option.match_limit(), + re_option.stack_limit(), re_option.report_errors()); +} + +PCRE::~PCRE() { + if (re_full_ != NULL) pcre_free(re_full_); + if (re_partial_ != NULL) pcre_free(re_partial_); + if (error_ != &empty_string) delete error_; +} + +pcre* PCRE::Compile(Anchor anchor) { + // Special treatment for anchoring. This is needed because at + // runtime pcre only provides an option for anchoring at the + // beginning of a string. + // + // There are three types of anchoring we want: + // UNANCHORED Compile the original pattern, and use + // a pcre unanchored match. + // ANCHOR_START Compile the original pattern, and use + // a pcre anchored match. + // ANCHOR_BOTH Tack a "\z" to the end of the original pattern + // and use a pcre anchored match. + + const char* error; + int eoffset; + pcre* re; + if (anchor != ANCHOR_BOTH) { + re = pcre_compile(pattern_.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } else { + // Tack a '\z' at the end of PCRE. Parenthesize it first so that + // the '\z' applies to all top-level alternatives in the regexp. + string wrapped = "(?:"; // A non-counting grouping operator + wrapped += pattern_; + wrapped += ")\\z"; + re = pcre_compile(wrapped.c_str(), + (options_ & EnabledCompileOptions), + &error, &eoffset, NULL); + } + if (re == NULL) { + if (error_ == &empty_string) error_ = new string(error); + PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error; + } + return re; +} + +/***** Convenience interfaces *****/ + +bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + int consumed; + int vec[kVecSize]; + return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, + const PCRE& re, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + int consumed; + int vec[kVecSize]; + return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); +} + +bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + int consumed; + int vec[kVecSize]; + if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, + const PCRE& pattern, + const Arg& a0, + const Arg& a1, + const Arg& a2, + const Arg& a3, + const Arg& a4, + const Arg& a5, + const Arg& a6, + const Arg& a7, + const Arg& a8, + const Arg& a9, + const Arg& a10, + const Arg& a11, + const Arg& a12, + const Arg& a13, + const Arg& a14, + const Arg& a15) const { + const Arg* args[kMaxArgs]; + int n = 0; + if (&a0 == &no_more_args) goto done; args[n++] = &a0; + if (&a1 == &no_more_args) goto done; args[n++] = &a1; + if (&a2 == &no_more_args) goto done; args[n++] = &a2; + if (&a3 == &no_more_args) goto done; args[n++] = &a3; + if (&a4 == &no_more_args) goto done; args[n++] = &a4; + if (&a5 == &no_more_args) goto done; args[n++] = &a5; + if (&a6 == &no_more_args) goto done; args[n++] = &a6; + if (&a7 == &no_more_args) goto done; args[n++] = &a7; + if (&a8 == &no_more_args) goto done; args[n++] = &a8; + if (&a9 == &no_more_args) goto done; args[n++] = &a9; + if (&a10 == &no_more_args) goto done; args[n++] = &a10; + if (&a11 == &no_more_args) goto done; args[n++] = &a11; + if (&a12 == &no_more_args) goto done; args[n++] = &a12; + if (&a13 == &no_more_args) goto done; args[n++] = &a13; + if (&a14 == &no_more_args) goto done; args[n++] = &a14; + if (&a15 == &no_more_args) goto done; args[n++] = &a15; +done: + + int consumed; + int vec[kVecSize]; + if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, + args, n, vec, kVecSize)) { + input->remove_prefix(consumed); + return true; + } else { + return false; + } +} + +bool PCRE::Replace(string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int vec[kVecSize]; + int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + + string s; + if (!pattern.Rewrite(&s, rewrite, *str, vec, matches)) + return false; + + assert(vec[0] >= 0); + assert(vec[1] >= 0); + str->replace(vec[0], vec[1] - vec[0], s); + return true; +} + +int PCRE::GlobalReplace(string *str, + const PCRE& pattern, + const StringPiece& rewrite) { + int count = 0; + int vec[kVecSize]; + string out; + int start = 0; + bool last_match_was_empty_string = false; + + for (; start <= str->length();) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = pattern.TryMatch(*str, start, ANCHOR_START, false, + vec, kVecSize); + if (matches <= 0) { + if (start < str->length()) + out.push_back((*str)[start]); + start++; + last_match_was_empty_string = false; + continue; + } + } else { + matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); + if (matches <= 0) + break; + } + int matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + + out.append(*str, start, matchstart - start); + pattern.Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); + } + + if (count == 0) + return 0; + + if (start < str->length()) + out.append(*str, start, str->length() - start); + swap(out, *str); + return count; +} + +bool PCRE::Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + string *out) { + int vec[kVecSize]; + int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); + if (matches == 0) + return false; + out->clear(); + return pattern.Rewrite(out, rewrite, text, vec, matches); +} + +string PCRE::QuoteMeta(const StringPiece& unquoted) { + string result; + result.reserve(unquoted.size() << 1); + + // Escape any ascii character not in [A-Za-z_0-9]. + // + // Note that it's legal to escape a character even if it has no + // special meaning in a regular expression -- so this function does + // that. (This also makes it identical to the perl function of the + // same name except for the null-character special case; + // see `perldoc -f quotemeta`.) + for (int ii = 0; ii < unquoted.length(); ++ii) { + // Note that using 'isalnum' here raises the benchmark time from + // 32ns to 58ns: + if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { // Special handling for null chars. + // Can't use "\\0" since the next character might be a digit. + result += "\\x00"; + continue; + } + result += '\\'; + } + result += unquoted[ii]; + } + + return result; +} + +/***** Actual matching and rewriting code *****/ + +bool PCRE::HitLimit() { + return hit_limit_; +} + +void PCRE::ClearHitLimit() { + hit_limit_ = 0; +} + +int PCRE::TryMatch(const StringPiece& text, + int startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { + pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; + if (re == NULL) { + PCREPORT(ERROR) << "Matching against invalid re: " << *error_; + return 0; + } + + int match_limit = match_limit_; + if (match_limit <= 0) { + match_limit = FLAGS_regexp_match_limit; + } + + int stack_limit = stack_limit_; + if (stack_limit <= 0) { + stack_limit = FLAGS_regexp_stack_limit; + } + + pcre_extra extra = { 0 }; + if (match_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT; + extra.match_limit = match_limit; + } + if (stack_limit > 0) { + extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; + extra.match_limit_recursion = stack_limit / kPCREFrameSize; + } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + + int rc = pcre_exec(re, // The regular expression object + &extra, + (text.data() == NULL) ? "" : text.data(), + text.size(), + startpos, + options, + vec, + vecsize); + + // Handle errors + if (rc == 0) { + // pcre_exec() returns 0 as a special case when the number of + // capturing subpatterns exceeds the size of the vector. + // When this happens, there is a match and the output vector + // is filled, but we miss out on the positions of the extra subpatterns. + rc = vecsize / 2; + } else if (rc < 0) { + switch (rc) { + case PCRE_ERROR_NOMATCH: + return 0; + case PCRE_ERROR_MATCHLIMIT: + // Writing to hit_limit is not safe if multiple threads + // are using the PCRE, but the flag is only intended + // for use by unit tests anyway, so we let it go. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded match limit of " << match_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + case PCRE_ERROR_RECURSIONLIMIT: + // See comment about hit_limit above. + hit_limit_ = true; + PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit + << " when matching '" << pattern_ << "'" + << " against text that is " << text.size() << " bytes."; + return 0; + default: + // There are other return codes from pcre.h : + // PCRE_ERROR_NULL (-2) + // PCRE_ERROR_BADOPTION (-3) + // PCRE_ERROR_BADMAGIC (-4) + // PCRE_ERROR_UNKNOWN_NODE (-5) + // PCRE_ERROR_NOMEMORY (-6) + // PCRE_ERROR_NOSUBSTRING (-7) + // ... + PCREPORT(ERROR) << "Unexpected return code: " << rc + << " when matching '" << pattern_ << "'" + << ", re=" << re + << ", text=" << text + << ", vec=" << vec + << ", vecsize=" << vecsize; + return 0; + } + } + + return rc; +} + +bool PCRE::DoMatchImpl(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { + assert((1 + n) * 3 <= vecsize); // results + PCRE workspace + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); + assert(matches >= 0); // TryMatch never returns negatives + if (matches == 0) + return false; + + *consumed = vec[1]; + + if (n == 0 || args == NULL) { + // We are not interested in results + return true; + } + if (NumberOfCapturingGroups() < n) { + // PCRE has fewer capturing groups than number of arg pointers passed in + return false; + } + + // If we got here, we must have matched the whole pattern. + // We do not need (can not do) any more checks on the value of 'matches' here + // -- see the comment for TryMatch. + for (int i = 0; i < n; i++) { + const int start = vec[2*(i+1)]; + const int limit = vec[2*(i+1)+1]; + if (!args[i]->Parse(text.data() + start, limit-start)) { + // TODO: Should we indicate what the error was? + return false; + } + } + + return true; +} + +bool PCRE::DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const args[], + int n) const { + assert(n >= 0); + size_t const vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int *vec = new int[vecsize]; + bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); + delete[] vec; + return b; +} + +bool PCRE::Rewrite(string *out, const StringPiece &rewrite, + const StringPiece &text, int *vec, int veclen) const { + int number_of_capturing_groups = NumberOfCapturingGroups(); + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c == '\\') { + c = *++s; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (n <= number_of_capturing_groups) { + // unmatched optional capturing group. treat + // its value as empty string; i.e., nothing to append. + } else { + PCREPORT(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + return false; + } + } + int start = vec[2 * n]; + if (start >= 0) + out->append(text.data() + start, vec[2 * n + 1] - start); + } else if (c == '\\') { + out->push_back('\\'); + } else { + PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } else { + out->push_back(c); + } + } + return true; +} + +bool PCRE::CheckRewriteString(const StringPiece& rewrite, string* error) const { + int max_token = -1; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + int c = *s; + if (c != '\\') { + continue; + } + if (++s == end) { + *error = "Rewrite schema error: '\\' not allowed at end."; + return false; + } + c = *s; + if (c == '\\') { + continue; + } + if (!isdigit(c)) { + *error = "Rewrite schema error: " + "'\\' must be followed by a digit or '\\'."; + return false; + } + int n = (c - '0'); + if (max_token < n) { + max_token = n; + } + } + + if (max_token > NumberOfCapturingGroups()) { + SStringPrintf(error, "Rewrite schema requests %d matches, " + "but the regexp only has %d parenthesized subexpressions.", + max_token, NumberOfCapturingGroups()); + return false; + } + return true; +} + + +// Return the number of capturing subpatterns, or -1 if the +// regexp wasn't valid on construction. +int PCRE::NumberOfCapturingGroups() const { + if (re_partial_ == NULL) return -1; + + int result; + CHECK(pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result) == 0); + return result; +} + + +/***** Parsers for various types *****/ + +bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { + // We fail if somebody asked us to store into a non-NULL void* pointer + return (dest == NULL); +} + +bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast(dest)->assign(str, n); + return true; +} + +bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { + if (dest == NULL) return true; + reinterpret_cast(dest)->set(str, n); + return true; +} + +bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +// Largest number spec that we are willing to parse +static const int kMaxNumberLength = 32; + +// PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1 +// PCREQUIPCRES "n > 0" +// Copies "str" into "buf" and null-terminates if necessary. +// Returns one of: +// a. "str" if no termination is needed +// b. "buf" if the string was copied and null-terminated +// c. "" if the input was invalid and has no hope of being parsed +static const char* TerminateNumber(char* buf, const char* str, int n) { + if ((n > 0) && isspace(*str)) { + // We are less forgiving than the strtoxxx() routines and do not + // allow leading spaces. + return ""; + } + + // See if the character right after the input text may potentially + // look like a digit. + if (isdigit(str[n]) || + ((str[n] >= 'a') && (str[n] <= 'f')) || + ((str[n] >= 'A') && (str[n] <= 'F'))) { + if (n > kMaxNumberLength) return ""; // Input too big to be a valid number + memcpy(buf, str, n); + buf[n] = '\0'; + return buf; + } else { + // We can parse right out of the supplied string, so return it. + return str; + } +} + +bool PCRE::Arg::parse_long_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + long r = strtol(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + + char* end; + errno = 0; + unsigned long r = strtoul(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_short_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ushort_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((ushort)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_int_radix(const char* str, + int n, + void* dest, + int radix) { + long r; + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_uint_radix(const char* str, + int n, + void* dest, + int radix) { + unsigned long r; + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((uint)r != r) return false; // Out of range + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_longlong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + char* end; + errno = 0; + int64 r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_ulonglong_radix(const char* str, + int n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, str, n); + if (str[0] == '-') { + // strtoull() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; + } + char* end; + errno = 0; + uint64 r = strtoull(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { + if (n == 0) return false; + static const int kMaxLength = 200; + char buf[kMaxLength]; + if (n >= kMaxLength) return false; + memcpy(buf, str, n); + buf[n] = '\0'; + errno = 0; + char* end; + double r = strtod(buf, &end); + if (end != buf + n) { +#ifdef COMPILER_MSVC + // Microsoft's strtod() doesn't handle inf and nan, so we have to + // handle it explicitly. Speed is not important here because this + // code is only called in unit tests. + bool pos = true; + const char* i = buf; + if ('-' == *i) { + pos = false; + ++i; + } else if ('+' == *i) { + ++i; + } + if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { + r = numeric_limits::infinity(); + if (!pos) + r = -r; + } else if (0 == stricmp(i, "nan")) { + r = numeric_limits::quiet_NaN(); + } else { + return false; + } +#else + return false; // Leftover junk +#endif + } + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { + double r; + if (!parse_double(str, n, &r)) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = static_cast(r); + return true; +} + + +#define DEFINE_INTEGER_PARSERS(name) \ + bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ + } + +DEFINE_INTEGER_PARSERS(short); +DEFINE_INTEGER_PARSERS(ushort); +DEFINE_INTEGER_PARSERS(int); +DEFINE_INTEGER_PARSERS(uint); +DEFINE_INTEGER_PARSERS(long); +DEFINE_INTEGER_PARSERS(ulong); +DEFINE_INTEGER_PARSERS(longlong); +DEFINE_INTEGER_PARSERS(ulonglong); + +#undef DEFINE_INTEGER_PARSERS + +} // namespace re2 diff --git a/outside/re2/util/pcre.h b/outside/re2/util/pcre.h new file mode 100644 index 000000000..4dda95dfa --- /dev/null +++ b/outside/re2/util/pcre.h @@ -0,0 +1,679 @@ +// Copyright 2003-2010 Google Inc. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is a variant of PCRE's pcrecpp.h, originally written at Google. +// The main changes are the addition of the HitLimit method and +// compilation as PCRE in namespace re2. + +// C++ interface to the pcre regular-expression library. PCRE supports +// Perl-style regular expressions (with extensions like \d, \w, \s, +// ...). +// +// ----------------------------------------------------------------------- +// REGEXP SYNTAX: +// +// This module uses the pcre library and hence supports its syntax +// for regular expressions: +// +// http://www.google.com/search?q=pcre +// +// The syntax is pretty similar to Perl's. For those not familiar +// with Perl's regular expressions, here are some examples of the most +// commonly used extensions: +// +// "hello (\\w+) world" -- \w matches a "word" character +// "version (\\d+)" -- \d matches a digit +// "hello\\s+world" -- \s matches any whitespace character +// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary +// "(?i)hello" -- (?i) turns on case-insensitive matching +// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible +// +// ----------------------------------------------------------------------- +// MATCHING INTERFACE: +// +// The "FullMatch" operation checks that supplied text matches a +// supplied pattern exactly. +// +// Example: successful match +// CHECK(PCRE::FullMatch("hello", "h.*o")); +// +// Example: unsuccessful match (requires full match): +// CHECK(!PCRE::FullMatch("hello", "e")); +// +// ----------------------------------------------------------------------- +// UTF-8 AND THE MATCHING INTERFACE: +// +// By default, pattern and text are plain text, one byte per character. +// The UTF8 flag, passed to the constructor, causes both pattern +// and string to be treated as UTF-8 text, still a byte stream but +// potentially multiple bytes per character. In practice, the text +// is likelier to be UTF-8 than the pattern, but the match returned +// may depend on the UTF8 flag, so always use it when matching +// UTF8 text. E.g., "." will match one byte normally but with UTF8 +// set may match up to three bytes of a multi-byte character. +// +// Example: +// PCRE re(utf8_pattern, PCRE::UTF8); +// CHECK(PCRE::FullMatch(utf8_string, re)); +// +// ----------------------------------------------------------------------- +// MATCHING WITH SUB-STRING EXTRACTION: +// +// You can supply extra pointer arguments to extract matched subpieces. +// +// Example: extracts "ruby" into "s" and 1234 into "i" +// int i; +// string s; +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); +// +// Example: fails because string cannot be stored in integer +// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i)); +// +// Example: fails because there aren't enough sub-patterns: +// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s)); +// +// Example: does not try to extract any extra sub-patterns +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s)); +// +// Example: does not try to extract into NULL +// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i)); +// +// Example: integer overflow causes failure +// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); +// +// ----------------------------------------------------------------------- +// PARTIAL MATCHES +// +// You can use the "PartialMatch" operation when you want the pattern +// to match any substring of the text. +// +// Example: simple search for a string: +// CHECK(PCRE::PartialMatch("hello", "ell")); +// +// Example: find first number in a string +// int number; +// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number)); +// CHECK_EQ(number, 100); +// +// ----------------------------------------------------------------------- +// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS +// +// PCRE makes it easy to use any string as a regular expression, without +// requiring a separate compilation step. +// +// If speed is of the essence, you can create a pre-compiled "PCRE" +// object from the pattern and use it multiple times. If you do so, +// you can typically parse text faster than with sscanf. +// +// Example: precompile pattern for faster matching: +// PCRE pattern("h.*o"); +// while (ReadLine(&str)) { +// if (PCRE::FullMatch(str, pattern)) ...; +// } +// +// ----------------------------------------------------------------------- +// SCANNING TEXT INCPCREMENTALLY +// +// The "Consume" operation may be useful if you want to repeatedly +// match regular expressions at the front of a string and skip over +// them as they match. This requires use of the "StringPiece" type, +// which represents a sub-range of a real string. +// +// Example: read lines of the form "var = value" from a string. +// string contents = ...; // Fill string somehow +// StringPiece input(contents); // Wrap a StringPiece around it +// +// string var; +// int value; +// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) { +// ...; +// } +// +// Each successful call to "Consume" will set "var/value", and also +// advance "input" so it points past the matched text. Note that if the +// regular expression matches an empty string, input will advance +// by 0 bytes. If the regular expression being used might match +// an empty string, the loop body must check for this case and either +// advance the string or break out of the loop. +// +// The "FindAndConsume" operation is similar to "Consume" but does not +// anchor your match at the beginning of the string. For example, you +// could extract all words from a string by repeatedly calling +// PCRE::FindAndConsume(&input, "(\\w+)", &word) +// +// ----------------------------------------------------------------------- +// PARSING HEX/OCTAL/C-RADIX NUMBERS +// +// By default, if you pass a pointer to a numeric value, the +// corresponding text is interpreted as a base-10 number. You can +// instead wrap the pointer with a call to one of the operators Hex(), +// Octal(), or CRadix() to interpret the text in another base. The +// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) +// prefixes, but defaults to base-10. +// +// Example: +// int a, b, c, d; +// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)", +// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d)); +// will leave 64 in a, b, c, and d. + +#include "util/util.h" +#include "re2/stringpiece.h" + +#ifdef USEPCRE +#include +namespace re2 { +const bool UsingPCRE = true; +} // namespace re2 +#else +namespace re2 { +const bool UsingPCRE = false; +struct pcre; +struct pcre_extra { int flags, match_limit, match_limit_recursion; }; +#define pcre_free(x) {} +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 +#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); }) +#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; }) +#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; }) +} // namespace re2 +#endif + +namespace re2 { + +class PCRE_Options; + +// Interface for regular expression matching. Also corresponds to a +// pre-compiled regular expression. An "PCRE" object is safe for +// concurrent use by multiple threads. +class PCRE { + public: + // We convert user-passed pointers into special Arg objects + class Arg; + + // Marks end of arg list. + // ONLY USE IN OPTIONAL ARG DEFAULTS. + // DO NOT PASS EXPLICITLY. + static Arg no_more_args; + + // Options are same value as those in pcre. We provide them here + // to avoid users needing to include pcre.h and also to isolate + // users from pcre should we change the underlying library. + // Only those needed by Google programs are exposed here to + // avoid collision with options employed internally by regexp.cc + // Note that some options have equivalents that can be specified in + // the regexp itself. For example, prefixing your regexp with + // "(?s)" has the same effect as the PCRE_DOTALL option. + enum Option { + None = 0x0000, + UTF8 = 0x0800, // == PCRE_UTF8 + EnabledCompileOptions = UTF8, + EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag + }; + + // We provide implicit conversions from strings so that users can + // pass in a string or a "const char*" wherever an "PCRE" is expected. + PCRE(const char* pattern); + PCRE(const char* pattern, Option option); + PCRE(const string& pattern); + PCRE(const string& pattern, Option option); + PCRE(const char *pattern, const PCRE_Options& re_option); + PCRE(const string& pattern, const PCRE_Options& re_option); + + ~PCRE(); + + // The string specification for this PCRE. E.g. + // PCRE re("ab*c?d+"); + // re.pattern(); // "ab*c?d+" + const string& pattern() const { return pattern_; } + + // If PCRE could not be created properly, returns an error string. + // Else returns the empty string. + const string& error() const { return *error_; } + + // Whether the PCRE has hit a match limit during execution. + // Not thread safe. Intended only for testing. + // If hitting match limits is a problem, + // you should be using PCRE2 (re2/re2.h) + // instead of checking this flag. + bool HitLimit(); + void ClearHitLimit(); + + /***** The useful part: the matching interface *****/ + + // Matches "text" against "pattern". If pointer arguments are + // supplied, copies matched sub-patterns into them. + // + // You can pass in a "const char*" or a "string" for "text". + // You can pass in a "const char*" or a "string" or a "PCRE" for "pattern". + // + // The provided pointer arguments can be pointers to any scalar numeric + // type, or one of: + // string (matched piece is copied to string) + // StringPiece (StringPiece is mutated to point to matched piece) + // T (where "bool T::ParseFrom(const char*, int)" exists) + // (void*)NULL (the corresponding matched sub-pattern is not copied) + // + // Returns true iff all of the following conditions are satisfied: + // a. "text" matches "pattern" exactly + // b. The number of matched sub-patterns is >= number of supplied pointers + // c. The "i"th argument has a suitable type for holding the + // string captured as the "i"th sub-pattern. If you pass in + // NULL for the "i"th argument, or pass fewer arguments than + // number of sub-patterns, "i"th captured sub-pattern is + // ignored. + // + // CAVEAT: An optional sub-pattern that does not exist in the + // matched string is assigned the empty string. Therefore, the + // following will return false (because the empty string is not a + // valid number): + // int number; + // PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number); + struct FullMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FullMatchFunctor FullMatch; + + // Exactly like FullMatch(), except that "pattern" is allowed to match + // a substring of "text". + struct PartialMatchFunctor { + bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const PartialMatchFunctor PartialMatch; + + // Like FullMatch() and PartialMatch(), except that pattern has to + // match a prefix of "text", and "input" is advanced past the matched + // text. Note: "input" is modified iff this routine returns true. + struct ConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const ConsumeFunctor Consume; + + // Like Consume(..), but does not anchor the match at the beginning of the + // string. That is, "pattern" need not start its match at the beginning of + // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next + // word in "s" and stores it in "word". + struct FindAndConsumeFunctor { + bool operator ()(StringPiece* input, const PCRE& pattern, + const Arg& ptr1 = no_more_args, + const Arg& ptr2 = no_more_args, + const Arg& ptr3 = no_more_args, + const Arg& ptr4 = no_more_args, + const Arg& ptr5 = no_more_args, + const Arg& ptr6 = no_more_args, + const Arg& ptr7 = no_more_args, + const Arg& ptr8 = no_more_args, + const Arg& ptr9 = no_more_args, + const Arg& ptr10 = no_more_args, + const Arg& ptr11 = no_more_args, + const Arg& ptr12 = no_more_args, + const Arg& ptr13 = no_more_args, + const Arg& ptr14 = no_more_args, + const Arg& ptr15 = no_more_args, + const Arg& ptr16 = no_more_args) const; + }; + + static const FindAndConsumeFunctor FindAndConsume; + + // Replace the first match of "pattern" in "str" with "rewrite". + // Within "rewrite", backslash-escaped digits (\1 to \9) can be + // used to insert text matching corresponding parenthesized group + // from the pattern. \0 in "rewrite" refers to the entire matching + // text. E.g., + // + // string s = "yabba dabba doo"; + // CHECK(PCRE::Replace(&s, "b+", "d")); + // + // will leave "s" containing "yada dabba doo" + // + // Returns true if the pattern matches and a replacement occurs, + // false otherwise. + static bool Replace(string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace(), except replaces all occurrences of the pattern in + // the string with the rewrite. Replacements are not subject to + // re-matching. E.g., + // + // string s = "yabba dabba doo"; + // CHECK(PCRE::GlobalReplace(&s, "b+", "d")); + // + // will leave "s" containing "yada dada doo" + // + // Returns the number of replacements made. + static int GlobalReplace(string *str, + const PCRE& pattern, + const StringPiece& rewrite); + + // Like Replace, except that if the pattern matches, "rewrite" + // is copied into "out" with substitutions. The non-matching + // portions of "text" are ignored. + // + // Returns true iff a match occurred and the extraction happened + // successfully; if no match occurs, the string is left unaffected. + static bool Extract(const StringPiece &text, + const PCRE& pattern, + const StringPiece &rewrite, + string *out); + + // Check that the given @p rewrite string is suitable for use with + // this PCRE. It checks that: + // * The PCRE has enough parenthesized subexpressions to satisfy all + // of the \N tokens in @p rewrite, and + // * The @p rewrite string doesn't have any syntax errors + // ('\' followed by anything besides [0-9] and '\'). + // Making this test will guarantee that "replace" and "extract" + // operations won't LOG(ERROR) or fail because of a bad rewrite + // string. + // @param rewrite The proposed rewrite string. + // @param error An error message is recorded here, iff we return false. + // Otherwise, it is unchanged. + // @return true, iff @p rewrite is suitable for use with the PCRE. + bool CheckRewriteString(const StringPiece& rewrite, string* error) const; + + // Returns a copy of 'unquoted' with all potentially meaningful + // regexp characters backslash-escaped. The returned string, used + // as a regular expression, will exactly match the original string. + // For example, + // 1.5-2.0? + // becomes: + // 1\.5\-2\.0\? + static string QuoteMeta(const StringPiece& unquoted); + + /***** Generic matching interface (not so nice to use) *****/ + + // Type of match (TODO: Should be restructured as an Option) + enum Anchor { + UNANCHORED, // No anchoring + ANCHOR_START, // Anchor at start only + ANCHOR_BOTH, // Anchor at start and end + }; + + // General matching routine. Stores the length of the match in + // "*consumed" if successful. + bool DoMatch(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const* args, int n) const; + + // Return the number of capturing subpatterns, or -1 if the + // regexp wasn't valid on construction. + int NumberOfCapturingGroups() const; + + private: + void Init(const char* pattern, Option option, int match_limit, + int stack_limit, bool report_errors); + + // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with + // pairs of integers for the beginning and end positions of matched + // text. The first pair corresponds to the entire matched text; + // subsequent pairs correspond, in order, to parentheses-captured + // matches. Returns the number of pairs (one more than the number of + // the last subpattern with a match) if matching was successful + // and zero if the match failed. + // I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching + // against "foo", "bar", and "baz" respectively. + // When matching PCRE("(foo)|hello") against "hello", it will return 1. + // But the values for all subpattern are filled in into "vec". + int TryMatch(const StringPiece& text, + int startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const; + + // Append the "rewrite" string, with backslash subsitutions from "text" + // and "vec", to string "out". + bool Rewrite(string *out, + const StringPiece &rewrite, + const StringPiece &text, + int *vec, + int veclen) const; + + // internal implementation for DoMatch + bool DoMatchImpl(const StringPiece& text, + Anchor anchor, + int* consumed, + const Arg* const args[], + int n, + int* vec, + int vecsize) const; + + // Compile the regexp for the specified anchoring mode + pcre* Compile(Anchor anchor); + + string pattern_; + Option options_; + pcre* re_full_; // For full matches + pcre* re_partial_; // For partial matches + const string* error_; // Error indicator (or empty string) + bool report_errors_; // Silences error logging if false + int match_limit_; // Limit on execution resources + int stack_limit_; // Limit on stack resources (bytes) + mutable int32_t hit_limit_; // Hit limit during execution (bool)? + DISALLOW_EVIL_CONSTRUCTORS(PCRE); +}; + +// PCRE_Options allow you to set the PCRE::Options, plus any pcre +// "extra" options. The only extras are match_limit, which limits +// the CPU time of a match, and stack_limit, which limits the +// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default +// that should not cause too many problems in production code. +// If PCRE hits a limit during a match, it may return a false negative, +// but (hopefully) it won't crash. +// +// NOTE: If you are handling regular expressions specified by +// (external or internal) users, rather than hard-coded ones, +// you should be using PCRE2, which uses an alternate implementation +// that avoids these issues. See http://go/re2quick. +class PCRE_Options { + public: + // constructor + PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {} + // accessors + PCRE::Option option() const { return option_; } + void set_option(PCRE::Option option) { + option_ = option; + } + int match_limit() const { return match_limit_; } + void set_match_limit(int match_limit) { + match_limit_ = match_limit; + } + int stack_limit() const { return stack_limit_; } + void set_stack_limit(int stack_limit) { + stack_limit_ = stack_limit; + } + + // If the regular expression is malformed, an error message will be printed + // iff report_errors() is true. Default: true. + bool report_errors() const { return report_errors_; } + void set_report_errors(bool report_errors) { + report_errors_ = report_errors; + } + private: + PCRE::Option option_; + int match_limit_; + int stack_limit_; + bool report_errors_; +}; + + +/***** Implementation details *****/ + +// Hex/Octal/Binary? + +// Special class for parsing into objects that define a ParseFrom() method +template +class _PCRE_MatchObject { + public: + static inline bool Parse(const char* str, int n, void* dest) { + if (dest == NULL) return true; + T* object = reinterpret_cast(dest); + return object->ParseFrom(str, n); + } +}; + +class PCRE::Arg { + public: + // Empty constructor so we can declare arrays of PCRE::Arg + Arg(); + + // Constructor specially designed for NULL arguments + Arg(void*); + + typedef bool (*Parser)(const char* str, int n, void* dest); + +// Type-specific parsers +#define MAKE_PARSER(type,name) \ + Arg(type* p) : arg_(p), parser_(name) { } \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ + + + MAKE_PARSER(char, parse_char); + MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(short, parse_short); + MAKE_PARSER(unsigned short, parse_ushort); + MAKE_PARSER(int, parse_int); + MAKE_PARSER(unsigned int, parse_uint); + MAKE_PARSER(long, parse_long); + MAKE_PARSER(unsigned long, parse_ulong); + MAKE_PARSER(long long, parse_longlong); + MAKE_PARSER(unsigned long long, parse_ulonglong); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + +#undef MAKE_PARSER + + // Generic constructor + template Arg(T*, Parser parser); + // Generic constructor template + template Arg(T* p) + : arg_(p), parser_(_PCRE_MatchObject::Parse) { + } + + // Parse the data + bool Parse(const char* str, int n) const; + + private: + void* arg_; + Parser parser_; + + static bool parse_null (const char* str, int n, void* dest); + static bool parse_char (const char* str, int n, void* dest); + static bool parse_uchar (const char* str, int n, void* dest); + static bool parse_float (const char* str, int n, void* dest); + static bool parse_double (const char* str, int n, void* dest); + static bool parse_string (const char* str, int n, void* dest); + static bool parse_stringpiece (const char* str, int n, void* dest); + +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_ ## name(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _radix( \ + const char* str, int n, void* dest, int radix); \ + public: \ + static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ + static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) + + DECLARE_INTEGER_PARSER(short); + DECLARE_INTEGER_PARSER(ushort); + DECLARE_INTEGER_PARSER(int); + DECLARE_INTEGER_PARSER(uint); + DECLARE_INTEGER_PARSER(long); + DECLARE_INTEGER_PARSER(ulong); + DECLARE_INTEGER_PARSER(longlong); + DECLARE_INTEGER_PARSER(ulonglong); + +#undef DECLARE_INTEGER_PARSER +}; + +inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } +inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } + +inline bool PCRE::Arg::Parse(const char* str, int n) const { + return (*parser_)(str, n, arg_); +} + +// This part of the parser, appropriate only for ints, deals with bases +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); } + +MAKE_INTEGER_PARSER(short, short); +MAKE_INTEGER_PARSER(unsigned short, ushort); +MAKE_INTEGER_PARSER(int, int); +MAKE_INTEGER_PARSER(unsigned int, uint); +MAKE_INTEGER_PARSER(long, long); +MAKE_INTEGER_PARSER(unsigned long, ulong); +MAKE_INTEGER_PARSER(long long, longlong); +MAKE_INTEGER_PARSER(unsigned long long, ulonglong); + +#undef MAKE_INTEGER_PARSER + +} // namespace re2 diff --git a/outside/re2/util/random.cc b/outside/re2/util/random.cc new file mode 100644 index 000000000..49d619587 --- /dev/null +++ b/outside/re2/util/random.cc @@ -0,0 +1,34 @@ +// Copyright 2005-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Modified from Google perftools's tcmalloc_unittest.cc. + +#include "util/random.h" + +namespace re2 { + +int32 ACMRandom::Next() { + const int32 M = 2147483647L; // 2^31-1 + const int32 A = 16807; + // In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1 + uint32 lo = A * (int32)(seed_ & 0xFFFF); + uint32 hi = A * (int32)((uint32)seed_ >> 16); + lo += (hi & 0x7FFF) << 16; + if (lo > M) { + lo &= M; + ++lo; + } + lo += hi >> 15; + if (lo > M) { + lo &= M; + ++lo; + } + return (seed_ = (int32) lo); +} + +int32 ACMRandom::Uniform(int32 n) { + return Next() % n; +} + +} // namespace re2 diff --git a/outside/re2/util/random.h b/outside/re2/util/random.h new file mode 100644 index 000000000..6c6e701dd --- /dev/null +++ b/outside/re2/util/random.h @@ -0,0 +1,29 @@ +// Copyright 2005-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Modified from Google perftools's tcmalloc_unittest.cc. + +#ifndef RE2_UTIL_RANDOM_H__ +#define RE2_UTIL_RANDOM_H__ + +#include "util/util.h" + +namespace re2 { + +// ACM minimal standard random number generator. (re-entrant.) +class ACMRandom { + public: + ACMRandom(int32 seed) : seed_(seed) {} + int32 Next(); + int32 Uniform(int32); + + void Reset(int32 seed) { seed_ = seed; } + + private: + int32 seed_; +}; + +} // namespace re2 + +#endif // RE2_UTIL_RANDOM_H__ diff --git a/outside/re2/util/rune.cc b/outside/re2/util/rune.cc new file mode 100644 index 000000000..26442b0ad --- /dev/null +++ b/outside/re2/util/rune.cc @@ -0,0 +1,258 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#include +#include +#include "util/utf.h" + +namespace re2 { + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1< T1 + */ + c = *(unsigned char*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(unsigned char*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(unsigned char*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(unsigned char*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(unsigned char*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} + + +int +utflen(const char *s) +{ + int c; + long n; + Rune rune; + + n = 0; + for(;;) { + c = *(unsigned char*)s; + if(c < Runeself) { + if(c == 0) + return n; + s++; + } else + s += chartorune(&rune, s); + n++; + } + return 0; +} + +char* +utfrune(const char *s, Rune c) +{ + long c1; + Rune r; + int n; + + if(c < Runesync) /* not part of utf sequence */ + return strchr((char*)s, c); + + for(;;) { + c1 = *(unsigned char*)s; + if(c1 < Runeself) { /* one byte rune */ + if(c1 == 0) + return 0; + if(c1 == c) + return (char*)s; + s++; + continue; + } + n = chartorune(&r, s); + if(r == c) + return (char*)s; + s += n; + } + return 0; +} + +} // namespace re2 diff --git a/outside/re2/util/sparse_array.h b/outside/re2/util/sparse_array.h new file mode 100644 index 000000000..3e33f8999 --- /dev/null +++ b/outside/re2/util/sparse_array.h @@ -0,0 +1,453 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// DESCRIPTION +// +// SparseArray(m) is a map from integers in [0, m) to T values. +// It requires (sizeof(T)+sizeof(int))*m memory, but it provides +// fast iteration through the elements in the array and fast clearing +// of the array. The array has a concept of certain elements being +// uninitialized (having no value). +// +// Insertion and deletion are constant time operations. +// +// Allocating the array is a constant time operation +// when memory allocation is a constant time operation. +// +// Clearing the array is a constant time operation (unusual!). +// +// Iterating through the array is an O(n) operation, where n +// is the number of items in the array (not O(m)). +// +// The array iterator visits entries in the order they were first +// inserted into the array. It is safe to add items to the array while +// using an iterator: the iterator will visit indices added to the array +// during the iteration, but will not re-visit indices whose values +// change after visiting. Thus SparseArray can be a convenient +// implementation of a work queue. +// +// The SparseArray implementation is NOT thread-safe. It is up to the +// caller to make sure only one thread is accessing the array. (Typically +// these arrays are temporary values and used in situations where speed is +// important.) +// +// The SparseArray interface does not present all the usual STL bells and +// whistles. +// +// Implemented with reference to Briggs & Torczon, An Efficient +// Representation for Sparse Sets, ACM Letters on Programming Languages +// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. +// +// Briggs & Torczon popularized this technique, but it had been known +// long before their paper. They point out that Aho, Hopcroft, and +// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's +// 1986 Programming Pearls both hint at the technique in exercises to the +// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1 +// exercise 8). +// +// Briggs & Torczon describe a sparse set implementation. I have +// trivially generalized it to create a sparse array (actually the original +// target of the AHU and Bentley exercises). + +// IMPLEMENTATION +// +// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of +// size max_size_. At any point, the number of elements in the sparse array is +// size_. +// +// The vector dense_ contains the size_ elements in the sparse array (with +// their indices), +// in the order that the elements were first inserted. This array is dense: +// the size_ pairs are dense_[0] through dense_[size_-1]. +// +// The array sparse_to_dense_ maps from indices in [0,m) to indices in +// [0,size_). +// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i. +// For indices not present in the array, sparse_to_dense_ can contain +// any value at all, perhaps outside the range [0, size_) but perhaps not. +// +// The lax requirement on sparse_to_dense_ values makes clearing +// the array very easy: set size_ to 0. Lookups are slightly more +// complicated. An index i has a value in the array if and only if: +// sparse_to_dense_[i] is in [0, size_) AND +// dense_[sparse_to_dense_[i]].index_ == i. +// If both these properties hold, only then it is safe to refer to +// dense_[sparse_to_dense_[i]].value_ +// as the value associated with index i. +// +// To insert a new entry, set sparse_to_dense_[i] to size_, +// initialize dense_[size_], and then increment size_. +// +// Deletion of specific values from the array is implemented by +// swapping dense_[size_-1] and the dense_ being deleted and then +// updating the appropriate sparse_to_dense_ entries. +// +// To make the sparse array as efficient as possible for non-primitive types, +// elements may or may not be destroyed when they are deleted from the sparse +// array through a call to erase(), erase_existing() or resize(). They +// immediately become inaccessible, but they are only guaranteed to be +// destroyed when the SparseArray destructor is called. + +#ifndef RE2_UTIL_SPARSE_ARRAY_H__ +#define RE2_UTIL_SPARSE_ARRAY_H__ + +#include "util/util.h" + +namespace re2 { + +template +class SparseArray { + public: + SparseArray(); + SparseArray(int max_size); + ~SparseArray(); + + // IndexValue pairs: exposed in SparseArray::iterator. + class IndexValue; + + typedef IndexValue value_type; + typedef typename vector::iterator iterator; + typedef typename vector::const_iterator const_iterator; + + inline const IndexValue& iv(int i) const; + + // Return the number of entries in the array. + int size() const { + return size_; + } + + // Iterate over the array. + iterator begin() { + return dense_.begin(); + } + iterator end() { + return dense_.begin() + size_; + } + + const_iterator begin() const { + return dense_.begin(); + } + const_iterator end() const { + return dense_.begin() + size_; + } + + // Change the maximum size of the array. + // Invalidates all iterators. + void resize(int max_size); + + // Return the maximum size of the array. + // Indices can be in the range [0, max_size). + int max_size() const { + return max_size_; + } + + // Clear the array. + void clear() { + size_ = 0; + } + + // Check whether index i is in the array. + inline bool has_index(int i) const; + + // Comparison function for sorting. + // Can sort the sparse array so that future iterations + // will visit indices in increasing order using + // sort(arr.begin(), arr.end(), arr.less); + static bool less(const IndexValue& a, const IndexValue& b); + + public: + // Set the value at index i to v. + inline iterator set(int i, Value v); + + pair insert(const value_type& new_value); + + // Returns the value at index i + // or defaultv if index i is not initialized in the array. + inline Value get(int i, Value defaultv) const; + + iterator find(int i); + + const_iterator find(int i) const; + + // Change the value at index i to v. + // Fast but unsafe: only use if has_index(i) is true. + inline iterator set_existing(int i, Value v); + + // Set the value at the new index i to v. + // Fast but unsafe: only use if has_index(i) is false. + inline iterator set_new(int i, Value v); + + // Get the value at index i from the array.. + // Fast but unsafe: only use if has_index(i) is true. + inline Value get_existing(int i) const; + + // Erasing items from the array during iteration is in general + // NOT safe. There is one special case, which is that the current + // index-value pair can be erased as long as the iterator is then + // checked for being at the end before being incremented. + // For example: + // + // for (i = m.begin(); i != m.end(); ++i) { + // if (ShouldErase(i->index(), i->value())) { + // m.erase(i->index()); + // --i; + // } + // } + // + // Except in the specific case just described, elements must + // not be erased from the array (including clearing the array) + // while iterators are walking over the array. Otherwise, + // the iterators could walk past the end of the array. + + // Erases the element at index i from the array. + inline void erase(int i); + + // Erases the element at index i from the array. + // Fast but unsafe: only use if has_index(i) is true. + inline void erase_existing(int i); + + private: + // Add the index i to the array. + // Only use if has_index(i) is known to be false. + // Since it doesn't set the value associated with i, + // this function is private, only intended as a helper + // for other methods. + inline void create_index(int i); + + // In debug mode, verify that some invariant properties of the class + // are being maintained. This is called at the end of the constructor + // and at the beginning and end of all public non-const member functions. + inline void DebugCheckInvariants() const; + + int size_; + int max_size_; + int* sparse_to_dense_; + vector dense_; + bool valgrind_; + + DISALLOW_EVIL_CONSTRUCTORS(SparseArray); +}; + +template +SparseArray::SparseArray() + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {} + +// IndexValue pairs: exposed in SparseArray::iterator. +template +class SparseArray::IndexValue { + friend class SparseArray; + public: + typedef int first_type; + typedef Value second_type; + + IndexValue() {} + IndexValue(int index, const Value& value) : second(value), index_(index) {} + + int index() const { return index_; } + Value value() const { return second; } + + // Provide the data in the 'second' member so that the utilities + // in map-util work. + Value second; + + private: + int index_; +}; + +template +const typename SparseArray::IndexValue& +SparseArray::iv(int i) const { + DCHECK_GE(i, 0); + DCHECK_LT(i, size_); + return dense_[i]; +} + +// Change the maximum size of the array. +// Invalidates all iterators. +template +void SparseArray::resize(int new_max_size) { + DebugCheckInvariants(); + if (new_max_size > max_size_) { + int* a = new int[new_max_size]; + if (sparse_to_dense_) { + memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); + // Don't need to zero the memory but appease Valgrind. + if (valgrind_) { + for (int i = max_size_; i < new_max_size; i++) + a[i] = 0xababababU; + } + delete[] sparse_to_dense_; + } + sparse_to_dense_ = a; + + dense_.resize(new_max_size); + } + max_size_ = new_max_size; + if (size_ > max_size_) + size_ = max_size_; + DebugCheckInvariants(); +} + +// Check whether index i is in the array. +template +bool SparseArray::has_index(int i) const { + DCHECK_GE(i, 0); + DCHECK_LT(i, max_size_); + if (static_cast(i) >= max_size_) { + return false; + } + // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. + return (uint)sparse_to_dense_[i] < (uint)size_ && + dense_[sparse_to_dense_[i]].index_ == i; +} + +// Set the value at index i to v. +template +typename SparseArray::iterator SparseArray::set(int i, Value v) { + DebugCheckInvariants(); + if (static_cast(i) >= max_size_) { + // Semantically, end() would be better here, but we already know + // the user did something stupid, so begin() insulates them from + // dereferencing an invalid pointer. + return begin(); + } + if (!has_index(i)) + create_index(i); + return set_existing(i, v); +} + +template +pair::iterator, bool> SparseArray::insert( + const value_type& new_value) { + DebugCheckInvariants(); + pair::iterator, bool> p; + if (has_index(new_value.index_)) { + p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false); + } else { + p = make_pair(set_new(new_value.index_, new_value.second), true); + } + DebugCheckInvariants(); + return p; +} + +template +Value SparseArray::get(int i, Value defaultv) const { + if (!has_index(i)) + return defaultv; + return get_existing(i); +} + +template +typename SparseArray::iterator SparseArray::find(int i) { + if (has_index(i)) + return dense_.begin() + sparse_to_dense_[i]; + return end(); +} + +template +typename SparseArray::const_iterator +SparseArray::find(int i) const { + if (has_index(i)) { + return dense_.begin() + sparse_to_dense_[i]; + } + return end(); +} + +template +typename SparseArray::iterator +SparseArray::set_existing(int i, Value v) { + DebugCheckInvariants(); + DCHECK(has_index(i)); + dense_[sparse_to_dense_[i]].second = v; + DebugCheckInvariants(); + return dense_.begin() + sparse_to_dense_[i]; +} + +template +typename SparseArray::iterator +SparseArray::set_new(int i, Value v) { + DebugCheckInvariants(); + if (static_cast(i) >= max_size_) { + // Semantically, end() would be better here, but we already know + // the user did something stupid, so begin() insulates them from + // dereferencing an invalid pointer. + return begin(); + } + DCHECK(!has_index(i)); + create_index(i); + return set_existing(i, v); +} + +template +Value SparseArray::get_existing(int i) const { + DCHECK(has_index(i)); + return dense_[sparse_to_dense_[i]].second; +} + +template +void SparseArray::erase(int i) { + DebugCheckInvariants(); + if (has_index(i)) + erase_existing(i); + DebugCheckInvariants(); +} + +template +void SparseArray::erase_existing(int i) { + DebugCheckInvariants(); + DCHECK(has_index(i)); + int di = sparse_to_dense_[i]; + if (di < size_ - 1) { + dense_[di] = dense_[size_ - 1]; + sparse_to_dense_[dense_[di].index_] = di; + } + size_--; + DebugCheckInvariants(); +} + +template +void SparseArray::create_index(int i) { + DCHECK(!has_index(i)); + DCHECK_LT(size_, max_size_); + sparse_to_dense_[i] = size_; + dense_[size_].index_ = i; + size_++; +} + +template SparseArray::SparseArray(int max_size) { + max_size_ = max_size; + sparse_to_dense_ = new int[max_size]; + valgrind_ = RunningOnValgrind(); + dense_.resize(max_size); + // Don't need to zero the new memory, but appease Valgrind. + if (valgrind_) { + for (int i = 0; i < max_size; i++) { + sparse_to_dense_[i] = 0xababababU; + dense_[i].index_ = 0xababababU; + } + } + size_ = 0; + DebugCheckInvariants(); +} + +template SparseArray::~SparseArray() { + DebugCheckInvariants(); + delete[] sparse_to_dense_; +} + +template void SparseArray::DebugCheckInvariants() const { + DCHECK_LE(0, size_); + DCHECK_LE(size_, max_size_); + DCHECK(size_ == 0 || sparse_to_dense_ != NULL); +} + +// Comparison function for sorting. +template bool SparseArray::less(const IndexValue& a, + const IndexValue& b) { + return a.index_ < b.index_; +} + +} // namespace re2 + +#endif // RE2_UTIL_SPARSE_ARRAY_H__ diff --git a/outside/re2/util/sparse_array_test.cc b/outside/re2/util/sparse_array_test.cc new file mode 100644 index 000000000..bc7a19f81 --- /dev/null +++ b/outside/re2/util/sparse_array_test.cc @@ -0,0 +1,150 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Simple tests that SparseArray behaves. + +#include "util/util.h" +#include "utest/utest.h" + +namespace re2 { + +static const string kNotFound = "NOT FOUND"; + +TEST(SparseArray, BasicOperations) { + static const int n = 50; + SparseArray set(n); + + int order[n]; + int value[n]; + for (int i = 0; i < n; i++) + order[i] = i; + for (int i = 0; i < n; i++) + value[i] = rand()%1000 + 1; + for (int i = 1; i < n; i++) { + int j = rand()%i; + int t = order[i]; + order[i] = order[j]; + order[j] = t; + } + + for (int i = 0;; i++) { + for (int j = 0; j < i; j++) { + ASSERT_TRUE(set.has_index(order[j])); + ASSERT_EQ(value[order[j]], set.get(order[j], -1)); + } + if (i >= n) + break; + for (int j = i; j < n; j++) + ASSERT_FALSE(set.has_index(order[j])); + set.set(order[i], value[order[i]]); + } + + int nn = 0; + for (SparseArray::iterator i = set.begin(); i != set.end(); ++i) { + ASSERT_EQ(order[nn++], i->index()); + ASSERT_EQ(value[i->index()], i->value()); + } + ASSERT_EQ(nn, n); + + set.clear(); + for (int i = 0; i < n; i++) + ASSERT_FALSE(set.has_index(i)); + + ASSERT_EQ(0, set.size()); + ASSERT_EQ(0, distance(set.begin(), set.end())); +} + +class SparseArrayStringTest : public testing::Test { + protected: + SparseArrayStringTest() + : str_map_(10) { + InsertOrUpdate(&str_map_, 1, "a"); + InsertOrUpdate(&str_map_, 5, "b"); + InsertOrUpdate(&str_map_, 2, "c"); + InsertOrUpdate(&str_map_, 7, "d"); + } + + SparseArray str_map_; + typedef SparseArray::iterator iterator; +}; + +TEST_F(SparseArrayStringTest, FindGetsPresentElement) { + iterator it = str_map_.find(2); + ASSERT_TRUE(str_map_.end() != it); + EXPECT_EQ("c", it->second); +} + +TEST_F(SparseArrayStringTest, FindDoesNotFindAbsentElement) { + iterator it = str_map_.find(3); + ASSERT_TRUE(str_map_.end() == it); +} + +TEST_F(SparseArrayStringTest, ContainsKey) { + EXPECT_TRUE(ContainsKey(str_map_, 1)); + EXPECT_TRUE(ContainsKey(str_map_, 2)); + EXPECT_FALSE(ContainsKey(str_map_, 3)); +} + +TEST_F(SparseArrayStringTest, InsertIfNotPresent) { + EXPECT_FALSE(ContainsKey(str_map_, 3)); + EXPECT_TRUE(InsertIfNotPresent(&str_map_, 3, "r")); + EXPECT_EQ("r", FindWithDefault(str_map_, 3, kNotFound)); + EXPECT_FALSE(InsertIfNotPresent(&str_map_, 3, "other value")); + EXPECT_EQ("r", FindWithDefault(str_map_, 3, kNotFound)); +} + +TEST(SparseArrayTest, Erase) { + SparseArray str_map(5); + str_map.set(1, "a"); + str_map.set(2, "b"); + EXPECT_EQ("a", FindWithDefault(str_map, 1, kNotFound)); + EXPECT_EQ("b", FindWithDefault(str_map, 2, kNotFound)); + str_map.erase(1); + EXPECT_EQ("NOT FOUND", FindWithDefault(str_map, 1, kNotFound)); + EXPECT_EQ("b", FindWithDefault(str_map, 2, kNotFound)); +} + +typedef SparseArrayStringTest SparseArrayStringSurvivesInvalidIndexTest; +// TODO(jyasskin): Cover invalid arguments to every method. + +TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNegative) { + EXPECT_DEBUG_DEATH(str_map_.set(-123456789, "hi"), + "\\(jyasskin\\) Illegal index -123456789 passed to" + " SparseArray\\(10\\).set\\(\\)."); + EXPECT_EQ(4, str_map_.size()); +} + +TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetTooBig) { + EXPECT_DEBUG_DEATH(str_map_.set(12345678, "hi"), + "\\(jyasskin\\) Illegal index 12345678 passed to" + " SparseArray\\(10\\).set\\(\\)."); + EXPECT_EQ(4, str_map_.size()); +} + +TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_Negative) { + EXPECT_DEBUG_DEATH(str_map_.set_new(-123456789, "hi"), + "\\(jyasskin\\) Illegal index -123456789 passed to" + " SparseArray\\(10\\).set_new\\(\\)."); + EXPECT_EQ(4, str_map_.size()); +} + +TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_Existing) { + EXPECT_DEBUG_DEATH({ + str_map_.set_new(2, "hi"); + EXPECT_EQ("hi", FindWithDefault(str_map_, 2, kNotFound)); + + // The old value for 2 is still present, but can never be removed. + // This risks crashing later, if the map fills up. + EXPECT_EQ(5, str_map_.size()); + }, "Check failed: !has_index\\(i\\)"); +} + +TEST_F(SparseArrayStringSurvivesInvalidIndexTest, SetNew_TooBig) { + EXPECT_DEBUG_DEATH(str_map_.set_new(12345678, "hi"), + "\\(jyasskin\\) Illegal index 12345678 passed to" + " SparseArray\\(10\\).set_new\\(\\)."); + EXPECT_EQ(4, str_map_.size()); +} + +} // namespace re2 diff --git a/outside/re2/util/sparse_set.h b/outside/re2/util/sparse_set.h new file mode 100644 index 000000000..165dd090e --- /dev/null +++ b/outside/re2/util/sparse_set.h @@ -0,0 +1,179 @@ +// Copyright 2006 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// DESCRIPTION +// +// SparseSet(m) is a set of integers in [0, m). +// It requires sizeof(int)*m memory, but it provides +// fast iteration through the elements in the set and fast clearing +// of the set. +// +// Insertion and deletion are constant time operations. +// +// Allocating the set is a constant time operation +// when memory allocation is a constant time operation. +// +// Clearing the set is a constant time operation (unusual!). +// +// Iterating through the set is an O(n) operation, where n +// is the number of items in the set (not O(m)). +// +// The set iterator visits entries in the order they were first +// inserted into the array. It is safe to add items to the set while +// using an iterator: the iterator will visit indices added to the set +// during the iteration, but will not re-visit indices whose values +// change after visiting. Thus SparseSet can be a convenient +// implementation of a work queue. +// +// The SparseSet implementation is NOT thread-safe. It is up to the +// caller to make sure only one thread is accessing the set. (Typically +// these sets are temporary values and used in situations where speed is +// important.) +// +// The SparseSet interface does not present all the usual STL bells and +// whistles. +// +// Implemented with reference to Briggs & Torczon, An Efficient +// Representation for Sparse Sets, ACM Letters on Programming Languages +// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. +// +// For a generalization to sparse array, see sparse_array.h. + +// IMPLEMENTATION +// +// See sparse_array.h for implementation details + +#ifndef RE2_UTIL_SPARSE_SET_H__ +#define RE2_UTIL_SPARSE_SET_H__ + +#include "util/util.h" + +namespace re2 { + +class SparseSet { + public: + SparseSet() + : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {} + + SparseSet(int max_size) { + max_size_ = max_size; + sparse_to_dense_ = new int[max_size]; + dense_ = new int[max_size]; + valgrind_ = RunningOnValgrind(); + // Don't need to zero the memory, but do so anyway + // to appease Valgrind. + if (valgrind_) { + for (int i = 0; i < max_size; i++) { + dense_[i] = 0xababababU; + sparse_to_dense_[i] = 0xababababU; + } + } + size_ = 0; + } + + ~SparseSet() { + delete[] sparse_to_dense_; + delete[] dense_; + } + + typedef int* iterator; + typedef const int* const_iterator; + + int size() const { return size_; } + iterator begin() { return dense_; } + iterator end() { return dense_ + size_; } + const_iterator begin() const { return dense_; } + const_iterator end() const { return dense_ + size_; } + + // Change the maximum size of the array. + // Invalidates all iterators. + void resize(int new_max_size) { + if (size_ > new_max_size) + size_ = new_max_size; + if (new_max_size > max_size_) { + int* a = new int[new_max_size]; + if (sparse_to_dense_) { + memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); + if (valgrind_) { + for (int i = max_size_; i < new_max_size; i++) + a[i] = 0xababababU; + } + delete[] sparse_to_dense_; + } + sparse_to_dense_ = a; + + a = new int[new_max_size]; + if (dense_) { + memmove(a, dense_, size_*sizeof a[0]); + if (valgrind_) { + for (int i = size_; i < new_max_size; i++) + a[i] = 0xababababU; + } + delete[] dense_; + } + dense_ = a; + } + max_size_ = new_max_size; + } + + // Return the maximum size of the array. + // Indices can be in the range [0, max_size). + int max_size() const { return max_size_; } + + // Clear the array. + void clear() { size_ = 0; } + + // Check whether i is in the array. + bool contains(int i) const { + DCHECK_GE(i, 0); + DCHECK_LT(i, max_size_); + if (static_cast(i) >= max_size_) { + return false; + } + // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. + return (uint)sparse_to_dense_[i] < (uint)size_ && + dense_[sparse_to_dense_[i]] == i; + } + + // Adds i to the set. + void insert(int i) { + if (!contains(i)) + insert_new(i); + } + + // Set the value at the new index i to v. + // Fast but unsafe: only use if contains(i) is false. + void insert_new(int i) { + if (static_cast(i) >= max_size_) { + // Semantically, end() would be better here, but we already know + // the user did something stupid, so begin() insulates them from + // dereferencing an invalid pointer. + return; + } + DCHECK(!contains(i)); + DCHECK_LT(size_, max_size_); + sparse_to_dense_[i] = size_; + dense_[size_] = i; + size_++; + } + + // Comparison function for sorting. + // Can sort the sparse array so that future iterations + // will visit indices in increasing order using + // sort(arr.begin(), arr.end(), arr.less); + static bool less(int a, int b) { return a < b; } + + private: + int size_; + int max_size_; + int* sparse_to_dense_; + int* dense_; + bool valgrind_; + + DISALLOW_EVIL_CONSTRUCTORS(SparseSet); +}; + +} // namespace re2 + +#endif // RE2_UTIL_SPARSE_SET_H__ diff --git a/outside/re2/util/stringpiece.cc b/outside/re2/util/stringpiece.cc new file mode 100644 index 000000000..37895b01e --- /dev/null +++ b/outside/re2/util/stringpiece.cc @@ -0,0 +1,87 @@ +// Copyright 2004 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/stringpiece.h" +#include "util/util.h" + +using re2::StringPiece; + +std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + o.write(piece.data(), piece.size()); + return o; +} + +bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) { + int len = x.size(); + if (len != y.size()) { + return false; + } + const char* p = x.data(); + const char* p2 = y.data(); + // Test last byte in case strings share large common prefix + if ((len > 0) && (p[len-1] != p2[len-1])) return false; + const char* p_limit = p + len; + for (; p < p_limit; p++, p2++) { + if (*p != *p2) + return false; + } + return true; +} + +void StringPiece::CopyToString(string* target) const { + target->assign(ptr_, length_); +} + +int StringPiece::copy(char* buf, size_type n, size_type pos) const { + int ret = min(length_ - pos, n); + memcpy(buf, ptr_ + pos, ret); + return ret; +} + +int StringPiece::find(const StringPiece& s, size_type pos) const { + if (length_ < 0 || pos > static_cast(length_)) + return npos; + + const char* result = std::search(ptr_ + pos, ptr_ + length_, + s.ptr_, s.ptr_ + s.length_); + const size_type xpos = result - ptr_; + return xpos + s.length_ <= length_ ? xpos : npos; +} + +int StringPiece::find(char c, size_type pos) const { + if (length_ <= 0 || pos >= static_cast(length_)) { + return npos; + } + const char* result = std::find(ptr_ + pos, ptr_ + length_, c); + return result != ptr_ + length_ ? result - ptr_ : npos; +} + +int StringPiece::rfind(const StringPiece& s, size_type pos) const { + if (length_ < s.length_) return npos; + const size_t ulen = length_; + if (s.length_ == 0) return min(ulen, pos); + + const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_; + const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); + return result != last ? result - ptr_ : npos; +} + +int StringPiece::rfind(char c, size_type pos) const { + if (length_ <= 0) return npos; + for (int i = min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (ptr_[i] == c) { + return i; + } + } + return npos; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > length_) pos = length_; + if (n > length_ - pos) n = length_ - pos; + return StringPiece(ptr_ + pos, n); +} + +const StringPiece::size_type StringPiece::npos = size_type(-1); diff --git a/outside/re2/util/stringprintf.cc b/outside/re2/util/stringprintf.cc new file mode 100644 index 000000000..c908181e5 --- /dev/null +++ b/outside/re2/util/stringprintf.cc @@ -0,0 +1,78 @@ +// Copyright 2002 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" + +namespace re2 { + +static void StringAppendV(string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + char space[1024]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, sizeof(space), format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (result < sizeof(space))) { + // It fit + dst->append(space, result); + return; + } + + // Repeatedly increase buffer size until it fits + int length = sizeof(space); + while (true) { + if (result < 0) { + // Older behavior: just try doubling the buffer size + length *= 2; + } else { + // We need exactly "result+1" characters + length = result+1; + } + char* buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (result < length)) { + // It fit + dst->append(buf, result); + delete[] buf; + return; + } + delete[] buf; + } +} + +string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +void SStringPrintf(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); +} + +void StringAppendF(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} + +} // namespace re2 diff --git a/outside/re2/util/strutil.cc b/outside/re2/util/strutil.cc new file mode 100644 index 000000000..6ab79b3c6 --- /dev/null +++ b/outside/re2/util/strutil.cc @@ -0,0 +1,97 @@ +// Copyright 1999-2005 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" +#include "re2/stringpiece.h" + +namespace re2 { + +// ---------------------------------------------------------------------- +// CEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// Returns the number of bytes written to 'dest' (not including the \0) +// or -1 if there was insufficient space. +// ---------------------------------------------------------------------- +int CEscapeString(const char* src, int src_len, char* dest, + int dest_len) { + const char* src_end = src + src_len; + int used = 0; + + for (; src < src_end; src++) { + if (dest_len - used < 2) // Need space for two letter escape + return -1; + + unsigned char c = *src; + switch (c) { + case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; + case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; + case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; + case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; + case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; + case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if (c < ' ' || c > '~') { + if (dest_len - used < 4) // need space for 4 letter escape + return -1; + sprintf(dest + used, "\\%03o", c); + used += 4; + } else { + dest[used++] = c; break; + } + } + } + + if (dest_len - used < 1) // make sure that there is room for \0 + return -1; + + dest[used] = '\0'; // doesn't count towards return value though + return used; +} + + +// ---------------------------------------------------------------------- +// CEscape() +// Copies 'src' to result, escaping dangerous characters using +// C-style escape sequences. 'src' and 'dest' should not overlap. +// ---------------------------------------------------------------------- +string CEscape(const StringPiece& src) { + const int dest_length = src.size() * 4 + 1; // Maximum possible expansion + char* dest = new char[dest_length]; + const int len = CEscapeString(src.data(), src.size(), + dest, dest_length); + string s = string(dest, len); + delete[] dest; + return s; +} + +string PrefixSuccessor(const StringPiece& prefix) { + // We can increment the last character in the string and be done + // unless that character is 255, in which case we have to erase the + // last character and increment the previous character, unless that + // is 255, etc. If the string is empty or consists entirely of + // 255's, we just return the empty string. + bool done = false; + string limit(prefix.data(), prefix.size()); + int index = limit.length() - 1; + while (!done && index >= 0) { + if ((limit[index]&255) == 255) { + limit.erase(index); + index--; + } else { + limit[index]++; + done = true; + } + } + if (!done) { + return ""; + } else { + return limit; + } +} + +} // namespace re2 diff --git a/outside/re2/util/test.cc b/outside/re2/util/test.cc new file mode 100644 index 000000000..0644829d8 --- /dev/null +++ b/outside/re2/util/test.cc @@ -0,0 +1,39 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include "util/test.h" + +DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); + +struct Test { + void (*fn)(void); + const char *name; +}; + +static Test tests[10000]; +static int ntests; + +void RegisterTest(void (*fn)(void), const char *name) { + tests[ntests].fn = fn; + tests[ntests++].name = name; +} + +namespace re2 { +int64 VirtualProcessSize() { + struct rusage ru; + getrusage(RUSAGE_SELF, &ru); + return (int64)ru.ru_maxrss*1024; +} +} // namespace re2 + +int main(int argc, char **argv) { + for (int i = 0; i < ntests; i++) { + printf("%s\n", tests[i].name); + tests[i].fn(); + } + printf("PASS\n"); + return 0; +} diff --git a/outside/re2/util/test.h b/outside/re2/util/test.h new file mode 100644 index 000000000..0f9386555 --- /dev/null +++ b/outside/re2/util/test.h @@ -0,0 +1,57 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_UTIL_TEST_H__ +#define RE2_UTIL_TEST_H__ + +#include "util/util.h" +#include "util/flags.h" + +#define TEST(x, y) \ + void x##y(void); \ + TestRegisterer r##x##y(x##y, # x "." # y); \ + void x##y(void) + +void RegisterTest(void (*)(void), const char*); + +class TestRegisterer { + public: + TestRegisterer(void (*fn)(void), const char *s) { + RegisterTest(fn, s); + } +}; + +// TODO(rsc): Do a better job. +#define EXPECT_EQ CHECK_EQ +#define EXPECT_TRUE CHECK +#define EXPECT_LT CHECK_LT +#define EXPECT_GT CHECK_GT +#define EXPECT_LE CHECK_LE +#define EXPECT_GE CHECK_GE +#define EXPECT_FALSE(x) CHECK(!(x)) + +#define ARRAYSIZE arraysize + +#define EXPECT_TRUE_M(x, y) CHECK(x) << (y) +#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y) +#define ASSERT_TRUE_M(x, y) CHECK(x) << (y) +#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y) + +const bool UsingMallocCounter = false; +namespace testing { +class MallocCounter { + public: + MallocCounter(int x) { } + static const int THIS_THREAD_ONLY = 0; + long long HeapGrowth() { return 0; } + long long PeakHeapGrowth() { return 0; } + void Reset() { } +}; +} // namespace testing + +namespace re2 { +int64 VirtualProcessSize(); +} // namespace re2 + +#endif // RE2_UTIL_TEST_H__ diff --git a/outside/re2/util/thread.cc b/outside/re2/util/thread.cc new file mode 100644 index 000000000..734999153 --- /dev/null +++ b/outside/re2/util/thread.cc @@ -0,0 +1,44 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "util/util.h" +#include "util/thread.h" + +Thread::Thread() { + pid_ = 0; + running_ = 0; + joinable_ = 0; +} + +Thread::~Thread() { +} + +void *startThread(void *v) { + Thread* t = (Thread*)v; + t->Run(); + return 0; +} + +void Thread::Start() { + CHECK(!running_); + pthread_create(&pid_, 0, startThread, this); + running_ = true; + if (!joinable_) + pthread_detach(pid_); +} + +void Thread::Join() { + CHECK(running_); + CHECK(joinable_); + void *val; + pthread_join(pid_, &val); + running_ = 0; +} + +void Thread::SetJoinable(bool j) { + CHECK(!running_); + joinable_ = j; +} diff --git a/outside/re2/util/thread.h b/outside/re2/util/thread.h new file mode 100644 index 000000000..b9610e045 --- /dev/null +++ b/outside/re2/util/thread.h @@ -0,0 +1,26 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_UTIL_THREAD_H__ +#define RE2_UTIL_THREAD_H__ + +#include + +class Thread { + public: + Thread(); + virtual ~Thread(); + void Start(); + void Join(); + void SetJoinable(bool); + virtual void Run() = 0; + + private: + pthread_t pid_; + bool running_; + bool joinable_; +}; + +#endif // RE2_UTIL_THREAD_H__ + diff --git a/outside/re2/util/utf.h b/outside/re2/util/utf.h new file mode 100644 index 000000000..06ff8f03e --- /dev/null +++ b/outside/re2/util/utf.h @@ -0,0 +1,43 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + * + * This file and rune.cc have been converted to compile as C++ code + * in name space re2. + */ +#ifndef RE2_UTIL_UTF_H__ +#define RE2_UTIL_UTF_H__ + +#include + +namespace re2 { + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +int runetochar(char* s, const Rune* r); +int chartorune(Rune* r, const char* s); +int fullrune(const char* s, int n); +int utflen(const char* s); +char* utfrune(const char*, Rune); + +} // namespace re2 + +#endif // RE2_UTIL_UTF_H__ diff --git a/outside/re2/util/util.h b/outside/re2/util/util.h new file mode 100644 index 000000000..fc6e01ba7 --- /dev/null +++ b/outside/re2/util/util.h @@ -0,0 +1,122 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_UTIL_UTIL_H__ +#define RE2_UTIL_UTIL_H__ + +// C +#include +#include +#include +#include // For size_t +#include +#include +#include +#include +#include // For isdigit, isalpha. + +// C++ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Use std names. +using std::set; +using std::pair; +using std::vector; +using std::string; +using std::min; +using std::max; +using std::ostream; +using std::map; +using std::stack; +using std::sort; +using std::swap; +using std::make_pair; + +#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID) + +#include +using std::tr1::unordered_set; + +#else + +#include +#if defined(WIN32) || defined(OS_ANDROID) +using std::tr1::unordered_set; +#else +using std::unordered_set; +#endif + +#endif + +namespace re2 { + +typedef int8_t int8; +typedef uint8_t uint8; +typedef int16_t int16; +typedef uint16_t uint16; +typedef int32_t int32; +typedef uint32_t uint32; +typedef int64_t int64; +typedef uint64_t uint64; + +typedef unsigned long ulong; +typedef unsigned int uint; +typedef unsigned short ushort; + +// COMPILE_ASSERT causes a compile error about msg if expr is not true. +template struct CompileAssert {}; +#define COMPILE_ASSERT(expr, msg) \ + typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + +// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions. +// It goes in the private: declarations in a class. +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#define arraysize(array) (sizeof(array)/sizeof((array)[0])) + +class StringPiece; + +string CEscape(const StringPiece& src); +int CEscapeString(const char* src, int src_len, char* dest, int dest_len); + +extern string StringPrintf(const char* format, ...); +extern void SStringPrintf(string* dst, const char* format, ...); +extern void StringAppendF(string* dst, const char* format, ...); +extern string PrefixSuccessor(const StringPiece& prefix); + +uint32 hashword(const uint32*, size_t, uint32); +void hashword2(const uint32*, size_t, uint32*, uint32*); + +static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) { + return hashword((uint32*)s, len/4, seed); +} + +static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { + uint32 x, y; + x = seed; + y = 0; + hashword2((uint32*)s, len/4, &x, &y); + return ((uint64)x << 32) | y; +} + +int RunningOnValgrind(); + +} // namespace re2 + +#include "util/arena.h" +#include "util/logging.h" +#include "util/mutex.h" +#include "util/utf.h" + +#endif // RE2_UTIL_UTIL_H__ diff --git a/outside/re2/util/valgrind.cc b/outside/re2/util/valgrind.cc new file mode 100644 index 000000000..7115c8efd --- /dev/null +++ b/outside/re2/util/valgrind.cc @@ -0,0 +1,24 @@ +// Copyright 2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "util/util.h" +#include "util/valgrind.h" + +namespace re2 { + +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +int RunningOnValgrind() { +#if __has_feature(memory_sanitizer) + return true; +#elif defined(RUNNING_ON_VALGRIND) + return RUNNING_ON_VALGRIND; +#else + return 0; +#endif +} + +} // namespace re2 diff --git a/outside/re2/util/valgrind.h b/outside/re2/util/valgrind.h new file mode 100644 index 000000000..ca10b1a0d --- /dev/null +++ b/outside/re2/util/valgrind.h @@ -0,0 +1,4517 @@ +/* -*- c -*- + ---------------------------------------------------------------- + + Notice that the following BSD-style license applies to this one + file (valgrind.h) only. The rest of Valgrind is licensed under the + terms of the GNU General Public License, version 2, unless + otherwise indicated. See the COPYING file in the source + distribution for details. + + ---------------------------------------------------------------- + + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2000-2009 Julian Seward. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------------------------------------------------------- + + Notice that the above BSD-style license applies to this one file + (valgrind.h) only. The entire rest of Valgrind is licensed under + the terms of the GNU General Public License, version 2. See the + COPYING file in the source distribution for details. + + ---------------------------------------------------------------- +*/ + + +/* This file is for inclusion into client (your!) code. + + You can use these macros to manipulate and query Valgrind's + execution inside your own programs. + + The resulting executables will still run without Valgrind, just a + little bit more slowly than they otherwise would, but otherwise + unchanged. When not running on valgrind, each client request + consumes very few (eg. 7) instructions, so the resulting performance + loss is negligible unless you plan to execute client requests + millions of times per second. Nevertheless, if that is still a + problem, you can compile with the NVALGRIND symbol defined (gcc + -DNVALGRIND) so that client requests are not even compiled in. */ + +#ifndef __VALGRIND_H +#define __VALGRIND_H + +#include + +/* Nb: this file might be included in a file compiled with -ansi. So + we can't use C++ style "//" comments nor the "asm" keyword (instead + use "__asm__"). */ + +/* Derive some tags indicating what the target platform is. Note + that in this file we're using the compiler's CPP symbols for + identifying architectures, which are different to the ones we use + within the rest of Valgrind. Note, __powerpc__ is active for both + 32 and 64-bit PPC, whereas __powerpc64__ is only active for the + latter (on Linux, that is). + + Misc note: how to find out what's predefined in gcc by default: + gcc -Wp,-dM somefile.c +*/ +#undef PLAT_ppc64_aix5 +#undef PLAT_ppc32_aix5 +#undef PLAT_x86_darwin +#undef PLAT_amd64_darwin +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_arm_linux + +#if defined(_AIX) && defined(__64BIT__) +# define PLAT_ppc64_aix5 1 +#elif defined(_AIX) && !defined(__64BIT__) +# define PLAT_ppc32_aix5 1 +#elif defined(__APPLE__) && defined(__i386__) +# define PLAT_x86_darwin 1 +#elif defined(__APPLE__) && defined(__x86_64__) +# define PLAT_amd64_darwin 1 +#elif defined(__linux__) && defined(__i386__) +# define PLAT_x86_linux 1 +#elif defined(__linux__) && defined(__x86_64__) +# define PLAT_amd64_linux 1 +#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__) +# define PLAT_ppc32_linux 1 +#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) +# define PLAT_ppc64_linux 1 +#elif defined(__linux__) && defined(__arm__) +# define PLAT_arm_linux 1 +#else +/* If we're not compiling for our target platform, don't generate + any inline asms. */ +# if !defined(NVALGRIND) +# define NVALGRIND 1 +# endif +#endif + + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS. There is nothing */ +/* in here of use to end-users -- skip to the next section. */ +/* ------------------------------------------------------------------ */ + +#if defined(NVALGRIND) + +/* Define NVALGRIND to completely remove the Valgrind magic sequence + from the compiled code (analogous to NDEBUG's effects on + assert()) */ +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { \ + (_zzq_rlval) = (_zzq_default); \ + } + +#else /* ! NVALGRIND */ + +/* The following defines the magic code sequences which the JITter + spots and handles magically. Don't look too closely at them as + they will rot your brain. + + The assembly code sequences for all architectures is in this one + file. This is because this file must be stand-alone, and we don't + want to have multiple files. + + For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default + value gets put in the return slot, so that everything works when + this is executed not under Valgrind. Args are passed in a memory + block, and so there's no intrinsic limit to the number that could + be passed, but it's currently five. + + The macro args are: + _zzq_rlval result lvalue + _zzq_default default value (result returned when running on real CPU) + _zzq_request request code + _zzq_arg1..5 request params + + The other two macros are used to support function wrapping, and are + a lot simpler. VALGRIND_GET_NR_CONTEXT returns the value of the + guest's NRADDR pseudo-register and whatever other information is + needed to safely run the call original from the wrapper: on + ppc64-linux, the R2 value at the divert point is also needed. This + information is abstracted into a user-visible type, OrigFn. + + VALGRIND_CALL_NOREDIR_* behaves the same as the following on the + guest, but guarantees that the branch instruction will not be + redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64: + branch-and-link-to-r11. VALGRIND_CALL_NOREDIR is just text, not a + complete inline asm, since it needs to be combined with more magic + inline asm stuff to be useful. +*/ + +/* ------------------------- x86-{linux,darwin} ---------------- */ + +#if defined(PLAT_x86_linux) || defined(PLAT_x86_darwin) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "roll $3, %%edi ; roll $13, %%edi\n\t" \ + "roll $29, %%edi ; roll $19, %%edi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned int _zzq_args[6]; \ + volatile unsigned int _zzq_result; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EDX = client_request ( %EAX ) */ \ + "xchgl %%ebx,%%ebx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EAX = guest_NRADDR */ \ + "xchgl %%ecx,%%ecx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_EAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%EAX */ \ + "xchgl %%edx,%%edx\n\t" +#endif /* PLAT_x86_linux || PLAT_x86_darwin */ + +/* ------------------------ amd64-{linux,darwin} --------------- */ + +#if defined(PLAT_amd64_linux) || defined(PLAT_amd64_darwin) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rolq $3, %%rdi ; rolq $13, %%rdi\n\t" \ + "rolq $61, %%rdi ; rolq $51, %%rdi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned long long int _zzq_args[6]; \ + volatile unsigned long long int _zzq_result; \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RDX = client_request ( %RAX ) */ \ + "xchgq %%rbx,%%rbx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RAX = guest_NRADDR */ \ + "xchgq %%rcx,%%rcx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_RAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%RAX */ \ + "xchgq %%rdx,%%rdx\n\t" +#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[6]; \ + unsigned int _zzq_result; \ + unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 3,%1\n\t" /*default*/ \ + "mr 4,%2\n\t" /*ptr*/ \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" /*result*/ \ + : "=b" (_zzq_result) \ + : "b" (_zzq_default), "b" (_zzq_ptr) \ + : "cc", "memory", "r3", "r4"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "cc", "memory", "r3" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[6]; \ + register unsigned long long int _zzq_result __asm__("r3"); \ + register unsigned long long int* _zzq_ptr __asm__("r4"); \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1" \ + : "=r" (_zzq_result) \ + : "0" (_zzq_default), "r" (_zzq_ptr) \ + : "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr __asm__("r3"); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------- arm-linux ------------------------- */ + +#if defined(PLAT_arm_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "mov r12, r12, ror #3 ; mov r12, r12, ror #13 \n\t" \ + "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { volatile unsigned int _zzq_args[6]; \ + volatile unsigned int _zzq_result; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + __asm__ volatile("mov r3, %1\n\t" /*default*/ \ + "mov r4, %2\n\t" /*ptr*/ \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* R3 = client_request ( R4 ) */ \ + "orr r10, r10, r10\n\t" \ + "mov %0, r3" /*result*/ \ + : "=r" (_zzq_result) \ + : "r" (_zzq_default), "r" (&_zzq_args[0]) \ + : "cc","memory", "r3", "r4"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* R3 = guest_NRADDR */ \ + "orr r11, r11, r11\n\t" \ + "mov %0, r3" \ + : "=r" (__addr) \ + : \ + : "cc", "memory", "r3" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R4 */ \ + "orr r12, r12, r12\n\t" + +#endif /* PLAT_arm_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + unsigned int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[7]; \ + register unsigned int _zzq_result; \ + register unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "lwz 3, 24(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[7]; \ + register unsigned long long int _zzq_result; \ + register unsigned long long int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int long long)(_zzq_request); \ + _zzq_args[1] = (unsigned int long long)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int long long)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int long long)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int long long)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int long long)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int long long)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "ld 3, 48(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_aix5 */ + +/* Insert assembly code for other platforms here... */ + +#endif /* NVALGRIND */ + + +/* ------------------------------------------------------------------ */ +/* PLATFORM SPECIFICS for FUNCTION WRAPPING. This is all very */ +/* ugly. It's the least-worst tradeoff I can think of. */ +/* ------------------------------------------------------------------ */ + +/* This section defines magic (a.k.a appalling-hack) macros for doing + guaranteed-no-redirection macros, so as to get from function + wrappers to the functions they are wrapping. The whole point is to + construct standard call sequences, but to do the call itself with a + special no-redirect call pseudo-instruction that the JIT + understands and handles specially. This section is long and + repetitious, and I can't see a way to make it shorter. + + The naming scheme is as follows: + + CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc} + + 'W' stands for "word" and 'v' for "void". Hence there are + different macros for calling arity 0, 1, 2, 3, 4, etc, functions, + and for each, the possibility of returning a word-typed result, or + no result. +*/ + +/* Use these to write the name of your wrapper. NOTE: duplicates + VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */ + +/* Use an extra level of macroisation so as to ensure the soname/fnname + args are fully macro-expanded before pasting them together. */ +#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd + +#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname) \ + VG_CONCAT4(_vgwZU_,soname,_,fnname) + +#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname) \ + VG_CONCAT4(_vgwZZ_,soname,_,fnname) + +/* Use this macro from within a wrapper function to collect the + context (address and possibly other info) of the original function. + Once you have that you can then use it in one of the CALL_FN_ + macros. The type of the argument _lval is OrigFn. */ +#define VALGRIND_GET_ORIG_FN(_lval) VALGRIND_GET_NR_CONTEXT(_lval) + +/* Derivatives of the main macros below, for calling functions + returning void. */ + +#define CALL_FN_v_v(fnptr) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_v(_junk,fnptr); } while (0) + +#define CALL_FN_v_W(fnptr, arg1) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_W(_junk,fnptr,arg1); } while (0) + +#define CALL_FN_v_WW(fnptr, arg1,arg2) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0) + +#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0) + +#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0) + +#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0) + +#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0) + +#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0) + +/* ------------------------- x86-{linux,darwin} ---------------- */ + +#if defined(PLAT_x86_linux) || defined(PLAT_x86_darwin) + +/* These regs are trashed by the hidden call. No need to mention eax + as gcc can already see that, plus causes gcc to bomb. */ +#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx" + +/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $4, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $8, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $12, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $16, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $20, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $24, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $28, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $32, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $36, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $40, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $44, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "pushl 48(%%eax)\n\t" \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $48, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_x86_linux || PLAT_x86_darwin */ + +/* ------------------------ amd64-{linux,darwin} --------------- */ + +#if defined(PLAT_amd64_linux) || defined(PLAT_amd64_darwin) + +/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi", \ + "rdi", "r8", "r9", "r10", "r11" + +/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned + long) == 8. */ + +/* NB 9 Sept 07. There is a nasty kludge here in all these CALL_FN_ + macros. In order not to trash the stack redzone, we need to drop + %rsp by 128 before the hidden call, and restore afterwards. The + nastyness is that it is only by luck that the stack still appears + to be unwindable during the hidden call - since then the behaviour + of any routine using this macro does not match what the CFI data + says. Sigh. + + Why is this important? Imagine that a wrapper has a stack + allocated local, and passes to the hidden call, a pointer to it. + Because gcc does not know about the hidden call, it may allocate + that local in the redzone. Unfortunately the hidden call may then + trash it before it comes to use it. So we must step clear of the + redzone, for the duration of the hidden call, to make it safe. + + Probably the same problem afflicts the other redzone-style ABIs too + (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is + self describing (none of this CFI nonsense) so at least messing + with the stack pointer doesn't give a danger of non-unwindable + stack. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + "addq $128,%%rsp\n\t" \ + VALGRIND_CALL_NOREDIR_RAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $8, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $16, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $24, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $32, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $40, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 96(%%rax)\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $48, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +/* This is useful for finding out about the on-stack stuff: + + extern int f9 ( int,int,int,int,int,int,int,int,int ); + extern int f10 ( int,int,int,int,int,int,int,int,int,int ); + extern int f11 ( int,int,int,int,int,int,int,int,int,int,int ); + extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int ); + + int g9 ( void ) { + return f9(11,22,33,44,55,66,77,88,99); + } + int g10 ( void ) { + return f10(11,22,33,44,55,66,77,88,99,110); + } + int g11 ( void ) { + return f11(11,22,33,44,55,66,77,88,99,110,121); + } + int g12 ( void ) { + return f12(11,22,33,44,55,66,77,88,99,110,121,132); + } +*/ + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc32-linux, + sizeof(unsigned long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + _argvec[12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,20(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------- arm-linux ------------------------- */ + +#if defined(PLAT_arm_linux) + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4","r14" + +/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "mov %0, r0\n" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "mov %0, r0\n" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "mov %0, r0\n" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "mov %0, r0\n" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "ldr r0, [%1, #20] \n\t" \ + "push {r0} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #4 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "push {r0, r1} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #8 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "push {r0, r1, r2} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #12 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "ldr r3, [%1, #32] \n\t" \ + "push {r0, r1, r2, r3} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #16 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "ldr r3, [%1, #32] \n\t" \ + "ldr r4, [%1, #36] \n\t" \ + "push {r0, r1, r2, r3, r4} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #20 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "ldr r0, [%1, #40] \n\t" \ + "push {r0} \n\t" \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "ldr r3, [%1, #32] \n\t" \ + "ldr r4, [%1, #36] \n\t" \ + "push {r0, r1, r2, r3, r4} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #24 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "ldr r0, [%1, #40] \n\t" \ + "ldr r1, [%1, #44] \n\t" \ + "push {r0, r1} \n\t" \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "ldr r3, [%1, #32] \n\t" \ + "ldr r4, [%1, #36] \n\t" \ + "push {r0, r1, r2, r3, r4} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #28 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory",__CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "ldr r0, [%1, #40] \n\t" \ + "ldr r1, [%1, #44] \n\t" \ + "ldr r2, [%1, #48] \n\t" \ + "push {r0, r1, r2} \n\t" \ + "ldr r0, [%1, #20] \n\t" \ + "ldr r1, [%1, #24] \n\t" \ + "ldr r2, [%1, #28] \n\t" \ + "ldr r3, [%1, #32] \n\t" \ + "ldr r4, [%1, #36] \n\t" \ + "push {r0, r1, r2, r3, r4} \n\t" \ + "ldr r0, [%1, #4] \n\t" \ + "ldr r1, [%1, #8] \n\t" \ + "ldr r2, [%1, #12] \n\t" \ + "ldr r3, [%1, #16] \n\t" \ + "ldr r4, [%1] \n\t" /* target->r4 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ + "add sp, sp, #32 \n\t" \ + "mov %0, r0" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "0" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_arm_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "lwz 3," #_n_fr "(1)\n\t" \ + "stw 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,68(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "ld 3," #_n_fr "(1)\n\t" \ + "std 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_aix5 */ + + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ +/* */ +/* ------------------------------------------------------------------ */ + +/* Some request codes. There are many more of these, but most are not + exposed to end-user view. These are the public ones, all of the + form 0x1000 + small_number. + + Core ones are in the range 0x00000000--0x0000ffff. The non-public + ones start at 0x2000. +*/ + +/* These macros are used by tools -- they must be public, but don't + embed them into other programs. */ +#define VG_USERREQ_TOOL_BASE(a,b) \ + ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16)) +#define VG_IS_TOOL_USERREQ(a, b, v) \ + (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000)) + +/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! + This enum comprises an ABI exported by Valgrind to programs + which use client requests. DO NOT CHANGE THE ORDER OF THESE + ENTRIES, NOR DELETE ANY -- add new ones at the end. */ +typedef + enum { VG_USERREQ__RUNNING_ON_VALGRIND = 0x1001, + VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002, + + /* These allow any function to be called from the simulated + CPU but run on the real CPU. Nb: the first arg passed to + the function is always the ThreadId of the running + thread! So CLIENT_CALL0 actually requires a 1 arg + function, etc. */ + VG_USERREQ__CLIENT_CALL0 = 0x1101, + VG_USERREQ__CLIENT_CALL1 = 0x1102, + VG_USERREQ__CLIENT_CALL2 = 0x1103, + VG_USERREQ__CLIENT_CALL3 = 0x1104, + + /* Can be useful in regression testing suites -- eg. can + send Valgrind's output to /dev/null and still count + errors. */ + VG_USERREQ__COUNT_ERRORS = 0x1201, + + /* These are useful and can be interpreted by any tool that + tracks malloc() et al, by using vg_replace_malloc.c. */ + VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301, + VG_USERREQ__FREELIKE_BLOCK = 0x1302, + /* Memory pool support. */ + VG_USERREQ__CREATE_MEMPOOL = 0x1303, + VG_USERREQ__DESTROY_MEMPOOL = 0x1304, + VG_USERREQ__MEMPOOL_ALLOC = 0x1305, + VG_USERREQ__MEMPOOL_FREE = 0x1306, + VG_USERREQ__MEMPOOL_TRIM = 0x1307, + VG_USERREQ__MOVE_MEMPOOL = 0x1308, + VG_USERREQ__MEMPOOL_CHANGE = 0x1309, + VG_USERREQ__MEMPOOL_EXISTS = 0x130a, + + /* Allow printfs to valgrind log. */ + /* The first two pass the va_list argument by value, which + assumes it is the same size as or smaller than a UWord, + which generally isn't the case. Hence are deprecated. + The second two pass the vargs by reference and so are + immune to this problem. */ + /* both :: char* fmt, va_list vargs (DEPRECATED) */ + VG_USERREQ__PRINTF = 0x1401, + VG_USERREQ__PRINTF_BACKTRACE = 0x1402, + /* both :: char* fmt, va_list* vargs */ + VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403, + VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404, + + /* Stack support. */ + VG_USERREQ__STACK_REGISTER = 0x1501, + VG_USERREQ__STACK_DEREGISTER = 0x1502, + VG_USERREQ__STACK_CHANGE = 0x1503, + + /* Wine support */ + VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601 + } Vg_ClientRequest; + +#if !defined(__GNUC__) +# define __extension__ /* */ +#endif + +/* Returns the number of Valgrinds this code is running under. That + is, 0 if running natively, 1 if running under Valgrind, 2 if + running under Valgrind which is running under another Valgrind, + etc. */ +#define RUNNING_ON_VALGRIND __extension__ \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */, \ + VG_USERREQ__RUNNING_ON_VALGRIND, \ + 0, 0, 0, 0, 0); \ + _qzz_res; \ + }) + + +/* Discard translation of code in the range [_qzz_addr .. _qzz_addr + + _qzz_len - 1]. Useful if you are debugging a JITter or some such, + since it provides a way to make sure valgrind will retranslate the + invalidated area. Returns no value. */ +#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DISCARD_TRANSLATIONS, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + } + + +/* These requests are for getting Valgrind itself to print something. + Possibly with a backtrace. This is a really ugly hack. The return value + is the number of characters printed, excluding the "**** " part at the + start and the backtrace (if present). */ + +#if defined(NVALGRIND) + +# define VALGRIND_PRINTF(...) +# define VALGRIND_PRINTF_BACKTRACE(...) + +#else /* NVALGRIND */ + +/* Modern GCC will optimize the static routine out if unused, + and unused attribute will shut down warnings about it. */ +static int VALGRIND_PRINTF(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, + VG_USERREQ__PRINTF_VALIST_BY_REF, + (unsigned long)format, + (unsigned long)&vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF_BACKTRACE(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, + VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF, + (unsigned long)format, + (unsigned long)&vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +#endif /* NVALGRIND */ + + +/* These requests allow control to move from the simulated CPU to the + real CPU, calling an arbitary function. + + Note that the current ThreadId is inserted as the first argument. + So this call: + + VALGRIND_NON_SIMD_CALL2(f, arg1, arg2) + + requires f to have this signature: + + Word f(Word tid, Word arg1, Word arg2) + + where "Word" is a word-sized type. + + Note that these client requests are not entirely reliable. For example, + if you call a function with them that subsequently calls printf(), + there's a high chance Valgrind will crash. Generally, your prospects of + these working are made higher if the called function does not refer to + any global variables, and does not refer to any libc or other functions + (printf et al). Any kind of entanglement with libc or dynamic linking is + likely to have a bad outcome, for tricky reasons which we've grappled + with a lot in the past. +*/ +#define VALGRIND_NON_SIMD_CALL0(_qyy_fn) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL0, \ + _qyy_fn, \ + 0, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL1, \ + _qyy_fn, \ + _qyy_arg1, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL2, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL3, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, \ + _qyy_arg3, 0); \ + _qyy_res; \ + }) + + +/* Counts the number of errors that have been recorded by a tool. Nb: + the tool must record the errors with VG_(maybe_record_error)() or + VG_(unique_error)() for them to be counted. */ +#define VALGRIND_COUNT_ERRORS \ + __extension__ \ + ({unsigned int _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__COUNT_ERRORS, \ + 0, 0, 0, 0, 0); \ + _qyy_res; \ + }) + +/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing + when heap blocks are allocated in order to give accurate results. This + happens automatically for the standard allocator functions such as + malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete, + delete[], etc. + + But if your program uses a custom allocator, this doesn't automatically + happen, and Valgrind will not do as well. For example, if you allocate + superblocks with mmap() and then allocates chunks of the superblocks, all + Valgrind's observations will be at the mmap() level and it won't know that + the chunks should be considered separate entities. In Memcheck's case, + that means you probably won't get heap block overrun detection (because + there won't be redzones marked as unaddressable) and you definitely won't + get any leak detection. + + The following client requests allow a custom allocator to be annotated so + that it can be handled accurately by Valgrind. + + VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated + by a malloc()-like function. For Memcheck (an illustrative case), this + does two things: + + - It records that the block has been allocated. This means any addresses + within the block mentioned in error messages will be + identified as belonging to the block. It also means that if the block + isn't freed it will be detected by the leak checker. + + - It marks the block as being addressable and undefined (if 'is_zeroed' is + not set), or addressable and defined (if 'is_zeroed' is set). This + controls how accesses to the block by the program are handled. + + 'addr' is the start of the usable block (ie. after any + redzone), 'sizeB' is its size. 'rzB' is the redzone size if the allocator + can apply redzones -- these are blocks of padding at the start and end of + each block. Adding redzones is recommended as it makes it much more likely + Valgrind will spot block overruns. `is_zeroed' indicates if the memory is + zeroed (or filled with another predictable value), as is the case for + calloc(). + + VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a + heap block -- that will be used by the client program -- is allocated. + It's best to put it at the outermost level of the allocator if possible; + for example, if you have a function my_alloc() which calls + internal_alloc(), and the client request is put inside internal_alloc(), + stack traces relating to the heap block will contain entries for both + my_alloc() and internal_alloc(), which is probably not what you want. + + For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out + custom blocks from within a heap block, B, that has been allocated with + malloc/calloc/new/etc, then block B will be *ignored* during leak-checking + -- the custom blocks will take precedence. + + VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK. For + Memcheck, it does two things: + + - It records that the block has been deallocated. This assumes that the + block was annotated as having been allocated via + VALGRIND_MALLOCLIKE_BLOCK. Otherwise, an error will be issued. + + - It marks the block as being unaddressable. + + VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a + heap block is deallocated. + + In many cases, these two client requests will not be enough to get your + allocator working well with Memcheck. More specifically, if your allocator + writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call + will be necessary to mark the memory as addressable just before the zeroing + occurs, otherwise you'll get a lot of invalid write errors. For example, + you'll need to do this if your allocator recycles freed blocks, but it + zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK). + Alternatively, if your allocator reuses freed blocks for allocator-internal + data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary. + + Really, what's happening is a blurring of the lines between the client + program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the + memory should be considered unaddressable to the client program, but the + allocator knows more than the rest of the client program and so may be able + to safely access it. Extra client requests are necessary for Valgrind to + understand the distinction between the allocator and the rest of the + program. + + Note: there is currently no VALGRIND_REALLOCLIKE_BLOCK client request; it + has to be emulated with MALLOCLIKE/FREELIKE and memory copying. + + Ignored if addr == 0. +*/ +#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MALLOCLIKE_BLOCK, \ + addr, sizeB, rzB, is_zeroed, 0); \ + } + +/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details. + Ignored if addr == 0. +*/ +#define VALGRIND_FREELIKE_BLOCK(addr, rzB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__FREELIKE_BLOCK, \ + addr, rzB, 0, 0, 0); \ + } + +/* Create a memory pool. */ +#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__CREATE_MEMPOOL, \ + pool, rzB, is_zeroed, 0, 0); \ + } + +/* Destroy a memory pool. */ +#define VALGRIND_DESTROY_MEMPOOL(pool) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DESTROY_MEMPOOL, \ + pool, 0, 0, 0, 0); \ + } + +/* Associate a piece of memory with a memory pool. */ +#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_ALLOC, \ + pool, addr, size, 0, 0); \ + } + +/* Disassociate a piece of memory from a memory pool. */ +#define VALGRIND_MEMPOOL_FREE(pool, addr) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_FREE, \ + pool, addr, 0, 0, 0); \ + } + +/* Disassociate any pieces outside a particular range. */ +#define VALGRIND_MEMPOOL_TRIM(pool, addr, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_TRIM, \ + pool, addr, size, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MOVE_MEMPOOL(poolA, poolB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MOVE_MEMPOOL, \ + poolA, poolB, 0, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_CHANGE, \ + pool, addrA, addrB, size, 0); \ + } + +/* Return 1 if a mempool exists, else 0. */ +#define VALGRIND_MEMPOOL_EXISTS(pool) \ + __extension__ \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_EXISTS, \ + pool, 0, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Mark a piece of memory as being a stack. Returns a stack id. */ +#define VALGRIND_STACK_REGISTER(start, end) \ + __extension__ \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_REGISTER, \ + start, end, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Unmark the piece of memory associated with a stack id as being a + stack. */ +#define VALGRIND_STACK_DEREGISTER(id) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_DEREGISTER, \ + id, 0, 0, 0, 0); \ + } + +/* Change the start and end address of the stack id. */ +#define VALGRIND_STACK_CHANGE(id, start, end) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_CHANGE, \ + id, start, end, 0, 0); \ + } + +/* Load PDB debug info for Wine PE image_map. */ +#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__LOAD_PDB_DEBUGINFO, \ + fd, ptr, total_size, delta, 0); \ + } + + +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_arm_linux +#undef PLAT_ppc32_aix5 +#undef PLAT_ppc64_aix5 + +#endif /* __VALGRIND_H */ From 2b115aae62c1d0bd444a88f360fa385b5b4d3321 Mon Sep 17 00:00:00 2001 From: Steve Dee Date: Thu, 10 Apr 2014 11:50:13 -0700 Subject: [PATCH 13/13] Build against our own re2 --- Makefile | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 4c88878b9..62c121756 100644 --- a/Makefile +++ b/Makefile @@ -52,7 +52,7 @@ ifeq ($(OS),bsd) OSLIBS=-lpthread -lncurses -lkvm endif -LIBS=-lssl -lcrypto -lgmp -lncurses -lsigsegv $(OSLIBS) -lre2 +LIBS=-lssl -lcrypto -lgmp -lncurses -lsigsegv $(OSLIBS) INCLUDE=include GENERATED=generated @@ -63,6 +63,7 @@ CFLAGS= -O2 -g \ -I/opt/local/include \ -I$(INCLUDE) \ -Ioutside/libuv/include \ + -Ioutside/re2 \ -Ioutside/cre2/src/src \ -I $(GENERATED) \ $(DEFINES) \ @@ -559,19 +560,24 @@ VERE_OFILES=\ LIBUV=outside/libuv/libuv.a +LIBRE2=outside/re2/obj/libre2.a + all: $(BIN)/vere -$(LIBUV): +$(LIBUV): $(MAKE) -C outside/libuv libuv.a -$(CRE2_OFILES): outside/cre2/src/src/cre2.cpp outside/cre2/src/src/cre2.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(LIBRE2): + $(MAKE) -C outside/re2 obj/libre2.a + +$(CRE2_OFILES): outside/cre2/src/src/cre2.cpp outside/cre2/src/src/cre2.h $(LIBRE2) + $(CXX) $(CXXFLAGS) -c $< $(LIBRE2) -o $@ $(V_OFILES) f/loom.o f/trac.o: include/v/vere.h -$(BIN)/vere: $(LIBCRE) $(VERE_OFILES) $(LIBUV) $(CAPN) +$(BIN)/vere: $(LIBCRE) $(VERE_OFILES) $(LIBUV) $(LIBRE2) mkdir -p $(BIN) - $(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(LIBCRE) $(CAPN) $(LIBS) + $(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(LIBCRE) $(LIBRE2) $(LIBS) tags: ctags -R -f .tags --exclude=root @@ -582,5 +588,5 @@ etags: clean: $(RM) $(VERE_OFILES) $(BIN)/vere $(BIN)/eyre $(MAKE) -C outside/libuv clean - cd outside/cre2/src && sh clean.sh + $(MAKE) -C outside/re2 clean