Merge remote-tracking branch 'urbit/master' into newbreach

Update to the latest vere
This commit is contained in:
~hatteb-mitlyd 2014-04-10 12:04:51 -07:00
commit aec40128c9
159 changed files with 57564 additions and 9 deletions

View File

@ -34,7 +34,9 @@ LIB=$(PWD)/lib
RM=rm -f
CC=gcc
CLD=gcc -O2 -g -L/usr/local/lib -L/opt/local/lib
CXX=g++
CXXFLAGS=$(CFLAGS)
CLD=g++ -O2 -g -L/usr/local/lib -L/opt/local/lib
YACC=bison -v -b$(GENERATED)/y
LEX=lex
@ -43,7 +45,7 @@ ifeq ($(OS),osx)
OSLIBS=-framework CoreServices -framework CoreFoundation
endif
ifeq ($(OS),linux)
OSLIBS=-lpthread -lrt -lcurses
OSLIBS=-lpthread -lrt -lcurses
DEFINES=-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
endif
ifeq ($(OS),bsd)
@ -56,11 +58,13 @@ INCLUDE=include
GENERATED=generated
MDEFINES=-DU2_OS_$(OS) -DU2_OS_ENDIAN_$(ENDIAN) -D U2_LIB=\"$(LIB)\"
CFLAGS=-O2 -g \
CFLAGS= -O2 -g \
-I/usr/local/include \
-I/opt/local/include \
-I$(INCLUDE) \
-Ioutside/libuv/include \
-Ioutside/re2 \
-Ioutside/cre2/src/src \
-I $(GENERATED) \
$(DEFINES) \
$(MDEFINES)
@ -451,6 +455,8 @@ J164_5_OFILES=\
gen164/5/mat.o \
gen164/5/mink.o \
gen164/5/parse.o \
gen164/5/repg.o \
gen164/5/rexp.o \
gen164/5/rub.o \
gen164/5/shax.o \
gen164/5/lore.o \
@ -524,6 +530,9 @@ BASE_OFILES=\
$(F_OFILES) \
$(J164_OFILES)
CRE2_OFILES=\
outside/cre2/src/src/cre2.o
OUT_OFILES=\
outside/jhttp/http_parser.o
@ -544,22 +553,31 @@ V_OFILES=\
v/walk.o
VERE_OFILES=\
$(V_OFILES) \
$(BASE_OFILES) \
$(OUT_OFILES)
$(CRE2_OFILES) \
$(OUT_OFILES) \
$(V_OFILES)
LIBUV=outside/libuv/libuv.a
LIBRE2=outside/re2/obj/libre2.a
all: $(BIN)/vere
$(LIBUV):
$(LIBUV):
$(MAKE) -C outside/libuv libuv.a
$(LIBRE2):
$(MAKE) -C outside/re2 obj/libre2.a
$(CRE2_OFILES): outside/cre2/src/src/cre2.cpp outside/cre2/src/src/cre2.h $(LIBRE2)
$(CXX) $(CXXFLAGS) -c $< $(LIBRE2) -o $@
$(V_OFILES) f/loom.o f/trac.o: include/v/vere.h
$(BIN)/vere: $(VERE_OFILES) $(LIBUV) $(CAPN)
$(BIN)/vere: $(LIBCRE) $(VERE_OFILES) $(LIBUV) $(LIBRE2)
mkdir -p $(BIN)
$(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(CAPN) $(LIBS)
$(CLD) $(CLDOSFLAGS) -o $(BIN)/vere $(VERE_OFILES) $(LIBUV) $(LIBCRE) $(LIBRE2) $(LIBS)
tags:
ctags -R -f .tags --exclude=root
@ -570,4 +588,5 @@ etags:
clean:
$(RM) $(VERE_OFILES) $(BIN)/vere $(BIN)/eyre
$(MAKE) -C outside/libuv clean
$(MAKE) -C outside/re2 clean

153
gen164/5/repg.c Normal file
View File

@ -0,0 +1,153 @@
/* j/5/repg.c
**
** This file is in the public domain.
*/
#include "all.h"
#include "../pit.h"
#include "cre2.h"
#include <string.h>
u2_noun // produce
j2_mbc(Pt5, repg)(u2_wire wir_r,
u2_noun lub,
u2_noun rad,
u2_noun rep) // retain
{
c3_y* lub_y = u2_cr_tape(lub);
c3_y* rad_y = u2_cr_tape(rad);
c3_y* rep_y = u2_cr_tape(rep);
char* rec = (char*)lub_y;
char* end;
while(*rec != 0) {
if(*rec == '\\') {
rec++;
switch (*rec) {
case 'P':
case 'p':
free(lub_y);
free(rad_y);
return u2_nul;
case 'Q':
end = strstr(rec, "\\E");
if(end == NULL) rec += strlen(rec) - 1;
else rec = end;
}
rec++;
}
else if(*rec == '(') {
rec++;
if(*rec == '?') {
rec++;
if(*rec != ':') {
free(lub_y);
free(rad_y);
return u2_nul;
}
rec++;
}
}
else
rec++;
}
cre2_regexp_t * rex;
cre2_options_t * opt;
opt = cre2_opt_new();
if (opt) {
cre2_opt_set_log_errors(opt, 0);
cre2_opt_set_encoding(opt, CRE2_Latin1);
cre2_opt_set_perl_classes(opt, 1);
cre2_opt_set_one_line(opt, 1);
cre2_opt_set_longest_match(opt, 1);
rex = cre2_new((const char *)lub_y, strlen((char *)lub_y), opt);
if (rex) {
if (!cre2_error_code(rex)) {
int text_len = strlen((char *)rad_y);
cre2_string_t matches[1];
int ic = 0;
u2_noun ret = u2_nul;
while (ic <= text_len) {
int match = cre2_match(rex, (const char*)rad_y, text_len, ic, text_len, CRE2_ANCHOR_START, matches, 1);
if (!match) {
if(rad_y[ic])
ret = u2_cn_cell((u2_atom)rad_y[ic], ret);
ic++;
}
else {
int mlen = matches[0].length;
if (mlen == 0) {
ret = u2_ckb_weld(u2_ckb_flop(u2_ci_tape((char *) rad_y+ic)), u2_ckb_flop(u2_ci_tape((char *)rep_y)));
ic = text_len + 1;
}
else {
ret = u2_ckb_weld(u2_ckb_flop(u2_ci_tape((char *)rep_y)), ret);
ic += mlen;
}
}
}
cre2_opt_delete(opt);
cre2_delete(rex);
free(lub_y);
free(rad_y);
free(rep_y);
return u2_cn_cell(u2_nul, u2_ckb_flop(ret));
}
else {
// Compiling the regular expression failed
cre2_opt_delete(opt);
cre2_delete(rex);
free(lub_y);
free(rad_y);
return u2_nul;
}
cre2_opt_delete(opt);
cre2_delete(rex);
}
else {
// rex Allocation Error
cre2_opt_delete(opt);
free(lub_y);
free(rad_y);
u2_bl_bail(wir_r, c3__exit);
}
cre2_opt_delete(opt);
}
// opt Allocation Error
free(lub_y);
free(rad_y);
u2_bl_bail(wir_r, c3__exit);
return u2_nul;
}
u2_weak // produce
j2_mb(Pt5, repg)(u2_wire wir_r,
u2_noun cor) // retain
{
u2_noun lub;
u2_noun rad;
u2_noun rep;
if ( (u2_none == (lub = u2_frag(u2_cv_sam_2, cor))) ||
(u2_none == (rad = u2_frag(u2_cv_sam_6, cor))) ||
(u2_none == (rep = u2_frag(u2_cv_sam_7, cor))) )
{
return u2_bl_bail(wir_r, c3__fail);
} else {
return j2_mbc(Pt5, repg)(wir_r, lub, rad, rep);
}
}
/* structures
*/
u2_ho_jet
j2_mbj(Pt5, repg)[] = {
{ ".2", c3__lite, j2_mb(Pt5, repg), Tier5, u2_none, u2_none },
{ }
};

151
gen164/5/rexp.c Normal file
View File

@ -0,0 +1,151 @@
/* j/5/rexp.c
**
** This file is in the public domain.
*/
#include "all.h"
#include "../pit.h"
#include "cre2.h"
#include <string.h>
u2_noun // produce
j2_mbc(Pt5, rexp)(u2_wire wir_r,
u2_noun lub,
u2_noun rad) // retain
{
c3_y* lub_y = u2_cr_tape(lub);
c3_y* rad_y = u2_cr_tape(rad);
u2k(lub);
int lub_l = u2_ckb_lent(lub);
if (lub_l != strlen((char *)lub_y)) {
free(lub_y);
free(rad_y);
return u2_nul;
}
char* rec = (char*)lub_y;
char* end;
while(*rec != 0) {
if(*rec > 127) {
free(lub_y);
free(rad_y);
return u2_nul;
}
else if(*rec == '\\') {
rec++;
switch (*rec) {
case 'P':
case 'p':
free(lub_y);
free(rad_y);
return u2_nul;
case 'Q':
end = strstr(rec, "\\E");
if(end == NULL) rec += strlen(rec) - 1;
else rec = end;
}
}
else if(*rec == '(') {
rec++;
if(*rec == '?') {
rec++;
if(*rec != ':') {
free(lub_y);
free(rad_y);
return u2_nul;
}
rec++;
}
}
else
rec++;
}
cre2_regexp_t * rex;
cre2_options_t * opt;
opt = cre2_opt_new();
if (opt) {
cre2_opt_set_log_errors(opt, 0);
cre2_opt_set_encoding(opt, CRE2_UTF8);
cre2_opt_set_perl_classes(opt, 1);
cre2_opt_set_one_line(opt, 1);
cre2_opt_set_longest_match(opt, 1);
rex = cre2_new((const char *)lub_y, strlen((char *)lub_y), opt);
if (rex) {
if (!cre2_error_code(rex)) {
int text_len = strlen((char *)rad_y);
int captures = cre2_num_capturing_groups(rex);
cre2_string_t matches[captures+1];
int match = cre2_match(rex, (const char*)rad_y, text_len, 0, text_len, CRE2_UNANCHORED, matches, captures+1);
if (!match) {
// No matches
cre2_opt_delete(opt);
cre2_delete(rex);
free(lub_y);
free(rad_y);
return u2_cn_cell(u2_nul, u2_nul);
}
u2_noun map = u2_nul;
int i;
for (i = 0; i < captures+1; i++) {
char * buf = malloc(matches[i].length + 1);
memcpy(buf, matches[i].data, matches[i].length);
buf[matches[i].length] = 0;
map = u2_ckd_by_put(map, i, u2_ci_tape(buf));
free(buf);
}
cre2_opt_delete(opt);
cre2_delete(rex);
free(lub_y);
free(rad_y);
return u2_cn_cell(u2_nul, u2_cn_cell(u2_nul, map));
}
else {
// Compiling the regular expression failed
cre2_opt_delete(opt);
cre2_delete(rex);
free(lub_y);
free(rad_y);
return u2_nul;
}
cre2_delete(rex);
}
cre2_opt_delete(opt);
}
free(lub_y);
free(rad_y);
u2_bl_bail(wir_r, c3__exit);
return u2_nul;
}
u2_weak // produce
j2_mb(Pt5, rexp)(u2_wire wir_r,
u2_noun cor) // retain
{
u2_noun lub;
u2_noun rad;
if ( (u2_none == (lub = u2_frag(u2_cv_sam_2, cor))) ||
(u2_none == (rad = u2_frag(u2_cv_sam_3, cor))) )
{
return u2_bl_bail(wir_r, c3__fail);
} else {
return j2_mbc(Pt5, rexp)(wir_r, lub, rad);
}
}
/* structures
*/
u2_ho_jet
j2_mbj(Pt5, rexp)[] = {
{ ".2", c3__lite, j2_mb(Pt5, rexp), Tier5, u2_none, u2_none },
{ }
};

View File

@ -87,6 +87,8 @@
extern u2_ho_jet j2_mbj(Pt5, pfix)[];
extern u2_ho_jet j2_mbj(Pt5, plug)[];
extern u2_ho_jet j2_mbj(Pt5, pose)[];
extern u2_ho_jet j2_mbj(Pt5, repg)[];
extern u2_ho_jet j2_mbj(Pt5, rexp)[];
extern u2_ho_jet j2_mbj(Pt5, rub)[];
extern u2_ho_jet j2_mbj(Pt5, sfix)[];
extern u2_ho_jet j2_mbj(Pt5, shax)[];
@ -229,6 +231,8 @@
{ j2_sb(Pt5, pfix), j2_mbj(Pt5, pfix), 0, 0, u2_none },
{ j2_sb(Pt5, plug), j2_mbj(Pt5, plug), 0, 0, u2_none },
{ j2_sb(Pt5, pose), j2_mbj(Pt5, pose), 0, 0, u2_none },
{ j2_sb(Pt5, repg), j2_mbj(Pt5, repg), 0, 0, u2_none },
{ j2_sb(Pt5, rexp), j2_mbj(Pt5, rexp), 0, 0, u2_none },
{ j2_sb(Pt5, rub), j2_mbj(Pt5, rub), 0, 0, u2_none },
{ j2_sb(Pt5, sfix), j2_mbj(Pt5, sfix), 0, 0, u2_none },
{ j2_sb(Pt5, shax), j2_mbj(Pt5, shax), 0, 0, u2_none },

View File

@ -0,0 +1,35 @@
Copyright (c) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
Redistribution and use in source and binary forms, with or
without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of the author nor the names of his
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,175 @@
C wrapper for re2
=================
Topics
------
1. Introduction
2. License
3. Install
4. Usage
A. Credits
B. Bugs
C. Resources
1. Introduction
---------------
The CRE2 distribution is a C language wrapper for the RE2
library, which is implemented in C++. RE2 is a fast, safe,
thread-friendly alternative to backtracking regular
expression engines like those used in PCRE, Perl, and
Python.
This distribution makes use of the GNU Autotools.
2. License
----------
Copyright (c) 2012, 2013 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
Redistribution and use in source and binary forms, with or
without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of the author nor the names of his
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3. Install
----------
To install RE2 follow the instructions in the README file in
RE2's. To install CRE2 from a proper release tarball, do
this:
$ cd cre2-0.1.0
$ mkdir "=build"
$ cd "=build"
to inspect the available configuration options:
$ ../configure --help
then do it:
$ ../configure [options]
$ make
$ make check
$ make install
From a repository checkout or snapshot (the ones from the
Github site): we may need to manually run "libtoolize" the
first time, then we must first run the script "autogen.sh"
from the top source directory, to generate the needed files:
$ sh autogen.sh
for this we need to have installed the GNU Autotools:
Automake, Autoconf, Libtool. After this:
$ ../configure --enable-maintainer-mode [options]
$ make
$ make check
$ make install
The Makefile supports the DESTDIR environment variable to
install files in a temporary location, example: to see what
will happen:
$ make -n install DESTDIR=/tmp/marco/cre2
to really do it:
$ make install DESTDIR=/tmp/marco/cre2
4. Usage
--------
Read the documentation.
A. Credits
----------
RE2 is a Google project. CRE2 is based on code by Keegan
McAllister. This distribution was assembled by Marco Maggi.
B. Bugs
-------
Bug reports are appreciated. Register issues at the CRE2
issue tracker:
<http://github.com/marcomaggi/cre2/issues>
C. Resources
------------
The GNU Project software can be found here:
<http://www.gnu.org/>
RE2 is available at:
<http://code.google.com/p/re2/>
development of this package happens at:
<http://github.com/marcomaggi/cre2/>
and as backup at:
<http://sourceforge.net/projects/cre2/>
proper release tarballs for this package are in the download
area at:
<http://sourceforge.net/projects/cre2/files/>
the documentation is available online:
<http://marcomaggi.github.com/docs/cre2.html>
### end of file
# Local Variables:
# mode: text
# coding: utf-8-unix
# fill-column: 60
# paragraph-start: "*"
# End:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
This is the file .../info/dir, which contains the
topmost node of the Info hierarchy, called (dir)Top.
The first time you invoke Info you start off looking at this node.

File: dir, Node: Top This is the top of the INFO tree
This (the Directory node) gives a menu of major topics.
Typing "q" exits, "?" lists all Info commands, "d" returns here,
"h" gives a primer for first-timers,
"mEmacs<Return>" visits the Emacs manual, etc.
In Emacs, you can click mouse button 2 on a menu item or cross reference
to select it.
* Menu:
Development
* cre2: (cre2). C wrapper for RE2.

44
outside/cre2/src/.gitignore vendored Normal file
View File

@ -0,0 +1,44 @@
*~
=*
,,*
*.a
*.bz2
*.fasl
*.gz
*.html
*.info
*.o
*.out
*.so
*.so.*
*.tgz
*.tmp
.DS_Store
.arch
.deps/
.emacs.*
.gdb_history
.vimview
Makefile
Makefile.in
aclocal.m4
ar-lib
autom4te*
compile
config.guess
config.h.in
config.sub
config.cache
configure
depcomp
test-driver
doc/mdate-sh
doc/stamp-vti
doc/texinfo.tex
doc/version.texi
install-sh
missing
mkinstalldirs
ltmain.sh
m4/
autotools/

35
outside/cre2/src/COPYING Normal file
View File

@ -0,0 +1,35 @@
Copyright (c) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
Redistribution and use in source and binary forms, with or
without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of the author nor the names of his
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

365
outside/cre2/src/INSTALL Normal file
View File

@ -0,0 +1,365 @@
Installation Instructions
*************************
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006, 2007, 2008, 2009 Free Software Foundation, Inc.
Copying and distribution of this file, with or without modification,
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. This file is offered as-is,
without warranty of any kind.
Basic Installation
==================
Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package. The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package. Some packages provide this
`INSTALL' file but do not implement all of the features documented
below. The lack of an optional feature in a given package is not
necessarily a bug. More recommendations for GNU packages can be found
in *note Makefile Conventions: (standards)Makefile Conventions.
The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation. It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions. Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').
It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring. Caching is
disabled by default to prevent problems with accidental use of stale
cache files.
If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release. If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.
The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'. You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.
The simplest way to compile this package is:
1. `cd' to the directory containing the package's source code and type
`./configure' to configure the package for your system.
Running `configure' might take a while. While running, it prints
some messages telling which features it is checking for.
2. Type `make' to compile the package.
3. Optionally, type `make check' to run any self-tests that come with
the package, generally using the just-built uninstalled binaries.
4. Type `make install' to install the programs and any data files and
documentation. When installing into a prefix owned by root, it is
recommended that the package be configured and built as a regular
user, and only the `make install' phase executed with root
privileges.
5. Optionally, type `make installcheck' to repeat any self-tests, but
this time using the binaries in their final installed location.
This target does not install anything. Running this target as a
regular user, particularly if the prior `make install' required
root privileges, verifies that the installation completed
correctly.
6. You can remove the program binaries and object files from the
source code directory by typing `make clean'. To also remove the
files that `configure' created (so you can compile the package for
a different kind of computer), type `make distclean'. There is
also a `make maintainer-clean' target, but that is intended mainly
for the package's developers. If you use it, you may have to get
all sorts of other programs in order to regenerate files that came
with the distribution.
7. Often, you can also type `make uninstall' to remove the installed
files again. In practice, not all packages have tested that
uninstallation works correctly, even though it is required by the
GNU Coding Standards.
8. Some packages, particularly those that use Automake, provide `make
distcheck', which can by used by developers to test that all other
targets like `make install' and `make uninstall' work correctly.
This target is generally not run by end users.
Compilers and Options
=====================
Some systems require unusual options for compilation or linking that
the `configure' script does not know about. Run `./configure --help'
for details on some of the pertinent environment variables.
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is an example:
./configure CC=c99 CFLAGS=-g LIBS=-lposix
*Note Defining Variables::, for more details.
Compiling For Multiple Architectures
====================================
You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory. To do this, you can use GNU `make'. `cd' to the
directory where you want the object files and executables to go and run
the `configure' script. `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'. This
is known as a "VPATH" build.
With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory. After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.
On MacOS X 10.5 and later systems, you can create libraries and
executables that work on multiple system types--known as "fat" or
"universal" binaries--by specifying multiple `-arch' options to the
compiler but only a single `-arch' option to the preprocessor. Like
this:
./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
CPP="gcc -E" CXXCPP="g++ -E"
This is not guaranteed to produce working output in all cases, you
may have to build one architecture at a time and combine the results
using the `lipo' tool if you have problems.
Installation Names
==================
By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc. You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX', where PREFIX must be an
absolute file name.
You can specify separate installation prefixes for
architecture-specific files and architecture-independent files. If you
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.
In addition, if you use an unusual directory layout you can give
options like `--bindir=DIR' to specify different values for particular
kinds of files. Run `configure --help' for a list of the directories
you can set and what kinds of files go in them. In general, the
default for these options is expressed in terms of `${prefix}', so that
specifying just `--prefix' will affect all of the other directory
specifications that were not explicitly provided.
The most portable way to affect installation locations is to pass the
correct locations to `configure'; however, many packages provide one or
both of the following shortcuts of passing variable assignments to the
`make install' command line to change installation locations without
having to reconfigure or recompile.
The first method involves providing an override variable for each
affected directory. For example, `make install
prefix=/alternate/directory' will choose an alternate location for all
directory configuration variables that were expressed in terms of
`${prefix}'. Any directories that were specified during `configure',
but not in terms of `${prefix}', must each be overridden at install
time for the entire installation to be relocated. The approach of
makefile variable overrides for each directory variable is required by
the GNU Coding Standards, and ideally causes no recompilation.
However, some platforms have known limitations with the semantics of
shared libraries that end up requiring recompilation when using this
method, particularly noticeable in packages that use GNU Libtool.
The second method involves providing the `DESTDIR' variable. For
example, `make install DESTDIR=/alternate/directory' will prepend
`/alternate/directory' before all installation names. The approach of
`DESTDIR' overrides is not required by the GNU Coding Standards, and
does not work on platforms that have drive letters. On the other hand,
it does better at avoiding recompilation issues, and works well even
when some directory options were not specified in terms of `${prefix}'
at `configure' time.
Optional Features
=================
If the package supports it, you can cause programs to be installed
with an extra prefix or suffix on their names by giving `configure' the
option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
Some packages pay attention to `--enable-FEATURE' options to
`configure', where FEATURE indicates an optional part of the package.
They may also pay attention to `--with-PACKAGE' options, where PACKAGE
is something like `gnu-as' or `x' (for the X Window System). The
`README' should mention any `--enable-' and `--with-' options that the
package recognizes.
For packages that use the X Window System, `configure' can usually
find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.
Some packages offer the ability to configure how verbose the
execution of `make' will be. For these packages, running `./configure
--enable-silent-rules' sets the default to minimal output, which can be
overridden with `make V=1'; while running `./configure
--disable-silent-rules' sets the default to verbose, which can be
overridden with `make V=0'.
Particular systems
==================
On HP-UX, the default C compiler is not ANSI C compatible. If GNU
CC is not installed, it is recommended to use the following options in
order to use an ANSI C compiler:
./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
and if that doesn't work, install pre-built binaries of GCC for HP-UX.
On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
parse its `<wchar.h>' header file. The option `-nodtk' can be used as
a workaround. If GNU CC is not installed, it is therefore recommended
to try
./configure CC="cc"
and if that doesn't work, try
./configure CC="cc -nodtk"
On Solaris, don't put `/usr/ucb' early in your `PATH'. This
directory contains several dysfunctional programs; working variants of
these programs are available in `/usr/bin'. So, if you need `/usr/ucb'
in your `PATH', put it _after_ `/usr/bin'.
On Haiku, software installed for all users goes in `/boot/common',
not `/usr/local'. It is recommended to use the following options:
./configure --prefix=/boot/common
Specifying the System Type
==========================
There may be some features `configure' cannot figure out
automatically, but needs to determine by the type of machine the package
will run on. Usually, assuming the package is built to be run on the
_same_ architectures, `configure' can figure that out, but if it prints
a message saying it cannot guess the machine type, give it the
`--build=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:
CPU-COMPANY-SYSTEM
where SYSTEM can have one of these forms:
OS
KERNEL-OS
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.
If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.
If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.
Sharing Defaults
================
If you want to set default values for `configure' scripts to share,
you can create a site shell script called `config.site' that gives
default values for variables like `CC', `cache_file', and `prefix'.
`configure' looks for `PREFIX/share/config.site' if it exists, then
`PREFIX/etc/config.site' if it exists. Or, you can set the
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.
Defining Variables
==================
Variables not defined in a site shell script can be set in the
environment passed to `configure'. However, some packages may run
configure again during the build, and the customized values of these
variables may be lost. In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'. For example:
./configure CC=/usr/local2/bin/gcc
causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug. Until the bug is fixed you can use this workaround:
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
`configure' Invocation
======================
`configure' recognizes the following options to control how it
operates.
`--help'
`-h'
Print a summary of all of the options to `configure', and exit.
`--help=short'
`--help=recursive'
Print a summary of the options unique to this package's
`configure', and exit. The `short' variant lists options used
only in the top level, while the `recursive' variant lists options
also present in any nested packages.
`--version'
`-V'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--cache-file=FILE'
Enable the cache: use and save the results of the tests in FILE,
traditionally `config.cache'. FILE defaults to `/dev/null' to
disable caching.
`--config-cache'
`-C'
Alias for `--cache-file=config.cache'.
`--quiet'
`--silent'
`-q'
Do not print messages saying which checks are being made. To
suppress all normal output, redirect it to `/dev/null' (any error
messages will still be shown).
`--srcdir=DIR'
Look for the package's source code in directory DIR. Usually
`configure' can determine that directory automatically.
`--prefix=DIR'
Use DIR as the installation prefix. *note Installation Names::
for more details, including other options available for fine-tuning
the installation locations.
`--no-create'
`-n'
Run the configure checks, but stop before creating any output
files.
`configure' also accepts some other, not widely useful, options. Run
`configure --help' for more details.

View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,48 @@
## Process this file with automake to produce Makefile.in
ACLOCAL_AMFLAGS = -I autotools
EXTRA_DIST = INSTALL configure.sh prepare.sh
dist_doc_DATA = README COPYING LICENSE.re2
## --------------------------------------------------------------------
cre2_CURRENT = @cre2_VERSION_INTERFACE_CURRENT@
cre2_REVISION = @cre2_VERSION_INTERFACE_REVISION@
cre2_AGE = @cre2_VERSION_INTERFACE_AGE@
include_HEADERS = src/cre2.h
lib_LTLIBRARIES = libcre2.la
libcre2_la_LDFLAGS = -version-info $(cre2_CURRENT):$(cre2_REVISION):$(cre2_AGE)
libcre2_la_SOURCES = src/cre2.cpp
## --------------------------------------------------------------------
AM_MAKEINFOFLAGS = --no-split
info_TEXINFOS = doc/cre2.texi
doc_cre2_TEXINFOS = doc/fdl-1.3.texi
## --------------------------------------------------------------------
check_PROGRAMS = \
tests/test-version \
tests/test-options \
tests/test-rex-alloc \
tests/test-matching \
tests/test-easy-matching \
tests/test-full-match \
tests/test-partial-match \
tests/test-consume-match \
tests/test-find-and-consume-match \
tests/test-replace \
tests/test-misc
AM_CPPFLAGS = -I$(top_srcdir)/src
LDADD = libcre2.la
TESTS = $(check_PROGRAMS)
installcheck-local: $(check_PROGRAMS)
for f in $(check_PROGRAMS); do $$f; done
### end of file

175
outside/cre2/src/README Normal file
View File

@ -0,0 +1,175 @@
C wrapper for re2
=================
Topics
------
1. Introduction
2. License
3. Install
4. Usage
A. Credits
B. Bugs
C. Resources
1. Introduction
---------------
The CRE2 distribution is a C language wrapper for the RE2
library, which is implemented in C++. RE2 is a fast, safe,
thread-friendly alternative to backtracking regular
expression engines like those used in PCRE, Perl, and
Python.
This distribution makes use of the GNU Autotools.
2. License
----------
Copyright (c) 2012, 2013 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
Redistribution and use in source and binary forms, with or
without modification, are permitted provided that the
following conditions are met:
1. Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of the author nor the names of his
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3. Install
----------
To install RE2 follow the instructions in the README file in
RE2's. To install CRE2 from a proper release tarball, do
this:
$ cd cre2-0.1.0
$ mkdir "=build"
$ cd "=build"
to inspect the available configuration options:
$ ../configure --help
then do it:
$ ../configure [options]
$ make
$ make check
$ make install
From a repository checkout or snapshot (the ones from the
Github site): we may need to manually run "libtoolize" the
first time, then we must first run the script "autogen.sh"
from the top source directory, to generate the needed files:
$ sh autogen.sh
for this we need to have installed the GNU Autotools:
Automake, Autoconf, Libtool. After this:
$ ../configure --enable-maintainer-mode [options]
$ make
$ make check
$ make install
The Makefile supports the DESTDIR environment variable to
install files in a temporary location, example: to see what
will happen:
$ make -n install DESTDIR=/tmp/marco/cre2
to really do it:
$ make install DESTDIR=/tmp/marco/cre2
4. Usage
--------
Read the documentation.
A. Credits
----------
RE2 is a Google project. CRE2 is based on code by Keegan
McAllister. This distribution was assembled by Marco Maggi.
B. Bugs
-------
Bug reports are appreciated. Register issues at the CRE2
issue tracker:
<http://github.com/marcomaggi/cre2/issues>
C. Resources
------------
The GNU Project software can be found here:
<http://www.gnu.org/>
RE2 is available at:
<http://code.google.com/p/re2/>
development of this package happens at:
<http://github.com/marcomaggi/cre2/>
and as backup at:
<http://sourceforge.net/projects/cre2/>
proper release tarballs for this package are in the download
area at:
<http://sourceforge.net/projects/cre2/files/>
the documentation is available online:
<http://marcomaggi.github.com/docs/cre2.html>
### end of file
# Local Variables:
# mode: text
# coding: utf-8-unix
# fill-column: 60
# paragraph-start: "*"
# End:

View File

@ -0,0 +1,11 @@
# autogen.sh --
#
# Run this in the top source directory to rebuild the infrastructure.
set -xe
test -d autotools || mkdir autotools
test -f autotools/libtool.m4 || libtoolize
autoreconf --warnings=all --install --verbose "$@"
### end of file

17
outside/cre2/src/build.sh Normal file
View File

@ -0,0 +1,17 @@
if [ ! -d "=build" ]; then
mkdir "=build"
(libtoolize || glibtoolize)
sh autogen.sh
cd "=build"
../configure --enable-maintainer-mode LDFLAGS=-pthread
make
cd ..
fi
if [ ! -d "../lib" ]; then
mkdir ../lib
fi
if [ ! -d "../include" ]; then
mkdir ../include
fi
cp \=build/.libs/* ../lib
cp src/cre2.h ../include

View File

@ -0,0 +1,9 @@
if [ -d "=build" ]; then
rm -r "=build"
fi
if [ -d "../lib" ]; then
rm -r ../lib
fi
if [ -d "../include" ]; then
rm -r ../include
fi

View File

@ -0,0 +1,58 @@
dnl @configure_input@
dnl
AC_PREREQ([2.68])
AC_INIT([CRE2],[0.1b5],[marco.maggi-ipsu@poste.it],
[cre2],[http://github.com/marcomaggi/cre2/])
AC_CONFIG_SRCDIR([src/])
AC_CONFIG_MACRO_DIR([autotools])
AC_CONFIG_AUX_DIR([autotools])
AC_CANONICAL_BUILD
AC_CANONICAL_HOST
AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE([1.14 foreign subdir-objects dist-xz no-dist-gzip -Wall])
AM_MAINTAINER_MODE
AM_PROG_AR
AC_PROG_INSTALL
AC_PROG_LN_S
AC_PROG_MAKE_SET
AC_PROG_MKDIR_P
LT_PREREQ([2.4])
LT_INIT
AC_PROG_CC
AC_PROG_CC_C_O
AC_LANG([C++])
AC_PROG_CXX
AC_PROG_CXX_C_O
AC_CHECK_LIB([re2],[main],,[AC_MSG_FAILURE([test for RE2 library failed])])
AC_CHECK_HEADERS([re2/re2.h],,[AC_MSG_ERROR([test for RE2 header failed])])
cre2_VERSION_INTERFACE_CURRENT=0
cre2_VERSION_INTERFACE_REVISION=0
cre2_VERSION_INTERFACE_AGE=0
AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_CURRENT],
[$cre2_VERSION_INTERFACE_CURRENT],
[current interface number])
AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_REVISION],
[$cre2_VERSION_INTERFACE_REVISION],
[current interface implementation number])
AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_AGE],
[$cre2_VERSION_INTERFACE_AGE],
[current interface age number])
AC_DEFINE_UNQUOTED([cre2_VERSION_INTERFACE_STRING],
["$cre2_VERSION_INTERFACE_CURRENT.$cre2_VERSION_INTERFACE_REVISION"],
[library interface version])
AC_SUBST([cre2_VERSION_INTERFACE_CURRENT])
AC_SUBST([cre2_VERSION_INTERFACE_REVISION])
AC_SUBST([cre2_VERSION_INTERFACE_AGE])
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([Makefile])
AC_OUTPUT
dnl end of file

View File

@ -0,0 +1,24 @@
# configure.sh --
#
# Run this to configure.
set -xe
prefix=/usr/local
if test -d /lib64
then libdir=${prefix}/lib64
else libdir=${prefix}/lib
fi
../configure \
--config-cache \
--cache-file=../config.cache \
--enable-maintainer-mode \
--disable-static --enable-shared \
--prefix="${prefix}" \
--libdir="${libdir}" \
CFLAGS='-O3' \
LDFLAGS="-L${libdir}" \
"$@"
### end of file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,509 @@
@node Documentation License
@appendix GNU Free Documentation License
@cindex FDL, GNU Free Documentation License
@center Version 1.3, 3 November 2008
@c This file is intended to be included within another document,
@c hence no sectioning command or @node.
@display
Copyright @copyright{} 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
@uref{http://fsf.org/}
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
@end display
@enumerate 0
@item
PREAMBLE
The purpose of this License is to make a manual, textbook, or other
functional and useful document @dfn{free} in the sense of freedom: to
assure everyone the effective freedom to copy and redistribute it,
with or without modifying it, either commercially or noncommercially.
Secondarily, this License preserves for the author and publisher a way
to get credit for their work, while not being considered responsible
for modifications made by others.
This License is a kind of ``copyleft'', which means that derivative
works of the document must themselves be free in the same sense. It
complements the GNU General Public License, which is a copyleft
license designed for free software.
We have designed this License in order to use it for manuals for free
software, because free software needs free documentation: a free
program should come with manuals providing the same freedoms that the
software does. But this License is not limited to software manuals;
it can be used for any textual work, regardless of subject matter or
whether it is published as a printed book. We recommend this License
principally for works whose purpose is instruction or reference.
@item
APPLICABILITY AND DEFINITIONS
This License applies to any manual or other work, in any medium, that
contains a notice placed by the copyright holder saying it can be
distributed under the terms of this License. Such a notice grants a
world-wide, royalty-free license, unlimited in duration, to use that
work under the conditions stated herein. The ``Document'', below,
refers to any such manual or work. Any member of the public is a
licensee, and is addressed as ``you''. You accept the license if you
copy, modify or distribute the work in a way requiring permission
under copyright law.
A ``Modified Version'' of the Document means any work containing the
Document or a portion of it, either copied verbatim, or with
modifications and/or translated into another language.
A ``Secondary Section'' is a named appendix or a front-matter section
of the Document that deals exclusively with the relationship of the
publishers or authors of the Document to the Document's overall
subject (or to related matters) and contains nothing that could fall
directly within that overall subject. (Thus, if the Document is in
part a textbook of mathematics, a Secondary Section may not explain
any mathematics.) The relationship could be a matter of historical
connection with the subject or with related matters, or of legal,
commercial, philosophical, ethical or political position regarding
them.
The ``Invariant Sections'' are certain Secondary Sections whose titles
are designated, as being those of Invariant Sections, in the notice
that says that the Document is released under this License. If a
section does not fit the above definition of Secondary then it is not
allowed to be designated as Invariant. The Document may contain zero
Invariant Sections. If the Document does not identify any Invariant
Sections then there are none.
The ``Cover Texts'' are certain short passages of text that are listed,
as Front-Cover Texts or Back-Cover Texts, in the notice that says that
the Document is released under this License. A Front-Cover Text may
be at most 5 words, and a Back-Cover Text may be at most 25 words.
A ``Transparent'' copy of the Document means a machine-readable copy,
represented in a format whose specification is available to the
general public, that is suitable for revising the document
straightforwardly with generic text editors or (for images composed of
pixels) generic paint programs or (for drawings) some widely available
drawing editor, and that is suitable for input to text formatters or
for automatic translation to a variety of formats suitable for input
to text formatters. A copy made in an otherwise Transparent file
format whose markup, or absence of markup, has been arranged to thwart
or discourage subsequent modification by readers is not Transparent.
An image format is not Transparent if used for any substantial amount
of text. A copy that is not ``Transparent'' is called ``Opaque''.
Examples of suitable formats for Transparent copies include plain
@sc{ascii} without markup, Texinfo input format, La@TeX{} input
format, @acronym{SGML} or @acronym{XML} using a publicly available
@acronym{DTD}, and standard-conforming simple @acronym{HTML},
PostScript or @acronym{PDF} designed for human modification. Examples
of transparent image formats include @acronym{PNG}, @acronym{XCF} and
@acronym{JPG}. Opaque formats include proprietary formats that can be
read and edited only by proprietary word processors, @acronym{SGML} or
@acronym{XML} for which the @acronym{DTD} and/or processing tools are
not generally available, and the machine-generated @acronym{HTML},
PostScript or @acronym{PDF} produced by some word processors for
output purposes only.
The ``Title Page'' means, for a printed book, the title page itself,
plus such following pages as are needed to hold, legibly, the material
this License requires to appear in the title page. For works in
formats which do not have any title page as such, ``Title Page'' means
the text near the most prominent appearance of the work's title,
preceding the beginning of the body of the text.
The ``publisher'' means any person or entity that distributes copies
of the Document to the public.
A section ``Entitled XYZ'' means a named subunit of the Document whose
title either is precisely XYZ or contains XYZ in parentheses following
text that translates XYZ in another language. (Here XYZ stands for a
specific section name mentioned below, such as ``Acknowledgements'',
``Dedications'', ``Endorsements'', or ``History''.) To ``Preserve the Title''
of such a section when you modify the Document means that it remains a
section ``Entitled XYZ'' according to this definition.
The Document may include Warranty Disclaimers next to the notice which
states that this License applies to the Document. These Warranty
Disclaimers are considered to be included by reference in this
License, but only as regards disclaiming warranties: any other
implication that these Warranty Disclaimers may have is void and has
no effect on the meaning of this License.
@item
VERBATIM COPYING
You may copy and distribute the Document in any medium, either
commercially or noncommercially, provided that this License, the
copyright notices, and the license notice saying this License applies
to the Document are reproduced in all copies, and that you add no other
conditions whatsoever to those of this License. You may not use
technical measures to obstruct or control the reading or further
copying of the copies you make or distribute. However, you may accept
compensation in exchange for copies. If you distribute a large enough
number of copies you must also follow the conditions in section 3.
You may also lend copies, under the same conditions stated above, and
you may publicly display copies.
@item
COPYING IN QUANTITY
If you publish printed copies (or copies in media that commonly have
printed covers) of the Document, numbering more than 100, and the
Document's license notice requires Cover Texts, you must enclose the
copies in covers that carry, clearly and legibly, all these Cover
Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
the back cover. Both covers must also clearly and legibly identify
you as the publisher of these copies. The front cover must present
the full title with all words of the title equally prominent and
visible. You may add other material on the covers in addition.
Copying with changes limited to the covers, as long as they preserve
the title of the Document and satisfy these conditions, can be treated
as verbatim copying in other respects.
If the required texts for either cover are too voluminous to fit
legibly, you should put the first ones listed (as many as fit
reasonably) on the actual cover, and continue the rest onto adjacent
pages.
If you publish or distribute Opaque copies of the Document numbering
more than 100, you must either include a machine-readable Transparent
copy along with each Opaque copy, or state in or with each Opaque copy
a computer-network location from which the general network-using
public has access to download using public-standard network protocols
a complete Transparent copy of the Document, free of added material.
If you use the latter option, you must take reasonably prudent steps,
when you begin distribution of Opaque copies in quantity, to ensure
that this Transparent copy will remain thus accessible at the stated
location until at least one year after the last time you distribute an
Opaque copy (directly or through your agents or retailers) of that
edition to the public.
It is requested, but not required, that you contact the authors of the
Document well before redistributing any large number of copies, to give
them a chance to provide you with an updated version of the Document.
@item
MODIFICATIONS
You may copy and distribute a Modified Version of the Document under
the conditions of sections 2 and 3 above, provided that you release
the Modified Version under precisely this License, with the Modified
Version filling the role of the Document, thus licensing distribution
and modification of the Modified Version to whoever possesses a copy
of it. In addition, you must do these things in the Modified Version:
@enumerate A
@item
Use in the Title Page (and on the covers, if any) a title distinct
from that of the Document, and from those of previous versions
(which should, if there were any, be listed in the History section
of the Document). You may use the same title as a previous version
if the original publisher of that version gives permission.
@item
List on the Title Page, as authors, one or more persons or entities
responsible for authorship of the modifications in the Modified
Version, together with at least five of the principal authors of the
Document (all of its principal authors, if it has fewer than five),
unless they release you from this requirement.
@item
State on the Title page the name of the publisher of the
Modified Version, as the publisher.
@item
Preserve all the copyright notices of the Document.
@item
Add an appropriate copyright notice for your modifications
adjacent to the other copyright notices.
@item
Include, immediately after the copyright notices, a license notice
giving the public permission to use the Modified Version under the
terms of this License, in the form shown in the Addendum below.
@item
Preserve in that license notice the full lists of Invariant Sections
and required Cover Texts given in the Document's license notice.
@item
Include an unaltered copy of this License.
@item
Preserve the section Entitled ``History'', Preserve its Title, and add
to it an item stating at least the title, year, new authors, and
publisher of the Modified Version as given on the Title Page. If
there is no section Entitled ``History'' in the Document, create one
stating the title, year, authors, and publisher of the Document as
given on its Title Page, then add an item describing the Modified
Version as stated in the previous sentence.
@item
Preserve the network location, if any, given in the Document for
public access to a Transparent copy of the Document, and likewise
the network locations given in the Document for previous versions
it was based on. These may be placed in the ``History'' section.
You may omit a network location for a work that was published at
least four years before the Document itself, or if the original
publisher of the version it refers to gives permission.
@item
For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve
the Title of the section, and preserve in the section all the
substance and tone of each of the contributor acknowledgements and/or
dedications given therein.
@item
Preserve all the Invariant Sections of the Document,
unaltered in their text and in their titles. Section numbers
or the equivalent are not considered part of the section titles.
@item
Delete any section Entitled ``Endorsements''. Such a section
may not be included in the Modified Version.
@item
Do not retitle any existing section to be Entitled ``Endorsements'' or
to conflict in title with any Invariant Section.
@item
Preserve any Warranty Disclaimers.
@end enumerate
If the Modified Version includes new front-matter sections or
appendices that qualify as Secondary Sections and contain no material
copied from the Document, you may at your option designate some or all
of these sections as invariant. To do this, add their titles to the
list of Invariant Sections in the Modified Version's license notice.
These titles must be distinct from any other section titles.
You may add a section Entitled ``Endorsements'', provided it contains
nothing but endorsements of your Modified Version by various
parties---for example, statements of peer review or that the text has
been approved by an organization as the authoritative definition of a
standard.
You may add a passage of up to five words as a Front-Cover Text, and a
passage of up to 25 words as a Back-Cover Text, to the end of the list
of Cover Texts in the Modified Version. Only one passage of
Front-Cover Text and one of Back-Cover Text may be added by (or
through arrangements made by) any one entity. If the Document already
includes a cover text for the same cover, previously added by you or
by arrangement made by the same entity you are acting on behalf of,
you may not add another; but you may replace the old one, on explicit
permission from the previous publisher that added the old one.
The author(s) and publisher(s) of the Document do not by this License
give permission to use their names for publicity for or to assert or
imply endorsement of any Modified Version.
@item
COMBINING DOCUMENTS
You may combine the Document with other documents released under this
License, under the terms defined in section 4 above for modified
versions, provided that you include in the combination all of the
Invariant Sections of all of the original documents, unmodified, and
list them all as Invariant Sections of your combined work in its
license notice, and that you preserve all their Warranty Disclaimers.
The combined work need only contain one copy of this License, and
multiple identical Invariant Sections may be replaced with a single
copy. If there are multiple Invariant Sections with the same name but
different contents, make the title of each such section unique by
adding at the end of it, in parentheses, the name of the original
author or publisher of that section if known, or else a unique number.
Make the same adjustment to the section titles in the list of
Invariant Sections in the license notice of the combined work.
In the combination, you must combine any sections Entitled ``History''
in the various original documents, forming one section Entitled
``History''; likewise combine any sections Entitled ``Acknowledgements'',
and any sections Entitled ``Dedications''. You must delete all
sections Entitled ``Endorsements.''
@item
COLLECTIONS OF DOCUMENTS
You may make a collection consisting of the Document and other documents
released under this License, and replace the individual copies of this
License in the various documents with a single copy that is included in
the collection, provided that you follow the rules of this License for
verbatim copying of each of the documents in all other respects.
You may extract a single document from such a collection, and distribute
it individually under this License, provided you insert a copy of this
License into the extracted document, and follow this License in all
other respects regarding verbatim copying of that document.
@item
AGGREGATION WITH INDEPENDENT WORKS
A compilation of the Document or its derivatives with other separate
and independent documents or works, in or on a volume of a storage or
distribution medium, is called an ``aggregate'' if the copyright
resulting from the compilation is not used to limit the legal rights
of the compilation's users beyond what the individual works permit.
When the Document is included in an aggregate, this License does not
apply to the other works in the aggregate which are not themselves
derivative works of the Document.
If the Cover Text requirement of section 3 is applicable to these
copies of the Document, then if the Document is less than one half of
the entire aggregate, the Document's Cover Texts may be placed on
covers that bracket the Document within the aggregate, or the
electronic equivalent of covers if the Document is in electronic form.
Otherwise they must appear on printed covers that bracket the whole
aggregate.
@item
TRANSLATION
Translation is considered a kind of modification, so you may
distribute translations of the Document under the terms of section 4.
Replacing Invariant Sections with translations requires special
permission from their copyright holders, but you may include
translations of some or all Invariant Sections in addition to the
original versions of these Invariant Sections. You may include a
translation of this License, and all the license notices in the
Document, and any Warranty Disclaimers, provided that you also include
the original English version of this License and the original versions
of those notices and disclaimers. In case of a disagreement between
the translation and the original version of this License or a notice
or disclaimer, the original version will prevail.
If a section in the Document is Entitled ``Acknowledgements'',
``Dedications'', or ``History'', the requirement (section 4) to Preserve
its Title (section 1) will typically require changing the actual
title.
@item
TERMINATION
You may not copy, modify, sublicense, or distribute the Document
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense, or distribute it is void, and
will automatically terminate your rights under this License.
However, if you cease all violation of this License, then your license
from a particular copyright holder is reinstated (a) provisionally,
unless and until the copyright holder explicitly and finally
terminates your license, and (b) permanently, if the copyright holder
fails to notify you of the violation by some reasonable means prior to
60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, receipt of a copy of some or all of the same material does
not give you any rights to use it.
@item
FUTURE REVISIONS OF THIS LICENSE
The Free Software Foundation may publish new, revised versions
of the GNU Free Documentation License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns. See
@uref{http://www.gnu.org/copyleft/}.
Each version of the License is given a distinguishing version number.
If the Document specifies that a particular numbered version of this
License ``or any later version'' applies to it, you have the option of
following the terms and conditions either of that specified version or
of any later version that has been published (not as a draft) by the
Free Software Foundation. If the Document does not specify a version
number of this License, you may choose any version ever published (not
as a draft) by the Free Software Foundation. If the Document
specifies that a proxy can decide which future versions of this
License can be used, that proxy's public statement of acceptance of a
version permanently authorizes you to choose that version for the
Document.
@item
RELICENSING
``Massive Multiauthor Collaboration Site'' (or ``MMC Site'') means any
World Wide Web server that publishes copyrightable works and also
provides prominent facilities for anybody to edit those works. A
public wiki that anybody can edit is an example of such a server. A
``Massive Multiauthor Collaboration'' (or ``MMC'') contained in the
site means any set of copyrightable works thus published on the MMC
site.
``CC-BY-SA'' means the Creative Commons Attribution-Share Alike 3.0
license published by Creative Commons Corporation, a not-for-profit
corporation with a principal place of business in San Francisco,
California, as well as future copyleft versions of that license
published by that same organization.
``Incorporate'' means to publish or republish a Document, in whole or
in part, as part of another Document.
An MMC is ``eligible for relicensing'' if it is licensed under this
License, and if all works that were first published under this License
somewhere other than this MMC, and subsequently incorporated in whole
or in part into the MMC, (1) had no cover texts or invariant sections,
and (2) were thus incorporated prior to November 1, 2008.
The operator of an MMC Site may republish an MMC contained in the site
under CC-BY-SA on the same site at any time before August 1, 2009,
provided the MMC is eligible for relicensing.
@end enumerate
@page
@heading ADDENDUM: How to use this License for your documents
To use this License in a document you have written, include a copy of
the License in the document and put the following copyright and
license notices just after the title page:
@smallexample
@group
Copyright (C) @var{year} @var{your name}.
Permission is granted to copy, distribute and/or modify this document
under the terms of the GNU Free Documentation License, Version 1.3
or any later version published by the Free Software Foundation;
with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
Texts. A copy of the license is included in the section entitled ``GNU
Free Documentation License''.
@end group
@end smallexample
If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
replace the ``with@dots{}Texts.'' line with this:
@smallexample
@group
with the Invariant Sections being @var{list their titles}, with
the Front-Cover Texts being @var{list}, and with the Back-Cover Texts
being @var{list}.
@end group
@end smallexample
If you have Invariant Sections without Cover Texts, or some other
combination of the three, merge those two alternatives to suit the
situation.
If your document contains nontrivial examples of program code, we
recommend releasing these examples in parallel under your choice of
free software license, such as the GNU General Public License,
to permit their use in free software.
@c Local Variables:
@c ispell-local-pdict: "ispell-dict"
@c End:

View File

@ -0,0 +1,10 @@
# prepare.sh --
#
# Run this to rebuild the infrastructure and configure.
set -xe
(cd .. && sh autogen.sh)
sh ../configure.sh
### end of file

View File

@ -0,0 +1,647 @@
/*
Source file for CRE2, a C language wrapper for RE2: a regular
expressions library by Google.
Copyright (c) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
For the license notice see the COPYING file.
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <re2/re2.h>
#include "cre2.h"
#include <cstdlib>
#include <cstdio>
/** --------------------------------------------------------------------
** Version functions.
** ----------------------------------------------------------------- */
const char *
cre2_version_string (void)
{
#ifdef HAVE_CONFIG_H
return cre2_VERSION_INTERFACE_STRING;
#else
return "0.0";
#endif
}
int
cre2_version_interface_current (void)
{
#ifdef HAVE_CONFIG_H
return cre2_VERSION_INTERFACE_CURRENT;
#else
return 0;
#endif
}
int
cre2_version_interface_revision (void)
{
#ifdef HAVE_CONFIG_H
return cre2_VERSION_INTERFACE_REVISION;
#else
return 0;
#endif
}
int
cre2_version_interface_age (void)
{
#ifdef HAVE_CONFIG_H
return cre2_VERSION_INTERFACE_AGE;
#else
return 0;
#endif
}
/** --------------------------------------------------------------------
** Options objects.
** ----------------------------------------------------------------- */
/* Cast the pointer argument "opt" to a pointer of type
"RE2::Options*". */
#define TO_OPT(opt) (reinterpret_cast<RE2::Options *>(opt))
cre2_options_t *
cre2_opt_new(void)
/* Allocate and return a new options object. */
{
// FIXME: is this use of "nothrow" good to avoid raising exceptions
// when memory allocation fails and to return NULL instead?
return reinterpret_cast<void*>(new (std::nothrow) RE2::Options());
}
void
cre2_opt_delete (cre2_options_t *opt)
/* Finalise an options object. */
{
delete TO_OPT(opt);
}
/* Set or unset option flags in an options object. */
#define OPT_BOOL(name) \
void cre2_opt_set_##name (cre2_options_t *opt, int flag) \
{ \
TO_OPT(opt)->set_##name(bool(flag)); \
} \
int cre2_opt_##name (cre2_options_t *opt) \
{ \
return TO_OPT(opt)->name(); \
}
OPT_BOOL(posix_syntax)
OPT_BOOL(longest_match)
OPT_BOOL(log_errors)
OPT_BOOL(literal)
OPT_BOOL(never_nl)
OPT_BOOL(case_sensitive)
OPT_BOOL(perl_classes)
OPT_BOOL(word_boundary)
OPT_BOOL(one_line)
#undef OPT_BOOL
void
cre2_opt_set_encoding (cre2_options_t *opt, cre2_encoding_t enc)
/* Select the encoding in an options object. */
{
switch (enc) {
case CRE2_UTF8:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8);
break;
case CRE2_Latin1:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1);
break;
default:
fprintf(stderr, "CRE2: internal error: unknown encoding %d\n", enc);
exit(EXIT_FAILURE);
}
}
cre2_encoding_t
cre2_opt_encoding (cre2_options_t *opt)
{
RE2::Options::Encoding E = TO_OPT(opt)->encoding();
switch (E) {
case RE2::Options::EncodingUTF8:
return CRE2_UTF8;
case RE2::Options::EncodingLatin1:
return CRE2_Latin1;
default:
return CRE2_UNKNOWN;
}
}
void
cre2_opt_set_max_mem (cre2_options_t *opt, int m)
/* Configure the maximum amount of memory in an options object. */
{
TO_OPT(opt)->set_max_mem(m);
}
int
cre2_opt_max_mem (cre2_options_t *opt)
{
return TO_OPT(opt)->max_mem();
}
/** --------------------------------------------------------------------
** Precompiled regular expressions objects.
** ----------------------------------------------------------------- */
#define TO_RE2(re) (reinterpret_cast<RE2 *>(re))
#define TO_CONST_RE2(re) (reinterpret_cast<const RE2 *>(re))
cre2_regexp_t *
cre2_new (const char *pattern, int pattern_len, const cre2_options_t *opt)
{
re2::StringPiece pattern_re2(pattern, pattern_len);
if (opt) {
// FIXME: is this use of "nothrow" enough to avoid raising
// exceptions when memory allocation fails and to return NULL
// instead?
return reinterpret_cast<void*>
(new (std::nothrow) RE2(pattern_re2, *reinterpret_cast<const RE2::Options *>(opt)));
} else {
return reinterpret_cast<void*> (new (std::nothrow) RE2(pattern_re2));
}
}
void
cre2_delete (cre2_regexp_t *re)
{
delete TO_RE2(re);
}
const char *
cre2_pattern (const cre2_regexp_t *re)
{
return TO_CONST_RE2(re)->pattern().c_str();
}
int
cre2_error_code (const cre2_regexp_t *re)
{
return int(TO_CONST_RE2(re)->error_code());
}
const char *
cre2_error_string (const cre2_regexp_t *re)
{
return TO_CONST_RE2(re)->error().c_str();
}
void
cre2_error_arg (const cre2_regexp_t *re, cre2_string_t *arg)
{
const std::string &argstr = TO_CONST_RE2(re)->error_arg();
arg->data = argstr.data();
arg->length = argstr.length();
}
int
cre2_num_capturing_groups (const cre2_regexp_t *re)
{
return TO_CONST_RE2(re)->NumberOfCapturingGroups();
}
int
cre2_program_size (const cre2_regexp_t *re)
{
return TO_CONST_RE2(re)->ProgramSize();
}
/** --------------------------------------------------------------------
** Matching with precompiled regular expressions objects.
** ----------------------------------------------------------------- */
int
cre2_match (const cre2_regexp_t *re , const char *text,
int textlen, int startpos, int endpos, cre2_anchor_t anchor,
cre2_string_t *match, int nmatch)
{
re2::StringPiece text_re2(text, textlen);
re2::StringPiece *match_re2;
RE2::Anchor anchor_re2 = RE2::UNANCHORED;
bool retval; // 0 for no match
// 1 for successful matching
match_re2 = (re2::StringPiece *)malloc(sizeof(re2::StringPiece) * nmatch);
switch (anchor) {
case CRE2_ANCHOR_START:
anchor_re2 = RE2::ANCHOR_START;
break;
case CRE2_ANCHOR_BOTH:
anchor_re2 = RE2::ANCHOR_BOTH;
break;
case CRE2_UNANCHORED:
break;
}
retval = TO_CONST_RE2(re)->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch);
if (retval) {
for (int i=0; i<nmatch; i++) {
match[i].data = match_re2[i].data();
match[i].length = match_re2[i].length();
}
}
free(match_re2);
return (retval)? 1 : 0;
}
int
cre2_easy_match (const char * pattern, int pattern_len,
const char *text, int text_len,
cre2_string_t *match, int nmatch)
{
cre2_regexp_t * rex;
cre2_options_t * opt;
int retval; // 0 for no match, 1 for successful
// matching, 2 for wrong regexp
opt = cre2_opt_new();
if (!opt) return 2;
cre2_opt_set_log_errors(opt, 0);
rex = cre2_new(pattern, pattern_len, opt);
if (!rex) {
cre2_opt_delete(opt);
return 2;
}
{
if (!cre2_error_code(rex)) {
retval = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, match, nmatch);
} else {
retval = 2;
}
}
cre2_delete(rex);
cre2_opt_delete(opt);
return retval;
}
void
cre2_strings_to_ranges (const char * text, cre2_range_t * ranges, cre2_string_t * strings, int nmatch)
{
for (int i=0; i<nmatch; ++i) {
ranges[i].start = strings[i].data - text;
ranges[i].past = ranges[i].start + strings[i].length;
}
}
/** --------------------------------------------------------------------
** Other matching functions: stringz pattern.
** ----------------------------------------------------------------- */
#define DEFINE_MATCH_ZSTRING_FUN(NAME,FUN) \
int \
NAME (const char * pattern, const cre2_string_t * text, \
cre2_string_t * match, int nmatch) \
{ \
re2::StringPiece input(text->data, text->length); \
re2::StringPiece *strv; \
RE2::Arg *argv; \
RE2::Arg * *args; \
bool retval; \
strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \
argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \
args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \
for (int i=0; i<nmatch; ++i) { \
argv[i] = &strv[i]; \
args[i] = &argv[i]; \
} \
retval = RE2::FUN(input, pattern, args, nmatch); \
if (retval) { \
for (int i=0; i<nmatch; ++i) { \
match[i].data = strv[i].data(); \
match[i].length = strv[i].length(); \
} \
} \
free(strv); \
free(argv); \
free(args); \
return int(retval); \
}
DEFINE_MATCH_ZSTRING_FUN(cre2_full_match,FullMatchN)
DEFINE_MATCH_ZSTRING_FUN(cre2_partial_match,PartialMatchN)
/* This is different from the above in that the "input" argument is
mutated to reference the text after the mathing portion. */
#define DEFINE_MATCH_ZSTRING_FUN2(NAME,FUN) \
int \
NAME (const char * pattern, cre2_string_t * text, \
cre2_string_t * match, int nmatch) \
{ \
re2::StringPiece input(text->data, text->length); \
re2::StringPiece *strv; \
RE2::Arg *argv; \
RE2::Arg * *args; \
bool retval; \
strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \
argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \
args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \
for (int i=0; i<nmatch; ++i) { \
argv[i] = &strv[i]; \
args[i] = &argv[i]; \
} \
retval = RE2::FUN(&input, pattern, args, nmatch); \
if (retval) { \
text->data = input.data(); \
text->length = input.length(); \
for (int i=0; i<nmatch; ++i) { \
match[i].data = strv[i].data(); \
match[i].length = strv[i].length(); \
} \
} \
free(strv); \
free(argv); \
free(args); \
return int(retval); \
}
DEFINE_MATCH_ZSTRING_FUN2(cre2_consume,ConsumeN)
DEFINE_MATCH_ZSTRING_FUN2(cre2_find_and_consume,FindAndConsumeN)
/** --------------------------------------------------------------------
** Other matching functions: rex pattern.
** ----------------------------------------------------------------- */
#define DEFINE_MATCH_REX_FUN(NAME,FUN) \
int \
NAME (cre2_regexp_t * rex, const cre2_string_t * text, \
cre2_string_t * match, int nmatch) \
{ \
re2::StringPiece input(text->data, text->length); \
re2::StringPiece *strv; \
RE2::Arg *argv; \
RE2::Arg * *args; \
bool retval; \
strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \
argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \
args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \
for (int i=0; i<nmatch; ++i) { \
argv[i] = &strv[i]; \
args[i] = &argv[i]; \
} \
retval = RE2::FUN(input, *TO_RE2(rex), args, nmatch); \
if (retval) { \
for (int i=0; i<nmatch; ++i) { \
match[i].data = strv[i].data(); \
match[i].length = strv[i].length(); \
} \
} \
free(strv); \
free(argv); \
free(args); \
return int(retval); \
}
DEFINE_MATCH_REX_FUN(cre2_full_match_re,FullMatchN)
DEFINE_MATCH_REX_FUN(cre2_partial_match_re,PartialMatchN)
/* This is different from the above in that the "input" argument is
mutated to reference the text after the mathing portion. */
#define DEFINE_MATCH_REX_FUN2(NAME,FUN) \
int \
NAME (cre2_regexp_t * rex, cre2_string_t * text, \
cre2_string_t * match, int nmatch) \
{ \
re2::StringPiece input(text->data, text->length); \
re2::StringPiece *strv; \
RE2::Arg *argv; \
RE2::Arg * *args; \
bool retval; \
strv = (re2::StringPiece *) (malloc(sizeof(re2::StringPiece) *nmatch)); \
argv = (RE2::Arg *) (malloc(sizeof(RE2::Arg) *nmatch)); \
args = (RE2::Arg **) (malloc(sizeof(RE2::Arg *) *nmatch)); \
for (int i=0; i<nmatch; ++i) { \
argv[i] = &strv[i]; \
args[i] = &argv[i]; \
} \
retval = RE2::FUN(&input, *TO_RE2(rex), args, nmatch); \
if (retval) { \
text->data = input.data(); \
text->length = input.length(); \
for (int i=0; i<nmatch; ++i) { \
match[i].data = strv[i].data(); \
match[i].length = strv[i].length(); \
} \
} \
free(strv); \
free(argv); \
free(args); \
return int(retval); \
}
DEFINE_MATCH_REX_FUN2(cre2_consume_re,ConsumeN)
DEFINE_MATCH_REX_FUN2(cre2_find_and_consume_re,FindAndConsumeN)
/** --------------------------------------------------------------------
** Problematic functions.
** ----------------------------------------------------------------- */
/* The following functions rely on C++ memory allocation. It is not
clear how they can be written to allow a correct API towards C. */
int
cre2_replace (const char * pattern, cre2_string_t * text_and_target, cre2_string_t * rewrite)
{
try {
std::string S(text_and_target->data, text_and_target->length);
re2::StringPiece R(rewrite->data, rewrite->length);
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = RE2::Replace(&S, pattern, R);
text_and_target->length = S.length();
buffer = (char *)malloc(1+text_and_target->length);
if (buffer) {
S.copy(buffer, text_and_target->length);
buffer[text_and_target->length] = '\0';
text_and_target->data = buffer;
} else
return -1;
return int(retval);
} catch(const std::exception &e) {
// e.what();
return -1;
} catch(...) {
return -1;
}
}
int
cre2_replace_re (cre2_regexp_t * rex, cre2_string_t * text_and_target, cre2_string_t * rewrite)
{
std::string S(text_and_target->data, text_and_target->length);
re2::StringPiece R(rewrite->data, rewrite->length);
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = RE2::Replace(&S, *TO_RE2(rex), R);
text_and_target->length = S.length();
buffer = (char *)malloc(1+text_and_target->length);
if (buffer) {
S.copy(buffer, text_and_target->length);
buffer[text_and_target->length] = '\0';
text_and_target->data = buffer;
} else
return -1;
return int(retval);
}
/* ------------------------------------------------------------------ */
int
cre2_global_replace (const char * pattern, cre2_string_t * text_and_target, cre2_string_t * rewrite)
{
std::string S(text_and_target->data, text_and_target->length);
re2::StringPiece R(rewrite->data, rewrite->length);
char * buffer; /* this exists to make GCC shut up about const */
int retval;
retval = RE2::GlobalReplace(&S, pattern, R);
text_and_target->length = S.length();
buffer = (char *)malloc(1+text_and_target->length);
if (buffer) {
S.copy(buffer, text_and_target->length);
buffer[text_and_target->length] = '\0';
text_and_target->data = buffer;
} else
return -1;
return int(retval);
}
int
cre2_global_replace_re (cre2_regexp_t * rex, cre2_string_t * text_and_target, cre2_string_t * rewrite)
{
std::string S(text_and_target->data, text_and_target->length);
re2::StringPiece R(rewrite->data, rewrite->length);
char * buffer; /* this exists to make GCC shut up about const */
int retval;
retval = RE2::GlobalReplace(&S, *TO_RE2(rex), R);
text_and_target->length = S.length();
buffer = (char *)malloc(1+text_and_target->length);
if (buffer) {
S.copy(buffer, text_and_target->length);
buffer[text_and_target->length] = '\0';
text_and_target->data = buffer;
} else
return -1;
return retval;
}
/* ------------------------------------------------------------------ */
int
cre2_extract (const char * pattern, cre2_string_t * text,
cre2_string_t * rewrite, cre2_string_t * target)
{
re2::StringPiece T(text->data, text->length);
re2::StringPiece R(rewrite->data, rewrite->length);
std::string O;
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = RE2::Extract(T, pattern, R, &O);
target->length = O.length();
buffer = (char *)malloc(1+target->length);
if (buffer) {
O.copy(buffer, target->length);
buffer[target->length] = '\0';
target->data = buffer;
} else
return -1;
return int(retval);
}
int
cre2_extract_re (cre2_regexp_t * rex, cre2_string_t * text,
cre2_string_t * rewrite, cre2_string_t * target)
{
re2::StringPiece T(text->data, text->length);
re2::StringPiece R(rewrite->data, rewrite->length);
std::string O;
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = RE2::Extract(T, *TO_RE2(rex), R, &O);
target->length = O.length();
buffer = (char *)malloc(1+target->length);
if (buffer) {
O.copy(buffer, target->length);
buffer[target->length] = '\0';
target->data = buffer;
} else
return -1;
return int(retval);
}
/* ------------------------------------------------------------------ */
int
cre2_quote_meta (cre2_string_t * quoted, cre2_string_t * original)
{
re2::StringPiece O(original->data, original->length);
std::string Q;
char * buffer; /* this exists to make GCC shut up about const */
Q = RE2::QuoteMeta(O);
quoted->length = Q.length();
buffer = (char *)malloc(1+quoted->length);
if (buffer) {
Q.copy(buffer, quoted->length);
buffer[quoted->length] = '\0';
quoted->data = buffer;
return 0;
} else
return -1;
}
int
cre2_possible_match_range (cre2_regexp_t * rex,
cre2_string_t * min_, cre2_string_t * max_, int maxlen)
{
std::string MIN, MAX;
cre2_string_t min, max;
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = TO_RE2(rex)->PossibleMatchRange(&MIN, &MAX, maxlen);
if (retval) {
/* copy MIN */
min.length = MIN.length();
buffer = (char *)malloc(1+min.length);
if (buffer) {
MIN.copy(buffer, min.length);
buffer[min.length] = '\0';
min.data = buffer;
} else
return -1;
/* copy MAX */
max.length = MAX.length();
buffer = (char *)malloc(1+max.length);
if (buffer) {
MAX.copy(buffer, max.length);
buffer[max.length] = '\0';
max.data = buffer;
} else {
free((void *)min.data);
min.data = NULL;
return -1;
}
*min_ = min;
*max_ = max;
return 1;
} else
return 0;
}
int
cre2_check_rewrite_string (cre2_regexp_t * rex, cre2_string_t * rewrite, cre2_string_t * errmsg)
{
re2::StringPiece R(rewrite->data, rewrite->length);
std::string E;
char * buffer; /* this exists to make GCC shut up about const */
bool retval;
retval = TO_RE2(rex)->CheckRewriteString(R, &E);
if (retval) {
errmsg->data = NULL;
errmsg->length = 0;
return 1;
} else {
errmsg->length = E.length();
buffer = (char *)malloc(1+errmsg->length);
if (buffer) {
E.copy(buffer, errmsg->length);
buffer[errmsg->length] = '\0';
errmsg->data = buffer;
} else
return -1;
return 0;
}
}
/* end of file */

299
outside/cre2/src/src/cre2.h Normal file
View File

@ -0,0 +1,299 @@
/*
Header file for CRE2, a C language wrapper for RE2: a regular
expressions library by Google.
Copyright (c) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
Copyright (c) 2011 Keegan McAllister
All rights reserved.
For the license notice see the COPYING file.
*/
/** --------------------------------------------------------------------
** Headers.
** ----------------------------------------------------------------- */
#ifdef __cplusplus
extern "C" {
#endif
#ifndef cre2_decl
# define cre2_decl extern
#endif
/** --------------------------------------------------------------------
** Version functions.
** ----------------------------------------------------------------- */
cre2_decl const char * cre2_version_string (void);
cre2_decl int cre2_version_interface_current (void);
cre2_decl int cre2_version_interface_revision (void);
cre2_decl int cre2_version_interface_age (void);
/** --------------------------------------------------------------------
** Regular expressions configuration options.
** ----------------------------------------------------------------- */
typedef void cre2_options_t;
typedef enum cre2_encoding_t {
CRE2_UNKNOWN = 0, /* should never happen */
CRE2_UTF8 = 1,
CRE2_Latin1 = 2
} cre2_encoding_t;
cre2_decl cre2_options_t *cre2_opt_new (void);
cre2_decl void cre2_opt_delete (cre2_options_t *opt);
cre2_decl void cre2_opt_set_posix_syntax (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_longest_match (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_log_errors (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_literal (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_never_nl (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_case_sensitive (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_perl_classes (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_word_boundary (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_one_line (cre2_options_t *opt, int flag);
cre2_decl void cre2_opt_set_max_mem (cre2_options_t *opt, int m);
cre2_decl void cre2_opt_set_encoding (cre2_options_t *opt, cre2_encoding_t enc);
cre2_decl int cre2_opt_posix_syntax (cre2_options_t *opt);
cre2_decl int cre2_opt_longest_match (cre2_options_t *opt);
cre2_decl int cre2_opt_log_errors (cre2_options_t *opt);
cre2_decl int cre2_opt_literal (cre2_options_t *opt);
cre2_decl int cre2_opt_never_nl (cre2_options_t *opt);
cre2_decl int cre2_opt_case_sensitive (cre2_options_t *opt);
cre2_decl int cre2_opt_perl_classes (cre2_options_t *opt);
cre2_decl int cre2_opt_word_boundary (cre2_options_t *opt);
cre2_decl int cre2_opt_one_line (cre2_options_t *opt);
cre2_decl int cre2_opt_max_mem (cre2_options_t *opt);
cre2_decl cre2_encoding_t cre2_opt_encoding (cre2_options_t *opt);
/** --------------------------------------------------------------------
** Precompiled regular expressions.
** ----------------------------------------------------------------- */
typedef struct cre2_string_t {
const char * data;
int length;
} cre2_string_t;
typedef void cre2_regexp_t;
/* This definition must be kept in sync with the definition of "enum
ErrorCode" in the file "re2.h" of the original RE2 distribution. */
typedef enum cre2_error_code_t {
CRE2_NO_ERROR = 0,
CRE2_ERROR_INTERNAL, /* unexpected error */
/* parse errors */
CRE2_ERROR_BAD_ESCAPE, /* bad escape sequence */
CRE2_ERROR_BAD_CHAR_CLASS, /* bad character class */
CRE2_ERROR_BAD_CHAR_RANGE, /* bad character class range */
CRE2_ERROR_MISSING_BRACKET, /* missing closing ] */
CRE2_ERROR_MISSING_PAREN, /* missing closing ) */
CRE2_ERROR_TRAILING_BACKSLASH,/* trailing \ at end of regexp */
CRE2_ERROR_REPEAT_ARGUMENT, /* repeat argument missing, e.g. "*" */
CRE2_ERROR_REPEAT_SIZE, /* bad repetition argument */
CRE2_ERROR_REPEA_TOP, /* bad repetition operator */
CRE2_ERROR_BAD_PERL_OP, /* bad perl operator */
CRE2_ERROR_BAD_UTF8, /* invalid UTF-8 in regexp */
CRE2_ERROR_BAD_NAMED_CAPTURE, /* bad named capture group */
CRE2_ERROR_PATTERN_TOO_LARGE, /* pattern too large (compile failed) */
} cre2_error_code_t;
/* construction and destruction */
cre2_decl cre2_regexp_t * cre2_new (const char *pattern, int pattern_len,
const cre2_options_t *opt);
cre2_decl void cre2_delete (cre2_regexp_t *re);
/* regular expression inspection */
cre2_decl const char * cre2_pattern (const cre2_regexp_t *re);
cre2_decl int cre2_error_code (const cre2_regexp_t *re);
cre2_decl int cre2_num_capturing_groups (const cre2_regexp_t *re);
cre2_decl int cre2_program_size (const cre2_regexp_t *re);
/* invalidated by further re use */
cre2_decl const char *cre2_error_string(const cre2_regexp_t *re);
cre2_decl void cre2_error_arg(const cre2_regexp_t *re, cre2_string_t * arg);
/** --------------------------------------------------------------------
** Main matching functions.
** ----------------------------------------------------------------- */
typedef enum cre2_anchor_t {
CRE2_UNANCHORED = 1,
CRE2_ANCHOR_START = 2,
CRE2_ANCHOR_BOTH = 3
} cre2_anchor_t;
typedef struct cre2_range_t {
long start; /* inclusive start index for bytevector */
long past; /* exclusive end index for bytevector */
} cre2_range_t;
cre2_decl int cre2_match (const cre2_regexp_t * re,
const char * text, int textlen,
int startpos, int endpos, cre2_anchor_t anchor,
cre2_string_t * match, int nmatch);
cre2_decl int cre2_easy_match (const char * pattern, int pattern_len,
const char * text, int text_len,
cre2_string_t * match, int nmatch);
cre2_decl void cre2_strings_to_ranges (const char * text, cre2_range_t * ranges,
cre2_string_t * strings, int nmatch);
/** --------------------------------------------------------------------
** Other matching functions.
** ----------------------------------------------------------------- */
typedef int cre2_match_stringz_fun_t (const char * pattern, const cre2_string_t * text,
cre2_string_t * match, int nmatch);
typedef int cre2_match_stringz2_fun_t (const char * pattern, cre2_string_t * text,
cre2_string_t * match, int nmatch);
typedef int cre2_match_rex_fun_t (cre2_regexp_t * rex, const cre2_string_t * text,
cre2_string_t * match, int nmatch);
typedef int cre2_match_rex2_fun_t (cre2_regexp_t * rex, cre2_string_t * text,
cre2_string_t * match, int nmatch);
cre2_decl cre2_match_stringz_fun_t cre2_full_match;
cre2_decl cre2_match_stringz_fun_t cre2_partial_match;
cre2_decl cre2_match_stringz2_fun_t cre2_consume;
cre2_decl cre2_match_stringz2_fun_t cre2_find_and_consume;
cre2_decl cre2_match_rex_fun_t cre2_full_match_re;
cre2_decl cre2_match_rex_fun_t cre2_partial_match_re;
cre2_decl cre2_match_rex2_fun_t cre2_consume_re;
cre2_decl cre2_match_rex2_fun_t cre2_find_and_consume_re;
/** --------------------------------------------------------------------
** Problematic functions.
** ----------------------------------------------------------------- */
/* Match the text in the buffer "text_and_target" against the rex in
"pattern" or "rex". Mutate "text_and_target" so that it references a
malloc'ed buffer holding the original text in which the first, and
only the first, match is substituted with the text in "rewrite".
Numeric backslash sequences (\1 to \9) in "rewrite" are substituted
with the portions of text matching the corresponding parenthetical
subexpressions.
Return 0 if no match, 1 if successful match, -1 if error allocating
memory. */
cre2_decl int cre2_replace (const char * pattern,
cre2_string_t * text_and_target,
cre2_string_t * rewrite);
cre2_decl int cre2_replace_re (cre2_regexp_t * rex,
cre2_string_t * text_and_target,
cre2_string_t * rewrite);
/* Match the text in the buffer "text_and_target" against the rex in
"pattern" or "rex". Mutate "text_and_target" so that it references a
malloc'ed buffer holding the original text in which the all the
matching substrings are substituted with the text in "rewrite".
Numeric backslash sequences (\1 to \9) in "rewrite" are substituted
with the portions of text matching the corresponding parenthetical
subexpressions.
Return 0 if no match, positive integer representing the number of
substitutions performed if successful match, -1 if error allocating
memory. */
cre2_decl int cre2_global_replace (const char * pattern,
cre2_string_t * text_and_target,
cre2_string_t * rewrite);
cre2_decl int cre2_global_replace_re (cre2_regexp_t * rex,
cre2_string_t * text_and_target,
cre2_string_t * rewrite);
/* Match the text in the buffer "text" against the rex in "pattern" or
"rex". Mutate "target" so that it references a malloc'ed buffer
holding a copy of the text in "rewrite"; numeric backslash sequences
(\1 to \9) in "rewrite" are substituted with the portions of text
matching the corresponding parenthetical subexpressions.
Non-matching text in "text" is ignored.
Return 0 if no match, 1 if successful match, -1 if error allocating
memory. */
cre2_decl int cre2_extract (const char * pattern,
cre2_string_t * text,
cre2_string_t * rewrite,
cre2_string_t * target);
cre2_decl int cre2_extract_re (cre2_regexp_t * rex,
cre2_string_t * text,
cre2_string_t * rewrite,
cre2_string_t * target);
/* ------------------------------------------------------------------ */
/* Allocate a zero-terminated malloc'ed buffer and fill it with the text
from "original" having all the regexp meta characters quoted with
single backslashes. Return 0 if successful, return -1 if an error
allocating memory occurs. */
cre2_decl int cre2_quote_meta (cre2_string_t * quoted, cre2_string_t * original);
/* Compute a "minimum" string and a "maximum" string matching the given
regular expression. The min and max can in some cases be arbitrarily
precise, so the caller gets to specify "maxlen" begin the maximum
desired length of string returned.
Assuming the call returns successfully, any string S that is an
anchored match for this regexp satisfies:
min <= S && S <= max.
Note that this function will only consider the first copy of an
infinitely repeated element (i.e., any regexp element followed by a
'*' or '+' operator). Regexps with "{N}" constructions are not
affected, as those do not compile down to infinite repetitions.
"min_" and "max_" are mutated to reference zero-terminated malloc'ed
buffers holding the min and max strings.
Return 0 if failure, return 1 if successful, return -1 if an error
allocating memory occurs. */
cre2_decl int cre2_possible_match_range (cre2_regexp_t * rex,
cre2_string_t * min_, cre2_string_t * max_,
int maxlen);
/* Check that the given rewrite string is suitable for use with this
regular expression. It checks that:
* The regular expression has enough parenthesized subexpressions to
satisfy all of the \N tokens in rewrite
* The rewrite string doesn't have any syntax errors. E.g., '\'
followed by anything other than a digit or '\'.
A true return value guarantees that the replace and extract functions
won't fail because of a bad rewrite string.
In case of error: "errmsg" is mutated to reference a zero-terminated
malloc'ed string describing the problem.
Return 1 if the string is correct, return 0 if the string is
incorrect, return -1 if an error occurred allocating memory. */
cre2_decl int cre2_check_rewrite_string (cre2_regexp_t * rex,
cre2_string_t * rewrite, cre2_string_t * errmsg);
/** --------------------------------------------------------------------
** Done.
** ----------------------------------------------------------------- */
#ifdef __cplusplus
} // extern "C"
#endif
/* end of file */

View File

@ -0,0 +1,335 @@
/*
Part of: CRE2
Contents: test for consume match function
Date: Tue Jan 3, 2012
Abstract
Test file for consume match function.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* success, no parentheses, full consumed buffer */
const char * pattern = "ci.*ut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp("", input.data, input.length))
goto error;
}
{ /* success, no parentheses, partially consumed buffer */
const char * pattern = "ci.*ut";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
result = cre2_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_consume(pattern, &input, NULL, 0);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_consume(pattern, &input, match, nmatch);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
result = cre2_consume(pattern, &input, match, nmatch);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* wrong regexp specification */
const char * pattern = "cia(o salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_consume(pattern, &input, match, nmatch);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
/* ------------------------------------------------------------------ */
{ /* success, no parentheses, full buffer consumed */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, no parentheses, partial buffer consumed */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,103 @@
/*
Part of: CRE2
Contents: test for easy matching
Date: Mon Jan 2, 2012
Abstract
Test file for regular expressions matching.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
const char * pattern;
const char * text;
/* ------------------------------------------------------------------ */
/* single match */
pattern = "ciao";
text = "ciao";
{
cre2_string_t match;
int nmatch = 1;
cre2_easy_match(pattern, strlen(pattern),
text, strlen(text),
&match, nmatch);
PRINTF("match: ");
FWRITE(match.data, match.length, 1, stdout);
PRINTF("\n");
if (0 != strncmp("ciao", match.data, match.length))
goto error;
}
/* ------------------------------------------------------------------ */
/* wrong pattern */
pattern = "ci(ao";
text = "ciao";
{
cre2_string_t match;
int nmatch = 1;
int retval;
retval = cre2_easy_match(pattern, strlen(pattern),
text, strlen(text),
&match, nmatch);
if (2 != retval)
goto error;
}
/* ------------------------------------------------------------------ */
/* two groups */
pattern = "(ciao) (hello)";
text = "ciao hello";
{
int nmatch = 3;
cre2_string_t match[nmatch];
cre2_easy_match(pattern, strlen(pattern),
text, strlen(text),
match, nmatch);
PRINTF("full match: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("first group: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
PRINTF("second group: ");
FWRITE(match[2].data, match[2].length, 1, stdout);
PRINTF("\n");
if (0 != strncmp("ciao hello", match[0].data, match[0].length))
goto error;
if (0 != strncmp("ciao", match[1].data, match[1].length))
goto error;
if (0 != strncmp("hello", match[2].data, match[2].length))
goto error;
}
/* ------------------------------------------------------------------ */
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,335 @@
/*
Part of: CRE2
Contents: test for find and consume match function
Date: Tue Jan 3, 2012
Abstract
Test file for find and consume match function.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* success, no parentheses, full consumed buffer */
const char * pattern = "ci.*ut";
const char * text = "prefix ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_find_and_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp("", input.data, input.length))
goto error;
}
{ /* success, no parentheses, partially consumed buffer */
const char * pattern = "ci.*ut";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_find_and_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "prefix ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_find_and_consume(pattern, &input, NULL, 0);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
const char * text = "prefix ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_find_and_consume(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* wrong regexp specification */
const char * pattern = "cia(o salut";
const char * text = "prefix ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_find_and_consume(pattern, &input, match, nmatch);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
/* ------------------------------------------------------------------ */
{ /* success, no parentheses, full buffer consumed */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, no parentheses, partial buffer consumed */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "prefix ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "prefix ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "prefix ciao salut hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_find_and_consume_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(" hello", input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,308 @@
/*
Part of: CRE2
Contents: test for full match function
Date: Tue Jan 3, 2012
Abstract
Test file for full match function.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* success, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_full_match(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_full_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
result = cre2_full_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_full_match(pattern, &input, NULL, 0);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_full_match(pattern, &input, match, nmatch);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_full_match(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
result = cre2_full_match(pattern, &input, match, nmatch);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_full_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* wrong regexp specification */
const char * pattern = "cia(o salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_full_match(pattern, &input, match, nmatch);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
/* ------------------------------------------------------------------ */
{ /* success, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_full_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,122 @@
/*
Part of: CRE2
Contents: test for matching
Date: Mon Jan 2, 2012
Abstract
Test file for regular expressions matching.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
cre2_regexp_t * rex;
cre2_options_t * opt;
const char * pattern;
/* ------------------------------------------------------------------ */
/* single match */
pattern = "ciao";
opt = cre2_opt_new();
cre2_opt_set_posix_syntax(opt, 1);
rex = cre2_new(pattern, strlen(pattern), opt);
{
if (cre2_error_code(rex))
goto error;
cre2_string_t match;
int nmatch = 1;
int e;
const char * text = "ciao";
int text_len = strlen(text);
e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, &match, nmatch);
if (1 != e)
goto error;
PRINTF("match: retval=%d, ", e);
FWRITE(match.data, match.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
cre2_opt_delete(opt);
/* ------------------------------------------------------------------ */
/* two groups */
pattern = "(ciao) (hello)";
opt = cre2_opt_new();
rex = cre2_new(pattern, strlen(pattern), opt);
{
if (cre2_error_code(rex))
goto error;
int nmatch = 3;
cre2_string_t strings[nmatch];
cre2_range_t ranges[nmatch];
int e;
const char * text = "ciao hello";
int text_len = strlen(text);
e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, strings, nmatch);
if (1 != e)
goto error;
cre2_strings_to_ranges(text, ranges, strings, nmatch);
PRINTF("full match: ");
FWRITE(text+ranges[0].start, ranges[0].past-ranges[0].start, 1, stdout);
PRINTF("\n");
PRINTF("first group: ");
FWRITE(text+ranges[1].start, ranges[1].past-ranges[1].start, 1, stdout);
PRINTF("\n");
PRINTF("second group: ");
FWRITE(text+ranges[2].start, ranges[2].past-ranges[2].start, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
cre2_opt_delete(opt);
/* ------------------------------------------------------------------ */
/* test literal option */
pattern = "(ciao) (hello)";
opt = cre2_opt_new();
cre2_opt_set_literal(opt, 1);
rex = cre2_new(pattern, strlen(pattern), opt);
{
if (cre2_error_code(rex))
goto error;
int nmatch = 0;
int e;
const char * text = "(ciao) (hello)";
int text_len = strlen(text);
e = cre2_match(rex, text, text_len, 0, text_len, CRE2_UNANCHORED, NULL, nmatch);
if (0 == e)
goto error;
}
cre2_delete(rex);
cre2_opt_delete(opt);
/* ------------------------------------------------------------------ */
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,119 @@
/*
Part of: CRE2
Contents: test for miscellaneous functions
Date: Wed Jan 4, 2012
Abstract
Test file for miscellaneous functions.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* quote meta characters */
const char * pattern = "1.5-2.0?";
cre2_string_t original = {
.data = pattern,
.length = strlen(pattern)
};
cre2_string_t quoted;
int result;
result = cre2_quote_meta(&quoted, &original);
if (0 != result)
goto error;
if (0 != strncmp("1\\.5\\-2\\.0\\?", quoted.data, quoted.length))
goto error;
free((void *)quoted.data);
}
/* ------------------------------------------------------------------ */
{ /* minimum and maximum matching strings */
const char * pattern = "(?i)ABCdef";
cre2_regexp_t * rex;
cre2_string_t min, max;
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_possible_match_range(rex, &min, &max, 1024);
if (1 != result)
goto error;
if (0 != strncmp("ABCDEF", min.data, min.length))
goto error;
if (0 != strncmp("abcdef", max.data, max.length))
goto error;
}
cre2_delete(rex);
free((void *)min.data);
free((void *)max.data);
}
/* ------------------------------------------------------------------ */
{ /* successfully check rewrite string */
const char * pattern = "a(b)c";
const char * subst = "def";
cre2_string_t rewrite = {
.data = subst,
.length = strlen(subst)
};
cre2_regexp_t * rex;
cre2_string_t errmsg;
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_check_rewrite_string(rex, &rewrite, &errmsg);
if (1 != result)
goto error;
}
cre2_delete(rex);
}
{ /* failed check rewrite string */
const char * pattern = "a(b)c";
const char * subst = "\\1 \\2";
cre2_string_t rewrite = {
.data = subst,
.length = strlen(subst)
};
cre2_regexp_t * rex;
cre2_string_t errmsg;
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_check_rewrite_string(rex, &rewrite, &errmsg);
if (0 != result)
goto error;
PRINTF("error message: ");
FWRITE(errmsg.data, errmsg.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)errmsg.data);
}
/* ------------------------------------------------------------------ */
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,43 @@
/*
Part of: CRE2
Contents: test for options
Date: Mon Jan 2, 2012
Abstract
Test file for options objects.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <cre2.h>
int
main (int argc, const char *const argv[])
{
cre2_options_t * opt;
opt = cre2_opt_new();
{
cre2_opt_set_posix_syntax(opt, 1);
cre2_opt_set_longest_match(opt, 1);
cre2_opt_set_log_errors(opt, 1);
cre2_opt_set_literal(opt, 1);
cre2_opt_set_never_nl(opt, 1);
cre2_opt_set_case_sensitive(opt, 1);
cre2_opt_set_perl_classes(opt, 1);
cre2_opt_set_word_boundary(opt, 1);
cre2_opt_set_one_line(opt, 1);
cre2_opt_set_encoding(opt, CRE2_UTF8);
cre2_opt_set_encoding(opt, CRE2_Latin1);
cre2_opt_set_max_mem(opt, 4096);
}
cre2_opt_delete(opt);
exit(EXIT_SUCCESS);
}
/* end of file */

View File

@ -0,0 +1,308 @@
/*
Part of: CRE2
Contents: test for partial match function
Date: Tue Jan 3, 2012
Abstract
Test file for partial match function.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* success, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "pre ciao salut post";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_partial_match(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_partial_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
result = cre2_partial_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_partial_match(pattern, &input, NULL, 0);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_partial_match(pattern, &input, match, nmatch);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
result = cre2_partial_match(pattern, &input, NULL, 0);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
result = cre2_partial_match(pattern, &input, match, nmatch);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_partial_match(pattern, &input, match, nmatch);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* wrong regexp specification */
const char * pattern = "cia(o salut";
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
result = cre2_partial_match(pattern, &input, match, nmatch);
if (0 != result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
/* ------------------------------------------------------------------ */
{ /* success, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, one match entry */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
{ /* success, two parenthetical subexpressions, two match entries */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp("salut", match[1].data, match[1].length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
PRINTF("match 1: ");
FWRITE(match[1].data, match[1].length, 1, stdout);
PRINTF("\n");
}
{ /* failure, no parentheses */
const char * pattern = "ci.*ut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (result)
goto error;
}
{ /* failure, one parenthetical subexpression */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao hello";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* success, one parenthetical subexpression, no match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, NULL, 0);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
}
{ /* failure, one parenthetical subexpression, two match entries */
const char * pattern = "(ciao) salut";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 2;
cre2_string_t match[nmatch];
int result;
memset(match, '\0', nmatch * sizeof(cre2_string_t));
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (0 != result)
goto error;
}
{ /* success, two parenthetical subexpressions, one match entry */
const char * pattern = "(ciao) (salut)";
cre2_regexp_t * rex;
const char * text = "ciao salut";
cre2_string_t input = { .data = text, .length = strlen(text) };
int nmatch = 1;
cre2_string_t match[nmatch];
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
result = cre2_partial_match_re(rex, &input, match, nmatch);
cre2_delete(rex);
if (! result)
goto error;
if (0 != strncmp("ciao", match[0].data, match[0].length))
goto error;
if (0 != strncmp(text, input.data, input.length))
goto error;
PRINTF("match 0: ");
FWRITE(match[0].data, match[0].length, 1, stdout);
PRINTF("\n");
}
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,257 @@
/*
Part of: CRE2
Contents: test for replace
Date: Wed Jan 4, 2012
Abstract
Test file for replacing.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
{ /* replace all the buffer using the full match */
cre2_regexp_t * rex;
const char * pattern = "ciao hello salut";
const char * text = "ciao hello salut";
const char * replace = "pre \\0 post";
cre2_string_t target = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_replace_re(rex, &target, &rewrite);
if (1 != result)
goto error;
if (0 != strncmp("pre ciao hello salut post", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
{ /* replace substring with fixed string */
cre2_regexp_t * rex;
const char * pattern = "hello";
const char * text = "ciao hello salut";
const char * replace = "ohayo";
cre2_string_t target = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_replace_re(rex, &target, &rewrite);
if (1 != result)
goto error;
if (0 != strncmp("ciao ohayo salut", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
/* ------------------------------------------------------------------ */
{ /* global replace all the buffer using the full match */
cre2_regexp_t * rex;
const char * pattern = "ciao hello salut";
const char * text = "ciao hello salut";
const char * replace = "pre \\0 post";
cre2_string_t target = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_global_replace_re(rex, &target, &rewrite);
if (1 != result)
goto error;
if (0 != strncmp("pre ciao hello salut post", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
{ /* global replace substring with fixed string */
cre2_regexp_t * rex;
const char * pattern = "hello";
const char * text = "ciao hello salut";
const char * replace = "ohayo";
cre2_string_t target = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_global_replace_re(rex, &target, &rewrite);
if (1 != result)
goto error;
if (0 != strncmp("ciao ohayo salut", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
{ /* global replace multiple substrings with parametrised string */
cre2_regexp_t * rex;
const char * pattern = "[a-z]+\\(([0-9]+)\\)";
const char * text = "ciao(1) hello(2) salut(3)";
const char * replace = "ohayo(\\1)";
cre2_string_t target = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_global_replace_re(rex, &target, &rewrite);
if (3 != result) /* 3 substitutions */
goto error;
if (0 != strncmp("ohayo(1) ohayo(2) ohayo(3)", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("result %d, rewritten to: ", result);
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
/* ------------------------------------------------------------------ */
{ /* extract all the buffer using the full match */
cre2_regexp_t * rex;
const char * pattern = "ciao hello salut";
const char * text = "ciao hello salut";
const char * replace = "pre \\0 post";
cre2_string_t input = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
cre2_string_t target;
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_extract_re(rex, &input, &rewrite, &target);
if (1 != result)
goto error;
if (0 != strncmp("pre ciao hello salut post", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
{ /* extract substring with fixed string */
cre2_regexp_t * rex;
const char * pattern = "hello([0-9]+)";
const char * text = "ciao hello123 salut";
const char * replace = "ohayo\\1";
cre2_string_t input = {
.data = text,
.length = strlen(text)
};
cre2_string_t rewrite = {
.data = replace,
.length = strlen(replace)
};
cre2_string_t target;
int result;
rex = cre2_new(pattern, strlen(pattern), NULL);
{
result = cre2_extract_re(rex, &input, &rewrite, &target);
if (1 != result)
goto error;
if (0 != strncmp("ohayo123", target.data, target.length))
goto error;
if ('\0' != target.data[target.length])
goto error;
PRINTF("rewritten to: ");
FWRITE(target.data, target.length, 1, stdout);
PRINTF("\n");
}
cre2_delete(rex);
free((void *)target.data);
}
/* ------------------------------------------------------------------ */
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,113 @@
/*
Part of: CRE2
Contents: test for rex allocation
Date: Mon Jan 2, 2012
Abstract
Test file for regular expressions allocation.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cre2.h>
#if 0
# define PRINTF printf
# define FWRITE fwrite
#else
# define PRINTF(MSG, ...) /* empty string */
# define FWRITE(BUF, ...) /* empty string */
#endif
int
main (int argc, const char *const argv[])
{
cre2_regexp_t * rex;
cre2_options_t * opt;
opt = cre2_opt_new();
cre2_opt_set_posix_syntax(opt, 1);
rex = cre2_new("ciao", 4, opt);
{
cre2_string_t S;
PRINTF("pattern: %s\n", cre2_pattern(rex));
PRINTF("error code: %d\n", cre2_error_code(rex));
PRINTF("error string: \"%s\"\n", cre2_error_string(rex));
PRINTF("number of capturing groups: %d\n", cre2_num_capturing_groups(rex));
PRINTF("program size: %d\n", cre2_program_size(rex));
cre2_error_arg(rex, &S);
PRINTF("error arg: len=%d, data=\"%s\"\n", S.length, S.data);
if (cre2_error_code(rex))
goto error;
if (cre2_num_capturing_groups(rex))
goto error;
if (cre2_error_code(rex))
goto error;
if (0 != strlen(cre2_error_string(rex)))
goto error;
if (0 != S.length)
goto error;
}
cre2_delete(rex);
cre2_opt_delete(opt);
/* ------------------------------------------------------------------ */
/* no options object */
rex = cre2_new("ciao", 4, NULL);
{
if (cre2_error_code(rex))
goto error;
}
cre2_delete(rex);
/* ------------------------------------------------------------------ */
opt = cre2_opt_new();
cre2_opt_set_posix_syntax(opt, 1);
rex = cre2_new("ci(ao)", 6, opt);
{
PRINTF("error code: %d\n", cre2_error_code(rex));
PRINTF("number of capturing groups: %d\n", cre2_num_capturing_groups(rex));
PRINTF("program size: %d\n", cre2_program_size(rex));
if (cre2_error_code(rex))
goto error;
if (1 != cre2_num_capturing_groups(rex))
goto error;
}
cre2_delete(rex);
cre2_opt_delete(opt);
/* ------------------------------------------------------------------ */
opt = cre2_opt_new();
cre2_opt_set_log_errors(opt, 0);
rex = cre2_new("ci(ao", 5, opt);
{
int code = cre2_error_code(rex);
const char * msg = cre2_error_string(rex);
cre2_string_t S;
cre2_error_arg(rex, &S);
if (CRE2_ERROR_MISSING_PAREN != code)
goto error;
if (! msg)
goto error;
PRINTF("pattern: %s\n", cre2_pattern(rex));
PRINTF("error: code=%d, msg=\"%s\"\n", code, msg);
PRINTF("error arg: len=%d, data=\"%s\"\n", S.length, S.data);
}
cre2_delete(rex);
cre2_opt_delete(opt);
exit(EXIT_SUCCESS);
error:
exit(EXIT_FAILURE);
}
/* end of file */

View File

@ -0,0 +1,30 @@
/*
Part of: CRE2
Contents: test for version functions
Date: Mon Jan 2, 2012
Abstract
Test file for version functions.
Copyright (C) 2012 Marco Maggi <marco.maggi-ipsu@poste.it>
See the COPYING file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <cre2.h>
int
main (int argc, const char *const argv[])
{
printf("version number string: %s\n", cre2_version_string());
printf("libtool version number: %d:%d:%d\n",
cre2_version_interface_current(),
cre2_version_interface_revision(),
cre2_version_interface_age());
exit(EXIT_SUCCESS);
}
/* end of file */

7
outside/re2/.hgignore Normal file
View File

@ -0,0 +1,7 @@
syntax:glob
*.pyc
*.orig
core
syntax:regexp
^obj/

13
outside/re2/AUTHORS Normal file
View File

@ -0,0 +1,13 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Samsung Electronics
Stefano Rivera <stefano.rivera@gmail.com>

40
outside/re2/CONTRIBUTORS Normal file
View File

@ -0,0 +1,40 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the RE2 repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Dominic Battré <battre@chromium.org>
Doug Kwan <dougkwan@google.com>
Dmitriy Vyukov <dvyukov@google.com>
John Millikin <jmillikin@gmail.com>
Mike Nazarewicz <mpn@google.com>
Pawel Hajdan <phajdan.jr@gmail.com>
Rob Pike <r@google.com>
Russ Cox <rsc@swtch.com>
Sanjay Ghemawat <sanjay@google.com>
Stefano Rivera <stefano.rivera@gmail.com>
Srinivasan Venkatachary <vsri@google.com>
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>

27
outside/re2/LICENSE Normal file
View File

@ -0,0 +1,27 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

291
outside/re2/Makefile Normal file
View File

@ -0,0 +1,291 @@
# Copyright 2009 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
all: obj/libre2.a obj/so/libre2.so
# to build against PCRE for testing or benchmarking,
# uncomment the next two lines
# CCPCRE=-I/usr/local/include -DUSEPCRE
# LDPCRE=-L/usr/local/lib -lpcre
CXX?=g++
CXXFLAGS?=-Wall -O3 -g -pthread # can override
RE2_CXXFLAGS?=-Wno-sign-compare -c -I. $(CCPCRE) # required
LDFLAGS?=
AR?=ar
ARFLAGS?=rsc
NM?=nm
NMFLAGS?=-p
# Variables mandated by GNU, the arbiter of all good taste on the internet.
# http://www.gnu.org/prep/standards/standards.html
prefix=/usr/local
exec_prefix=$(prefix)
bindir=$(exec_prefix)/bin
includedir=$(prefix)/include
libdir=$(exec_prefix)/lib
INSTALL=install
INSTALL_PROGRAM=$(INSTALL)
INSTALL_DATA=$(INSTALL) -m 644
# ABI version
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
SONAME=0
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
# access for Unicode data), uncomment the following line:
# REBUILD_TABLES=1
ifeq ($(shell uname),Darwin)
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
else
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.so.$(SONAME),--version-script=libre2.symbols $(LDFLAGS)
endif
INSTALL_HFILES=\
re2/filtered_re2.h\
re2/re2.h\
re2/set.h\
re2/stringpiece.h\
re2/variadic_function.h\
HFILES=\
util/arena.h\
util/atomicops.h\
util/benchmark.h\
util/flags.h\
util/logging.h\
util/mutex.h\
util/pcre.h\
util/random.h\
util/sparse_array.h\
util/sparse_set.h\
util/test.h\
util/utf.h\
util/util.h\
util/valgrind.h\
re2/filtered_re2.h\
re2/prefilter.h\
re2/prefilter_tree.h\
re2/prog.h\
re2/re2.h\
re2/regexp.h\
re2/set.h\
re2/stringpiece.h\
re2/testing/exhaustive_tester.h\
re2/testing/regexp_generator.h\
re2/testing/string_generator.h\
re2/testing/tester.h\
re2/unicode_casefold.h\
re2/unicode_groups.h\
re2/variadic_function.h\
re2/walker-inl.h\
OFILES=\
obj/util/arena.o\
obj/util/hash.o\
obj/util/rune.o\
obj/util/stringpiece.o\
obj/util/stringprintf.o\
obj/util/strutil.o\
obj/util/valgrind.o\
obj/re2/bitstate.o\
obj/re2/compile.o\
obj/re2/dfa.o\
obj/re2/filtered_re2.o\
obj/re2/mimics_pcre.o\
obj/re2/nfa.o\
obj/re2/onepass.o\
obj/re2/parse.o\
obj/re2/perl_groups.o\
obj/re2/prefilter.o\
obj/re2/prefilter_tree.o\
obj/re2/prog.o\
obj/re2/re2.o\
obj/re2/regexp.o\
obj/re2/set.o\
obj/re2/simplify.o\
obj/re2/tostring.o\
obj/re2/unicode_casefold.o\
obj/re2/unicode_groups.o\
TESTOFILES=\
obj/util/pcre.o\
obj/util/random.o\
obj/util/thread.o\
obj/re2/testing/backtrack.o\
obj/re2/testing/dump.o\
obj/re2/testing/exhaustive_tester.o\
obj/re2/testing/null_walker.o\
obj/re2/testing/regexp_generator.o\
obj/re2/testing/string_generator.o\
obj/re2/testing/tester.o\
TESTS=\
obj/test/charclass_test\
obj/test/compile_test\
obj/test/filtered_re2_test\
obj/test/mimics_pcre_test\
obj/test/parse_test\
obj/test/possible_match_test\
obj/test/re2_test\
obj/test/re2_arg_test\
obj/test/regexp_test\
obj/test/required_prefix_test\
obj/test/search_test\
obj/test/set_test\
obj/test/simplify_test\
obj/test/string_generator_test\
BIGTESTS=\
obj/test/dfa_test\
obj/test/exhaustive1_test\
obj/test/exhaustive2_test\
obj/test/exhaustive3_test\
obj/test/exhaustive_test\
obj/test/random_test\
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
obj/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/dbg/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
obj/so/%.o: %.cc $(HFILES)
@mkdir -p $$(dirname $@)
$(CXX) -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
obj/libre2.a: $(OFILES)
@mkdir -p obj
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
obj/dbg/libre2.a: $(DOFILES)
@mkdir -p obj/dbg
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
obj/so/libre2.so: $(SOFILES)
@mkdir -p obj/so
$(MAKE_SHARED_LIBRARY) -o $@.$(SONAME) $(SOFILES)
ln -sf libre2.so.$(SONAME) $@
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
@mkdir -p obj/dbg/test
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
@mkdir -p obj/so/test
$(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
@mkdir -p obj/test
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
ifdef REBUILD_TABLES
re2/perl_groups.cc: re2/make_perl_groups.pl
perl $< > $@
re2/unicode_%.cc: re2/make_unicode_%.py
python $< > $@
endif
distclean: clean
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
clean:
rm -rf obj
rm -f re2/*.pyc
testofiles: $(TESTOFILES)
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
debug-test: $(DTESTS)
@echo
@echo Running debug binary tests.
@echo
@./runtests $(DTESTS)
static-test: $(TESTS)
@echo
@echo Running static binary tests.
@echo
@./runtests $(TESTS)
shared-test: $(STESTS)
@echo
@echo Running dynamic binary tests.
@echo
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
debug-bigtest: $(DTESTS) $(DBIGTESTS)
@./runtests $(DTESTS) $(DBIGTESTS)
static-bigtest: $(TESTS) $(BIGTESTS)
@./runtests $(TESTS) $(BIGTESTS)
shared-bigtest: $(STESTS) $(SBIGTESTS)
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
benchmark: obj/test/regexp_benchmark
install: obj/libre2.a obj/so/libre2.so
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
testinstall:
@mkdir -p obj
cp testinstall.cc obj
(cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
benchlog: obj/test/regexp_benchmark
(echo '==BENCHMARK==' `hostname` `date`; \
(uname -a; $(CXX) --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
echo; \
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
# Keep gmake from deleting intermediate files it creates.
# This makes repeated builds faster and preserves debug info on OS X.
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
obj/dbg/libre2.a obj/so/libre2.a \
obj/test/% obj/so/test/% obj/dbg/test/%
log:
make clean
make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/exhaustive{,1,2,3}_test
echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
echo '#' $$(date) >>re2-exhaustive.txt
obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
make CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
echo '#' $$(date) >>re2-search.txt
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
x: x.cc obj/libre2.a
g++ -I. -o x x.cc obj/libre2.a

19
outside/re2/README Normal file
View File

@ -0,0 +1,19 @@
This is the source code repository for RE2, a regular expression library.
For documentation about how to install and use RE2,
visit http://code.google.com/p/re2/.
The short version is:
make
make test
make install
make testinstall
Unless otherwise noted, the RE2 source files are distributed
under the BSD-style license found in the LICENSE file.
RE2's native language is C++.
An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
A Python wrapper is at http://github.com/facebook/pyre2/.
A Ruby wrapper is at http://github.com/axic/rre2/.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,582 @@
hw.ncpu: 2
hw.byteorder: 1234
hw.memsize: 4294967296
hw.activecpu: 2
hw.physicalcpu: 2
hw.physicalcpu_max: 2
hw.logicalcpu: 2
hw.logicalcpu_max: 2
hw.cputype: 7
hw.cpusubtype: 4
hw.cpu64bit_capable: 1
hw.cpufamily: 1114597871
hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0
hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0
hw.pagesize: 4096
hw.busfrequency: 664000000
hw.busfrequency_min: 664000000
hw.busfrequency_max: 664000000
hw.cpufrequency: 1830000000
hw.cpufrequency_min: 1830000000
hw.cpufrequency_max: 1830000000
hw.cachelinesize: 64
hw.l1icachesize: 32768
hw.l1dcachesize: 32768
hw.l2cachesize: 2097152
hw.tbfrequency: 1000000000
hw.packages: 1
hw.optional.floatingpoint: 1
hw.optional.mmx: 1
hw.optional.sse: 1
hw.optional.sse2: 1
hw.optional.sse3: 1
hw.optional.supplementalsse3: 1
hw.optional.sse4_1: 0
hw.optional.sse4_2: 0
hw.optional.x86_64: 1
hw.machine = i386
hw.model = Macmini2,1
hw.ncpu = 2
hw.byteorder = 1234
hw.physmem = 2147483648
hw.usermem = 1849147392
hw.pagesize = 4096
hw.epoch = 0
hw.vectorunit = 1
hw.busfrequency = 664000000
hw.cpufrequency = 1830000000
hw.cachelinesize = 64
hw.l1icachesize = 32768
hw.l1dcachesize = 32768
hw.l2settings = 1
hw.l2cachesize = 2097152
hw.tbfrequency = 1000000000
hw.memsize = 4294967296
hw.availcpu = 2
machdep.cpu.max_basic: 10
machdep.cpu.max_ext: 2147483656
machdep.cpu.vendor: GenuineIntel
machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz
machdep.cpu.family: 6
machdep.cpu.model: 15
machdep.cpu.extmodel: 0
machdep.cpu.extfamily: 0
machdep.cpu.stepping: 2
machdep.cpu.feature_bits: 3219913727 58301
machdep.cpu.extfeature_bits: 537921536 1
machdep.cpu.signature: 1778
machdep.cpu.brand: 0
machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
machdep.cpu.extfeatures: SYSCALL XD EM64T
machdep.cpu.logical_per_package: 2
machdep.cpu.cores_per_package: 2
machdep.cpu.microcode_version: 87
machdep.cpu.mwait.linesize_min: 64
machdep.cpu.mwait.linesize_max: 64
machdep.cpu.mwait.extensions: 3
machdep.cpu.mwait.sub_Cstates: 139808
machdep.cpu.thermal.sensor: 1
machdep.cpu.thermal.dynamic_acceleration: 0
machdep.cpu.thermal.thresholds: 2
machdep.cpu.thermal.ACNT_MCNT: 1
machdep.cpu.arch_perf.version: 2
machdep.cpu.arch_perf.number: 2
machdep.cpu.arch_perf.width: 40
machdep.cpu.arch_perf.events_number: 7
machdep.cpu.arch_perf.events: 0
machdep.cpu.arch_perf.fixed_number: 0
machdep.cpu.arch_perf.fixed_width: 0
machdep.cpu.cache.linesize: 64
machdep.cpu.cache.L2_associativity: 6
machdep.cpu.cache.size: 2048
machdep.cpu.tlb.inst.small: 128
machdep.cpu.tlb.inst.large: 8
machdep.cpu.tlb.data.small: 16
machdep.cpu.tlb.data.small_level1: 256
machdep.cpu.tlb.data.large: 16
machdep.cpu.tlb.data.large_level1: 32
machdep.cpu.address_bits.physical: 36
machdep.cpu.address_bits.virtual: 48
machdep.cpu.core_count: 2
machdep.cpu.thread_count: 2
==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010
# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386
# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1)
# Copyright (C) 2007 Free Software Foundation, Inc.
# This is free software; see the source for copying conditions. There is NO
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# a94585d91e66+ tip
# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64
Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s
Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s
Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s
Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s
Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s
Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s
Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s
Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s
Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s
Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s
Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s
Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s
Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s
Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s
Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s
Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s
Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s
Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s
Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s
Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s
Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s
Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s
Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s
Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s
Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s
Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s
Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s
Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s
Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s
Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s
Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s
Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s
Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s
Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s
Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s
Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s
Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s
Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s
Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s
Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s
Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s
Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s
Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s
Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s
Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s
Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s
Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s
Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s
Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s
Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s
Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s
Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s
Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s
Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s
Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s
Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s
Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s
Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s
Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s
Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s
Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s
Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s
Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s
Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s
Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s
Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s
Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s
Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s
Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s
Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s
Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s
Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s
Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s
Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s
Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s
Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s
Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s
Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s
Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s
Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s
Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s
Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s
Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s
Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s
Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s
Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s
Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s
Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s
Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s
Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s
Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s
Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s
Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s
Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s
Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s
Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s
Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s
Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s
Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s
Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s
Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s
Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s
Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s
Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s
Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s
Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s
Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s
Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s
Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s
Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s
Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s
Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s
Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s
Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s
Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s
Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s
Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s
Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s
Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s
Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s
Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s
Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s
Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s
Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s
Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s
Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s
Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s
Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s
Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s
Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s
Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s
Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s
Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s
Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s
Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s
Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s
Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s
Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s
Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s
Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s
Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s
Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s
Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s
Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s
Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s
Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s
Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s
Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s
Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s
Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s
Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s
Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s
Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s
Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s
Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s
Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s
Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s
Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s
Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s
Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s
Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s
Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s
Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s
Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s
Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s
Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s
Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s
Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s
Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s
Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s
Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s
Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s
Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s
Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s
Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s
Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s
Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s
Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s
Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s
Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s
Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s
Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s
Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s
Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s
Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s
Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s
Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s
Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s
Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s
Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s
Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s
Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s
Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s
Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s
Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s
Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s
Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s
Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s
Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s
Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s
Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s
Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s
Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s
Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s
Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s
Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s
Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s
Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s
Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s
Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s
Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s
Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s
Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s
Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s
Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s
Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s
Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s
Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s
Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s
Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s
Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s
Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s
Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s
Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s
Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s
Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s
Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s
Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s
Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s
Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s
Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s
Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s
Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s
Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s
Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s
Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s
Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s
Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s
Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s
Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s
Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s
Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s
Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s
Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s
Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s
Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s
Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s
Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s
Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s
Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s
Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s
Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s
Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s
Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s
Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s
Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s
Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s
Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s
Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s
Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s
Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s
Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s
Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s
Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s
Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s
Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s
Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s
Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s
Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s
Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s
Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s
Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s
Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s
Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s
Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s
Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s
Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s
Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s
Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s
Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s
Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s
Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s
Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s
Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s
Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s
Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s
Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s
Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s
Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s
Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s
Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s
Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s
Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s
Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s
Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s
Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s
Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s
Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s
Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s
Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s
Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s
Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s
Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s
Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s
Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s
Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s
Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s
Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s
Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s
Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s
Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s
Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s
Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s
Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s
Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s
Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s
Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s
Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s
Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s
Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s
Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s
Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s
Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s
Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s
Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s
Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s
Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s
Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s
Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s
Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s
Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s
Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s
Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s
Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s
Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s
Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s
Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s
Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s
Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s
Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s
Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s
Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s
Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s
Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s
Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s
Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s
Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s
Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s
Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s
Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s
Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s
Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s
Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s
Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s
Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s
Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s
Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s
Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s
Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s
Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s
Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s
Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s
Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s
Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s
Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s
Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s
Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s
Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s
Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s
Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s
Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s
Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s
Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s
Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s
Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s
Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s
Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s
Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s
Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s
Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s
Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s
Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s
Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s
Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s
Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s
Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s
Search_Digits_PCRE 500000 7534 ns/op
Search_Digits_RE2 50000 44162 ns/op
Parse_Digits_PCRE 200000 7664 ns/op
Parse_Digits_RE2 100000 22595 ns/op
Parse_CachedDigits_PCRE 5000000 721 ns/op
Parse_CachedDigits_RE2 5000000 413 ns/op
Parse_DigitDs_PCRE 500000 7095 ns/op
Parse_DigitDs_RE2 100000 22259 ns/op
Parse_CachedDigitDs_PCRE 5000000 704 ns/op
Parse_CachedDigitDs_RE2 5000000 415 ns/op
Parse_Split_PCRE 500000 5540 ns/op
Parse_Split_RE2 100000 23817 ns/op
Parse_CachedSplit_PCRE 5000000 490 ns/op
Parse_CachedSplit_RE2 10000000 251 ns/op
Parse_SplitHard_PCRE 500000 5410 ns/op
Parse_SplitHard_RE2 100000 28518 ns/op
Parse_CachedSplitHard_PCRE 5000000 488 ns/op
Parse_CachedSplitHard_RE2 1000000 2489 ns/op
Parse_CachedSplitBig1_PCRE 500 7171752 ns/op
Parse_CachedSplitBig1_RE2 2000 990722 ns/op
Parse_CachedSplitBig2_PCRE 5000 658331 ns/op
Parse_CachedSplitBig2_RE2 20 81205250 ns/op
BM_PCRE_Compile 500000 6443 ns/op
BM_RE2_Compile 100000 24103 ns/op
SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s
SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s
SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s
SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s
SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s
SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s
SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s
SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s
SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s
SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s
SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s
SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s
SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s
SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s
SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s
SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s
SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s
SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s
SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s
SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s
SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s
SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s
SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s
SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s
SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s
SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s
SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s
SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s
SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s
SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s
SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s
SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s
SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s
SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s
SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s
SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s
SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s
SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s
SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s
SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s
SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s
SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s
SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s
SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s
EmptyPartialMatchPCRE 10000000 195 ns/op
EmptyPartialMatchRE2 5000000 497 ns/op
SimplePartialMatchPCRE 10000000 276 ns/op
SimplePartialMatchRE2 5000000 548 ns/op
HTTPPartialMatchPCRE 2000000 826 ns/op
HTTPPartialMatchRE2 2000000 894 ns/op
SmallHTTPPartialMatchPCRE 2000000 825 ns/op
SmallHTTPPartialMatchRE2 2000000 895 ns/op
DotMatchPCRE 2000000 810 ns/op
DotMatchRE2 2000000 976 ns/op
ASCIIMatchPCRE 5000000 604 ns/op
ASCIIMatchRE2 2000000 976 ns/op

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

155
outside/re2/benchlog/mktable Executable file
View File

@ -0,0 +1,155 @@
#!/usr/bin/perl
# XXX
sub table() {
my ($name) = @_;
print <<'EOF';
<table border=0>
<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
EOF
foreach my $sys (@sys) {
my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
}
print <<'EOF';
<tr height=5><td colspan=3></td></tr>
</table>
EOF
}
@sizes = (
"8", "16", "32", "64", "128", "256", "512",
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
"1M", "2M", "4M", "8M", "16M"
);
%color = (
"PCRE" => "0.7 0 0",
"RE2" => "0 0 1",
);
$ngraph = 0;
sub graph() {
my ($name) = @_;
my $sys = "wreck";
my $base = sprintf("regexp3g%d", ++$ngraph);
open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
printf JGR "bbox -20 -12 392 95\n";
printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
$ymax = 0;
%lastx = ();
%lasty = ();
foreach my $who ("PCRE", "RE2") {
printf JGR "newcurve pts\n";
for(my $i=0; $i<@sizes; $i++) {
my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
my $val = $data{$sys}->{$key}->{'MB/s'};
next if !defined($val);
if($val > $ymax) {
$ymax = $val;
}
$lastx{$who} = $i;
$lasty{$who} = $val;
printf JGR "$i %f (* %s *)\n", $val, $key;
}
my $color = $color{$who};
printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
}
my $n = @sizes;
printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
for($i=0; $i<@sizes; $i+=3) {
printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
}
my $y = 1;
while(10*$y <= $ymax) {
$y = 10*$y;
}
for($i=2; $i<=10; $i++) {
if($i*$y > $ymax) {
$y = $i*$y;
last;
}
}
foreach my $who ("PCRE", "RE2") {
$x1 = $lastx{$who};
$y1 = $lasty{$who};
$x1 *= 1.01;
my $v = "vjc";
if($y1 < 0.05 * $y) {
$v = "vjb";
$y1 = 0.05 * $y;
}
printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
}
printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
printf JGR " hash_labels fontsize 9\n";
# printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
system("jgraph $base.jgr >$base.eps"); # die "system: $!";
system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
printf "<img src=$base.png>\n"
}
sub skip() {
while(<>) {
if(/^<!-- -->/) {
print;
last;
}
}
}
@sys = ("r70", "c2", "wreck", "mini");
%sysname = (
"r70" => "AMD Opteron 8214 HE, 2.2 GHz",
"c2" => "Intel Core2 Duo E7200, 2.53 GHz",
"wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
"mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
);
%func = (
"table" => \&table,
"graph" => \&graph,
);
foreach my $sys (@sys) {
open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
my %sysdat;
while(<F>) {
if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
my %row;
$row{"name"} = $1;
$row{"iter"} = $2;
$row{"ns/op"} = $3;
if(/([\d.]+) MB\/s/){
$row{"MB/s"} = $1;
}
$sysdat{$row{"name"}} = \%row;
}
}
close F;
$data{$sys} = \%sysdat;
}
while(<>) {
print;
if(/^<!-- benchlog (\w+) -->/) {
$func{$1}();
skip();
next;
}
if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
$func{$1}($2);
skip();
next;
}
}

View File

@ -0,0 +1 @@
xkcd.png is a cropped version of http://xkcd.com/208/

41
outside/re2/doc/mksyntaxgo Executable file
View File

@ -0,0 +1,41 @@
#!/bin/sh
set -e
out=$GOROOT/src/pkg/regexp/syntax/doc.go
cp syntax.txt $out
sam -d $out <<'!'
,x g/NOT SUPPORTED/d
/^Unicode character class/,$d
,s/[«»]//g
,x g/^Possessive repetitions:/d
,x g/\\C/d
,x g/Flag syntax/d
,s/.=(true|false)/flag &/g
,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
,s/\n\n\n+/\n\n/g
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
1,2c
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package regexp (such as Compile and Match) instead of this package.
Syntax
The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to Parse.
.
$a
*/
package syntax
.
w
q
!

42
outside/re2/doc/mksyntaxhtml Executable file
View File

@ -0,0 +1,42 @@
#!/bin/sh
cp syntax.txt syntax.html
sam -d syntax.html <<'!'
,s/\&/\&amp;/g
,s/</\&lt;/g
,s/>/\&gt;/g
,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
,s!«!<code>!g
,s!»!</code>!g
,s! vim$! <font size=-2>VIM</font>!g
,s! pcre$! <font size=-2>PCRE</font>!g
,s! perl$! <font size=-2>PERL</font>!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
,s!NOT SUPPORTED!!g
,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
1,2c
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
</body>
</html>
.
w
q
!

36
outside/re2/doc/mksyntaxwiki Executable file
View File

@ -0,0 +1,36 @@
#!/bin/sh
cp syntax.txt syntax.wiki
sam -d syntax.wiki <<'!'
,s!`!`````!g
,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
,s!«!`!g
,s!»!`!g
,s! vim$! <font size="1">VIM</font>!g
,s! pcre$! <font size="1">PCRE</font>!g
,s! perl$! <font size="1">PERL</font>!g
,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
,s!NOT SUPPORTED!<font size="1">(&)</font>!g
,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
,s!.*:$!<b>&</b>!g
,s!^$!<tr><td></td></tr>!g
,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
1,2c
#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
<wiki:comment>
GENERATED BY mksyntaxwiki. DO NOT EDIT
</wiki:comment>
<table border="0" cellpadding="2" cellspacing="2">
<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
.
$a
</table>
.
w
q
!

388
outside/re2/doc/syntax.html Normal file
View File

@ -0,0 +1,388 @@
<html>
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>RE2 regular expression syntax reference</title>
</head>
<body>
<h1>RE2 regular expression syntax reference</h1>
<table border=0 cellpadding=2 cellspacing=2>
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
<tr><td colspan=2>See <a href="http://go/re2">http://go/re2</a> and <a href="http://go/re2quick">http://go/re2quick</a>.</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Single characters:</b></td></tr>
<tr><td><code>.</code></td><td>any character, including newline (s=true)</td></tr>
<tr><td><code>[xyz]</code></td><td>character class</td></tr>
<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
<tr><td><code>[:alpha:]</code></td><td>ASCII character class</td></tr>
<tr><td><code>[:^alpha:]</code></td><td>negated ASCII character class</td></tr>
<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Composites:</b></td></tr>
<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Repetitions:</b></td></tr>
<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Grouping:</b></td></tr>
<tr><td><code>(re)</code></td><td>numbered capturing group</td></tr>
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group</td></tr>
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group </td></tr>
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
<tr><td><code>(?flags)</code></td><td>set flags until outer paren closes; non-capturing</td></tr>
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
<tr><td><code><font color=#808080>(?&gt;re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re@&gt;</font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Flags:</b></td></tr>
<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
<tr><td><code>m</code></td><td>multi-line mode (default false)</td></tr>
<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Empty strings:</b></td></tr>
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>$</code></td><td>at end of text or line (<code>m</code>=true)</td></tr>
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> to left and <code>\W</code> to right or vice versa)</td></tr>
<tr><td><code>\B</code></td><td>not a word boundary</td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
<tr><td><code>\z</code></td><td>at end of text</td></tr>
<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>(?&lt;!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
<tr><td><code><font color=#808080>re&amp;</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>re@&lt;!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Escape sequences:</b></td></tr>
<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\g&lt;name&gt;</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
<tr><td><code><font color=#808080>\k&lt;name&gt;</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Character class elements:</b></td></tr>
<tr><td><code>x</code></td><td>single character</td></tr>
<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Perl character classes:</b></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
<tr><td></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
<tr><td><code>[:alnum:]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
<tr><td><code>[:alpha:]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
<tr><td><code>[:ascii:]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
<tr><td><code>[:blank:]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
<tr><td><code>[:cntrl:]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
<tr><td><code>[:digit:]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
<tr><td><code>[:graph:]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&amp;'()*+,\-./:;&lt;=&gt;?@[\\\]^_`{|}~]</code>)</td></tr>
<tr><td><code>[:lower:]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
<tr><td><code>[:print:]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
<tr><td><code>[:punct:]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
<tr><td><code>[:space:]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
<tr><td><code>[:upper:]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
<tr><td><code>[:word:]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
<tr><td><code>[:xdigit:]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
<tr><td><code>C</code></td><td>other</td></tr>
<tr><td><code>Cc</code></td><td>control</td></tr>
<tr><td><code>Cf</code></td><td>format</td></tr>
<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
<tr><td><code>Co</code></td><td>private use</td></tr>
<tr><td><code>Cs</code></td><td>surrogate</td></tr>
<tr><td><code>L</code></td><td>letter</td></tr>
<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
<tr><td><code><font color=#808080>L&amp;</font></code></td><td>cased letter </td></tr>
<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
<tr><td><code>Lo</code></td><td>other letter</td></tr>
<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
<tr><td><code>M</code></td><td>mark</td></tr>
<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
<tr><td><code>N</code></td><td>number</td></tr>
<tr><td><code>Nd</code></td><td>decimal number</td></tr>
<tr><td><code>Nl</code></td><td>letter number</td></tr>
<tr><td><code>No</code></td><td>other number</td></tr>
<tr><td><code>P</code></td><td>punctuation</td></tr>
<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
<tr><td><code>Po</code></td><td>other punctuation</td></tr>
<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
<tr><td><code>S</code></td><td>symbol</td></tr>
<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
<tr><td><code>Sm</code></td><td>math symbol</td></tr>
<tr><td><code>So</code></td><td>other symbol</td></tr>
<tr><td><code>Z</code></td><td>separator</td></tr>
<tr><td><code>Zl</code></td><td>line separator</td></tr>
<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
<tr><td><code>Zs</code></td><td>space separator</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
<tr><td><code>Arabic</code></td><td>Arabic</td></tr>
<tr><td><code>Armenian</code></td><td>Armenian</td></tr>
<tr><td><code>Balinese</code></td><td>Balinese</td></tr>
<tr><td><code>Bengali</code></td><td>Bengali</td></tr>
<tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr>
<tr><td><code>Braille</code></td><td>Braille</td></tr>
<tr><td><code>Buginese</code></td><td>Buginese</td></tr>
<tr><td><code>Buhid</code></td><td>Buhid</td></tr>
<tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr>
<tr><td><code>Carian</code></td><td>Carian</td></tr>
<tr><td><code>Cham</code></td><td>Cham</td></tr>
<tr><td><code>Cherokee</code></td><td>Cherokee</td></tr>
<tr><td><code>Common</code></td><td>characters not specific to one script</td></tr>
<tr><td><code>Coptic</code></td><td>Coptic</td></tr>
<tr><td><code>Cuneiform</code></td><td>Cuneiform</td></tr>
<tr><td><code>Cypriot</code></td><td>Cypriot</td></tr>
<tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr>
<tr><td><code>Deseret</code></td><td>Deseret</td></tr>
<tr><td><code>Devanagari</code></td><td>Devanagari</td></tr>
<tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr>
<tr><td><code>Georgian</code></td><td>Georgian</td></tr>
<tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr>
<tr><td><code>Gothic</code></td><td>Gothic</td></tr>
<tr><td><code>Greek</code></td><td>Greek</td></tr>
<tr><td><code>Gujarati</code></td><td>Gujarati</td></tr>
<tr><td><code>Gurmukhi</code></td><td>Gurmukhi</td></tr>
<tr><td><code>Han</code></td><td>Han</td></tr>
<tr><td><code>Hangul</code></td><td>Hangul</td></tr>
<tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr>
<tr><td><code>Hebrew</code></td><td>Hebrew</td></tr>
<tr><td><code>Hiragana</code></td><td>Hiragana</td></tr>
<tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr>
<tr><td><code>Kannada</code></td><td>Kannada</td></tr>
<tr><td><code>Katakana</code></td><td>Katakana</td></tr>
<tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr>
<tr><td><code>Kharoshthi</code></td><td>Kharoshthi</td></tr>
<tr><td><code>Khmer</code></td><td>Khmer</td></tr>
<tr><td><code>Lao</code></td><td>Lao</td></tr>
<tr><td><code>Latin</code></td><td>Latin</td></tr>
<tr><td><code>Lepcha</code></td><td>Lepcha</td></tr>
<tr><td><code>Limbu</code></td><td>Limbu</td></tr>
<tr><td><code>Linear_B</code></td><td>Linear B</td></tr>
<tr><td><code>Lycian</code></td><td>Lycian</td></tr>
<tr><td><code>Lydian</code></td><td>Lydian</td></tr>
<tr><td><code>Malayalam</code></td><td>Malayalam</td></tr>
<tr><td><code>Mongolian</code></td><td>Mongolian</td></tr>
<tr><td><code>Myanmar</code></td><td>Myanmar</td></tr>
<tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr>
<tr><td><code>Nko</code></td><td>Nko</td></tr>
<tr><td><code>Ogham</code></td><td>Ogham</td></tr>
<tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr>
<tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr>
<tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr>
<tr><td><code>Oriya</code></td><td>Oriya</td></tr>
<tr><td><code>Osmanya</code></td><td>Osmanya</td></tr>
<tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr>
<tr><td><code>Phoenician</code></td><td>Phoenician</td></tr>
<tr><td><code>Rejang</code></td><td>Rejang</td></tr>
<tr><td><code>Runic</code></td><td>Runic</td></tr>
<tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr>
<tr><td><code>Shavian</code></td><td>Shavian</td></tr>
<tr><td><code>Sinhala</code></td><td>Sinhala</td></tr>
<tr><td><code>Sundanese</code></td><td>Sundanese</td></tr>
<tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr>
<tr><td><code>Syriac</code></td><td>Syriac</td></tr>
<tr><td><code>Tagalog</code></td><td>Tagalog</td></tr>
<tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr>
<tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr>
<tr><td><code>Tamil</code></td><td>Tamil</td></tr>
<tr><td><code>Telugu</code></td><td>Telugu</td></tr>
<tr><td><code>Thaana</code></td><td>Thaana</td></tr>
<tr><td><code>Thai</code></td><td>Thai</td></tr>
<tr><td><code>Tibetan</code></td><td>Tibetan</td></tr>
<tr><td><code>Tifinagh</code></td><td>Tifinagh</td></tr>
<tr><td><code>Ugaritic</code></td><td>Ugaritic</td></tr>
<tr><td><code>Vai</code></td><td>Vai</td></tr>
<tr><td><code>Yi</code></td><td>Yi</td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Vim flags:</b></td></tr>
<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
<tr><td></td></tr>
<tr><td colspan=2><b>Magic:</b></td></tr>
<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
<tr><td><code><font color=#808080>(?&amp;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
<tr><td><code><font color=#808080>(?P&gt;name)</font></code></td><td>recursive call to named group </td></tr>
<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
<tr><td></td></tr>
</table>
</body>
</html>

395
outside/re2/doc/syntax.txt Normal file
View File

@ -0,0 +1,395 @@
RE2 regular expression syntax reference
-------------------------­-------­-----
Single characters:
. any character, possibly including newline (s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[:alpha:] ASCII character class
[:^alpha:] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy «x» followed by «y»
x|y «x» or «y» (prefer «x»)
Repetitions:
x* zero or more «x», prefer more
x+ one or more «x», prefer more
x? zero or one «x», prefer one
x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
x{n,} «n» or more «x», prefer more
x{n} exactly «n» «x»
x*? zero or more «x», prefer fewer
x+? one or more «x», prefer fewer
x?? zero or one «x», prefer zero
x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
x{n,}? «n» or more «x», prefer fewer
x{n}? exactly «n» «x»
x{} (== x*) NOT SUPPORTED vim
x{-} (== x*?) NOT SUPPORTED vim
x{-n} (== x{n}?) NOT SUPPORTED vim
x= (== x?) NOT SUPPORTED vim
Possessive repetitions:
x*+ zero or more «x», possessive NOT SUPPORTED
x++ one or more «x», possessive NOT SUPPORTED
x?+ zero or one «x», possessive NOT SUPPORTED
x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
x{n,}+ «n» or more «x», possessive NOT SUPPORTED
x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group
(?P<name>re) named & numbered capturing group
(?<name>re) named & numbered capturing group NOT SUPPORTED
(?'name're) named & numbered capturing group NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
(?#text) comment NOT SUPPORTED
(?|x|y|z) branch numbering reset NOT SUPPORTED
(?>re) possessive match of «re» NOT SUPPORTED
re@> possessive match of «re» NOT SUPPORTED vim
%(re) non-capturing group NOT SUPPORTED vim
Flags:
i case-insensitive (default false)
m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
s let «.» match «\n» (default false)
U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
Empty strings:
^ at beginning of text or line («m»=true)
$ at end of text (like «\z» not «\Z») or line («m»=true)
\A at beginning of text
\b at word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
\B not a word boundary
\G at beginning of subtext being searched NOT SUPPORTED pcre
\G at end of last match NOT SUPPORTED perl
\Z at end of text, or before newline at end of text NOT SUPPORTED
\z at end of text
(?=re) before text matching «re» NOT SUPPORTED
(?!re) before text not matching «re» NOT SUPPORTED
(?<=re) after text matching «re» NOT SUPPORTED
(?<!re) after text not matching «re» NOT SUPPORTED
re& before text matching «re» NOT SUPPORTED vim
re@= before text matching «re» NOT SUPPORTED vim
re@! before text not matching «re» NOT SUPPORTED vim
re@<= after text matching «re» NOT SUPPORTED vim
re@<! after text not matching «re» NOT SUPPORTED vim
\zs sets start of match (= \K) NOT SUPPORTED vim
\ze sets end of match NOT SUPPORTED vim
\%^ beginning of file NOT SUPPORTED vim
\%$ end of file NOT SUPPORTED vim
\%V on screen NOT SUPPORTED vim
\%# cursor position NOT SUPPORTED vim
\%'m mark «m» position NOT SUPPORTED vim
\%23l in line 23 NOT SUPPORTED vim
\%23c in column 23 NOT SUPPORTED vim
\%23v in virtual column 23 NOT SUPPORTED vim
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal «*», for any punctuation character «*»
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\C match a single byte even in UTF-8 mode
\Q...\E literal text «...» even if «...» has punctuation
\1 backreference NOT SUPPORTED
\b backspace NOT SUPPORTED (use «\010»)
\cK control char ^K NOT SUPPORTED (use «\001» etc)
\e escape NOT SUPPORTED (use «\033»)
\g1 backreference NOT SUPPORTED
\g{1} backreference NOT SUPPORTED
\g{+1} backreference NOT SUPPORTED
\g{-1} backreference NOT SUPPORTED
\g{name} named backreference NOT SUPPORTED
\g<name> subroutine call NOT SUPPORTED
\g'name' subroutine call NOT SUPPORTED
\k<name> named backreference NOT SUPPORTED
\k'name' named backreference NOT SUPPORTED
\lX lowercase «X» NOT SUPPORTED
\ux uppercase «x» NOT SUPPORTED
\L...\E lowercase text «...» NOT SUPPORTED
\K reset beginning of «$0» NOT SUPPORTED
\N{name} named Unicode character NOT SUPPORTED
\R line break NOT SUPPORTED
\U...\E upper case text «...» NOT SUPPORTED
\X extended Unicode sequence NOT SUPPORTED
\%d123 decimal character 123 NOT SUPPORTED vim
\%xFF hex character FF NOT SUPPORTED vim
\%o123 octal character 123 NOT SUPPORTED vim
\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class «foo»
\p{Foo} Unicode character class «Foo»
\pF Unicode character class «F» (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes:
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
\h horizontal space NOT SUPPORTED
\H not horizontal space NOT SUPPORTED
\v vertical space NOT SUPPORTED
\V not vertical space NOT SUPPORTED
ASCII character classes:
[:alnum:] alphanumeric (== [0-9A-Za-z])
[:alpha:] alphabetic (== [A-Za-z])
[:ascii:] ASCII (== [\x00-\x7F])
[:blank:] blank (== [\t ])
[:cntrl:] control (== [\x00-\x1F\x7F])
[:digit:] digits (== [0-9])
[:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[:lower:] lower case (== [a-z])
[:print:] printable (== [ -~] == [ [:graph:]])
[:punct:] punctuation (== [!-/:-@[-`{-~])
[:space:] whitespace (== [\t\n\v\f\r ])
[:upper:] upper case (== [A-Z])
[:word:] word characters (== [0-9A-Za-z_])
[:xdigit:] hex digit (== [0-9A-Fa-f])
Unicode character class names--general category:
C other
Cc control
Cf format
Cn unassigned code points NOT SUPPORTED
Co private use
Cs surrogate
L letter
LC cased letter NOT SUPPORTED
L& cased letter NOT SUPPORTED
Ll lowercase letter
Lm modifier letter
Lo other letter
Lt titlecase letter
Lu uppercase letter
M mark
Mc spacing mark
Me enclosing mark
Mn non-spacing mark
N number
Nd decimal number
Nl letter number
No other number
P punctuation
Pc connector punctuation
Pd dash punctuation
Pe close punctuation
Pf final punctuation
Pi initial punctuation
Po other punctuation
Ps open punctuation
S symbol
Sc currency symbol
Sk modifier symbol
Sm math symbol
So other symbol
Z separator
Zl line separator
Zp paragraph separator
Zs space separator
Unicode character class names--scripts:
Arabic Arabic
Armenian Armenian
Balinese Balinese
Bamum Bamum
Batak Batak
Bengali Bengali
Bopomofo Bopomofo
Brahmi Brahmi
Braille Braille
Buginese Buginese
Buhid Buhid
Canadian_Aboriginal Canadian Aboriginal
Carian Carian
Chakma Chakma
Cham Cham
Cherokee Cherokee
Common characters not specific to one script
Coptic Coptic
Cuneiform Cuneiform
Cypriot Cypriot
Cyrillic Cyrillic
Deseret Deseret
Devanagari Devanagari
Egyptian_Hieroglyphs Egyptian Hieroglyphs
Ethiopic Ethiopic
Georgian Georgian
Glagolitic Glagolitic
Gothic Gothic
Greek Greek
Gujarati Gujarati
Gurmukhi Gurmukhi
Han Han
Hangul Hangul
Hanunoo Hanunoo
Hebrew Hebrew
Hiragana Hiragana
Imperial_Aramaic Imperial Aramaic
Inherited inherit script from previous character
Inscriptional_Pahlavi Inscriptional Pahlavi
Inscriptional_Parthian Inscriptional Parthian
Javanese Javanese
Kaithi Kaithi
Kannada Kannada
Katakana Katakana
Kayah_Li Kayah Li
Kharoshthi Kharoshthi
Khmer Khmer
Lao Lao
Latin Latin
Lepcha Lepcha
Limbu Limbu
Linear_B Linear B
Lycian Lycian
Lydian Lydian
Malayalam Malayalam
Mandaic Mandaic
Meetei_Mayek Meetei Mayek
Meroitic_Cursive Meroitic Cursive
Meroitic_Hieroglyphs Meroitic Hieroglyphs
Miao Miao
Mongolian Mongolian
Myanmar Myanmar
New_Tai_Lue New Tai Lue (aka Simplified Tai Lue)
Nko Nko
Ogham Ogham
Ol_Chiki Ol Chiki
Old_Italic Old Italic
Old_Persian Old Persian
Old_South_Arabian Old South Arabian
Old_Turkic Old Turkic
Oriya Oriya
Osmanya Osmanya
Phags_Pa 'Phags Pa
Phoenician Phoenician
Rejang Rejang
Runic Runic
Saurashtra Saurashtra
Sharada Sharada
Shavian Shavian
Sinhala Sinhala
Sora_Sompeng Sora Sompeng
Sundanese Sundanese
Syloti_Nagri Syloti Nagri
Syriac Syriac
Tagalog Tagalog
Tagbanwa Tagbanwa
Tai_Le Tai Le
Tai_Tham Tai Tham
Tai_Viet Tai Viet
Takri Takri
Tamil Tamil
Telugu Telugu
Thaana Thaana
Thai Thai
Tibetan Tibetan
Tifinagh Tifinagh
Ugaritic Ugaritic
Vai Vai
Yi Yi
Vim character classes:
\i identifier character NOT SUPPORTED vim
\I «\i» except digits NOT SUPPORTED vim
\k keyword character NOT SUPPORTED vim
\K «\k» except digits NOT SUPPORTED vim
\f file name character NOT SUPPORTED vim
\F «\f» except digits NOT SUPPORTED vim
\p printable character NOT SUPPORTED vim
\P «\p» except digits NOT SUPPORTED vim
\s whitespace character (== [ \t]) NOT SUPPORTED vim
\S non-white space character (== [^ \t]) NOT SUPPORTED vim
\d digits (== [0-9]) vim
\D not «\d» vim
\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
\X not «\x» NOT SUPPORTED vim
\o octal digits (== [0-7]) NOT SUPPORTED vim
\O not «\o» NOT SUPPORTED vim
\w word character vim
\W not «\w» vim
\h head of word character NOT SUPPORTED vim
\H not «\h» NOT SUPPORTED vim
\a alphabetic NOT SUPPORTED vim
\A not «\a» NOT SUPPORTED vim
\l lowercase NOT SUPPORTED vim
\L not lowercase NOT SUPPORTED vim
\u uppercase NOT SUPPORTED vim
\U not uppercase NOT SUPPORTED vim
\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
Vim flags:
\c ignore case NOT SUPPORTED vim
\C match case NOT SUPPORTED vim
\m magic NOT SUPPORTED vim
\M nomagic NOT SUPPORTED vim
\v verymagic NOT SUPPORTED vim
\V verynomagic NOT SUPPORTED vim
\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
Magic:
(?{code}) arbitrary Perl code NOT SUPPORTED perl
(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
(?+n) recursive call to relative group «+n» NOT SUPPORTED
(?-n) recursive call to relative group «-n» NOT SUPPORTED
(?C) PCRE callout NOT SUPPORTED pcre
(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
(?&name) recursive call to named group NOT SUPPORTED
(?P=name) named backreference NOT SUPPORTED
(?P>name) recursive call to named group NOT SUPPORTED
(?(cond)true|false) conditional branch NOT SUPPORTED
(?(cond)true) conditional branch NOT SUPPORTED
(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
(*COMMIT) NOT SUPPORTED
(*F) NOT SUPPORTED
(*FAIL) NOT SUPPORTED
(*MARK) NOT SUPPORTED
(*PRUNE) NOT SUPPORTED
(*SKIP) NOT SUPPORTED
(*THEN) NOT SUPPORTED
(*ANY) set newline convention NOT SUPPORTED
(*ANYCRLF) NOT SUPPORTED
(*CR) NOT SUPPORTED
(*CRLF) NOT SUPPORTED
(*LF) NOT SUPPORTED
(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
(*BSR_UNICODE) NOT SUPPORTED pcre

BIN
outside/re2/doc/xkcd.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

View File

@ -0,0 +1 @@
defaultcc: re2-dev@googlegroups.com

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
{
global:
# re2::RE2*
_ZN3re23RE2*;
_ZNK3re23RE2*;
# re2::StringPiece*
_ZN3re211StringPiece*;
_ZNK3re211StringPiece*;
# operator<<(std::ostream&, re2::StringPiece const&)
_ZlsRSoRKN3re211StringPieceE;
# re2::FilteredRE2*
_ZN3re211FilteredRE2*;
_ZNK3re211FilteredRE210AllMatches*;
local:
*;
};

View File

@ -0,0 +1,13 @@
# Linker doesn't like these unmangled:
# re2::RE2*
__ZN3re23RE2*
__ZNK3re23RE2*
# re2::StringPiece*
__ZN3re211StringPiece*
__ZNK3re211StringPiece*
# operator<<(std::ostream&, re2::StringPiece const&)
__ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
# re2::FilteredRE2*
__ZN3re211FilteredRE2*
__ZNK3re211FilteredRE210AllMatches*

1
outside/re2/re2/Makefile Normal file
View File

@ -0,0 +1 @@

378
outside/re2/re2/bitstate.cc Normal file
View File

@ -0,0 +1,378 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Like
// testing/backtrack.cc, it allocates a bit vector with (length of
// text) * (length of prog) bits, to make sure it never explores the
// same (character position, instruction) state multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int arg;
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
~BitState();
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p, int arg);
bool GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char** cap_; // capture registers
int ncap_;
static const int VisitedBits = 32;
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
int maxjob_;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
cap_(NULL),
ncap_(0),
visited_(NULL),
nvisited_(0),
job_(NULL),
njob_(0),
maxjob_(0) {
}
BitState::~BitState() {
delete[] visited_;
delete[] job_;
delete[] cap_;
}
// Should the search visit the pair ip, p?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
uint n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
bool BitState::GrowStack() {
// VLOG(0) << "Reallocate.";
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
delete[] job_;
job_ = newjob;
if (njob_ >= maxjob_) {
LOG(DFATAL) << "Job stack overflow.";
return false;
}
return true;
}
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p, int arg) {
if (njob_ >= maxjob_) {
if (!GrowStack())
return;
}
int op = prog_->inst(id)->opcode();
if (op == kInstFail)
return;
// Only check ShouldVisit when arg == 0.
// When arg > 0, we are continuing a previous visit.
if (arg == 0 && !ShouldVisit(id, p))
return;
Job* j = &job_[njob_++];
j->id = id;
j->p = p;
j->arg = arg;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
const char* p = job_[njob_].p;
int arg = job_[njob_].arg;
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
if (0) {
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
// VLOG(0) << "Job: " << ip->id() << " "
// << (p - text_.begin()) << " " << arg;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstFail:
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstAlt:
// Cannot just
// Push(ip->out1(), p, 0);
// Push(ip->out(), p, 0);
// If, during the processing of ip->out(), we encounter
// ip->out1() via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// ip with arg==1 as a reminder to push ip->out1() later.
switch (arg) {
case 0:
Push(id, p, 1); // come back when we're done
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); try ip->out1().
arg = 0;
id = ip->out1();
goto CheckAndLoop;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstAltMatch:
// One opcode is byte range; the other leads to match.
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
id = ip->out1();
p = end;
goto CheckAndLoop;
}
// out is the match - non-greedy
Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (ip->Matches(c)) {
id = ip->out();
p++;
goto CheckAndLoop;
}
continue;
}
case kInstCapture:
switch (arg) {
case 0:
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
continue;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
continue;
id = ip->out();
goto CheckAndLoop;
case kInstNop:
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
continue;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == text_.end())
return true;
// Otherwise, continue on in hope of a longer match.
continue;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = NULL;
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// VLOG(0) << "nvisited_ = " << nvisited_;
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
ncap_ = 2;
cap_ = new const char*[ncap_];
memset(cap_, 0, ncap_*sizeof cap_[0]);
maxjob_ = 256;
job_ = new Job[maxjob_];
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

1140
outside/re2/re2/compile.cc Normal file

File diff suppressed because it is too large Load Diff

2115
outside/re2/re2/dfa.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,102 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/util.h"
#include "re2/filtered_re2.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::~FilteredRE2() {
for (int i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
}
delete re;
} else {
*id = re2_vec_.size();
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(vector<string>* atoms) {
if (compiled_ || re2_vec_.size() == 0) {
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
return;
}
for (int i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (int i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return i;
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile";
return -1;
}
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const {
matching_regexps->clear();
vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (int i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

View File

@ -0,0 +1,101 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const;
// The number of regexps added.
int NumRegexps() const { return re2_vec_.size(); }
private:
// Get the individual RE2 objects. Useful for testing.
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
FilteredRE2(const FilteredRE2&);
void operator=(const FilteredRE2&);
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

View File

@ -0,0 +1,110 @@
#!/usr/bin/perl
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
@posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
@perlclasses = (
"\\d",
"\\s",
"\\w",
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && chr($i) =~ $regexp) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "static const URange16 code${cname}[] = { /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
}
print "};\n";
my $n = @ranges;
my $escname = $name;
$escname =~ s/\\/\\\\/g;
$negname = $escname;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "const UGroup ${cname}_groups[] = {\n";
foreach my $e (@entries) {
print "\t$e,\n";
}
print "};\n";
my $count = @entries;
print "const int num_${cname}_groups = $count;\n";
}
print <<EOF;
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);
print <<EOF;
} // namespace re2
EOF

View File

@ -0,0 +1,146 @@
#!/usr/bin/python
# coding=utf-8
#
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# See unicode_casefold.h for description of case folding tables.
"""Generate C++ table for Unicode case folding."""
import unicode, sys
_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
def _Delta(a, b):
"""Compute the delta for b - a. Even/odd and odd/even
are handled specially, as described above."""
if a+1 == b:
if a%2 == 0:
return 'EvenOdd'
else:
return 'OddEven'
if a == b+1:
if a%2 == 0:
return 'OddEven'
else:
return 'EvenOdd'
return b - a
def _AddDelta(a, delta):
"""Return a + delta, handling EvenOdd and OddEven specially."""
if type(delta) == int:
return a+delta
if delta == 'EvenOdd':
if a%2 == 0:
return a+1
else:
return a-1
if delta == 'OddEven':
if a%2 == 1:
return a+1
else:
return a-1
print >>sys.stderr, "Bad Delta: ", delta
raise "Bad Delta"
def _MakeRanges(pairs):
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
into [(65, 90, +32)]."""
ranges = []
last = -100
def evenodd(last, a, b, r):
if a != last+1 or b != _AddDelta(a, r[2]):
return False
r[1] = a
return True
def evenoddpair(last, a, b, r):
if a != last+2:
return False
delta = r[2]
d = delta
if type(delta) is not str:
return False
if delta.endswith('Skip'):
d = delta[:-4]
else:
delta = d + 'Skip'
if b != _AddDelta(a, d):
return False
r[1] = a
r[2] = delta
return True
for a, b in pairs:
if ranges and evenodd(last, a, b, ranges[-1]):
pass
elif ranges and evenoddpair(last, a, b, ranges[-1]):
pass
else:
ranges.append([a, a, _Delta(a, b)])
last = a
return ranges
# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
# case-folding group, so it is important that this bound be small.
# The current tables have no group bigger than 4.
# If there are ever groups bigger than 10 or so, it will be
# time to rework the code in parse.cc.
MaxCasefoldGroup = 4
def main():
lowergroups, casegroups = unicode.CaseGroups()
foldpairs = []
seen = {}
for c in casegroups:
if len(c) > MaxCasefoldGroup:
raise unicode.Error("casefold group too long: %s" % (c,))
for i in range(len(c)):
if c[i-1] in seen:
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
seen[c[i-1]] = True
foldpairs.append([c[i-1], c[i]])
lowerpairs = []
for lower, group in lowergroups.iteritems():
for g in group:
if g != lower:
lowerpairs.append([g, lower])
def printpairs(name, foldpairs):
foldpairs.sort()
foldranges = _MakeRanges(foldpairs)
print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
print "const CaseFold unicode_%s[] = {" % (name,)
for lo, hi, delta in foldranges:
print "\t{ %d, %d, %s }," % (lo, hi, delta)
print "};"
print "const int num_unicode_%s = %d;" % (name, len(foldranges),)
print ""
print _header
printpairs("casefold", foldpairs)
printpairs("tolower", lowerpairs)
print _trailer
if __name__ == '__main__':
main()

View File

@ -0,0 +1,111 @@
#!/usr/bin/python
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Generate C++ tables for Unicode Script and Category groups."""
import sys
import unicode
_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
n16 = 0
n32 = 0
def MakeRanges(codes):
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
ranges = []
last = -100
for c in codes:
if c == last+1:
ranges[-1][1] = c
else:
ranges.append([c, c])
last = c
return ranges
def PrintRanges(type, name, ranges):
"""Print the ranges as an array of type named name."""
print "static const %s %s[] = {" % (type, name,)
for lo, hi in ranges:
print "\t{ %d, %d }," % (lo, hi)
print "};"
# def PrintCodes(type, name, codes):
# """Print the codes as an array of type named name."""
# print "static %s %s[] = {" % (type, name,)
# for c in codes:
# print "\t%d," % (c,)
# print "};"
def PrintGroup(name, codes):
"""Print the data structures for the group of codes.
Return a UGroup literal for the group."""
# See unicode_groups.h for a description of the data structure.
# Split codes into 16-bit ranges and 32-bit ranges.
range16 = MakeRanges([c for c in codes if c < 65536])
range32 = MakeRanges([c for c in codes if c >= 65536])
# Pull singleton ranges out of range16.
# code16 = [lo for lo, hi in range16 if lo == hi]
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
global n16
global n32
n16 += len(range16)
n32 += len(range32)
ugroup = "{ \"%s\", +1" % (name,)
# if len(code16) > 0:
# PrintCodes("uint16", name+"_code16", code16)
# ugroup += ", %s_code16, %d" % (name, len(code16))
# else:
# ugroup += ", 0, 0"
if len(range16) > 0:
PrintRanges("URange16", name+"_range16", range16)
ugroup += ", %s_range16, %d" % (name, len(range16))
else:
ugroup += ", 0, 0"
if len(range32) > 0:
PrintRanges("URange32", name+"_range32", range32)
ugroup += ", %s_range32, %d" % (name, len(range32))
else:
ugroup += ", 0, 0"
ugroup += " }"
return ugroup
def main():
print _header
ugroups = []
for name, codes in unicode.Categories().iteritems():
ugroups.append(PrintGroup(name, codes))
for name, codes in unicode.Scripts().iteritems():
ugroups.append(PrintGroup(name, codes))
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
print "const UGroup unicode_groups[] = {";
ugroups.sort()
for ug in ugroups:
print "\t%s," % (ug,)
print "};"
print "const int num_unicode_groups = %d;" % (len(ugroups),)
print _trailer
if __name__ == '__main__':
main()

View File

@ -0,0 +1,185 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

709
outside/re2/re2/nfa.cc Normal file
View File

@ -0,0 +1,709 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
namespace re2 {
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
int j;
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
AddState()
: id(0), j(-1), cap_j(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
// Add id (or its children, following unlabeled arrows)
// to the workqueue q with associated capture info.
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is position of the next byte (the one after c)
// in the input string, used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
const char** match_; // best match so far
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
DISALLOW_EVIL_CONSTRUCTORS(NFA);
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
return t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// The pointer p is the current input position, and m is the
// current set of match boundaries.
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
if (id0 == 0)
return;
// Astack_ is pre-allocated to avoid resize operations.
// It has room for 2*prog_->size() entries, which is enough:
// Each inst in prog can be processed at most once,
// pushing at most two entries on stk.
int nstk = 0;
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
// fall through
case kInstAlt:
// Explore alternatives.
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
case kInstNop:
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
case kInstCapture:
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, capture[j], j);
// Record capture.
capture[j] = p;
}
stk[nstk++] = AddState(ip->out());
break;
case kInstMatch:
case kInstByteRange:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
break;
case kInstEmptyWidth:
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates match as new, better matches are found.
// p is position of the byte c in the input string,
// used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
continue;
}
}
int id = t->id;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
break;
const char* old = t->capture[1]; // previous end pointer
t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture((const char**)match_, t->capture);
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
t->capture[0] = old;
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
// Returns whether haystack contains needle's memory.
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
if (Debug) {
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
longest);
}
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
// Check for empty-width specials.
int flag = 0;
// ^ and \A
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
// $ and \z
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
// \b and \B
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
}
fprintf(stderr, "\n");
}
// Process previous character (waited until now to avoid
// repeating the flag computation above).
// This is a no-op the first time around the loop, because
// runq is empty.
int id = Step(runq, nextq, c, flag, p-1);
DCHECK_EQ(runq->size(), 0);
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (Debug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
// Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
return true;
}
VLOG(1) << "No matches found";
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1; // first byte, not yet computed
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
// Explore alternatives.
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

614
outside/re2/re2/onepass.cc Normal file
View File

@ -0,0 +1,614 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <string.h>
#include <map>
#include "util/util.h"
#include "util/arena.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
static const int Debug = 0;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA (aka DFA) - just an array of actions.
struct OneState;
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32 matchcond; // conditions to match right now.
uint32 action[1];
};
// The uint32 conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32 kMatchWins = 1 << kEmptyShift;
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
kEmptyShift_disagrees_with_kEmptyAllFlags);
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
kMaxCap_disagrees_with_kMaxOnePassCapture);
}
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
uint32 satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32 cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Compute a node pointer.
// Basically (OneState*)(nodes + statesize*nodeindex)
// but the version with the C++ casts overflows 80 characters (and is ugly).
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(
const_cast<uint8*>(nodes + statesize*nodeindex));
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
// State and act are marked volatile to
// keep the compiler from re-ordering the
// memory accesses walking over the NFA.
// This is worth about 5%.
volatile OneState* state = onepass_start_;
volatile uint8* nodes = onepass_nodes_;
volatile uint32 statesize = onepass_statesize_;
uint8* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32 nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32 matchcond = nextmatchcond;
uint32 cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32 nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32 matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32 cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_start_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + byte_inst_count_;
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int size = this->size();
InstCond *stack = new InstCond[size];
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
uint8* nodes = new uint8[maxnodes*statesize];
uint8* nodep = nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
nodep += statesize;
int nalloc = 1;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes, statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
// Fall through.
case kInstAlt:
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
goto fail;
stack[nstack].id = ip->out1();
stack[nstack++].cond = cond;
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR)
<< StringPrintf("Not OnePass: hit node limit %d > %d",
nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
nodep += statesize;
nodebyid[ip->out()] = nextindex;
nalloc++;
AddQ(&tovisit, ip->out());
}
if (matched)
cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in byte class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
goto fail;
}
}
}
break;
}
case kInstCapture:
if (ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
goto QueueEmpty;
case kInstEmptyWidth:
cond |= ip->empty();
goto QueueEmpty;
case kInstNop:
QueueEmpty:
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
" %d -> %d\n",
*it, ip->out());
}
goto fail;
}
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
case kInstMatch:
if (matched) {
// (3) is violated
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
" from %d\n", *it);
}
goto fail;
}
matched = true;
node->matchcond = cond;
break;
case kInstFail:
break;
}
}
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
string dump = "prog dump:\n" + Dump() + "node dump\n";
map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
StringAppendF(&dump, "byte ranges:\n");
int i = 0;
for (int b = 0; b < bytemap_range_; b++) {
int lo = i;
while (bytemap_[i] == b)
i++;
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
}
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
string s;
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << dump;
}
// Overallocated earlier; cut down to actual size.
nodep = new uint8[nalloc*statesize];
memmove(nodep, nodes, nalloc*statesize);
delete[] nodes;
nodes = nodep;
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
onepass_nodes_ = nodes;
onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
delete[] stack;
delete[] nodebyid;
return true;
fail:
delete[] stack;
delete[] nodebyid;
delete[] nodes;
return false;
}
} // namespace re2

2216
outside/re2/re2/parse.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,119 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
const int num_posix_groups = 28;
} // namespace re2

View File

@ -0,0 +1,715 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
alloc_id_ = alloc_id++;
VLOG(10) << "alloc_id: " << alloc_id_;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (int i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->size() == 0) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (int i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching ab will already allow this regexp to be a
// candidate for match, so further matching abc is redundant.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
// Increment j early so that we can erase the element it points to.
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
string Prefilter::Info::ToString() {
if (this == NULL) {
// Sometimes when iterating on children of a node,
// some children might have NULL Info. Adding
// the check here for NULL to take care of cases where
// the caller is not checking.
return "";
}
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
static string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
DISALLOW_EVIL_CONSTRUCTORS(Walker);
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
bool latin1 = re->parse_flags() & Regexp::Latin1;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
// Claim nothing, except that it's not empty.
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
<< ": " << info->ToString();
}
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
if (this == NULL)
return "<nil>";
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
s += (*subs_)[i]->DebugString();
}
return s;
}
case OR: {
string s = "(";
for (int i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
s += (*subs_)[i]->DebugString();
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

105
outside/re2/re2/prefilter.h Normal file
View File

@ -0,0 +1,105 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
#include "util/util.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
vector<Prefilter*>* subs() {
CHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(set<string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
// Used for debugging, helps in tracking memory leaks.
int alloc_id_;
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
};
} // namespace re2
#endif // RE2_PREFILTER_H_

View File

@ -0,0 +1,397 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2/re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
namespace re2 {
PrefilterTree::PrefilterTree()
: compiled_(false) {
}
PrefilterTree::~PrefilterTree() {
for (int i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (int i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
FLAGS_filtered_re2_min_atom_len;
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (int i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (int i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
}
prefilter_vec_.push_back(f);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
AssignUniqueIds(atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (int i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
PrintDebugInfo();
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (int i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (int i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
for (int j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
// Create parent IntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
set<int> uniq_child;
for (int j = 0; j < prefilter->subs()->size() ; j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end())
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (int i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
for (int i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
for (int j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
for (int i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (int i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->first;
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
for (int i = 0; i < entries_.size(); ++i) {
StdIntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->first;
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

View File

@ -0,0 +1,131 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
#include "util/util.h"
#include "util/sparse_array.h"
namespace re2 {
typedef SparseArray<int> IntMap;
typedef map<int,int> StdIntMap;
class Prefilter;
class PrefilterTree {
public:
PrefilterTree();
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
StdIntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
vector<int> regexps;
};
private:
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo();
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
vector<Entry> entries_;
// Map node string to canonical Prefilter node.
map<string, Prefilter*> node_map_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
vector<int> unfiltered_;
// vector of Prefilter for all regexps.
vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

343
outside/re2/re2/prog.cc Normal file
View File

@ -0,0 +1,343 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
if (0 < lo && lo <= 255)
byterange_.Set(lo - 1);
if (0 <= hi && hi <= 255)
byterange_.Set(hi);
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = i;
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = i;
unbytemap_[i] = i;
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
} // namespace re2

376
outside/re2/re2/prog.h Normal file
View File

@ -0,0 +1,376 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#ifndef RE2_PROG_H__
#define RE2_PROG_H__
#include "util/util.h"
#include "re2/re2.h"
namespace re2 {
// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
public:
Bitmap() { Reset(); }
int Size() { return Bits; }
void Reset() {
for (int i = 0; i < Words; i++)
w_[i] = 0;
}
bool Get(int k) const {
return w_[k >> WordLog] & (1<<(k & 31));
}
void Set(int k) {
w_[k >> WordLog] |= 1<<(k & 31);
}
void Clear(int k) {
w_[k >> WordLog] &= ~(1<<(k & 31));
}
uint32 Word(int i) const {
return w_[i];
}
private:
static const int WordLog = 5;
static const int Words = (Bits+31)/32;
uint32 w_[Words];
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
};
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class Regexp;
class DFA;
struct OneState;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) { }
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
void InitCapture(int cap, uint32 out);
void InitEmptyWidth(EmptyOp empty, uint32 out);
void InitMatch(int id);
void InitNop(uint32 out);
void InitFail();
// Getters
int id(Prog* p) { return this - p->inst_; }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int out() { return out_opcode_>>3; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog *p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange;
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_, and PatchList steals another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<3) | opcode;
}
void set_out(int out) {
out_opcode_ = (out<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<3) | opcode;
}
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
// alternate next instruction
int32 cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32 match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8 lo_; // byte range is lo_-hi_ inclusive
uint8 hi_; //
uint8 foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
DISALLOW_EVIL_CONSTRUCTORS(Inst);
};
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int64 size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int64 byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
// Record that at some point in the prog, the bytes in the range
// lo-hi (inclusive) are treated as different from bytes outside the range.
// Tracking this lets the DFA collapse commonly-treated byte ranges
// when recording state pointers, greatly reducing its memory footprint.
void MarkByteRange(int lo, int hi);
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8 c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match0, bool* failed,
vector<int>* matches);
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work. This function is useful only
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
// Compute byte map.
void ComputeByteMap();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int byte_inst_count_; // number of kInstByteRange instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
Inst* inst_; // pointer to instruction array
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
void (*delete_dfa_)(DFA* dfa);
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
// commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
DISALLOW_EVIL_CONSTRUCTORS(Prog);
};
} // namespace re2
#endif // RE2_PROG_H__

1218
outside/re2/re2/re2.cc Normal file

File diff suppressed because it is too large Load Diff

877
outside/re2/re2/re2.h Normal file
View File

@ -0,0 +1,877 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H
#define RE2_RE2_H
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stdint.h>
#include <map>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
#ifndef RE2_HAVE_LONGLONG
#define RE2_HAVE_LONGLONG 1
#endif
namespace re2 {
using std::string;
using std::map;
class Mutex;
class Prog;
class Regexp;
// The following enum should be used only as a constructor argument to indicate
// that the variable has static storage class, and that the constructor should
// do nothing to its state. It indicates to the reader that it is legal to
// declare a static instance of the class, provided the constructor is given
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
// static variable that has a constructor or a destructor because invocation
// order is undefined. However, IF the type can be initialized by filling with
// zeroes (which the loader does for static variables), AND the type's
// destructor does nothing to the storage, then a constructor for static
// initialization can be declared as
// explicit MyClass(LinkerInitialized x) {}
// and invoked as
// static MyClass my_variable_name(LINKER_INITIALIZED);
enum LinkerInitialized { LINKER_INITIALIZED };
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, possibly passing one of these to
// the Option constructor, change the settings, and pass that
// Option class to the RE2 constructor.
enum CannedOptions {
DefaultOptions = 0,
Latin1, // treat input as Latin-1 (default UTF-8)
POSIX, // POSIX syntax, leftmost-longest match
Quiet // do not log about regexp parse errors
};
// Need to have the const char* and const string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& option);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
string *out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
// runs even faster if nmatch == 0.
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i] == NULL.
bool Match(const StringPiece& text,
int startpos,
int endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
static int MaxSubmatch(const StringPiece& rewrite);
// Append the "rewrite" string, with backslash subsitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece* vec,
int veclen) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// dot_nl (false) dot matches everything including new line
// never_capture (false) parse all parens as non-capturing
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// (When posix_syntax == false these features are always enabled and
// cannot be turned off.)
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
never_capture_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
/*implicit*/ Options(CannedOptions);
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int64_t max_mem() const { return max_mem_; }
void set_max_mem(int64_t m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool dot_nl() const { return dot_nl_; }
void set_dot_nl(bool b) { dot_nl_ = b; }
bool never_capture() const { return never_capture_; }
void set_never_capture(bool b) { never_capture_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
encoding_ = src.encoding_;
posix_syntax_ = src.posix_syntax_;
longest_match_ = src.longest_match_;
log_errors_ = src.log_errors_;
max_mem_ = src.max_mem_;
literal_ = src.literal_;
never_nl_ = src.never_nl_;
dot_nl_ = src.dot_nl_;
never_capture_ = src.never_capture_;
case_sensitive_ = src.case_sensitive_;
perl_classes_ = src.perl_classes_;
word_boundary_ = src.word_boundary_;
one_line_ = src.one_line_;
}
int ParseFlags() const;
private:
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
bool never_capture_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
//DISALLOW_EVIL_CONSTRUCTORS(Options);
Options(const Options&);
void operator=(const Options&);
};
// Returns the options set in the constructor.
const Options& options() const { return options_; };
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
#endif
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
#endif
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
#endif
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
mutable Mutex* mutex_;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
mutable re2::Prog* rprog_; // reverse program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
// Map from capture indices to names
mutable const map<int, string>* group_names_;
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
RE2(const RE2&);
void operator=(const RE2&);
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type,name) \
Arg(type* p) : arg_(p), parser_(name) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
#ifdef RE2_HAVE_LONGLONG
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#endif
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
}
// Parse the data
bool Parse(const char* str, int n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
#ifdef RE2_HAVE_LONGLONG
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#endif
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, int n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
#ifdef RE2_HAVE_LONGLONG
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#endif
#undef MAKE_INTEGER_PARSER
} // namespace re2
using re2::RE2;
#endif /* RE2_RE2_H */

931
outside/re2/re2/regexp.cc Normal file
View File

@ -0,0 +1,931 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
static map<Regexp*, int> *ref_map;
GLOBAL_MUTEX(ref_mutex);
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = 0;
if (ref_map != NULL) {
r = (*ref_map)[this];
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return r;
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
// Store ref count in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
if (ref_map == NULL) {
ref_map = new map<Regexp*, int>;
}
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
} else {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = r;
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
// Can use memmove because Regexp is just a struct (no vtable).
char tmp[sizeof *this];
memmove(tmp, this, sizeof tmp);
memmove(this, that, sizeof tmp);
memmove(that, tmp, sizeof tmp);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
int n = stk.size();
if (n == 0)
break;
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
map<string, int>* TakeMap() {
map<string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
map<string, int>* map_;
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
};
map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
map<int, string>* TakeMap() {
map<int, string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<int, string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
map<int, string>* map_;
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
};
map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = r;
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase);
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = max<Rune>(lo, 'A');
Rune hi1 = min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = max<Rune>(lo, 'a');
hi1 = min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (int i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
if (this == NULL)
return;
uint8 *data = reinterpret_cast<uint8*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(n, ranges_.size());
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

633
outside/re2/re2/regexp.h Normal file
View File

@ -0,0 +1,633 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#ifndef RE2_REGEXP_H__
#define RE2_REGEXP_H__
#include "util/util.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(enum RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
enum RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static string CodeText(enum RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
enum RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
};
// Walker to implement Simplify.
class SimplifyWalker;
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_; }
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64 max_mem);
Prog* CompileToReverseProg(int64 max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
static int FactorAlternationRecursive(Regexp** sub, int nsub,
ParseFlags flags, int maxdepth);
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
if (n < 0 || static_cast<uint16>(n) != n)
LOG(FATAL) << "Cannot AllocSub " << n;
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = n;
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8 instead of bool to control space usage.
uint8 simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16 instead of ParseFlags to control space usage.
uint16 parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16 to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (100),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16 ref_;
static const uint16 kMaxRef = 0xffff;
// Subexpressions.
// uint16 to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16 nsub_;
static const uint16 kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32 AlphaMask = (1<<26) - 1;
uint32 upper_; // bitmap of A-Z
uint32 lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
};
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
{
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
}
} // namespace re2
#endif // RE2_REGEXP_H__

113
outside/re2/re2/set.cc Normal file
View File

@ -0,0 +1,113 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include "util/util.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
using namespace re2;
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
}
RE2::Set::~Set() {
for (int i = 0; i < re_.size(); i++)
re_[i]->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add after Compile";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = re_.size();
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub, nsub + 1, pf);
delete[] sub;
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
re_.push_back(re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile multiple times";
return false;
}
compiled_ = true;
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
re_.size(), pf);
re_.clear();
re2::Regexp* sre = re->Simplify();
re->Decref();
re = sre;
if (re == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error simplifying during Compile.";
return false;
}
prog_ = Prog::CompileSet(options_, anchor_, re);
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
}
v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
if (failed)
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
if (ret == false)
return false;
if (v->size() == 0) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
return false;
}
return true;
}

55
outside/re2/re2/set.h Normal file
View File

@ -0,0 +1,55 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H
#define RE2_SET_H
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Add adds regexp pattern to the set, interpreted using the RE2 options.
// (The RE2 constructor's default options parameter is RE2::UTF8.)
// Add returns the regexp index that will be used to identify
// it in the result of Match, or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Error returns do not increment the index.
// If an error occurs and error != NULL, *error will hold an error message.
int Add(const StringPiece& pattern, string* error);
// Compile prepares the Set for matching.
// Add must not be called again after Compile.
// Compile must be called before FullMatch or PartialMatch.
// Compile may return false if it runs out of memory.
bool Compile();
// Match returns true if text matches any of the regexps in the set.
// If so, it fills v with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
private:
RE2::Options options_;
RE2::Anchor anchor_;
vector<re2::Regexp*> re_;
re2::Prog* prog_;
bool compiled_;
//DISALLOW_EVIL_CONSTRUCTORS(Set);
Set(const Set&);
void operator=(const Set&);
};
} // namespace re2
#endif // RE2_SET_H

393
outside/re2/re2/simplify.cc Normal file
View File

@ -0,0 +1,393 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include "util/util.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple_)
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple_;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple_)
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
if (simple_)
return Incref();
SimplifyWalker w;
return w.Walk(this, NULL);
}
#define Simplify DontCallSimplify // Avoid accidental recursion
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple_) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
break;
}
}
if (!changed) {
for (int i = 0; i < re->nsub_; i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
VLOG(1) << "Simplify " << min;
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
return nre;
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

View File

@ -0,0 +1,182 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#ifndef STRINGS_STRINGPIECE_H__
#define STRINGS_STRINGPIECE_H__
#include <string.h>
#include <cstddef>
#include <iosfwd>
#include <string>
namespace re2 {
class StringPiece {
private:
const char* ptr_;
int length_;
public:
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
int size() const { return length_; }
int length() const { return length_; }
bool empty() const { return length_ == 0; }
void clear() { ptr_ = NULL; length_ = 0; }
void set(const char* data, int len) { ptr_ = data; length_ = len; }
void set(const char* str) {
ptr_ = str;
if (str != NULL)
length_ = static_cast<int>(strlen(str));
else
length_ = 0;
}
void set(const void* data, int len) {
ptr_ = reinterpret_cast<const char*>(data);
length_ = len;
}
char operator[](int i) const { return ptr_[i]; }
void remove_prefix(int n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(int n) {
length_ -= n;
}
int compare(const StringPiece& x) const {
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
if (r == 0) {
if (length_ < x.length_) r = -1;
else if (length_ > x.length_) r = +1;
}
return r;
}
std::string as_string() const {
return std::string(data(), size());
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data(), size());
}
void CopyToString(std::string* target) const;
void AppendToString(std::string* target) const;
// Does "this" start with "x"
bool starts_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_, x.ptr_, x.length_) == 0));
}
// Does "this" end with "x"
bool ends_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
}
// standard STL container boilerplate
typedef char value_type;
typedef const char* pointer;
typedef const char& reference;
typedef const char& const_reference;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos;
typedef const char* const_iterator;
typedef const char* iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
iterator begin() const { return ptr_; }
iterator end() const { return ptr_ + length_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(ptr_ + length_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(ptr_);
}
// STLS says return size_type, but Google says return int
int max_size() const { return length_; }
int capacity() const { return length_; }
int copy(char* buf, size_type n, size_type pos = 0) const;
int find(const StringPiece& s, size_type pos = 0) const;
int find(char c, size_type pos = 0) const;
int rfind(const StringPiece& s, size_type pos = npos) const;
int rfind(char c, size_type pos = npos) const;
StringPiece substr(size_type pos, size_type n = npos) const;
static bool _equal(const StringPiece&, const StringPiece&);
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
return StringPiece::_equal(x, y);
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = memcmp(x.data(), y.data(),
std::min(x.size(), y.size()));
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
} // namespace re2
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
#endif // STRINGS_STRINGPIECE_H__

View File

@ -0,0 +1,254 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
//
// Prog::BadSearchBacktrack is a backtracking regular expression search,
// except that it remembers where it has been, trading a lot of
// memory for a lot of time. It exists only for testing purposes.
//
// Let me repeat that.
//
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
// - It uses a ton of memory.
// - It uses a ton of stack.
// - It uses CHECK and LOG(FATAL).
// - It implements unanchored search by repeated anchored search.
//
// On the other hand, it is very simple and a good reference
// implementation for the more complicated regexp packages.
//
// In BUILD, this file is linked into the ":testing" library,
// not the main library, in order to make it harder to pick up
// accidentally.
#include "util/util.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
// Backtracker holds the state for a backtracking search.
//
// Excluding the search parameters, the main search state
// is just the "capture registers", which record, for the
// current execution, the string position at which each
// parenthesis was passed. cap_[0] and cap_[1] are the
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
//
// To avoid infinite loops during backtracking on expressions
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
// pairs that have already been explored and are thus not worth
// re-exploring if we get there via another path. Modern backtracking
// libraries engineer their program representation differently, to make
// such infinite loops possible to avoid without keeping a giant visited_
// bitmap, but visited_ works fine for a reference implementation
// and it has the nice benefit of making the search run in linear time.
class Backtracker {
public:
explicit Backtracker(Prog* prog);
~Backtracker();
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
// Explores from instruction ip at string position p looking for a match.
// Returns true if found (so that caller can stop trying other possibilities).
bool Visit(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether search must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char* cap_[64]; // capture registers
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
};
Backtracker::Backtracker(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
visited_(NULL),
nvisited_(0) {
}
Backtracker::~Backtracker() {
delete[] visited_;
}
// Runs a backtracking search.
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && text.begin() > context_.begin())
return false;
if (prog_->anchor_end() && text.end() < context_.end())
return false;
anchored_ = anchored | prog_->anchor_start();
longest_ = longest | prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
CHECK(2*nsubmatch_ < arraysize(cap_));
memset(cap_, 0, sizeof cap_);
// We use submatch_[0] for our own bookkeeping,
// so it had better exist.
StringPiece sp0;
if (nsubmatch < 1) {
submatch_ = &sp0;
nsubmatch_ = 1;
}
submatch_[0] = NULL;
// Allocate new visited_ bitmap -- size is proportional
// to text, so have to reallocate on each call to Search.
delete[] visited_;
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
visited_ = new uint32[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return Visit(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
for (const char* p = text.begin(); p <= text.end(); p++) {
cap_[0] = p;
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Explores from instruction ip at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
// Check bitmap. If we've already explored from here,
// either it didn't match or it did but we're hoping for a better match.
// Either way, don't go down that road again.
CHECK(p <= text_.end());
int n = id*(text_.size()+1) + (p - text_.begin());
CHECK_LT(n/32, nvisited_);
if (visited_[n/32] & (1 << (n&31)))
return false;
visited_[n/32] |= 1 << (n&31);
// Pick out byte at current position. If at end of string,
// have to explore in hope of finishing a match. Use impossible byte -1.
int c = -1;
if (p < text_.end())
c = *p & 0xFF;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
return false; // not reached
case kInstAlt:
case kInstAltMatch:
// Try both possible next states: out is preferred to out1.
if (Visit(ip->out(), p)) {
if (longest_)
Visit(ip->out1(), p);
return true;
}
return Visit(ip->out1(), p);
case kInstByteRange:
if (ip->Matches(c))
return Visit(ip->out(), p+1);
return false;
case kInstCapture:
if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
// Capture p to register, but save old value.
const char* q = cap_[ip->cap()];
cap_[ip->cap()] = p;
bool ret = Visit(ip->out(), p);
// Restore old value as we backtrack.
cap_[ip->cap()] = q;
return ret;
}
return Visit(ip->out(), p);
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
return false;
return Visit(ip->out(), p);
case kInstNop:
return Visit(ip->out(), p);
case kInstMatch:
// We found a match. If it's the best so far, record the
// parameters in the caller's submatch_ array.
if (endmatch_ && p != context_.end())
return false;
cap_[1] = p;
if (submatch_[0].data() == NULL || // First match so far ...
(longest_ && p > submatch_[0].end())) { // ... or better match
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
}
return true;
case kInstFail:
return false;
}
}
// Runs a backtracking search.
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
Backtracker b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

View File

@ -0,0 +1,223 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test character class manipulations.
#include "util/test.h"
#include "re2/regexp.h"
namespace re2 {
struct CCTest {
struct {
Rune lo;
Rune hi;
} add[10];
int remove;
struct {
Rune lo;
Rune hi;
} final[10];
};
static CCTest tests[] = {
{ { { 10, 20 }, {-1} }, -1,
{ { 10, 20 }, {-1} } },
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
{ { 10, 30 }, {-1} } },
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
{ { 10, 40 }, {-1} } },
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
{ { 5, 25 }, {-1} } },
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
{ { 10, 23 }, {-1} } },
// These check boundary cases during negation.
{ { { 0, Runemax }, {-1} }, -1,
{ { 0, Runemax }, {-1} } },
{ { { 0, 50 }, {-1} }, -1,
{ { 0, 50 }, {-1} } },
{ { { 50, Runemax }, {-1} }, -1,
{ { 50, Runemax }, {-1} } },
// Check RemoveAbove.
{ { { 50, Runemax }, {-1} }, 255,
{ { 50, 255 }, {-1} } },
{ { { 50, Runemax }, {-1} }, 65535,
{ { 50, 65535 }, {-1} } },
{ { { 50, Runemax }, {-1} }, Runemax,
{ { 50, Runemax }, {-1} } },
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
{ { 50, 60 }, { 250, 255 }, {-1} } },
{ { { 50, 60 }, {-1} }, 255,
{ { 50, 60 }, {-1} } },
{ { { 350, 360 }, {-1} }, 255,
{ {-1} } },
{ { {-1} }, 255,
{ {-1} } },
};
template<class CharClass>
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
if (t == NULL) {
printf("\t%s:", desc);
} else {
printf("\n");
printf("CharClass added: [%s]", desc);
for (int k = 0; t->add[k].lo >= 0; k++)
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
printf("\n");
if (t->remove >= 0)
printf("Removed > %d\n", t->remove);
printf("\twant:");
for (int k = 0; t->final[k].lo >= 0; k++)
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
printf("\n");
printf("\thave:");
}
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
printf(" %d-%d", it->lo, it->hi);
printf("\n");
}
bool ShouldContain(CCTest *t, int x) {
for (int j = 0; t->final[j].lo >= 0; j++)
if (t->final[j].lo <= x && x <= t->final[j].hi)
return true;
return false;
}
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
CharClass* Negate(CharClass *cc) {
return cc->Negate();
}
void Delete(CharClass* cc) {
cc->Delete();
}
CharClassBuilder* Negate(CharClassBuilder* cc) {
CharClassBuilder* ncc = cc->Copy();
ncc->Negate();
return ncc;
}
void Delete(CharClassBuilder* cc) {
delete cc;
}
template<class CharClass>
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
typename CharClass::iterator it = cc->begin();
int size = 0;
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
if (it == cc->end() ||
it->lo != t->final[j].lo ||
it->hi != t->final[j].hi) {
Broke(desc, t, cc);
return false;
}
size += it->hi - it->lo + 1;
}
if (it != cc->end()) {
Broke(desc, t, cc);
return false;
}
if (cc->size() != size) {
Broke(desc, t, cc);
printf("wrong size: want %d have %d\n", size, cc->size());
return false;
}
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) != cc->Contains(j)) {
Broke(desc, t, cc);
printf("want contains(%d)=%d, got %d\n",
j, ShouldContain(t, j), cc->Contains(j));
return false;
}
}
CharClass* ncc = Negate(cc);
for (int j = 0; j < 101; j++) {
if (j == 100)
j = Runemax;
if (ShouldContain(t, j) == ncc->Contains(j)) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("want ncc contains(%d)!=%d, got %d\n",
j, ShouldContain(t, j), ncc->Contains(j));
Delete(ncc);
return false;
}
if (ncc->size() != Runemax+1 - cc->size()) {
Broke(desc, t, cc);
Broke("ncc", NULL, ncc);
printf("ncc size should be %d is %d\n",
Runemax+1 - cc->size(), ncc->size());
Delete(ncc);
return false;
}
}
Delete(ncc);
return true;
}
TEST(TestCharClassBuilder, Adds) {
int nfail = 0;
for (int i = 0; i < arraysize(tests); i++) {
CharClassBuilder ccb;
CCTest* t = &tests[i];
for (int j = 0; t->add[j].lo >= 0; j++)
ccb.AddRange(t->add[j].lo, t->add[j].hi);
if (t->remove >= 0)
ccb.RemoveAbove(t->remove);
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
nfail++;
CharClass* cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "before copy (CharClass)"))
nfail++;
cc->Delete();
CharClassBuilder *ccb1 = ccb.Copy();
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
nfail++;
cc = ccb.GetCharClass();
if (!CorrectCC(cc, t, "after copy (CharClass)"))
nfail++;
cc->Delete();
delete ccb1;
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

View File

@ -0,0 +1,171 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test prog.cc, compile.cc
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/prog.h"
DEFINE_string(show, "", "regular expression to compile and dump");
namespace re2 {
// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.
struct Test {
const char* regexp;
const char* code;
};
static Test tests[] = {
{ "a",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "ab",
"1. byte [61-61] -> 2\n"
"2. byte [62-62] -> 3\n"
"3. match! 0\n" },
{ "a|c",
"3. alt -> 1 | 2\n"
"1. byte [61-61] -> 4\n"
"2. byte [63-63] -> 4\n"
"4. match! 0\n" },
{ "a|b",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "[ab]",
"1. byte [61-62] -> 2\n"
"2. match! 0\n" },
{ "a+",
"1. byte [61-61] -> 2\n"
"2. alt -> 1 | 3\n"
"3. match! 0\n" },
{ "a+?",
"1. byte [61-61] -> 2\n"
"2. alt -> 3 | 1\n"
"3. match! 0\n" },
{ "a*",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 2\n"
"3. match! 0\n" },
{ "a*?",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 2\n" },
{ "a?",
"2. alt -> 1 | 3\n"
"1. byte [61-61] -> 3\n"
"3. match! 0\n" },
{ "a??",
"2. alt -> 3 | 1\n"
"3. match! 0\n"
"1. byte [61-61] -> 3\n" },
{ "a{4}",
"1. byte [61-61] -> 2\n"
"2. byte [61-61] -> 3\n"
"3. byte [61-61] -> 4\n"
"4. byte [61-61] -> 5\n"
"5. match! 0\n" },
{ "(a)",
"2. capture 2 -> 1\n"
"1. byte [61-61] -> 3\n"
"3. capture 3 -> 4\n"
"4. match! 0\n" },
{ "(?:a)",
"1. byte [61-61] -> 2\n"
"2. match! 0\n" },
{ "",
"2. match! 0\n" },
{ ".",
"3. alt -> 1 | 2\n"
"1. byte [00-09] -> 4\n"
"2. byte [0b-ff] -> 4\n"
"4. match! 0\n" },
{ "[^ab]",
"5. alt -> 3 | 4\n"
"3. alt -> 1 | 2\n"
"4. byte [63-ff] -> 6\n"
"1. byte [00-09] -> 6\n"
"2. byte [0b-60] -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
"1. byte/i [61-61] -> 2\n"
"2. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
for (int i = 0; i < arraysize(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
LOG(ERROR) << "Cannot parse: " << t.regexp;
failed++;
continue;
}
Prog* prog = re->CompileToProg(0);
if (prog == NULL) {
LOG(ERROR) << "Cannot compile: " << t.regexp;
re->Decref();
failed++;
continue;
}
CHECK(re->CompileToProg(1) == NULL);
string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
LOG(ERROR) << "Want:\n" << t.code;
LOG(ERROR) << "Got:\n" << s;
failed++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(failed, 0);
}
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
static struct UTF8ByteRange {
int lo;
int hi;
} utf8ranges[] = {
{ 0x00, 0x09 },
{ 0x0A, 0x0A },
{ 0x10, 0x7F },
{ 0x80, 0x8F },
{ 0x90, 0x9F },
{ 0xA0, 0xBF },
{ 0xC0, 0xC1 },
{ 0xC2, 0xDF },
{ 0xE0, 0xE0 },
{ 0xE1, 0xEF },
{ 0xF0, 0xF0 },
{ 0xF1, 0xF3 },
{ 0xF4, 0xF4 },
{ 0xF5, 0xFF },
};
TEST(TestCompile, ByteRanges) {
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
for (int i = 0; i < arraysize(utf8ranges); i++)
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
delete prog;
re->Decref();
}
} // namespace re2

View File

@ -0,0 +1,344 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "util/thread.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
DECLARE_bool(re2_dfa_bail_when_slow);
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
DEFINE_int32(repeat, 2, "Repetition count.");
DEFINE_int32(threads, 4, "number of threads");
namespace re2 {
// Check that multithreaded access to DFA class works.
// Helper thread: builds entire DFA for prog.
class BuildThread : public Thread {
public:
BuildThread(Prog* prog) : prog_(prog) {}
virtual void Run() {
CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
}
private:
Prog* prog_;
};
TEST(Multithreaded, BuildEntireDFA) {
// Create regexp with 2^FLAGS_size states in DFA.
string s = "a";
for (int i = 0; i < FLAGS_size; i++)
s += "[ab]";
s += "b";
// Check that single-threaded code works.
{
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
BuildThread* t = new BuildThread(prog);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
re->Decref();
}
// Build the DFA simultaneously in a bunch of threads.
for (int i = 0; i < FLAGS_repeat; i++) {
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
vector<BuildThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
BuildThread *t = new BuildThread(prog);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
// One more compile, to make sure everything is okay.
prog->BuildEntireDFA(Prog::kFirstMatch);
delete prog;
re->Decref();
}
}
// Check that DFA size requirements are followed.
// BuildEntireDFA will, like SearchDFA, stop building out
// the DFA once the memory limits are reached.
TEST(SingleThreaded, BuildEntireDFA) {
// Create regexp with 2^30 states in DFA.
string s = "a";
for (int i = 0; i < 30; i++)
s += "[ab]";
s += "b";
//LOG(INFO) << s;
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
CHECK(re);
int max = 24;
for (int i = 17; i < max; i++) {
int limit = 1<<i;
int usage;
//int progusage, dfamem;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(limit);
CHECK(prog);
//progusage = m.HeapGrowth();
//dfamem = prog->dfa_mem();
prog->BuildEntireDFA(Prog::kFirstMatch);
prog->BuildEntireDFA(Prog::kLongestMatch);
usage = m.HeapGrowth();
delete prog;
}
if (!UsingMallocCounter)
continue;
//LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
// limit, progusage, dfamem, usage);
CHECK_GT(usage, limit*9/10);
CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
}
re->Decref();
}
// Generates and returns a string over binary alphabet {0,1} that contains
// all possible binary sequences of length n as subsequences. The obvious
// brute force method would generate a string of length n * 2^n, but this
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
// Such a string is useful for testing a DFA. If you have a DFA
// where distinct last n bytes implies distinct states, then running on a
// DeBruijn string causes the DFA to need to create a new state at every
// position in the input, never reusing any states until it gets to the
// end of the string. This is the worst possible case for DFA execution.
static string DeBruijnString(int n) {
CHECK_LT(n, 8*sizeof(int));
CHECK_GT(n, 0);
vector<bool> did(1<<n);
for (int i = 0; i < 1<<n; i++)
did[i] = false;
string s;
for (int i = 0; i < n-1; i++)
s.append("0");
int bits = 0;
int mask = (1<<n) - 1;
for (int i = 0; i < (1<<n); i++) {
bits <<= 1;
bits &= mask;
if (!did[bits|1]) {
bits |= 1;
s.append("1");
} else {
s.append("0");
}
CHECK(!did[bits]);
did[bits] = true;
}
return s;
}
// Test that the DFA gets the right result even if it runs
// out of memory during a search. The regular expression
// 0[01]{n}$ matches a binary string of 0s and 1s only if
// the (n+1)th-to-last character is a 0. Matching this in
// a single forward pass (as done by the DFA) requires
// keeping one bit for each of the last n+1 characters
// (whether each was a 0), or 2^(n+1) possible states.
// If we run this regexp to search in a string that contains
// every possible n-character binary string as a substring,
// then it will have to run through at least 2^n states.
// States are big data structures -- certainly more than 1 byte --
// so if the DFA can search correctly while staying within a
// 2^n byte limit, it must be handling out-of-memory conditions
// gracefully.
TEST(SingleThreaded, SearchDFA) {
// Choice of n is mostly arbitrary, except that:
// * making n too big makes the test run for too long.
// * making n too small makes the DFA refuse to run,
// because it has so little memory compared to the program size.
// Empirically, n = 18 is a good compromise between the two.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
string no_match = DeBruijnString(n);
string match = no_match + "0";
// The De Bruijn string is the worst case input for this regexp.
// By default, the DFA will notice that it is flushing its cache
// too frequently and will bail out early, so that RE2 can use the
// NFA implementation instead. (The DFA loses its speed advantage
// if it can't get a good cache hit rate.)
// Tell the DFA to trudge along instead.
FLAGS_re2_dfa_bail_when_slow = false;
int64 usage;
int64 peak_usage;
{
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
for (int i = 0; i < 10; i++) {
bool matched, failed = false;
matched = prog->SearchDFA(match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog->SearchDFA(no_match, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
usage = m.HeapGrowth();
peak_usage = m.PeakHeapGrowth();
delete prog;
}
re->Decref();
if (!UsingMallocCounter)
return;
//LOG(INFO) << "usage " << usage << " " << peak_usage;
CHECK_LT(usage, 1<<n);
CHECK_LT(peak_usage, 1<<n);
}
// Helper thread: searches for match, which should match,
// and no_match, which should not.
class SearchThread : public Thread {
public:
SearchThread(Prog* prog, const StringPiece& match,
const StringPiece& no_match)
: prog_(prog), match_(match), no_match_(no_match) {}
virtual void Run() {
for (int i = 0; i < 2; i++) {
bool matched, failed = false;
matched = prog_->SearchDFA(match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(matched);
matched = prog_->SearchDFA(no_match_, NULL,
Prog::kUnanchored, Prog::kFirstMatch,
NULL, &failed, NULL);
CHECK(!failed);
CHECK(!matched);
}
}
private:
Prog* prog_;
StringPiece match_;
StringPiece no_match_;
};
TEST(Multithreaded, SearchDFA) {
// Same as single-threaded test above.
const int n = 18;
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
Regexp::LikePerl, NULL);
CHECK(re);
string no_match = DeBruijnString(n);
string match = no_match + "0";
FLAGS_re2_dfa_bail_when_slow = false;
// Check that single-threaded code works.
{
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
SearchThread* t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
t->Start();
t->Join();
delete t;
delete prog;
}
// Run the search simultaneously in a bunch of threads.
// Reuse same flags for Multithreaded.BuildDFA above.
for (int i = 0; i < FLAGS_repeat; i++) {
//LOG(INFO) << "Search " << i;
Prog* prog = re->CompileToProg(1<<n);
CHECK(prog);
vector<SearchThread*> threads;
for (int j = 0; j < FLAGS_threads; j++) {
SearchThread *t = new SearchThread(prog, match, no_match);
t->SetJoinable(true);
threads.push_back(t);
}
for (int j = 0; j < FLAGS_threads; j++)
threads[j]->Start();
for (int j = 0; j < FLAGS_threads; j++) {
threads[j]->Join();
delete threads[j];
}
delete prog;
}
re->Decref();
}
struct ReverseTest {
const char *regexp;
const char *text;
bool match;
};
// Test that reverse DFA handles anchored/unanchored correctly.
// It's in the DFA interface but not used by RE2.
ReverseTest reverse_tests[] = {
{ "\\A(a|b)", "abc", true },
{ "(a|b)\\z", "cba", true },
{ "\\A(a|b)", "cba", false },
{ "(a|b)\\z", "abc", false },
};
TEST(DFA, ReverseMatch) {
int nfail = 0;
for (int i = 0; i < arraysize(reverse_tests); i++) {
const ReverseTest& t = reverse_tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog *prog = re->CompileToReverseProg(0);
CHECK(prog);
bool failed = false;
bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
if (matched != t.match) {
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
nfail++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(nfail, 0);
}
} // namespace re2

View File

@ -0,0 +1,164 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Dump the regexp into a string showing structure.
// Tested by parse_unittest.cc
// This function traverses the regexp recursively,
// meaning that on inputs like Regexp::Simplify of
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
// it takes time and space exponential in the size of the
// original regular expression. It can also use stack space
// linear in the size of the regular expression for inputs
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
// As a result, Dump is provided only in the testing
// library (see BUILD).
#include <string>
#include <vector>
#include "util/test.h"
#include "re2/stringpiece.h"
#include "re2/regexp.h"
// Cause a link error if this file is used outside of testing.
DECLARE_string(test_tmpdir);
namespace re2 {
static const char* kOpcodeNames[] = {
"bad",
"no",
"emp",
"lit",
"str",
"cat",
"alt",
"star",
"plus",
"que",
"rep",
"cap",
"dot",
"byte",
"bol",
"eol",
"wb", // kRegexpWordBoundary
"nwb", // kRegexpNoWordBoundary
"bot",
"eot",
"cc",
"match",
};
// Create string representation of regexp with explicit structure.
// Nothing pretty, just for testing.
static void DumpRegexpAppending(Regexp* re, string* s) {
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
StringAppendF(s, "op%d", re->op());
} else {
switch (re->op()) {
default:
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (re->parse_flags() & Regexp::NonGreedy)
s->append("n");
break;
}
s->append(kOpcodeNames[re->op()]);
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
Rune r = re->rune();
if ('a' <= r && r <= 'z')
s->append("fold");
}
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
if ('a' <= r && r <= 'z') {
s->append("fold");
break;
}
}
}
}
s->append("{");
switch (re->op()) {
default:
break;
case kRegexpEndText:
if (!(re->parse_flags() & Regexp::WasDollar)) {
s->append("\\z");
}
break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
break;
case kRegexpConcat:
case kRegexpAlternate:
for (int i = 0; i < re->nsub(); i++)
DumpRegexpAppending(re->sub()[i], s);
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCapture:
if (re->name()) {
s->append(*re->name());
s->append(":");
}
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpRepeat:
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
DumpRegexpAppending(re->sub()[0], s);
break;
case kRegexpCharClass: {
string sep;
for (CharClass::iterator it = re->cc()->begin();
it != re->cc()->end(); ++it) {
RuneRange rr = *it;
s->append(sep);
if (rr.lo == rr.hi)
s->append(StringPrintf("%#x", rr.lo));
else
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
sep = " ";
}
break;
}
}
s->append("}");
}
string Regexp::Dump() {
string s;
// Make sure being called from a unit test.
if (FLAGS_test_tmpdir.empty()) {
LOG(ERROR) << "Cannot use except for testing.";
return s;
}
DumpRegexpAppending(this, &s);
return s;
}
} // namespace re2

View File

@ -0,0 +1,42 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test simple repetition operators
TEST(Repetition, Simple) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Explode("abc."), ops,
6, Explode("ab"), "(?:%s)", "");
ExhaustiveTest(3, 2, Explode("abc."), ops,
40, Explode("a"), "(?:%s)", "");
}
// Test capturing parens -- (a) -- inside repetition operators
TEST(Repetition, Capturing) {
vector<string> ops = Split(" ",
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
"%s* %s+ %s? %s*? %s+? %s??");
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
7, Explode("ab"), "(?:%s)", "");
// This would be a great test, but it runs forever when PCRE is enabled.
if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
100, Explode("a"), "(?:%s)", "");
}
} // namespace re2

View File

@ -0,0 +1,70 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/re2.h"
#include "re2/testing/exhaustive_tester.h"
DECLARE_string(regexp_engines);
namespace re2 {
// Test empty string matches (aka "(?:)")
TEST(EmptyString, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
RegexpGenerator::EgrepOps(),
5, Split("", "ab"), "", "");
}
// Test escaped versions of regexp syntax.
TEST(Punctuation, Literals) {
vector<string> alphabet = Explode("()*+?{}[]\\^$.");
vector<string> escaped = alphabet;
for (int i = 0; i < escaped.size(); i++)
escaped[i] = "\\" + escaped[i];
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
2, alphabet, "", "");
}
// Test ^ $ . \A \z in presence of line endings.
// Have to wrap the empty-width ones in (?:) so that
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
TEST(LineEnds, Exhaustive) {
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
RegexpGenerator::EgrepOps(),
4, Explode("ab\n"), "", "");
}
// Test what does and does not match \n.
// This would be a good test, except that PCRE seems to have a bug:
// in single-byte character set mode (the default),
// [^a] matches \n, but in UTF-8 mode it does not.
// So when we run the test, the tester complains that
// we don't agree with PCRE, but it's PCRE that is at fault.
// For what it's worth, Perl gets this right (matches
// regardless of whether UTF-8 input is selected):
//
// #!/usr/bin/perl
// use POSIX qw(locale_h);
// print "matches in latin1\n" if "\n" =~ /[^a]/;
// setlocale("en_US.utf8");
// print "matches in utf8\n" if "\n" =~ /[^a]/;
//
// The rule chosen for RE2 is that by default, like Perl,
// dot does not match \n but negated character classes [^a] do.
// (?s) will allow dot to match \n; there is no way in RE2
// to stop [^a] from matching \n, though the underlying library
// provides a mechanism, and RE2 could add new syntax if needed.
//
// TEST(Newlines, Exhaustive) {
// vector<string> empty_vector;
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
// RegexpGenerator::EgrepOps(),
// 4, Explode("a\n"), "");
// }
} // namespace re2

View File

@ -0,0 +1,94 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
// Test simple character classes by themselves.
TEST(CharacterClasses, Exhaustive) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "", "");
}
// Test simple character classes inside a___b (for example, a[a]b).
TEST(CharacterClasses, ExhaustiveAB) {
vector<string> atoms = Split(" ",
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
5, Explode("ab"), "a%sb", "");
}
// Returns UTF8 for Rune r
static string UTF8(Rune r) {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
return string(buf);
}
// Returns a vector of "interesting" UTF8 characters.
// Unicode is now too big to just return all of them,
// so UTF8Characters return a set likely to be good test cases.
static const vector<string>& InterestingUTF8() {
static bool init;
static vector<string> v;
if (init)
return v;
init = true;
// All the Latin1 equivalents are interesting.
for (int i = 1; i < 256; i++)
v.push_back(UTF8(i));
// After that, the codes near bit boundaries are
// interesting, because they span byte sequence lengths.
for (int j = 0; j < 8; j++)
v.push_back(UTF8(256 + j));
for (int i = 512; i < Runemax; i <<= 1)
for (int j = -8; j < 8; j++)
v.push_back(UTF8(i + j));
// The codes near Runemax, including Runemax itself, are interesting.
for (int j = -8; j <= 0; j++)
v.push_back(UTF8(Runemax + j));
return v;
}
// Test interesting UTF-8 characters against character classes.
TEST(InterestingUTF8, SingleOps) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
ExhaustiveTest(1, 0, atoms, ops,
1, InterestingUTF8(), "", "");
}
// Test interesting UTF-8 characters against character classes,
// but wrap everything inside AB.
TEST(InterestingUTF8, AB) {
vector<string> atoms = Split(" ",
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
vector<string> ops; // no ops
vector<string> alpha = InterestingUTF8();
for (int i = 0; i < alpha.size(); i++)
alpha[i] = "a" + alpha[i] + "b";
ExhaustiveTest(1, 0, atoms, ops,
1, alpha, "a%sb", "");
}
} // namespace re2

View File

@ -0,0 +1,38 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
namespace re2 {
DECLARE_string(regexp_engines);
// Test very simple expressions.
TEST(EgrepLiterals, Lowercase) {
EgrepTest(3, 2, "abc.", 3, "abc", "");
}
// Test mixed-case expressions.
TEST(EgrepLiterals, MixedCase) {
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
}
// Test mixed-case in case-insensitive mode.
TEST(EgrepLiterals, FoldCase) {
// The punctuation characters surround A-Z and a-z
// in the ASCII table. This looks for bugs in the
// bytemap range code in the DFA.
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
}
// Test very simple expressions.
TEST(EgrepLiterals, UTF8) {
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
}
} // namespace re2

View File

@ -0,0 +1,188 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Exhaustive testing of regular expression matching.
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
// a maximum regular expression length, and a maximum number of letters
// that can appear in the regular expression. Given these parameters,
// it tries every possible regular expression and string, verifying that
// the NFA, DFA, and a trivial backtracking implementation agree about
// the location of the match.
#include <stdlib.h>
#include <stdio.h>
#ifndef LOGGING
#define LOGGING 0
#endif
#include "util/test.h"
#include "re2/testing/exhaustive_tester.h"
#include "re2/testing/tester.h"
DEFINE_bool(show_regexps, false, "show regexps during testing");
DEFINE_int32(max_bad_regexp_inputs, 1,
"Stop testing a regular expression after finding this many "
"strings that break it.");
// Compiled in debug mode, the usual tests run for over an hour.
// Have to cut it down to make the unit test machines happy.
DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
namespace re2 {
static char* escape(const StringPiece& sp) {
static char buf[512];
char* p = buf;
*p++ = '\"';
for (int i = 0; i < sp.size(); i++) {
if(p+5 >= buf+sizeof buf)
LOG(FATAL) << "ExhaustiveTester escape: too long";
if(sp[i] == '\\' || sp[i] == '\"') {
*p++ = '\\';
*p++ = sp[i];
} else if(sp[i] == '\n') {
*p++ = '\\';
*p++ = 'n';
} else {
*p++ = sp[i];
}
}
*p++ = '\"';
*p = '\0';
return buf;
}
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
printf("-");
return;
}
for (int i = 0; i < n; i++) {
if (i > 0)
printf(" ");
if (m[i].begin() == NULL)
printf("-");
else
printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
}
}
// Processes a single generated regexp.
// Compiles it using Regexp interface and PCRE, and then
// checks that NFA, DFA, and PCRE all return the same results.
void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
regexps_++;
string regexp = const_regexp;
if (!topwrapper_.empty())
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
if (FLAGS_show_regexps) {
printf("\r%s", regexp.c_str());
fflush(stdout);
}
if (LOGGING) {
// Write out test cases and answers for use in testing
// other implementations, such as Go's regexp package.
if (randomstrings_)
LOG(ERROR) << "Cannot log with random strings.";
if (regexps_ == 1) { // first
printf("strings\n");
strgen_.Reset();
while (strgen_.HasNext())
printf("%s\n", escape(strgen_.Next()));
printf("regexps\n");
}
printf("%s\n", escape(regexp));
RE2 re(regexp);
RE2::Options longest;
longest.set_longest_match(true);
RE2 relongest(regexp, longest);
int ngroup = re.NumberOfCapturingGroups()+1;
StringPiece* group = new StringPiece[ngroup];
strgen_.Reset();
while (strgen_.HasNext()) {
StringPiece input = strgen_.Next();
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
printf(";");
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
printf("\n");
}
delete[] group;
return;
}
Tester tester(regexp);
if (tester.error())
return;
strgen_.Reset();
strgen_.GenerateNULL();
if (randomstrings_)
strgen_.Random(stringseed_, stringcount_);
int bad_inputs = 0;
while (strgen_.HasNext()) {
tests_++;
if (!tester.TestInput(strgen_.Next())) {
failures_++;
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
break;
}
}
}
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper) {
if (DEBUG_MODE && FLAGS_quick_debug_mode) {
if (maxatoms > 1)
maxatoms--;
if (maxops > 1)
maxops--;
if (maxstrlen > 1)
maxstrlen--;
}
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
maxstrlen, stralphabet, wrapper,
topwrapper);
t.Generate();
if (!LOGGING) {
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
}
EXPECT_EQ(0, t.failures());
}
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper) {
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
for (int i = 0; i < arraysize(tops); i++) {
ExhaustiveTest(maxatoms, maxops,
Split("", alphabet),
RegexpGenerator::EgrepOps(),
maxstrlen,
Split("", stralphabet),
wrapper,
tops[i]);
}
}
} // namespace re2

View File

@ -0,0 +1,85 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
#include <string>
#include <vector>
#include "util/util.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Exhaustive regular expression test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
// each possible string, and if so, where the match is.
//
// Can also be used in a "random" mode that generates a given number
// of random regexp and strings, allowing testing of larger expressions
// and inputs.
class ExhaustiveTester : public RegexpGenerator {
public:
ExhaustiveTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
wrapper_(wrapper),
topwrapper_(topwrapper),
regexps_(0), tests_(0), failures_(0),
randomstrings_(0), stringseed_(0), stringcount_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
int failures() { return failures_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
// Causes testing to generate random input strings.
void RandomStrings(int32 seed, int32 count) {
randomstrings_ = true;
stringseed_ = seed;
stringcount_ = count;
}
private:
StringGenerator strgen_;
string wrapper_; // Regexp wrapper - either empty or has one %s.
string topwrapper_; // Regexp top-level wrapper.
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
int failures_; // Number of tests failed.
bool randomstrings_; // Whether to use random strings
int32 stringseed_; // If so, the seed.
int stringcount_; // If so, how many to generate.
DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
};
// Runs an exhaustive test on the given parameters.
void ExhaustiveTest(int maxatoms, int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen, const vector<string>& stralphabet,
const string& wrapper,
const string& topwrapper);
// Runs an exhaustive test using the given parameters and
// the basic egrep operators.
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
int maxstrlen, const string& stralphabet,
const string& wrapper);
} // namespace re2
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__

View File

@ -0,0 +1,275 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
namespace re2 {
struct FilterTestVars {
vector<string> atoms;
vector<int> atom_indices;
vector<int> matches;
RE2::Options opts;
FilteredRE2 f;
};
TEST(FilteredRE2Test, EmptyTest) {
FilterTestVars v;
v.f.AllMatches("foo", v.atom_indices, &v.matches);
EXPECT_EQ(0, v.matches.size());
}
TEST(FilteredRE2Test, SmallOrTest) {
FLAGS_filtered_re2_min_atom_len = 4;
FilterTestVars v;
int id;
v.f.Add("(foo|bar)", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(0, v.atoms.size());
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
TEST(FilteredRE2Test, SmallLatinTest) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
int id;
v.opts.set_utf8(false);
v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
v.f.Compile(&v.atoms);
EXPECT_EQ(1, v.atoms.size());
EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
v.atom_indices.push_back(0);
v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
EXPECT_EQ(1, v.matches.size());
EXPECT_EQ(id, v.matches[0]);
}
struct AtomTest {
const char* testname;
// If any test needs more than this many regexps or atoms, increase
// the size of the corresponding array.
const char* regexps[20];
const char* atoms[20];
};
AtomTest atom_tests[] = {
{
// This test checks to make sure empty patterns are allowed.
"CheckEmptyPattern",
{""},
{}
}, {
// This test checks that all atoms of length greater than min length
// are found, and no atoms that are of smaller length are found.
"AllAtomsGtMinLengthFound", {
"(abc123|def456|ghi789).*mnop[x-z]+",
"abc..yyy..zz",
"mnmnpp[a-z]+PPP"
}, {
"abc123",
"def456",
"ghi789",
"mnop",
"abc",
"yyy",
"mnmnpp",
"ppp"
}
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
// substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
"(abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
"abc",
"ghi789",
"abcd",
"yyy",
"yyyzzz",
"mnmnpp",
"ppp"
}
}, {
// Test character class expansion.
"CharClassExpansion", {
"m[a-c][d-f]n.*[x-z]+",
"[x-y]bcde[ab]"
}, {
"madn", "maen", "mafn",
"mbdn", "mben", "mbfn",
"mcdn", "mcen", "mcfn",
"xbcdea", "xbcdeb",
"ybcdea", "ybcdeb"
}
}, {
// Test upper/lower of non-ASCII.
"UnicodeLower", {
"(?i)ΔδΠϖπΣςσ",
"ΛΜΝΟΠ",
"ψρστυ",
}, {
"δδπππσσσ",
"λμνοπ",
"ψρστυ",
},
},
};
void AddRegexpsAndCompile(const char* regexps[],
int n,
struct FilterTestVars* v) {
for (int i = 0; i < n; i++) {
int id;
v->f.Add(regexps[i], v->opts, &id);
}
v->f.Compile(&v->atoms);
}
bool CheckExpectedAtoms(const char* atoms[],
int n,
const char* testname,
struct FilterTestVars* v) {
vector<string> expected;
for (int i = 0; i < n; i++)
expected.push_back(atoms[i]);
bool pass = expected.size() == v->atoms.size();
sort(v->atoms.begin(), v->atoms.end());
sort(expected.begin(), expected.end());
for (int i = 0; pass && i < n; i++)
pass = pass && expected[i] == v->atoms[i];
if (!pass) {
LOG(WARNING) << "Failed " << testname;
LOG(WARNING) << "Expected #atoms = " << expected.size();
for (int i = 0; i < expected.size(); i++)
LOG(WARNING) << expected[i];
LOG(WARNING) << "Found #atoms = " << v->atoms.size();
for (int i = 0; i < v->atoms.size(); i++)
LOG(WARNING) << v->atoms[i];
}
return pass;
}
TEST(FilteredRE2Test, AtomTests) {
FLAGS_filtered_re2_min_atom_len = 3;
int nfail = 0;
for (int i = 0; i < arraysize(atom_tests); i++) {
FilterTestVars v;
AtomTest* t = &atom_tests[i];
int natom, nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
for (natom = 0; natom < arraysize(t->atoms); natom++)
if (t->atoms[natom] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
nfail++;
}
EXPECT_EQ(0, nfail);
}
void FindAtomIndices(const vector<string> atoms,
const vector<string> matched_atoms,
vector<int>* atom_indices) {
atom_indices->clear();
for (int i = 0; i < matched_atoms.size(); i++) {
int j = 0;
for (; j < atoms.size(); j++) {
if (matched_atoms[i] == atoms[j]) {
atom_indices->push_back(j);
break;
}
EXPECT_LT(j, atoms.size());
}
}
}
TEST(FilteredRE2Test, MatchEmptyPattern) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[0];
// We are using the regexps used in one of the atom tests
// for this test. Adding the EXPECT here to make sure
// the index we use for the test is for the correct test.
EXPECT_EQ("CheckEmptyPattern", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "0123";
vector<int> atom_ids;
vector<int> matching_regexps;
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
}
TEST(FilteredRE2Test, MatchTests) {
FLAGS_filtered_re2_min_atom_len = 3;
FilterTestVars v;
AtomTest* t = &atom_tests[2];
// We are using the regexps used in one of the atom tests
// for this test.
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
int nregexp;
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
if (t->regexps[nregexp] == NULL)
break;
AddRegexpsAndCompile(t->regexps, nregexp, &v);
string text = "abc121212xyz";
// atoms = abc
vector<int> atom_ids;
vector<string> atoms;
atoms.push_back("abc");
FindAtomIndices(v.atoms, atoms, &atom_ids);
vector<int> matching_regexps;
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abc12312yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(1, matching_regexps.size());
text = "abcd12yyy32yyyzzz";
atoms.clear();
atoms.push_back("abc");
atoms.push_back("abcd");
atoms.push_back("yyy");
atoms.push_back("yyyzzz");
FindAtomIndices(v.atoms, atoms, &atom_ids);
LOG(INFO) << "S: " << atom_ids.size();
for (int i = 0; i < atom_ids.size(); i++)
LOG(INFO) << "i: " << i << " : " << atom_ids[i];
v.f.AllMatches(text, atom_ids, &matching_regexps);
EXPECT_EQ(2, matching_regexps.size());
}
} // namespace re2

View File

@ -0,0 +1,76 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct PCRETest {
const char* regexp;
bool should_match;
};
static PCRETest tests[] = {
// Most things should behave exactly.
{ "abc", true },
{ "(a|b)c", true },
{ "(a*|b)c", true },
{ "(a|b*)c", true },
{ "a(b|c)d", true },
{ "a(()|())c", true },
{ "ab*c", true },
{ "ab+c", true },
{ "a(b*|c*)d", true },
{ "\\W", true },
{ "\\W{1,2}", true },
{ "\\d", true },
// Check that repeated empty strings do not.
{ "(a*)*", false },
{ "x(a*)*y", false },
{ "(a*)+", false },
{ "(a+)*", true },
{ "(a+)+", true },
{ "(a+)+", true },
// \v is the only character class that shouldn't.
{ "\\b", true },
{ "\\v", false },
{ "\\d", true },
// The handling of ^ in multi-line mode is different, as is
// the handling of $ in single-line mode. (Both involve
// boundary cases if the string ends with \n.)
{ "\\A", true },
{ "\\z", true },
{ "(?m)^", false },
{ "(?m)$", true },
{ "(?-m)^", true },
{ "(?-m)$", false }, // In PCRE, == \Z
{ "(?m)\\A", true },
{ "(?m)\\z", true },
{ "(?-m)\\A", true },
{ "(?-m)\\z", true },
};
TEST(MimicsPCRE, SimpleTests) {
for (int i = 0; i < arraysize(tests); i++) {
const PCRETest& t = tests[i];
for (int j = 0; j < 2; j++) {
Regexp::ParseFlags flags = Regexp::LikePerl;
if (j == 0)
flags = flags | Regexp::Latin1;
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
CHECK(re) << " " << t.regexp;
CHECK_EQ(t.should_match, re->MimicsPCRE())
<< " " << t.regexp << " "
<< (j==0 ? "latin1" : "utf");
re->Decref();
}
}
}
} // namespace re2

View File

@ -0,0 +1,44 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Null walker. For benchmarking the walker itself.
class NullWalker : public Regexp::Walker<bool> {
public:
NullWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NullWalker::ShortVisit called";
return a;
}
private:
DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
return false;
}
// Returns whether re can match an empty string.
void Regexp::NullWalk() {
NullWalker w;
w.Walk(this, false);
}
} // namespace re2

Some files were not shown because too many files have changed in this diff Show More