C++ tokenizer based on RE2. Not by me.

Some differences from Moses tokenizer:  fraction characters count as numbers, _ handling, URLs
Currently 3x slower than perl :'(.  Looking to make it faster by composing regex substitutions.
TODO eliminate sprintf and fixed-size buffers.
This commit is contained in:
Kenneth Heafield 2015-01-21 12:23:44 -05:00 committed by Marcin Junczys-Dowmunt
parent ad6f3a8026
commit 1dce55f413
4 changed files with 1065 additions and 0 deletions

View File

@ -0,0 +1,2 @@
external-lib re2 ;
exe tokenizer : tokenizer.cpp tokenizer_main.cpp re2 : <cflags>-std=c++11 ;

View File

@ -0,0 +1,736 @@
#include "tokenizer.h"
#include <sstream>
#include <iterator>
#include <memory>
#include <vector>
#include <algorithm>
namespace {
// frequently used regexp's are pre-compiled thus:
RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
RE2 mult_spc_x(" +"); // multiple spaces
RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
RE2 ctrls_x("[\\000-\\037]*"); // match any control characters
RE2 head_spc_x("^ "); // match a leading space on a line
RE2 tail_spc_x(" $"); // match a trailing space on a line
RE2 genl_spc_x("\\s+"); // any sequence of one or more whitespace characters
RE2 specials_x("([^_\\p{L}\\p{N}\\s\\.\\'\\`\\,\\-])"); // any surely non-token character
RE2 hyphen_x("([\\p{L}\\p{N}])(-)([\\p{L}\\p{N}])"); // any hyphenated pronouncable sequence
RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
RE2 qx_x("([?!])"); // one qm/em mark
RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
RE2 postncomma_x("([^\\p{N}]),"); // comma after non-number
RE2 prencomma_x(",([^\\p{N}])"); // comma before non-number
RE2 nanaapos_x("([^\\p{L}])'([^\\p{L}])"); // non-letter'non-letter contraction form
RE2 nxpaapos_x("([^\\p{L}\\p{N}])'([\\p{L}])"); // alnum'non-letter contraction form
RE2 napaapos_x("([^\\p{L}])'([\\p{L}])"); // non-letter'letter contraction form
RE2 panaapos_x("([\\p{L}])'([^\\p{L}])"); // letter'non-letter contraction form
RE2 papaapos_x("([\\p{L}])'([\\p{L}])"); // letter'letter contraction form
RE2 pnsapos_x("([\\p{N}])[']s"); // plural number
RE2 letter_x("\\p{L}"); // a letter
RE2 lower_x("^\\p{Ll}"); // a lower-case letter
RE2 sinteger_x("^\\p{N}"); // not a digit mark
RE2 dotskey_x("MANYDOTS(\\d+)"); // token for a dot sequence parameterized by seq length
RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
RE2 nonbreak_x("-\\p{L}"); // where not to break a protected form
RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms
RE2 right_x("[\\p{Sc}({¿¡]+"); //
RE2 left_x("[,.?!:;\\%})]+"); //
RE2 curr_en_x("^[\'][\\p{L}]"); //
RE2 pre_en_x("[\\p{L}\\p{N}]$"); //
RE2 curr_fr_x("[\\p{L}\\p{N}][\']$"); //
RE2 post_fr_x("^[\\p{L}\\p{N}]"); //
RE2 quotes_x("^[\'\"]+$"); //
RE2 endnum_x("[-\'\"]"); //
// anything rarely used will just be given as a string and compiled on demand by RE2
}; // end anonymous namespace
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
// where to load nonbreaking_prefix.XX files
// and protected_pattern.XX files
std::string Tokenizer::cfg_dir(".");
// static method
void
Tokenizer::set_config_dir(const std::string& dir) {
if (dir.empty()) {
cfg_dir = ".";
} else {
cfg_dir.assign(dir);
}
}
Tokenizer::Tokenizer(const std::string& _lang_iso,
bool _skip_xml_p,
bool _skip_alltags_p,
bool _non_escape_p,
bool _aggressive_hyphen_p,
bool _penn_p,
bool _verbose_p)
: lang_iso(_lang_iso)
, english_p(_lang_iso.compare("en")==0)
, latin_p((!english_p) && (_lang_iso.compare("fr")==0 || _lang_iso.compare("it")==0))
, skip_xml_p(_skip_xml_p)
, skip_alltags_p(_skip_alltags_p)
, non_escape_p(_non_escape_p)
, aggressive_hyphen_p(_aggressive_hyphen_p)
, penn_p(_penn_p)
, verbose_p(_verbose_p)
{
}
//
// dtor deletes dynamically allocated per-language RE2 compiled expressions
//
Tokenizer::~Tokenizer()
{
for (auto& ptr : prot_pat_vec) {
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
continue;
delete ptr;
}
}
//
// stuffs numeric-only prefixes into nbpre_num_set,
// others into nbpre_gen_set
//
std::pair<int,int>
Tokenizer::load_prefixes(std::ifstream& ifs)
{
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
std::string line;
int nnon = 0;
int nnum = 0;
while (std::getline(ifs,line)) {
if (!line.empty() && line.at(0) != '#') {
std::string prefix;
if (RE2::PartialMatch(line,numonly,&prefix)) {
nbpre_num_set.insert(prefix);
nnum++;
} else {
nbpre_gen_set.insert(line);
nnon++;
}
}
}
return std::make_pair(nnon,nnum);
}
//
// load files (make sure to call set_config_dir before, if ever
// for nonbreaking prefixes and protected patterns
//
void
Tokenizer::init() {
std::string nbpre_path(cfg_dir);
nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
// default to generic version
if (::access(nbpre_path.c_str(),R_OK))
nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
if (::access(nbpre_path.c_str(),R_OK) == 0) {
std::ifstream cfg(nbpre_path.c_str());
try {
std::pair<int,int> counts = load_prefixes(cfg);
if (verbose_p) {
std::cerr << "loaded " << counts.first << " non-numeric, "
<< counts.second << " numeric prefixes from "
<< nbpre_path << std::endl;
}
} catch (...) {
std::ostringstream ess;
ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
throw std::runtime_error(ess.str());
}
} else if (verbose_p) {
std::cerr << "no prefix file found: " << nbpre_path << std::endl;
}
if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
std::ostringstream ess;
ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
<< "No known abbreviations for language " << lang_iso;
throw std::runtime_error(ess.str());
}
std::string protpat_path(cfg_dir);
protpat_path.append("/protected_pattern.").append(lang_iso);
// default to generic version
if (::access(protpat_path.c_str(),R_OK))
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
prot_pat_vec.push_back(&numprefixed_x);
prot_pat_vec.push_back(&quasinumeric_x);
if (::access(protpat_path.c_str(),R_OK) == 0) {
std::ifstream cfg(protpat_path.c_str());
char linebuf[1028];
int npat = 0;
try {
linebuf[0]='(';
while (cfg.good()) {
cfg.getline(linebuf+1,1024);
if (linebuf[1] && linebuf[1] != '#') {
strcat(linebuf,")");
prot_pat_vec.push_back(new RE2(linebuf));
npat++;
}
}
} catch (...) {
std::ostringstream ess;
ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
throw std::runtime_error(ess.str());
}
if (verbose_p) {
std::cerr << "loaded " << npat << " protected patterns from "
<< protpat_path << std::endl;
}
} else if (verbose_p) {
std::cerr << "no protected file found: " << protpat_path << std::endl;
}
}
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
// assumes protections are applied already, some invariants are in place
//
void
Tokenizer::protected_tokenize(std::string& text) {
std::vector<std::string> words;
size_t pos = 0;
if (text.at(pos) == ' ')
++pos;
size_t next = text.find(' ',pos);
while (next != std::string::npos) {
if (next - pos)
words.push_back(text.substr(pos,next-pos));
pos = next + 1;
while (pos < text.size() && text.at(pos) == ' ')
++pos;
next = text.find(' ',pos);
}
if (pos < text.size() && text.at(pos) != ' ')
words.push_back(text.substr(pos,text.size()-pos));
text.clear();
// regurgitate words with look-ahead handling for tokens with final .
for (size_t ii = 0; ii < words.size(); ++ii) {
size_t len = words[ii].size();
if (len > 1 && words[ii].at(len-1) == '.') {
std::string prefix(words[ii].substr(0,len-1));
bool gen_prefix_p = nbpre_gen_set.find(prefix) != nbpre_gen_set.end();
bool embeds_p = prefix.find('.') != std::string::npos;
bool letter_p = RE2::PartialMatch(prefix.c_str(),letter_x);
bool more_p = ii < words.size() - 1;
bool nlower_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),lower_x);
bool num_prefix_p = (!gen_prefix_p) && nbpre_num_set.find(prefix) != nbpre_num_set.end();
bool nint_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),sinteger_x);
bool isolate_p = true;
if (gen_prefix_p) {
isolate_p = false;
} else if (num_prefix_p && nint_p) {
isolate_p = false;
} else if (embeds_p && letter_p) {
isolate_p = false;
} else if (nlower_p) {
isolate_p = false;
}
if (isolate_p) {
words[ii].assign(prefix);
words[ii].append(" .");
}
}
text.append(words[ii]);
if (ii < words.size() - 1)
text.append(" ");
}
}
bool
Tokenizer::escape(std::string& text) {
static const char escaping[] = "&|<>'\"[]";
static const char *replacements[] = {
"&amp;",
"&#124;",
"&lt;",
"&gt;",
"&apos;",
"&quot;",
"&#91;",
"&#93;"
};
bool modified = false;
const char *next = escaping;
for (int ii = 0; *next; ++ii, ++next) {
size_t pos = 0;
for (pos = text.find(*next,pos); pos != std::string::npos;
pos = (++pos < text.size() ? text.find(*next,pos) : std::string::npos)) {
std::string replacement(replacements[ii]);
if (*next != '\'') {
if (pos > 0 && text.at(pos-1) == ' ' && pos < text.size()-1 && text.at(pos+1) != ' ')
replacement.append(" ");
}
text.replace(pos,1,replacement);
modified = true;
}
}
return modified;
}
std::string
Tokenizer::tokenize(const std::string& buf)
{
static const char *apos_refs = "\\1 ' \\2";
static const char *right_refs = "\\1 '\\2";
static const char *left_refs = "\\1' \\2";
static const char *comma_refs = "\\1 , \\2";
static const char *isolate_ref = " \\1 ";
static const char *special_refs = "\\1 @\\2@ \\3";
std::string outs;
std::string text(buf);
if (skip_alltags_p) {
RE2::GlobalReplace(&text,genl_tags_x," ");
}
RE2::GlobalReplace(&text,genl_spc_x," ");
RE2::GlobalReplace(&text,ctrls_x,"");
size_t pos;
int num = 0;
if (!penn_p) {
// this is the main moses-compatible tokenizer
// push all the prefixes matching protected patterns
std::vector<std::string> prot_stack;
std::string match;
for (auto& pat : prot_pat_vec) {
pos = 0;
while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
pos = text.find(match,pos);
if (pos == std::string::npos)
break;
size_t len = match.size();
if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
char subst[32];
int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
text.replace(pos,len,subst,nsubst);
prot_stack.push_back(match);
pos += nsubst;
} else {
pos += len;
}
}
}
// collapse spaces
RE2::GlobalReplace(&text,mult_spc_x," ");
// strip leading space
if (text.at(0) == ' ')
text = text.substr(1);
// strip trailing space
if (text.at(text.size()-1) == ' ')
text = text.substr(0,text.size()-1);
// isolate hyphens, if non-default option is set
if (aggressive_hyphen_p)
RE2::GlobalReplace(&text,hyphen_x,special_refs);
// find successive dots, protect them
pos = text.find("..");
while (pos != std::string::npos && pos < text.size()) {
char subst[12];
size_t lim = pos + 2;
while (lim < text.size() && text.at(lim) == '.') ++lim;
snprintf(subst,sizeof(subst),"MANYDOTS%.3d",lim-pos);
text.replace(pos,lim-pos,subst,11);
pos = text.find("..",pos+11);
}
// terminate token at superscript or subscript sequence when followed by lower-case
RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
// isolate commas after non-digits
RE2::GlobalReplace(&text,postncomma_x,"\\1 , ");
// isolate commas before non-digits
RE2::GlobalReplace(&text,prencomma_x," , \\1");
// replace backtick with single-quote
pos = text.find("`");
while (pos != std::string::npos) {
text.replace(pos,1,"'",1);
pos = text.find("`");
}
// replace doubled single-quotes with double-quotes
pos = text.find("''");
while (pos != std::string::npos) {
text.replace(pos,2,"\"",1);
pos = text.find("''",pos+1);
}
// isolate special characters
RE2::GlobalReplace(&text,specials_x,isolate_ref);
if (english_p) {
// english contractions to the right
RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
RE2::GlobalReplace(&text,nxpaapos_x,apos_refs);
RE2::GlobalReplace(&text,panaapos_x,apos_refs);
RE2::GlobalReplace(&text,papaapos_x,right_refs);
RE2::GlobalReplace(&text,pnsapos_x,"\\1 's");
} else if (latin_p) {
// italian,french contractions to the left
RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
RE2::GlobalReplace(&text,napaapos_x,apos_refs);
RE2::GlobalReplace(&text,panaapos_x,apos_refs);
RE2::GlobalReplace(&text,papaapos_x,left_refs);
}
protected_tokenize(text);
// restore prefix-protected strings
num = 0;
for (auto& prot : prot_stack) {
char subst[32];
snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
size_t loc = text.find(subst);
while (loc != std::string::npos) {
text.replace(loc,18,prot);
loc = text.find(subst,loc+18);
}
}
// restore dot-sequences with correct length
std::string numstr;
pos = 0;
while (RE2::PartialMatch(text,dotskey_x,&numstr)) {
int count = std::strtoul(numstr.c_str(),0,0);
int loc = text.find("MANYDOTS",pos);
std::ostringstream fss;
fss << text.substr(0,loc);
if (loc > 0 && text.at(loc-1) != ' ')
fss << ' ';
for (int ii = 0; ii < count; ++ii)
fss << '.';
int sublen = 8 + numstr.size();
pos = loc + sublen;
if (pos < text.size() && text.at(pos) != ' ')
fss << ' ';
fss << text.substr(pos);
pos = loc;
text.assign(fss.str());
}
// escape moses mark-up
if (!non_escape_p)
escape(text);
// return value
outs.assign(text);
} else {
// tokenize_penn case
// directed quote patches
size_t len = text.size();
if (len > 2 && text.substr(0,2) == "``")
text.replace(0,2,"`` ",3);
else if (text.at(0) == '"')
text.replace(0,1,"`` ",3);
else if (text.at(0) == '`' || text.at(0) == '\'')
text.replace(0,1,"` ",2);
static char one_gg[] = "\\1 ``";
RE2::GlobalReplace(&text,x1_v_d,one_gg);
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
// protect ellipsis
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
text.replace(pos,3,"MANYELIPSIS",11);
// numeric commas
RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
// isolable symbols
RE2::GlobalReplace(&text,symbol_x,isolate_ref);
// isolable slash
RE2::GlobalReplace(&text,slash_x,special_refs);
// isolate final period
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
// isolate q.m., e.m.
RE2::GlobalReplace(&text,qx_x,isolate_ref);
// isolate braces
RE2::GlobalReplace(&text,braces_x,isolate_ref);
// convert open/close punctuation
RE2::GlobalReplace(&text,"\\(","-LRB-");
RE2::GlobalReplace(&text,"\\[","-LSB-");
RE2::GlobalReplace(&text,"\\{","-LCB-");
RE2::GlobalReplace(&text,"\\)","-RRB-");
RE2::GlobalReplace(&text,"\\]","-RSB-");
RE2::GlobalReplace(&text,"\\}","-RCB-");
// isolate double-dash hyphen
RE2::GlobalReplace(&text,"--"," -- ");
// insure leading and trailing space on line, to simplify exprs
// also make sure final . has one space on each side
len = text.size();
while (len > 1 && text.at(len-1) == ' ') --len;
if (len < text.size())
text.assign(text.substr(0,len));
if (len > 2 && text.at(len-1) == '.') {
if (text.at(len-2) != ' ') {
text.assign(text.substr(0,len-1));
text.append(" . ");
} else {
text.assign(text.substr(0,len-1));
text.append(". ");
}
} else {
text.append(" ");
}
std::string ntext(" ");
ntext.append(text);
// convert double quote to paired single-quotes
RE2::GlobalReplace(&ntext,"\""," '' ");
// deal with contractions in penn style
RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
RE2::GlobalReplace(&ntext,"'ll "," 'll ");
RE2::GlobalReplace(&ntext,"'re "," 're ");
RE2::GlobalReplace(&ntext,"'ve "," 've ");
RE2::GlobalReplace(&ntext,"n't "," n't ");
RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
RE2::GlobalReplace(&ntext,"N'T "," N'T ");
RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
protected_tokenize(ntext);
// restore ellipsis
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
// collapse spaces
RE2::GlobalReplace(&ntext,mult_spc_x," ");
// escape moses meta-characters
if (!non_escape_p)
escape(ntext);
// strip out wrapping spaces from line in result string
outs.assign(ntext.substr(1,ntext.size()-2));
}
return outs;
}
std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
if (istr.empty())
continue;
if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
os << istr << std::endl;
} else {
std::string bstr(" ");
bstr.append(istr).append(" ");
os << tokenize(bstr) << std::endl;
}
if (verbose_p && ((line_no % 1000) == 0)) {
std::cerr << line_no << ' ';
std::cerr.flush();
}
}
return line_no;
}
namespace {
std::string trim(const std::string& in)
{
std::size_t start = 0;
std::size_t limit = in.size();
while (start < limit && in.at(start) < '!') ++start;
while (start < limit && in.at(limit-1) < '!') --limit;
if (start == limit) return std::string("");
if (start > 0 || limit < in.size())
return in.substr(start,limit-start);
return std::string(in);
}
std::vector<std::string> split(const std::string& in)
{
std::vector<std::string> outv;
std::istringstream iss(in);
std::copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter(outv));
return outv;
}
};
std::string
Tokenizer::detokenize(const std::string& buf)
{
std::vector<std::string> words = split(trim(buf));
std::size_t squotes = 0;
std::size_t dquotes = 0;
std::string prepends(" ");
std::ostringstream oss;
std::size_t nwords = words.size();
std::size_t iword = 0;
for (auto word: words) {
if (RE2::FullMatch(word,right_x)) {
oss << prepends << word;
prepends.clear();
} else if (RE2::FullMatch(word,left_x)) {
oss << word;
prepends = " ";
} else if (english_p && iword && RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word;
prepends = " ";
} else if (latin_p && iword < nwords - 2 && RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word;
prepends.clear();
} else if (word.size() == 1) {
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword && word.at(0) == '\'' && words[iword-1].at(words[iword-1].size()-1) == 's') {
oss << word;
prepends = " ";
} else {
oss << prepends << word;
prepends.clear();
if (word.at(0) == '\'')
squotes++;
else
dquotes++;
}
} else {
oss << word;
prepends = " ";
if (word.at(0) == '\'')
squotes++;
else if (word.at(0) == '"')
dquotes++;
}
} else {
oss << prepends << word;
prepends.clear();
}
iword++;
}
std::string text(oss.str());
RE2::GlobalReplace(&text," +"," ");
RE2::GlobalReplace(&text,"\n ","\n");
RE2::GlobalReplace(&text," \n","\n");
return trim(text);
}
std::size_t
Tokenizer::detokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
if (istr.empty())
continue;
if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
os << istr << std::endl;
} else {
os << detokenize(istr) << std::endl;
}
}
return line_no;
}
#ifdef TOKENIZER_NAMESPACE
}; // namespace
#endif

View File

@ -0,0 +1,115 @@
#include <string>
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <set>
#include <vector>
#include <iterator>
#include <stdexcept>
#include <re2/re2.h>
#include <unistd.h>
#ifdef TOKENIZER_NAMESPACE
namespace TOKENIZER_NAMESPACE {
#endif
//
// @about
// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
//
class Tokenizer {
private:
static std::string cfg_dir;
std::set<std::string> nbpre_num_set;
std::set<std::string> nbpre_gen_set;
std::vector<re2::RE2 *> prot_pat_vec;
protected:
// language
std::string lang_iso;
bool english_p; // is lang_iso "en"
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool non_escape_p;
bool aggressive_hyphen_p;
bool penn_p;
bool verbose_p;
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
bool escape(std::string& inplace);
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
public:
// cfg_dir is assumed shared by all languages
static void set_config_dir(const std::string& _cfg_dir);
// no throw
Tokenizer(const std::string& _lang_iso = "en",
bool _skip_xml_p = true, // skips isolated (linewise) tags in any case
bool _skip_alltags_p = true, // skip all xml style tags
bool _non_escape_p = false, // default is to call escape method before return
bool _aggressive_hyphen_p = false, // hyphens become tokens when true
bool _penn_p = false, // Treebank-3 compatible tokenization when true
bool _verbose_p = false);
// frees dynamically compiled expressions
~Tokenizer();
// required before other methods, may throw
void init();
// streaming tokenizer reads from is, writes to os, preserving line breaks
std::size_t tokenize(std::istream& is, std::ostream& os);
// tokenize padded line buffer to return string
std::string tokenize(const std::string& buf);
void tokenize(const std::string& buf, std::string& outs) {
outs = tokenize(buf);
}
// tokenize to a vector
std::vector<std::string> tokens(const std::string& in) {
std::istringstream tokss(tokenize(in));
std::vector<std::string> outv;
std::copy(std::istream_iterator<std::string>(tokss),
std::istream_iterator<std::string>(),
std::back_inserter(outv));
return outv;
}
// streaming detokenizer reads from is, writes to os, preserving breaks
std::size_t detokenize(std::istream& is, std::ostream &os);
// detokenize padded line buffer to return string
std::string detokenize(const std::string& buf);
void detokenize(const std::string& buf, std::string& outs) {
outs = detokenize(buf);
}
// detokenize from a vector
std::string detokenize(const std::vector<std::string>& inv) {
std::ostringstream oss;
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
return detokenize(oss.str());
}
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE
};
#endif

View File

@ -0,0 +1,212 @@
#include "tokenizer.h"
#include <memory>
#include <vector>
#include <cctype>
#ifdef TOKENIZER_NAMESPACE
using namespace TOKENIZER_NAMESPACE ;
#endif
void
usage(const char *path)
{
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
std::cerr << " -e -- escape entities" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
std::cerr << " -p -- treebank-3 style" << std::endl;
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
std::cerr << " -o OUT -- output file path" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction." << std::endl;
}
std::string token_word(const std::string& in) {
int pos = -1;
int digits_prefixed = 0;
int nalpha = 0;
int len = in.size();
std::vector<char> cv;
int last_quirk = -1;
while (++pos < len) {
char ch = in.at(pos);
if (std::isdigit(ch)) {
if (digits_prefixed > 0) {
last_quirk = pos;
break;
}
digits_prefixed--;
cv.push_back(std::tolower(ch));
} else if (std::isalpha(ch)) {
if (digits_prefixed < 0)
digits_prefixed = -digits_prefixed;
cv.push_back(std::tolower(ch));
nalpha++;
} else {
if (digits_prefixed < 0)
digits_prefixed = -digits_prefixed;
last_quirk = pos;
if ((ch == '-' || ch == '\'') && pos != 0) {
cv.push_back(ch);
} else {
break;
}
}
}
if (last_quirk == pos || digits_prefixed > 0 && nalpha == 0)
cv.clear(); // invalid word
return std::string(cv.begin(),cv.end());
}
int
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
for (auto& token: tokens) {
std::string word(token_word(token));
if (word.empty()) continue;
ofs << word << ' ';
count++;
}
if (count) {
ofs << std::endl;
nlines++;
}
}
return nlines;
}
int main(int ac, char **av)
{
int rc = 0;
std::string lang_iso;
std::vector<std::string> args;
std::string out_path;
char *cfg_path = 0;
bool next_cfg_p = false;
bool next_output_p = false;
bool verbose_p = false;
bool detag_p = false;
bool alltag_p = false;
bool escape_p = true;
bool aggro_p = false;
bool penn_p = false;
bool words_p = false;
const char *prog = av[0];
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
case 'h':
usage(prog);
exit(0);
case 'c':
next_cfg_p = true;
break;
case 'o':
next_output_p = true;
break;
case 'v':
verbose_p = true;
break;
case 'e':
escape_p = false;
break;
case 'w':
words_p = true;
break;
case 'x':
detag_p = true;
break;
case 'y':
alltag_p = true;
break;
case 'a':
aggro_p = true;
break;
case 'l':
// ignored
break;
case 'p':
penn_p = true;
break;
default:
std::cerr << "Unknown option: " << *av << std::endl;
::exit(1);
}
} else if (lang_iso.empty() && strlen(*av) == 2) {
lang_iso = *av;
} else if (**av == '-') {
++*av;
} else if (next_output_p) {
next_output_p = false;
out_path = *av;
} else if (next_cfg_p) {
next_cfg_p = false;
cfg_path = *av;
} else {
args.push_back(std::string(*av));
}
}
if (!cfg_path) {
cfg_path = getenv("TOKENIZER_SHARED_DIR");
}
if (cfg_path) {
Tokenizer::set_config_dir(std::string(cfg_path));
}
std::unique_ptr<std::ofstream> pofs = 0;
if (!out_path.empty()) {
pofs.reset(new std::ofstream(out_path.c_str()));
}
std::ostream& ofs(pofs ? *pofs : std::cout);
Tokenizer tize(lang_iso,detag_p,alltag_p,!escape_p,aggro_p,penn_p,verbose_p);
tize.init();
size_t nlines = 0;
if (words_p) {
if (args.empty()) {
nlines += copy_words(tize,std::cin,ofs);
} else {
for (std::string& arg : args) {
try {
std::ifstream ifs(arg.c_str());
nlines += copy_words(tize,ifs,ofs);
} catch (...) {
std::cerr << "Exception extracting words from path " << arg << std::endl;
}
}
}
} else if (args.empty()) {
nlines = tize.tokenize(std::cin,ofs);
} else {
for (std::string& arg : args) {
try {
std::ifstream ifs(arg.c_str());
nlines = tize.tokenize(ifs,ofs);
} catch (...) {
std::cerr << "Exception tokenizing from path " << arg << std::endl;
}
}
}
if (verbose_p)
std::cerr << "%%% tokenized lines: " << nlines << std::endl;
return rc;
}