draft splitter

This commit is contained in:
akimbal1 2015-03-19 01:02:18 -04:00
parent 99b8f65fb1
commit 1b9da3bb04
5 changed files with 352 additions and 10 deletions

View File

@ -9,6 +9,7 @@ Parameters::Parameters()
, verbose_p(false)
, detag_p(false)
, alltag_p(false)
, entities_p(false)
, escape_p(true)
, aggro_p(false)
, supersub_p(false)
@ -23,6 +24,8 @@ Parameters::Parameters()
, refined_p(false)
, unescape_p(false)
, drop_bad_p(false)
, split_p(false)
, notokenization_p(false)
{
}

View File

@ -16,6 +16,7 @@ struct Parameters
bool verbose_p;
bool detag_p;
bool alltag_p;
bool entities_p;
bool escape_p;
bool aggro_p;
bool supersub_p;
@ -30,6 +31,8 @@ struct Parameters
bool refined_p;
bool unescape_p;
bool drop_bad_p;
bool split_p;
bool notokenization_p;
Parameters();

View File

@ -62,7 +62,8 @@ RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes c
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
RE2 quotes_x("^[\'\"]+$"); //
RE2 endnum_x("[-\'\"]"); //
RE2 split_word("([\\p{L}\\p{N}\\.\\-]*)([\\'\\\"\\)\\]\\%\\p{Pf}]*)(\\.+)$"); //
RE2 split_word2("^([ ]*[\\'\\\"\\(\\[¿¡\\p{Pi}]*[ ]*[\\p{Lu}\\p{N}])");
// anything rarely used will just be given as a string and compiled on demand by RE2
const char *SPC_BYTE = " ";
@ -447,6 +448,7 @@ Tokenizer::Tokenizer(const Parameters& _)
, latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
, skip_xml_p(_.detag_p)
, skip_alltags_p(_.alltag_p)
, entities_p(_.entities_p)
, escape_p(_.escape_p)
, unescape_p(_.unescape_p)
, aggressive_hyphen_p(_.aggro_p)
@ -459,6 +461,7 @@ Tokenizer::Tokenizer(const Parameters& _)
, narrow_kana_p(_.narrow_kana_p)
, refined_p(_.refined_p)
, drop_bad_p(_.drop_bad_p)
, splits_p(_.split_p)
, verbose_p(_.verbose_p)
{
}
@ -591,6 +594,12 @@ Tokenizer::init() {
}
void
Tokenizer::reset() {
starts_vec.clear();
}
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
// assumes protections are applied already, some invariants are in place,
@ -1024,11 +1033,35 @@ Tokenizer::tokenize(const std::string& buf)
ucs4 = eptr;
nxt4 = ++eptr;
next_uch = *nxt4;
next_type = nxt4 < lim4 ? g_unichar_type(*nxt4) : G_UNICODE_UNASSIGNED;
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry;
}
}
}
if (entities_p && !in_url_p) {
gunichar *cur4 = nxt4;
if (*cur4 == gunichar('#')) ++cur4;
while (g_unichar_isalnum(*cur4)) ++cur4;
if (cur4 > nxt4 && *cur4 == gunichar(';')) {
if (since_start) {
*uptr++ = gunichar(L' ');
since_start = 0;
}
++cur4;
memcpy(uptr,ucs4,cur4-ucs4);
uptr += cur4-ucs4;
ucs4 = cur4;
*uptr++ = gunichar(L' ');
pre_break_p = post_break_p = false;
curr_uch = *ucs4;
curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
nxt4 = ++cur4;
next_uch = *nxt4;
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry;
}
}
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
if (escape_p)
substitute_p = L"&amp;";
@ -1472,11 +1505,32 @@ std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
size_t sent_no = 0;
size_t nbreaks = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
if (istr.empty()) {
if (splitting()) {
if (istr.empty()) {
if (is.eof())
break;
if (nbreaks)
os << std::endl;
nbreaks++;
reset();
continue;
}
if (skip_xml_p &&
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << std::endl;
nbreaks++;
reset();
continue;
} else {
// XXX
}
} else if (istr.empty()) {
if (is.eof())
break;
os << std::endl;
@ -1493,7 +1547,7 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
std::cerr.flush();
}
}
return line_no;
return splitting() ? sent_no : line_no;
}
@ -1622,6 +1676,224 @@ Tokenizer::detokenize(std::istream& is, std::ostream& os)
}
std::string
Tokenizer::splitter(const std::string &istr) {
std::ostringstream ostr0;
gchar *ccur = (gchar *)istr.c_str();
gchar *cend = ccur + istr.size();
glong ncp = 0;
gunichar *ucs4 = g_utf8_to_ucs4_fast(ccur,cend-ccur,&ncp);
size_t n_nongraph1 = 0;
size_t n_nongraph2 = 0;
bool beginning = true;
const glong invalid = 1L;
glong term_start = invalid;
glong term_end = invalid;
glong term_post = invalid;
glong term_more = invalid;
for (glong icp = 0; icp <= ncp; ++icp) {
if (term_post != invalid) {
gchar * pre = g_ucs4_to_utf8(ucs4+term_start,term_end - term_start,0,0,0);
ostr0 << pre;
g_free(pre);
ostr0 << std::endl;
gchar *post = g_ucs4_to_utf8(ucs4+term_post,icp-term_post,0,0,0);
ostr0 << post;
g_free(post);
term_start = term_end = term_post = term_more = invalid;
n_nongraph1 = n_nongraph2 = 0;
}
if (icp == ncp)
break;
if (!g_unichar_isgraph(ucs4[icp])) {
if (term_start != invalid && term_end == invalid) {
term_end = icp;
}
if (ucs4[icp] == L'\n') {
beginning = true;
n_nongraph2 = n_nongraph1 = 0;
term_start = term_end = term_post = term_more = invalid;
} else if (!beginning) {
if (term_more)
n_nongraph2++;
else
n_nongraph1++;
}
continue;
}
beginning = false;
GUnicodeType icp_type = g_unichar_type(ucs4[icp]);
if (g_unichar_ispunct(ucs4[icp])) {
switch (ucs4[icp]) {
case L'?':
case L'!':
case L'.':
if (term_start == invalid) {
term_start = icp;
continue;
}
break;
case L'\'':
case L'\"':
case L'(':
case L'[':
case L'¿':
case L'¡':
if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
term_more = term_post = icp;
continue;
}
break;
}
}
switch (icp_type) {
case G_UNICODE_INITIAL_PUNCTUATION:
if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
term_post = icp;
term_more = invalid;
continue;
}
break;
case G_UNICODE_FINAL_PUNCTUATION:
if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
if (!n_nongraph2) {
term_more = term_post = icp;
}
continue;
}
break;
case G_UNICODE_UPPERCASE_LETTER:
case G_UNICODE_TITLECASE_LETTER:
if (term_end != invalid && n_nongraph1) {
term_post = icp;
continue;
}
break;
default:
break;
}
term_start = term_end = term_post = term_more = invalid;
n_nongraph1 = n_nongraph2 = 0;
}
std::vector<std::string> tokens = split(ostr0.str());
size_t ntok = tokens.size();
std::ostringstream ostr1;
for (size_t itok = 0; itok < ntok - 1; ++itok) {
std::string& word(tokens[itok]);
if (RE2::FullMatch(word,split_word)) {
size_t nchar = word.size();
size_t ndot = 0;
size_t ntrail = 0;
gunichar gu = 0;
gchar *base = (gchar *)word.c_str();
gchar *prev = 0;
while (nchar && word.at(nchar-1) == '.') {
++ndot;
--nchar;
}
while (nchar) {
switch (word.at(nchar-1)) {
case '\'':
case '"':
case ')':
case ']':
case '%':
++ntrail;
--nchar;
continue;
default:
prev = g_utf8_find_prev_char(base,base+nchar);
gu = prev ? g_utf8_get_char(prev) : 0;
if (gu && g_unichar_type(gu) == G_UNICODE_FINAL_PUNCTUATION) {
++ntrail;
--nchar;
continue;
}
}
break;
}
bool non_break_p = false;
if (nchar && !ntrail && nbpre_gen_set.find(word.substr(0,nchar)) != nbpre_gen_set.end()) {
non_break_p = true;
} else {
nchar = word.size();
ndot = 0;
size_t nupper = 0;
size_t nlead = 0;
size_t ichar = 0;
for (; ichar < nchar; ++ichar) {
char &byte(word.at(ichar));
if (byte < 0x7f) {
if ((byte <= 'Z' && byte >= 'A') || byte == '-') {
nupper++;
continue;
}
if (byte == '.') {
ndot++;
if (!nupper)
nlead++;
continue;
}
} else {
gu = g_utf8_get_char(base+ichar);
if (gu && g_unichar_type(gu) == G_UNICODE_UPPERCASE_LETTER) {
nupper++;
continue;
}
}
break;
}
non_break_p = ichar == nchar && nlead && nupper && ndot > nlead;
}
if (!non_break_p && RE2::FullMatch(tokens[itok+1],split_word2)) {
if (nchar && nbpre_num_set.find(word.substr(0,nchar)) != nbpre_num_set.end()
&& std::isdigit(tokens[itok+1].at(0))) {
non_break_p = true;
}
}
ostr1 << word;
if (!non_break_p) {
ostr1 << std::endl;
} else {
ostr1 << ' ';
}
} else {
ostr1 << word << ' ';
}
}
ostr1 << tokens[ntok-1] << std::endl;
return ostr1.str();
}
std::size_t
Tokenizer::splitter(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
if (istr.empty())
continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
} else {
os << splitter(istr) << std::endl;
}
}
return line_no;
}
#ifdef TOKENIZER_NAMESPACE
}; // namespace
#endif

View File

@ -28,12 +28,22 @@ private:
static std::string cfg_dir;
// non-breaking prefixes (numeric) utf8
std::set<std::string> nbpre_num_set;
// non-breaking prefixes (other) utf8
std::set<std::string> nbpre_gen_set;
// non-breaking prefixes (numeric) ucs4
std::set<std::wstring> nbpre_num_ucs4;
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
// sentence starts embedded in last line of input
std::vector<std::size_t> starts_vec;
protected:
// language
@ -42,6 +52,7 @@ protected:
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool entities_p;
bool escape_p;
bool unescape_p;
bool aggressive_hyphen_p;
@ -54,6 +65,7 @@ protected:
bool narrow_kana_p;
bool refined_p;
bool drop_bad_p;
bool splits_p;
bool verbose_p;
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
@ -80,6 +92,11 @@ public:
// required before other methods, may throw
void init();
// required after processing a contiguous sequence of lines when sentence splitting is on
void reset();
bool splitting() const { return splits_p; }
// streaming tokenizer reads from is, writes to os, preserving line breaks
std::size_t tokenize(std::istream& is, std::ostream& os);
@ -117,6 +134,11 @@ public:
return detokenize(oss.str());
}
std::string splitter(const std::string &istr);
// split sentences from lines of input
std::size_t splitter(std::istream& is, std::ostream& os);
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE

View File

@ -20,6 +20,7 @@ usage(const char *path)
std::cerr << " -d -- downcase" << std::endl;
std::cerr << " -D -- detokenize" << std::endl;
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
std::cerr << " -k -- narrow kana" << std::endl;
std::cerr << " -n -- narrow latin" << std::endl;
std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +28,15 @@ usage(const char *path)
std::cerr << " -p -- penn treebank style" << std::endl;
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
std::cerr << " -t -- do not tokenize (for use as splitter)." << std::endl;
std::cerr << " -u -- disable url handling" << std::endl;
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
std::cerr << " -X -- split only" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +87,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) continue;
if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
bool was_break = false;
for (auto& token: tokens) {
if (token.empty()) {
if (count || was_break) {
ofs << std::endl;
count = 0;
nlines++;
was_break = true;
continue;
}
}
was_break = false;
std::string word(token_word(token));
if (word.empty()) continue;
ofs << word << ' ';
count++;
if (word.empty()) {
continue;
}
if (count++) {
ofs << ' ';
}
ofs << word;
}
if (count) {
ofs << std::endl;
nlines++;
@ -110,7 +134,9 @@ int main(int ac, char **av)
bool next_cfg_p = false;
bool next_output_p = false;
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
@ -127,11 +153,14 @@ int main(int ac, char **av)
params.downcase_p = true;
break;
case 'D':
detokenize_p = true;
detokenize_p = !detokenize_p;
break;
case 'e':
params.escape_p = false;
break;
case 'E':
params.entities_p = true;
break;
case 'h':
usage(prog);
exit(0);
@ -156,6 +185,12 @@ int main(int ac, char **av)
case 's':
params.supersub_p = true;
break;
case 'S':
params.split_p = !params.split_p;
break;
case 'T':
params.notokenization_p = true;
break;
case 'U':
params.unescape_p = true;
break;
@ -171,6 +206,9 @@ int main(int ac, char **av)
case 'x':
params.detag_p = true;
break;
case 'X':
params.notokenization_p = true;
break;
case 'y':
params.alltag_p = true;
break;
@ -262,6 +300,8 @@ int main(int ac, char **av)
} else if (params.args.empty()) {
if (detokenize_p) {
nlines = tize.detokenize(std::cin,ofs);
} else if (params.notokenization_p) {
nlines = tize.splitter(std::cin,ofs);
} else {
nlines = tize.tokenize(std::cin,ofs);
}
@ -271,6 +311,8 @@ int main(int ac, char **av)
std::ifstream ifs(arg.c_str());
if (detokenize_p) {
nlines = tize.detokenize(ifs,ofs);
} else if (params.notokenization_p) {
nlines = tize.splitter(ifs,ofs);
} else {
nlines = tize.tokenize(ifs,ofs);
}