draft splitter

2024-09-19 07:07:24 +03:00 · 2015-03-19 01:02:18 -04:00 · 2015-03-19 01:02:18 -04:00 · 1b9da3bb04
commit 1b9da3bb04
parent 99b8f65fb1
5 changed files with 352 additions and 10 deletions
--- a/contrib/c++tokenizer/Parameters.cpp
+++ b/contrib/c++tokenizer/Parameters.cpp
@ -9,6 +9,7 @@ Parameters::Parameters()
 , verbose_p(false)
 , detag_p(false)
 , alltag_p(false)
+, entities_p(false)
 , escape_p(true)
 , aggro_p(false)
 , supersub_p(false)
@ -23,6 +24,8 @@ Parameters::Parameters()
 , refined_p(false)
 , unescape_p(false)
 , drop_bad_p(false)
+, split_p(false)
+, notokenization_p(false)
 {
 }

--- a/contrib/c++tokenizer/Parameters.h
+++ b/contrib/c++tokenizer/Parameters.h
@ -16,6 +16,7 @@ struct Parameters
    bool verbose_p;
    bool detag_p;
    bool alltag_p;
+    bool entities_p;
    bool escape_p;
    bool aggro_p;
    bool supersub_p;
@ -30,6 +31,8 @@ struct Parameters
    bool refined_p;
    bool unescape_p;
    bool drop_bad_p;
+    bool split_p;
+    bool notokenization_p;

 	Parameters();

--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@ -62,7 +62,8 @@ RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes c
 RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
 RE2 quotes_x("^[\'\"]+$"); //
 RE2 endnum_x("[-\'\"]"); //
-
+RE2 split_word("([\\p{L}\\p{N}\\.\\-]*)([\\'\\\"\\)\\]\\%\\p{Pf}]*)(\\.+)$"); // 
+RE2 split_word2("^([ ]*[\\'\\\"\\(\\[¿¡\\p{Pi}]*[ ]*[\\p{Lu}\\p{N}])");
 // anything rarely used will just be given as a string and compiled on demand by RE2 

 const char *SPC_BYTE = " ";
@ -447,6 +448,7 @@ Tokenizer::Tokenizer(const Parameters& _)
        , latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
        , skip_xml_p(_.detag_p)
        , skip_alltags_p(_.alltag_p)
+        , entities_p(_.entities_p)
        , escape_p(_.escape_p)
        , unescape_p(_.unescape_p)
        , aggressive_hyphen_p(_.aggro_p)
@ -459,6 +461,7 @@ Tokenizer::Tokenizer(const Parameters& _)
        , narrow_kana_p(_.narrow_kana_p)
        , refined_p(_.refined_p)
        , drop_bad_p(_.drop_bad_p)
+        , splits_p(_.split_p)
        , verbose_p(_.verbose_p)
 {
 }
@ -591,6 +594,12 @@ Tokenizer::init() {
 }


+void
+Tokenizer::reset() {
+    starts_vec.clear();
+}
+
+
 //
 // apply ctor-selected tokenization to a string, in-place, no newlines allowed,
 // assumes protections are applied already, some invariants are in place, 
@ -1024,11 +1033,35 @@ Tokenizer::tokenize(const std::string& buf)
                                ucs4 = eptr;
                                nxt4 = ++eptr;
                                next_uch = *nxt4;
-                                next_type = nxt4 < lim4 ? g_unichar_type(*nxt4) : G_UNICODE_UNASSIGNED;
+                                next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
                                goto retry;
                            }
                        }
                    }
+                    if (entities_p && !in_url_p) {
+                        gunichar *cur4 = nxt4;
+                        if (*cur4 == gunichar('#')) ++cur4;
+                        while (g_unichar_isalnum(*cur4)) ++cur4;
+                        if (cur4 > nxt4 && *cur4 == gunichar(';')) {
+                            if (since_start) {
+                                *uptr++ = gunichar(L' ');
+                                since_start = 0;
+                            }
+                            ++cur4;
+                            memcpy(uptr,ucs4,cur4-ucs4);
+                            uptr += cur4-ucs4;
+                            ucs4 = cur4;
+                            *uptr++ = gunichar(L' ');
+                            pre_break_p = post_break_p = false;
+                            curr_uch = *ucs4;
+                            curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
+                            nxt4 = ++cur4;
+                            next_uch = *nxt4;
+                            next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
+                            goto retry;
+                        }
+                        
+                    }
                    post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
                    if (escape_p) 
                        substitute_p = L"&amp;";
@ -1472,11 +1505,32 @@ std::size_t
 Tokenizer::tokenize(std::istream& is, std::ostream& os)
 {
    size_t line_no = 0;
+    size_t sent_no = 0;
+    size_t nbreaks = 0;
    while (is.good() && os.good()) {
        std::string istr;
        std::getline(is,istr);
        line_no ++;
-        if (istr.empty()) {
+        if (splitting()) {
+            if (istr.empty()) {
+                if (is.eof()) 
+                    break;
+                if (nbreaks)
+                    os << std::endl;
+                nbreaks++;
+                reset();
+                continue;
+            } 
+            if (skip_xml_p && 
+                   (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+                os << std::endl;
+                nbreaks++;
+                reset();
+                continue;
+            } else {
+                // XXX
+            }
+        } else if (istr.empty()) {
            if (is.eof())
                break;
            os << std::endl;
@ -1493,7 +1547,7 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
            std::cerr.flush();
        }
    }
-    return line_no;
+    return splitting() ? sent_no : line_no;
 }


@ -1622,6 +1676,224 @@ Tokenizer::detokenize(std::istream& is, std::ostream& os)
 }


+std::string
+Tokenizer::splitter(const std::string &istr) {
+    std::ostringstream ostr0;
+    gchar *ccur = (gchar *)istr.c_str();
+    gchar *cend = ccur + istr.size();
+    glong ncp = 0;
+    gunichar *ucs4 = g_utf8_to_ucs4_fast(ccur,cend-ccur,&ncp);
+    size_t n_nongraph1 = 0;
+    size_t n_nongraph2 = 0;
+    bool beginning = true;
+    const glong invalid = 1L;
+    glong term_start = invalid;
+    glong term_end = invalid;
+    glong term_post = invalid;
+    glong term_more = invalid;
+
+    for (glong icp = 0; icp <= ncp; ++icp) {
+        if (term_post != invalid) {
+            gchar * pre = g_ucs4_to_utf8(ucs4+term_start,term_end - term_start,0,0,0);
+            ostr0 << pre;
+            g_free(pre);
+            ostr0 << std::endl;
+            gchar *post = g_ucs4_to_utf8(ucs4+term_post,icp-term_post,0,0,0);
+            ostr0 << post;
+            g_free(post);
+            term_start = term_end = term_post = term_more = invalid;
+            n_nongraph1 = n_nongraph2 = 0;
+        }
+
+        if (icp == ncp)
+            break;
+
+        if (!g_unichar_isgraph(ucs4[icp])) {
+            if (term_start != invalid && term_end == invalid) {
+                term_end = icp;
+            }
+            if (ucs4[icp] == L'\n') {
+                beginning = true;
+                n_nongraph2 = n_nongraph1 = 0;
+                term_start = term_end = term_post = term_more = invalid;
+            } else if (!beginning) {
+                if (term_more)
+                    n_nongraph2++;
+                else
+                    n_nongraph1++;
+            }
+            continue;
+        } 
+        beginning = false;
+        GUnicodeType icp_type = g_unichar_type(ucs4[icp]);
+        
+        if (g_unichar_ispunct(ucs4[icp])) {
+            switch (ucs4[icp]) {
+            case L'?': 
+            case L'!':
+            case L'.':
+                if (term_start == invalid) {
+                    term_start = icp;
+                    continue;
+                }
+                break;
+            case L'\'':
+            case L'\"':
+            case L'(':
+            case L'[':
+            case L'¿':
+            case L'¡':
+                if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
+                    term_more = term_post = icp;
+                    continue;
+                }
+                break;
+            }
+        } 
+
+        switch (icp_type) {
+        case G_UNICODE_INITIAL_PUNCTUATION:
+            if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
+                term_post = icp;
+                term_more = invalid;
+                continue;
+            } 
+            break;
+        case G_UNICODE_FINAL_PUNCTUATION:
+            if (term_end != invalid && n_nongraph1 && !n_nongraph2) {
+                if (!n_nongraph2) {
+                    term_more = term_post = icp;
+                }
+                continue;
+            } 
+            break;
+        case G_UNICODE_UPPERCASE_LETTER:
+        case G_UNICODE_TITLECASE_LETTER:
+            if (term_end != invalid && n_nongraph1) {
+                term_post = icp;
+                continue;
+            }
+            break;
+        default:
+            break;
+        } 
+        term_start = term_end = term_post = term_more = invalid;
+        n_nongraph1 = n_nongraph2 = 0;
+    }
+
+    std::vector<std::string> tokens = split(ostr0.str());
+    size_t ntok = tokens.size();
+    std::ostringstream ostr1;
+    for (size_t itok = 0; itok < ntok - 1; ++itok) {
+        std::string& word(tokens[itok]);
+        if (RE2::FullMatch(word,split_word)) {        
+            size_t nchar = word.size();
+            size_t ndot = 0;
+            size_t ntrail = 0;
+            gunichar gu = 0;
+            gchar *base = (gchar *)word.c_str();
+            gchar *prev = 0;
+
+            while (nchar && word.at(nchar-1) == '.') {
+                ++ndot;
+                --nchar;
+            }
+            while (nchar) {
+                switch (word.at(nchar-1)) {
+                    case '\'':
+                    case '"':
+                    case ')':
+                    case ']':
+                    case '%':
+                        ++ntrail;
+                        --nchar;
+                        continue;
+                default:
+                    prev = g_utf8_find_prev_char(base,base+nchar);
+                    gu = prev ? g_utf8_get_char(prev) : 0;
+                    if (gu && g_unichar_type(gu) == G_UNICODE_FINAL_PUNCTUATION) {
+                        ++ntrail;
+                        --nchar;
+                        continue;
+                    }
+                }
+                break;
+            }
+            
+            bool non_break_p = false;
+            if (nchar && !ntrail && nbpre_gen_set.find(word.substr(0,nchar)) != nbpre_gen_set.end()) {
+                non_break_p = true;
+            }  else {
+                nchar = word.size();
+                ndot = 0;
+                size_t nupper = 0;
+                size_t nlead = 0;
+                size_t ichar = 0;
+                for (; ichar < nchar; ++ichar) {
+                    char &byte(word.at(ichar));
+                    if (byte < 0x7f) {
+                        if ((byte <= 'Z' && byte >= 'A') || byte == '-') {
+                            nupper++;
+                            continue;
+                        }
+                        if (byte == '.') {
+                            ndot++;
+                            if (!nupper)
+                                nlead++;
+                            continue;
+                        }
+                    } else {
+                        gu = g_utf8_get_char(base+ichar);
+                        if (gu && g_unichar_type(gu) == G_UNICODE_UPPERCASE_LETTER) {
+                            nupper++;
+                            continue;
+                        }
+                    }
+                    break;
+                }
+                non_break_p = ichar == nchar && nlead && nupper && ndot > nlead;
+            }
+            if (!non_break_p && RE2::FullMatch(tokens[itok+1],split_word2)) {
+                if (nchar && nbpre_num_set.find(word.substr(0,nchar)) != nbpre_num_set.end()
+                    && std::isdigit(tokens[itok+1].at(0))) {
+                    non_break_p = true;
+                }
+            }
+            ostr1 << word;
+            if (!non_break_p) {
+                ostr1 << std::endl;
+            } else {
+                ostr1 << ' ';
+            }
+		} else {
+            ostr1 << word << ' ';
+        }
+	}
+	ostr1 << tokens[ntok-1] << std::endl;
+    return ostr1.str();
+}
+
+
+std::size_t
+Tokenizer::splitter(std::istream& is, std::ostream& os) 
+{
+    size_t line_no = 0;
+    while (is.good() && os.good()) {
+        std::string istr;
+        std::getline(is,istr);
+        line_no ++;
+        if (istr.empty()) 
+            continue;
+        if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+            os << istr << std::endl;
+        } else {
+            os << splitter(istr) << std::endl;
+        }
+    }
+    return line_no;
+}
+
+
 #ifdef TOKENIZER_NAMESPACE
 }; // namespace
 #endif
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@ -28,12 +28,22 @@ private:

    static std::string cfg_dir;

+    // non-breaking prefixes (numeric) utf8
    std::set<std::string> nbpre_num_set;
+    // non-breaking prefixes (other) utf8
    std::set<std::string> nbpre_gen_set;
+
+    // non-breaking prefixes (numeric) ucs4
    std::set<std::wstring> nbpre_num_ucs4;
+    // non-breaking prefixes (other) ucs4
    std::set<std::wstring> nbpre_gen_ucs4;
+
+    // compiled protected patterns 
    std::vector<re2::RE2 *> prot_pat_vec;

+    // sentence starts embedded in last line of input
+    std::vector<std::size_t> starts_vec;
+
 protected:

    // language
@ -42,6 +52,7 @@ protected:
    bool latin_p; // is lang_iso "fr" or "it"
    bool skip_xml_p;
    bool skip_alltags_p;
+    bool entities_p;
    bool escape_p;
    bool unescape_p;
    bool aggressive_hyphen_p;
@ -54,6 +65,7 @@ protected:
    bool narrow_kana_p;
    bool refined_p;
    bool drop_bad_p;
+    bool splits_p;
    bool verbose_p;

    std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
@ -80,6 +92,11 @@ public:
    // required before other methods, may throw
    void init();

+    // required after processing a contiguous sequence of lines when sentence splitting is on
+    void reset();
+
+    bool splitting() const { return splits_p; }
+
    // streaming tokenizer reads from is, writes to os, preserving line breaks
    std::size_t tokenize(std::istream& is, std::ostream& os);

@ -117,6 +134,11 @@ public:
        return detokenize(oss.str());
    }

+    std::string splitter(const std::string &istr);
+
+    // split sentences from lines of input
+    std::size_t splitter(std::istream& is, std::ostream& os);
+
 }; // end class Tokenizer

 #ifdef TOKENIZER_NAMESPACE
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@ -20,6 +20,7 @@ usage(const char *path)
    std::cerr << " -d -- downcase" << std::endl;
    std::cerr << " -D -- detokenize" << std::endl;
    std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
+    std::cerr << " -E -- preserve entities during tokenization" << std::endl;
    std::cerr << " -k -- narrow kana" << std::endl;
    std::cerr << " -n -- narrow latin" << std::endl;
    std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +28,15 @@ usage(const char *path)
    std::cerr << " -p -- penn treebank style" << std::endl;
    std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
    std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
+    std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
+    std::cerr << " -t -- do not tokenize (for use as splitter)." << std::endl;
    std::cerr << " -u -- disable url handling" << std::endl;
    std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
    std::cerr << " -v -- verbose" << std::endl;
    std::cerr << " -w -- word filter" << std::endl;
    std::cerr << " -x -- skip xml tag lines" << std::endl;
    std::cerr << " -y -- skip all xml tags" << std::endl;
+    std::cerr << " -X -- split only" << std::endl;
    std::cerr << "Default is -c ., stdin, stdout." << std::endl;
    std::cerr << "LL in en,fr,it affect contraction.  LL selects nonbreaking prefix file" << std::endl;
    std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +87,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
    int nlines = 0;
    std::string line;
    while (ifs.good() && std::getline(ifs,line)) {
-        if (line.empty()) continue;
+        if (line.empty()) 
+            continue;
        std::vector<std::string> tokens(tize.tokens(line));
        int count = 0;
+        bool was_break = false;
+
        for (auto& token: tokens) {
+            if (token.empty()) {
+                if (count || was_break) {
+                    ofs << std::endl;
+                    count = 0;
+                    nlines++;
+                    was_break = true;
+                    continue;
+                }
+            }
+            was_break = false;
+
            std::string word(token_word(token));
-            if (word.empty()) continue;
-            ofs << word << ' ';
-            count++;
+            if (word.empty()) {
+                continue;
+            }
+
+            if (count++) {
+                ofs << ' ';
+            }
+            ofs << word;
        }
+
        if (count) {
            ofs << std::endl;
            nlines++;
@ -110,7 +134,9 @@ int main(int ac, char **av)
    bool next_cfg_p = false;
    bool next_output_p = false;
    bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
-    
+    if (!detokenize_p)
+        params.split_p = std::strstr(av[0],"splitter") != 0;
+
    while (++av,--ac) { 
        if (**av == '-') {
            switch (av[0][1]) {
@ -127,11 +153,14 @@ int main(int ac, char **av)
                params.downcase_p = true;
                break;
            case 'D':
-                detokenize_p = true;
+                detokenize_p = !detokenize_p;
                break;
            case 'e':
                params.escape_p = false;
                break;
+            case 'E':
+                params.entities_p = true;
+                break;
            case 'h':
                usage(prog);
                exit(0);
@ -156,6 +185,12 @@ int main(int ac, char **av)
            case 's':
                params.supersub_p = true;
                break;
+            case 'S':
+                params.split_p = !params.split_p;
+                break;
+            case 'T':
+                params.notokenization_p = true;
+                break;
            case 'U':
                params.unescape_p = true;
                break;
@ -171,6 +206,9 @@ int main(int ac, char **av)
            case 'x':
                params.detag_p = true;
                break;
+            case 'X':
+                params.notokenization_p = true;
+                break;
            case 'y':
                params.alltag_p = true;
                break;
@ -262,6 +300,8 @@ int main(int ac, char **av)
    } else if (params.args.empty()) {
        if (detokenize_p) {
            nlines = tize.detokenize(std::cin,ofs);
+        } else if (params.notokenization_p) {
+            nlines = tize.splitter(std::cin,ofs);
        } else {
            nlines = tize.tokenize(std::cin,ofs);
        }
@ -271,6 +311,8 @@ int main(int ac, char **av)
                std::ifstream ifs(arg.c_str());
                if (detokenize_p) {
                    nlines = tize.detokenize(ifs,ofs);
+                } else if (params.notokenization_p) {
+                    nlines = tize.splitter(ifs,ofs);
                } else {
                    nlines = tize.tokenize(ifs,ofs);
                }