make -a work more like the perl tokenizer

This commit is contained in:
akimbal1 2015-04-01 18:26:19 -04:00
parent 2e39e829bf
commit d4ef9ce106
4 changed files with 103 additions and 64 deletions

View File

@ -12,7 +12,7 @@ Parameters::Parameters()
, detag_p(false)
, alltag_p(false)
, entities_p(false)
, escape_p(true)
, escape_p(false)
, aggro_p(false)
, supersub_p(false)
, url_p(true)

View File

@ -74,6 +74,30 @@ class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
return false;
}
const char *ESCAPE_MOSES[] = {
"|", // | 0
"[", // [ 1
"]", // ] 2
"&", // & 3 (26)
"&lt;", // < 4 (3c)
"&gt;", // > 5 (3e)
"&apos;", // ' 6 (27)
"&quot;", // " 7 (22)
};
const std::set<std::string>
ESCAPE_SET = {
std::string(ESCAPE_MOSES[0]),
std::string(ESCAPE_MOSES[1]),
std::string(ESCAPE_MOSES[2]),
std::string(ESCAPE_MOSES[3]),
std::string(ESCAPE_MOSES[4]),
std::string(ESCAPE_MOSES[5]),
std::string(ESCAPE_MOSES[6]),
std::string(ESCAPE_MOSES[7]),
};
const std::map<std::wstring,gunichar>
ENTITY_MAP = {
{ std::wstring(L"&quot;"), L'"' },
@ -375,41 +399,6 @@ get_entity(char *ptr, size_t len) {
}
bool
unescape(std::string& word) {
std::ostringstream oss;
std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length
bool hit = false;
for (std::size_t endp=0;
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1;
glong ulen(0);
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
gunichar gbuf[2] = { 0 };
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
if (was < pos)
oss << word.substr(was,pos-was);
oss << gstr;
g_free(gstr);
was += ulen;
hit = true;
} else {
oss << word.substr(was,1+endp-was);
}
g_free(gtmp);
}
if (was < word.size())
oss << word.substr(was);
if (hit)
word = oss.str();
return hit;
}
inline std::string
trim(const std::string& in)
{
@ -682,22 +671,51 @@ Tokenizer::protected_tokenize(std::string& text) {
}
bool
Tokenizer::unescape(std::string& word) {
std::ostringstream oss;
std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length
bool hit = false;
for (std::size_t endp=0;
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1;
glong ulen(0);
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
gunichar gbuf[2] = { 0 };
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
// do not unescape moses escapes when escape flag is turned on
oss << word.substr(was,1+endp-was);
} else {
if (was < pos)
oss << word.substr(was,pos-was);
oss << gstr;
was += ulen;
hit = true;
}
g_free(gstr);
} else {
oss << word.substr(was,1+endp-was);
}
g_free(gtmp);
}
if (was < word.size())
oss << word.substr(was);
if (hit)
word = oss.str();
return hit;
}
bool
Tokenizer::escape(std::string& text) {
bool mod_p = false;
std::string outs;
static const char *replacements[] = {
"&#124;", // | 0
"&#91;", // [ 1
"&#93;", // ] 2
"&amp;", // & 3
"&lt;", // < 4
"&gt;", // > 5
"&apos;", // ' 6
"&quot;", // " 7
};
const char *pp = text.c_str(); // from pp to pt is uncopied
const char *ep = pp + text.size();
const char *pt = pp;
@ -720,25 +738,29 @@ Tokenizer::escape(std::string& text) {
const char *sequence_p = 0;
if (*pt < '?') {
if (*pt == '&') {
sequence_p = replacements[3];
// check for a pre-existing escape
const char *sc = strchr(pt,';');
if (!sc || sc-pt < 2 || sc-pt > 9) {
sequence_p = ESCAPE_MOSES[3];
}
} else if (*pt == '\'') {
sequence_p = replacements[6];
sequence_p = ESCAPE_MOSES[6];
} else if (*pt == '"') {
sequence_p = replacements[7];
sequence_p = ESCAPE_MOSES[7];
}
} else if (*pt > ']') {
if (*pt =='|') { // 7c
sequence_p = replacements[0];
sequence_p = ESCAPE_MOSES[0];
}
} else if (*pt > 'Z') {
if (*pt == '<') { // 3e
sequence_p = replacements[4];
sequence_p = ESCAPE_MOSES[4];
} else if (*pt == '>') { // 3c
sequence_p = replacements[5];
sequence_p = ESCAPE_MOSES[5];
} else if (*pt == '[') { // 5b
sequence_p = replacements[1];
sequence_p = ESCAPE_MOSES[1];
} else if (*pt == ']') { // 5d
sequence_p = replacements[2];
sequence_p = ESCAPE_MOSES[2];
}
}
@ -1056,7 +1078,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
in_url_p = in_num_p = false;
break;
case G_UNICODE_DASH_PUNCTUATION:
if (aggressive_hyphen_p && !in_url_p) {
if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
substitute_p = L"@-@";
post_break_p = pre_break_p = true;
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
@ -1090,6 +1112,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
case G_UNICODE_DECIMAL_NUMBER:
case G_UNICODE_LETTER_NUMBER:
case G_UNICODE_OTHER_NUMBER:
case G_UNICODE_OTHER_PUNCTUATION:
break;
default:
post_break_p = true;
@ -1118,6 +1141,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
case G_UNICODE_LETTER_NUMBER:
case G_UNICODE_OTHER_NUMBER:
break;
case G_UNICODE_OTHER_PUNCTUATION:
if (prev_type != next_type)
break;
default:
post_break_p = pre_break_p = prev_uch != curr_uch;
}
@ -1517,6 +1543,10 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
// escape moses meta-characters
if (escape_p)
escape(text);
return text;
}

View File

@ -85,14 +85,13 @@ protected:
bool para_marks_p;
bool split_breaks_p;
// return counts of general and numeric prefixes loaded
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
bool escape(std::string& inplace);
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
// used for boost::thread
struct VectorTokenizerCallable {
Tokenizer *tokenizer;
std::vector<std::string>& in;
@ -120,8 +119,6 @@ protected:
public:
void set_config_dir(const std::string& _cfg_dir);
Tokenizer(); // UNIMPL
// no throw
@ -133,18 +130,29 @@ public:
// required before other methods, may throw
void init(const char *cfg_dir_path = 0);
void set_config_dir(const std::string& _cfg_dir);
// required after processing a contiguous sequence of lines when sentence splitting is on
void reset();
// simultaneous sentence splitting not yet implemented
bool splitting() const { return splits_p; }
// escapes chars the set &|"'<> after tokenization (moses special characters)
bool escape(std::string& inplace);
// used in detokenizer, converts entities into characters
// if escape_p is set, does not unescape moses special tokens, thus
// escape_p and unescape_p can be used together usefully
bool unescape(std::string& inplace);
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
std::size_t tokenize(std::istream& is, std::ostream& os);
// quik-tokenize padded line buffer to return string
std::string quik_tokenize(const std::string& buf);
// penn-tokenize padded line buffer to return string
// penn-tokenize padded line buffer to return string // untested
std::string penn_tokenize(const std::string& buf);
// select-tokenize padded line buffer to return string
@ -184,9 +192,10 @@ public:
return detokenize(oss.str());
}
// split a string on sentence boundaries (approximately)
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
// split sentences from lines of input
// split sentences from input stream and write one per line on output stream
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
}; // end class Tokenizer

View File

@ -162,7 +162,7 @@ int main(int ac, char **av)
detokenize_p = !detokenize_p;
break;
case 'e':
params.escape_p = false;
params.escape_p = !params.escape_p;
break;
case 'E':
params.entities_p = true;