mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 07:07:24 +03:00
make -a work more like the perl tokenizer
This commit is contained in:
parent
2e39e829bf
commit
d4ef9ce106
@ -12,7 +12,7 @@ Parameters::Parameters()
|
||||
, detag_p(false)
|
||||
, alltag_p(false)
|
||||
, entities_p(false)
|
||||
, escape_p(true)
|
||||
, escape_p(false)
|
||||
, aggro_p(false)
|
||||
, supersub_p(false)
|
||||
, url_p(true)
|
||||
|
@ -74,6 +74,30 @@ class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
const char *ESCAPE_MOSES[] = {
|
||||
"|", // | 0
|
||||
"[", // [ 1
|
||||
"]", // ] 2
|
||||
"&", // & 3 (26)
|
||||
"<", // < 4 (3c)
|
||||
">", // > 5 (3e)
|
||||
"'", // ' 6 (27)
|
||||
""", // " 7 (22)
|
||||
};
|
||||
|
||||
const std::set<std::string>
|
||||
ESCAPE_SET = {
|
||||
std::string(ESCAPE_MOSES[0]),
|
||||
std::string(ESCAPE_MOSES[1]),
|
||||
std::string(ESCAPE_MOSES[2]),
|
||||
std::string(ESCAPE_MOSES[3]),
|
||||
std::string(ESCAPE_MOSES[4]),
|
||||
std::string(ESCAPE_MOSES[5]),
|
||||
std::string(ESCAPE_MOSES[6]),
|
||||
std::string(ESCAPE_MOSES[7]),
|
||||
};
|
||||
|
||||
const std::map<std::wstring,gunichar>
|
||||
ENTITY_MAP = {
|
||||
{ std::wstring(L"""), L'"' },
|
||||
@ -375,41 +399,6 @@ get_entity(char *ptr, size_t len) {
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
unescape(std::string& word) {
|
||||
std::ostringstream oss;
|
||||
std::size_t was = 0; // last processed
|
||||
std::size_t pos = 0; // last unprocessed
|
||||
std::size_t len = 0; // processed length
|
||||
bool hit = false;
|
||||
for (std::size_t endp=0;
|
||||
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
||||
was = endp == std::string::npos ? pos : 1+endp) {
|
||||
len = endp - pos + 1;
|
||||
glong ulen(0);
|
||||
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
|
||||
gunichar gbuf[2] = { 0 };
|
||||
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
|
||||
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
|
||||
if (was < pos)
|
||||
oss << word.substr(was,pos-was);
|
||||
oss << gstr;
|
||||
g_free(gstr);
|
||||
was += ulen;
|
||||
hit = true;
|
||||
} else {
|
||||
oss << word.substr(was,1+endp-was);
|
||||
}
|
||||
g_free(gtmp);
|
||||
}
|
||||
if (was < word.size())
|
||||
oss << word.substr(was);
|
||||
if (hit)
|
||||
word = oss.str();
|
||||
return hit;
|
||||
}
|
||||
|
||||
|
||||
inline std::string
|
||||
trim(const std::string& in)
|
||||
{
|
||||
@ -682,22 +671,51 @@ Tokenizer::protected_tokenize(std::string& text) {
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
Tokenizer::unescape(std::string& word) {
|
||||
std::ostringstream oss;
|
||||
std::size_t was = 0; // last processed
|
||||
std::size_t pos = 0; // last unprocessed
|
||||
std::size_t len = 0; // processed length
|
||||
bool hit = false;
|
||||
for (std::size_t endp=0;
|
||||
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
||||
was = endp == std::string::npos ? pos : 1+endp) {
|
||||
len = endp - pos + 1;
|
||||
glong ulen(0);
|
||||
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
|
||||
gunichar gbuf[2] = { 0 };
|
||||
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
|
||||
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
|
||||
if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
|
||||
// do not unescape moses escapes when escape flag is turned on
|
||||
oss << word.substr(was,1+endp-was);
|
||||
} else {
|
||||
if (was < pos)
|
||||
oss << word.substr(was,pos-was);
|
||||
oss << gstr;
|
||||
was += ulen;
|
||||
hit = true;
|
||||
}
|
||||
g_free(gstr);
|
||||
} else {
|
||||
oss << word.substr(was,1+endp-was);
|
||||
}
|
||||
g_free(gtmp);
|
||||
}
|
||||
if (was < word.size())
|
||||
oss << word.substr(was);
|
||||
if (hit)
|
||||
word = oss.str();
|
||||
return hit;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
Tokenizer::escape(std::string& text) {
|
||||
bool mod_p = false;
|
||||
std::string outs;
|
||||
|
||||
static const char *replacements[] = {
|
||||
"|", // | 0
|
||||
"[", // [ 1
|
||||
"]", // ] 2
|
||||
"&", // & 3
|
||||
"<", // < 4
|
||||
">", // > 5
|
||||
"'", // ' 6
|
||||
""", // " 7
|
||||
};
|
||||
|
||||
const char *pp = text.c_str(); // from pp to pt is uncopied
|
||||
const char *ep = pp + text.size();
|
||||
const char *pt = pp;
|
||||
@ -720,25 +738,29 @@ Tokenizer::escape(std::string& text) {
|
||||
const char *sequence_p = 0;
|
||||
if (*pt < '?') {
|
||||
if (*pt == '&') {
|
||||
sequence_p = replacements[3];
|
||||
// check for a pre-existing escape
|
||||
const char *sc = strchr(pt,';');
|
||||
if (!sc || sc-pt < 2 || sc-pt > 9) {
|
||||
sequence_p = ESCAPE_MOSES[3];
|
||||
}
|
||||
} else if (*pt == '\'') {
|
||||
sequence_p = replacements[6];
|
||||
sequence_p = ESCAPE_MOSES[6];
|
||||
} else if (*pt == '"') {
|
||||
sequence_p = replacements[7];
|
||||
sequence_p = ESCAPE_MOSES[7];
|
||||
}
|
||||
} else if (*pt > ']') {
|
||||
if (*pt =='|') { // 7c
|
||||
sequence_p = replacements[0];
|
||||
sequence_p = ESCAPE_MOSES[0];
|
||||
}
|
||||
} else if (*pt > 'Z') {
|
||||
if (*pt == '<') { // 3e
|
||||
sequence_p = replacements[4];
|
||||
sequence_p = ESCAPE_MOSES[4];
|
||||
} else if (*pt == '>') { // 3c
|
||||
sequence_p = replacements[5];
|
||||
sequence_p = ESCAPE_MOSES[5];
|
||||
} else if (*pt == '[') { // 5b
|
||||
sequence_p = replacements[1];
|
||||
sequence_p = ESCAPE_MOSES[1];
|
||||
} else if (*pt == ']') { // 5d
|
||||
sequence_p = replacements[2];
|
||||
sequence_p = ESCAPE_MOSES[2];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1056,7 +1078,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
in_url_p = in_num_p = false;
|
||||
break;
|
||||
case G_UNICODE_DASH_PUNCTUATION:
|
||||
if (aggressive_hyphen_p && !in_url_p) {
|
||||
if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
|
||||
substitute_p = L"@-@";
|
||||
post_break_p = pre_break_p = true;
|
||||
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
|
||||
@ -1090,6 +1112,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
case G_UNICODE_DECIMAL_NUMBER:
|
||||
case G_UNICODE_LETTER_NUMBER:
|
||||
case G_UNICODE_OTHER_NUMBER:
|
||||
case G_UNICODE_OTHER_PUNCTUATION:
|
||||
break;
|
||||
default:
|
||||
post_break_p = true;
|
||||
@ -1118,6 +1141,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
case G_UNICODE_LETTER_NUMBER:
|
||||
case G_UNICODE_OTHER_NUMBER:
|
||||
break;
|
||||
case G_UNICODE_OTHER_PUNCTUATION:
|
||||
if (prev_type != next_type)
|
||||
break;
|
||||
default:
|
||||
post_break_p = pre_break_p = prev_uch != curr_uch;
|
||||
}
|
||||
@ -1517,6 +1543,10 @@ Tokenizer::quik_tokenize(const std::string& buf)
|
||||
}
|
||||
}
|
||||
|
||||
// escape moses meta-characters
|
||||
if (escape_p)
|
||||
escape(text);
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
|
@ -85,14 +85,13 @@ protected:
|
||||
bool para_marks_p;
|
||||
bool split_breaks_p;
|
||||
|
||||
// return counts of general and numeric prefixes loaded
|
||||
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
|
||||
|
||||
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
|
||||
bool escape(std::string& inplace);
|
||||
|
||||
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
|
||||
void protected_tokenize(std::string& inplace);
|
||||
|
||||
// used for boost::thread
|
||||
struct VectorTokenizerCallable {
|
||||
Tokenizer *tokenizer;
|
||||
std::vector<std::string>& in;
|
||||
@ -120,8 +119,6 @@ protected:
|
||||
|
||||
public:
|
||||
|
||||
void set_config_dir(const std::string& _cfg_dir);
|
||||
|
||||
Tokenizer(); // UNIMPL
|
||||
|
||||
// no throw
|
||||
@ -133,18 +130,29 @@ public:
|
||||
// required before other methods, may throw
|
||||
void init(const char *cfg_dir_path = 0);
|
||||
|
||||
void set_config_dir(const std::string& _cfg_dir);
|
||||
|
||||
// required after processing a contiguous sequence of lines when sentence splitting is on
|
||||
void reset();
|
||||
|
||||
// simultaneous sentence splitting not yet implemented
|
||||
bool splitting() const { return splits_p; }
|
||||
|
||||
// escapes chars the set &|"'<> after tokenization (moses special characters)
|
||||
bool escape(std::string& inplace);
|
||||
|
||||
// used in detokenizer, converts entities into characters
|
||||
// if escape_p is set, does not unescape moses special tokens, thus
|
||||
// escape_p and unescape_p can be used together usefully
|
||||
bool unescape(std::string& inplace);
|
||||
|
||||
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
|
||||
std::size_t tokenize(std::istream& is, std::ostream& os);
|
||||
|
||||
// quik-tokenize padded line buffer to return string
|
||||
std::string quik_tokenize(const std::string& buf);
|
||||
|
||||
// penn-tokenize padded line buffer to return string
|
||||
// penn-tokenize padded line buffer to return string // untested
|
||||
std::string penn_tokenize(const std::string& buf);
|
||||
|
||||
// select-tokenize padded line buffer to return string
|
||||
@ -184,9 +192,10 @@ public:
|
||||
return detokenize(oss.str());
|
||||
}
|
||||
|
||||
// split a string on sentence boundaries (approximately)
|
||||
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
|
||||
|
||||
// split sentences from lines of input
|
||||
// split sentences from input stream and write one per line on output stream
|
||||
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
|
||||
|
||||
}; // end class Tokenizer
|
||||
|
@ -162,7 +162,7 @@ int main(int ac, char **av)
|
||||
detokenize_p = !detokenize_p;
|
||||
break;
|
||||
case 'e':
|
||||
params.escape_p = false;
|
||||
params.escape_p = !params.escape_p;
|
||||
break;
|
||||
case 'E':
|
||||
params.entities_p = true;
|
||||
|
Loading…
Reference in New Issue
Block a user