mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-30 23:42:30 +03:00
1dce55f413
Some differences from Moses tokenizer: fraction characters count as numbers, _ handling, URLs Currently 3x slower than perl :'(. Looking to make it faster by composing regex substitutions. TODO eliminate sprintf and fixed-size buffers.
213 lines
5.9 KiB
C++
213 lines
5.9 KiB
C++
#include "tokenizer.h"
|
|
#include <memory>
|
|
#include <vector>
|
|
#include <cctype>
|
|
|
|
#ifdef TOKENIZER_NAMESPACE
|
|
using namespace TOKENIZER_NAMESPACE ;
|
|
#endif
|
|
|
|
|
|
void
|
|
usage(const char *path)
|
|
{
|
|
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
|
|
std::cerr << " -v -- verbose" << std::endl;
|
|
std::cerr << " -w -- word filter" << std::endl;
|
|
std::cerr << " -x -- skip xml tag lines" << std::endl;
|
|
std::cerr << " -y -- skip all xml tags" << std::endl;
|
|
std::cerr << " -e -- escape entities" << std::endl;
|
|
std::cerr << " -a -- aggressive hyphenization" << std::endl;
|
|
std::cerr << " -p -- treebank-3 style" << std::endl;
|
|
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
|
|
std::cerr << " -o OUT -- output file path" << std::endl;
|
|
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
|
|
std::cerr << "LL in en,fr,it affect contraction." << std::endl;
|
|
}
|
|
|
|
|
|
std::string token_word(const std::string& in) {
|
|
int pos = -1;
|
|
int digits_prefixed = 0;
|
|
int nalpha = 0;
|
|
int len = in.size();
|
|
std::vector<char> cv;
|
|
int last_quirk = -1;
|
|
while (++pos < len) {
|
|
char ch = in.at(pos);
|
|
if (std::isdigit(ch)) {
|
|
if (digits_prefixed > 0) {
|
|
last_quirk = pos;
|
|
break;
|
|
}
|
|
digits_prefixed--;
|
|
cv.push_back(std::tolower(ch));
|
|
} else if (std::isalpha(ch)) {
|
|
if (digits_prefixed < 0)
|
|
digits_prefixed = -digits_prefixed;
|
|
cv.push_back(std::tolower(ch));
|
|
nalpha++;
|
|
} else {
|
|
if (digits_prefixed < 0)
|
|
digits_prefixed = -digits_prefixed;
|
|
last_quirk = pos;
|
|
if ((ch == '-' || ch == '\'') && pos != 0) {
|
|
cv.push_back(ch);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (last_quirk == pos || digits_prefixed > 0 && nalpha == 0)
|
|
cv.clear(); // invalid word
|
|
return std::string(cv.begin(),cv.end());
|
|
}
|
|
|
|
|
|
int
|
|
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
|
int nlines = 0;
|
|
std::string line;
|
|
while (ifs.good() && std::getline(ifs,line)) {
|
|
if (line.empty()) continue;
|
|
std::vector<std::string> tokens(tize.tokens(line));
|
|
int count = 0;
|
|
for (auto& token: tokens) {
|
|
std::string word(token_word(token));
|
|
if (word.empty()) continue;
|
|
ofs << word << ' ';
|
|
count++;
|
|
}
|
|
if (count) {
|
|
ofs << std::endl;
|
|
nlines++;
|
|
}
|
|
}
|
|
return nlines;
|
|
}
|
|
|
|
|
|
int main(int ac, char **av)
|
|
{
|
|
int rc = 0;
|
|
std::string lang_iso;
|
|
std::vector<std::string> args;
|
|
std::string out_path;
|
|
char *cfg_path = 0;
|
|
bool next_cfg_p = false;
|
|
bool next_output_p = false;
|
|
bool verbose_p = false;
|
|
bool detag_p = false;
|
|
bool alltag_p = false;
|
|
bool escape_p = true;
|
|
bool aggro_p = false;
|
|
bool penn_p = false;
|
|
bool words_p = false;
|
|
|
|
const char *prog = av[0];
|
|
while (++av,--ac) {
|
|
if (**av == '-') {
|
|
switch (av[0][1]) {
|
|
case 'h':
|
|
usage(prog);
|
|
exit(0);
|
|
case 'c':
|
|
next_cfg_p = true;
|
|
break;
|
|
case 'o':
|
|
next_output_p = true;
|
|
break;
|
|
case 'v':
|
|
verbose_p = true;
|
|
break;
|
|
case 'e':
|
|
escape_p = false;
|
|
break;
|
|
case 'w':
|
|
words_p = true;
|
|
break;
|
|
case 'x':
|
|
detag_p = true;
|
|
break;
|
|
case 'y':
|
|
alltag_p = true;
|
|
break;
|
|
case 'a':
|
|
aggro_p = true;
|
|
break;
|
|
case 'l':
|
|
// ignored
|
|
break;
|
|
case 'p':
|
|
penn_p = true;
|
|
break;
|
|
default:
|
|
std::cerr << "Unknown option: " << *av << std::endl;
|
|
::exit(1);
|
|
}
|
|
} else if (lang_iso.empty() && strlen(*av) == 2) {
|
|
lang_iso = *av;
|
|
} else if (**av == '-') {
|
|
++*av;
|
|
} else if (next_output_p) {
|
|
next_output_p = false;
|
|
out_path = *av;
|
|
} else if (next_cfg_p) {
|
|
next_cfg_p = false;
|
|
cfg_path = *av;
|
|
} else {
|
|
args.push_back(std::string(*av));
|
|
}
|
|
}
|
|
|
|
if (!cfg_path) {
|
|
cfg_path = getenv("TOKENIZER_SHARED_DIR");
|
|
}
|
|
if (cfg_path) {
|
|
Tokenizer::set_config_dir(std::string(cfg_path));
|
|
}
|
|
|
|
std::unique_ptr<std::ofstream> pofs = 0;
|
|
if (!out_path.empty()) {
|
|
pofs.reset(new std::ofstream(out_path.c_str()));
|
|
}
|
|
std::ostream& ofs(pofs ? *pofs : std::cout);
|
|
|
|
Tokenizer tize(lang_iso,detag_p,alltag_p,!escape_p,aggro_p,penn_p,verbose_p);
|
|
tize.init();
|
|
size_t nlines = 0;
|
|
|
|
if (words_p) {
|
|
if (args.empty()) {
|
|
nlines += copy_words(tize,std::cin,ofs);
|
|
} else {
|
|
for (std::string& arg : args) {
|
|
try {
|
|
std::ifstream ifs(arg.c_str());
|
|
nlines += copy_words(tize,ifs,ofs);
|
|
} catch (...) {
|
|
std::cerr << "Exception extracting words from path " << arg << std::endl;
|
|
}
|
|
}
|
|
}
|
|
} else if (args.empty()) {
|
|
nlines = tize.tokenize(std::cin,ofs);
|
|
} else {
|
|
for (std::string& arg : args) {
|
|
try {
|
|
std::ifstream ifs(arg.c_str());
|
|
nlines = tize.tokenize(ifs,ofs);
|
|
} catch (...) {
|
|
std::cerr << "Exception tokenizing from path " << arg << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (verbose_p)
|
|
std::cerr << "%%% tokenized lines: " << nlines << std::endl;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|