Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Barry Haddow 2015-04-03 15:47:22 +01:00
commit 44c5ae344f
152 changed files with 2057 additions and 1515 deletions

View File

@ -5,11 +5,14 @@ namespace TOKENIZER_NAMESPACE {
#endif
Parameters::Parameters()
: cfg_path(0)
: nthreads(0)
, chunksize(2000)
, cfg_path(0)
, verbose_p(false)
, detag_p(false)
, alltag_p(false)
, escape_p(true)
, entities_p(false)
, escape_p(false)
, aggro_p(false)
, supersub_p(false)
, url_p(true)
@ -23,6 +26,10 @@ Parameters::Parameters()
, refined_p(false)
, unescape_p(false)
, drop_bad_p(false)
, split_p(false)
, notokenization_p(false)
, para_marks_p(false)
, split_breaks_p(false)
{
}

View File

@ -12,10 +12,13 @@ struct Parameters
std::string lang_iso;
std::vector<std::string> args;
std::string out_path;
int nthreads;
int chunksize;
const char *cfg_path;
bool verbose_p;
bool detag_p;
bool alltag_p;
bool entities_p;
bool escape_p;
bool aggro_p;
bool supersub_p;
@ -30,6 +33,10 @@ struct Parameters
bool refined_p;
bool unescape_p;
bool drop_bad_p;
bool split_p;
bool notokenization_p;
bool para_marks_p;
bool split_breaks_p;
Parameters();

File diff suppressed because it is too large Load Diff

View File

@ -26,12 +26,37 @@ class Tokenizer {
private:
static std::string cfg_dir;
typedef enum {
empty = 0,
blank,
upper, // upper case
letta, // extended word class (includes number, hyphen)
numba,
hyphn,
stops, // blank to stops are "extended word class" variants
quote, // init & fini = {',"}
pinit, // init (includes INVERT_*)
pfini, // fini
pfpct, // fini + pct
marks,
limit
} charclass_t;
std::size_t nthreads;
std::size_t chunksize;
std::string cfg_dir;
// non-breaking prefixes (numeric) utf8
std::set<std::string> nbpre_num_set;
// non-breaking prefixes (other) utf8
std::set<std::string> nbpre_gen_set;
// non-breaking prefixes (numeric) ucs4
std::set<std::wstring> nbpre_num_ucs4;
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
// compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@ -42,6 +67,7 @@ protected:
bool latin_p; // is lang_iso "fr" or "it"
bool skip_xml_p;
bool skip_alltags_p;
bool entities_p;
bool escape_p;
bool unescape_p;
bool aggressive_hyphen_p;
@ -54,20 +80,44 @@ protected:
bool narrow_kana_p;
bool refined_p;
bool drop_bad_p;
bool splits_p;
bool verbose_p;
bool para_marks_p;
bool split_breaks_p;
// return counts of general and numeric prefixes loaded
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
bool escape(std::string& inplace);
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
void protected_tokenize(std::string& inplace);
public:
// used for boost::thread
struct VectorTokenizerCallable {
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
// cfg_dir is assumed shared by all languages
static void set_config_dir(const std::string& _cfg_dir);
VectorTokenizerCallable(Tokenizer *_tokenizer,
std::vector<std::string>& _in,
std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
};
void operator()() {
out.resize(in.size());
for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);
};
};
public:
Tokenizer(); // UNIMPL
@ -78,21 +128,46 @@ public:
~Tokenizer();
// required before other methods, may throw
void init();
void init(const char *cfg_dir_path = 0);
// streaming tokenizer reads from is, writes to os, preserving line breaks
void set_config_dir(const std::string& _cfg_dir);
// required after processing a contiguous sequence of lines when sentence splitting is on
void reset();
// simultaneous sentence splitting not yet implemented
bool splitting() const { return splits_p; }
// escapes chars the set &|"'<> after tokenization (moses special characters)
bool escape(std::string& inplace);
// used in detokenizer, converts entities into characters
// if escape_p is set, does not unescape moses special tokens, thus
// escape_p and unescape_p can be used together usefully
bool unescape(std::string& inplace);
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
std::size_t tokenize(std::istream& is, std::ostream& os);
// tokenize padded line buffer to return string
std::string tokenize(const std::string& buf);
// quik-tokenize padded line buffer to return string
std::string quik_tokenize(const std::string& buf);
// penn-tokenize padded line buffer to return string // untested
std::string penn_tokenize(const std::string& buf);
// select-tokenize padded line buffer to return string
std::string tokenize(const std::string& buf) {
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
}
// tokenize with output argument
void tokenize(const std::string& buf, std::string& outs) {
outs = tokenize(buf);
}
// tokenize to a vector
std::vector<std::string> tokens(const std::string& in) {
std::istringstream tokss(tokenize(in));
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
std::vector<std::string> outv;
std::copy(std::istream_iterator<std::string>(tokss),
std::istream_iterator<std::string>(),
@ -117,6 +192,12 @@ public:
return detokenize(oss.str());
}
// split a string on sentence boundaries (approximately)
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
// split sentences from input stream and write one per line on output stream
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
}; // end class Tokenizer
#ifdef TOKENIZER_NAMESPACE

View File

@ -16,10 +16,12 @@ usage(const char *path)
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
std::cerr << " -b -- drop bad bytes" << std::endl;
std::cerr << " -B -- splitter will split on linebreak" << std::endl;
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
std::cerr << " -d -- downcase" << std::endl;
std::cerr << " -D -- detokenize" << std::endl;
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
std::cerr << " -k -- narrow kana" << std::endl;
std::cerr << " -n -- narrow latin" << std::endl;
std::cerr << " -N -- normalize" << std::endl;
@ -27,12 +29,16 @@ usage(const char *path)
std::cerr << " -p -- penn treebank style" << std::endl;
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
std::cerr << " -u -- disable url handling" << std::endl;
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
std::cerr << " -X -- split only, with <P> marks" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
@ -83,15 +89,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
if (line.empty()) continue;
if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
bool was_break = false;
for (auto& token: tokens) {
std::string word(token_word(token));
if (word.empty()) continue;
ofs << word << ' ';
count++;
if (token.empty()) {
if (count || was_break) {
ofs << std::endl;
count = 0;
nlines++;
was_break = true;
continue;
}
}
was_break = false;
std::string word(token_word(token));
if (word.empty()) {
continue;
}
if (count++) {
ofs << ' ';
}
ofs << word;
}
if (count) {
ofs << std::endl;
nlines++;
@ -109,7 +135,10 @@ int main(int ac, char **av)
const char *prog = av[0];
bool next_cfg_p = false;
bool next_output_p = false;
bool next_threads_p = false;
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
while (++av,--ac) {
if (**av == '-') {
@ -120,6 +149,9 @@ int main(int ac, char **av)
case 'b':
params.drop_bad_p = true;
break;
case 'B':
params.split_breaks_p = true;
break;
case 'c':
next_cfg_p = true;
break;
@ -127,10 +159,13 @@ int main(int ac, char **av)
params.downcase_p = true;
break;
case 'D':
detokenize_p = true;
detokenize_p = !detokenize_p;
break;
case 'e':
params.escape_p = false;
params.escape_p = !params.escape_p;
break;
case 'E':
params.entities_p = true;
break;
case 'h':
usage(prog);
@ -156,6 +191,16 @@ int main(int ac, char **av)
case 's':
params.supersub_p = true;
break;
case 'S':
params.split_p = !params.split_p;
break;
case 'T':
params.notokenization_p = true;
params.para_marks_p = false;
break;
case 't':
next_threads_p = true;
break;
case 'U':
params.unescape_p = true;
break;
@ -171,6 +216,10 @@ int main(int ac, char **av)
case 'x':
params.detag_p = true;
break;
case 'X':
params.notokenization_p = true;
params.para_marks_p = true;
break;
case 'y':
params.alltag_p = true;
break;
@ -181,7 +230,7 @@ int main(int ac, char **av)
std::cerr << "Unknown option: " << *av << std::endl;
::exit(1);
}
} else if (params.lang_iso.empty() && strlen(*av) == 2) {
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
params.lang_iso = *av;
} else if (next_output_p) {
next_output_p = false;
@ -189,6 +238,14 @@ int main(int ac, char **av)
} else if (next_cfg_p) {
next_cfg_p = false;
params.cfg_path = *av;
} else if (next_threads_p) {
next_threads_p = false;
char *comma = strchr(*av,',');
if (comma) {
*comma++ = 0;
params.chunksize = std::strtoul(comma,0,0);
}
params.nthreads = std::strtoul(*av,0,0);
} else {
params.args.push_back(std::string(*av));
}
@ -230,7 +287,6 @@ int main(int ac, char **av)
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
Tokenizer::set_config_dir(std::string(params.cfg_path));
}
std::unique_ptr<std::ofstream> pofs = 0;
@ -244,16 +300,16 @@ int main(int ac, char **av)
Tokenizer tize(params);
tize.init();
size_t nlines = 0;
std::pair<std::size_t,std::size_t> plines = { 0, 0 };
if (params.words_p) {
if (params.args.empty()) {
nlines += copy_words(tize,std::cin,ofs);
plines.first += copy_words(tize,std::cin,ofs);
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
nlines += copy_words(tize,ifs,ofs);
plines.first += copy_words(tize,ifs,ofs);
} catch (...) {
std::cerr << "Exception extracting words from path " << arg << std::endl;
}
@ -261,18 +317,22 @@ int main(int ac, char **av)
}
} else if (params.args.empty()) {
if (detokenize_p) {
nlines = tize.detokenize(std::cin,ofs);
plines.first = tize.detokenize(std::cin,ofs);
} else if (params.notokenization_p) {
plines = tize.splitter(std::cin,ofs);
} else {
nlines = tize.tokenize(std::cin,ofs);
plines.first = tize.tokenize(std::cin,ofs);
}
} else {
for (std::string& arg : params.args) {
try {
std::ifstream ifs(arg.c_str());
if (detokenize_p) {
nlines = tize.detokenize(ifs,ofs);
plines.first = tize.detokenize(ifs,ofs);
} else if (params.notokenization_p) {
plines = tize.splitter(ifs,ofs);
} else {
nlines = tize.tokenize(ifs,ofs);
plines.first = tize.tokenize(ifs,ofs);
}
} catch (...) {
std::cerr << "Exception tokenizing from path " << arg << std::endl;
@ -280,9 +340,12 @@ int main(int ac, char **av)
}
}
if (params.verbose_p)
std::cerr << "%%% " << nlines << " lines." << std::endl;
if (params.verbose_p) {
std::cerr << "%%% " << plines.first << " lines." << std::endl;
if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
}
}
return rc;
}

View File

@ -9,6 +9,7 @@
#include <algorithm>
#include <fstream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/filesystem.hpp>
#include "EnOpenNLPChunker.h"
#include "moses/Util.h"
@ -28,10 +29,11 @@ EnOpenNLPChunker::~EnOpenNLPChunker() {
void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
{
const boost::filesystem::path
inPath = boost::filesystem::unique_path(),
outPath = boost::filesystem::unique_path();
// read all input to a temp file
char *ptr = tmpnam(NULL);
string inStr(ptr);
ofstream inFile(ptr);
ofstream inFile(inPath.c_str());
string line;
while (getline(in, line)) {
@ -40,21 +42,18 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
}
inFile.close();
ptr = tmpnam(NULL);
string outStr(ptr);
// execute chunker
string cmd = "cat " + inStr + " | "
string cmd = "cat " + inPath.native() + " | "
+ m_openNLPPath + "/bin/opennlp POSTagger "
+ m_openNLPPath + "/models/en-pos-maxent.bin | "
+ m_openNLPPath + "/bin/opennlp ChunkerME "
+ m_openNLPPath + "/models/en-chunker.bin > "
+ outStr;
+ outPath.native();
//g << "Executing:" << cmd << endl;
int ret = system(cmd.c_str());
// read result of chunker and output as Moses xml trees
ifstream outFile(outStr.c_str());
ifstream outFile(outPath.c_str());
size_t lineNum = 0;
while (getline(outFile, line)) {
@ -66,8 +65,8 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
outFile.close();
// clean up temporary files
remove(inStr.c_str());
remove(outStr.c_str());
remove(inPath.c_str());
remove(outPath.c_str());
}
void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)

View File

@ -3,7 +3,7 @@
#include <string>
#include <boost/thread/tss.hpp>
#include "Classifier.h"
#include "vw/Classifier.h"
#include "moses/TypeDef.h"
#include "moses/Util.h"
#include "moses/FF/StatelessFeatureFunction.h"

View File

@ -70,10 +70,9 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
inputPath.SetTargetPhrases(*this, tpColl, NULL);
} else {
// TRANSLITERATE
char *ptr = tmpnam(NULL);
string inFile(ptr);
ptr = tmpnam(NULL);
string outDir(ptr);
const boost::filesystem::path
inFile = boost::filesystem::unique_path(),
outDir = boost::filesystem::unique_path();
ofstream inStream(inFile.c_str());
inStream << sourcePhrase.ToString() << endl;
@ -85,14 +84,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
" --external-bin-dir " + m_externalDir +
" --input-extension " + m_inputLang +
" --output-extension " + m_outputLang +
" --oov-file " + inFile +
" --out-dir " + outDir;
" --oov-file " + inFile.native() +
" --out-dir " + outDir.native();
int ret = system(cmd.c_str());
UTIL_THROW_IF2(ret != 0, "Transliteration script error");
TargetPhraseCollection *tpColl = new TargetPhraseCollection();
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
vector<TargetPhrase*>::const_iterator iter;
for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
TargetPhrase *tp = *iter;

View File

@ -1,206 +0,0 @@
#include "line_splitter.hh"
bool test_vectorinsert()
{
StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
line_text output = splitLine(line1);
line_text output2 = splitLine(line2);
//Init container vector and iterator.
std::vector<char> container;
container.reserve(10000); //Reserve vector
std::vector<char>::iterator it = container.begin();
std::pair<std::vector<char>::iterator, int> binary_append_ret; //Return values from vector_append
//Put a value into the vector
binary_append_ret = vector_append(&output, &container, it, false);
it = binary_append_ret.first;
binary_append_ret = vector_append(&output2, &container, it, false);
it = binary_append_ret.first;
std::string test(container.begin(), container.end());
std::string should_be = "! ! ! ! 0.0804289 0.141656 0.0804289 0.443409 2.718 0-0 1-1 2-2 3-3 1 1 1! ! ! ) - , a 0.0804289 0.0257627 0.0804289 0.00146736 2.718 0-0 1-1 2-2 3-3 4-4 4-5 5-6 1 1 1";
if (test == should_be) {
return true;
} else {
return false;
}
}
bool probabilitiesTest()
{
StringPiece line1 = StringPiece("0.536553 0.75961 0.634108 0.532927 2.718");
StringPiece line2 = StringPiece("1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718");
std::vector<double> pesho;
bool peshobool = false;
bool kirobool = false;
std::vector<double> kiro;
pesho = splitProbabilities(line1);
kiro = splitProbabilities(line2);
if (pesho[0] == 0.536553 && pesho[1] == 0.75961 && pesho[2] == 0.634108 && pesho[3] == 0.532927 && pesho[4] == 2.718 && pesho.size() == 5) {
peshobool = true;
} else {
std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << " " << pesho[3] << " " << pesho[4] << std::endl;
std::cout << "Size is: " << pesho.size() << " Expected 5." << std::endl;
std::cout << "Expected: " << "0.536553 0.75961 0.634108 0.532927 2.718" << std::endl;
}
if (kiro[0] == 1.42081e-05 && kiro[1] == 3.91895e-09 && kiro[2] == 0.0738539 && kiro[3] == 0.749514 && kiro[4] == 2.718 && kiro.size() == 5) {
kirobool = true;
} else {
std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << " " << kiro[3] << " " << kiro[4] << std::endl;
std::cout << "Size is: " << kiro.size() << " Expected 5." << std::endl;
std::cout << "Expected: " << "1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718" << std::endl;
}
return (peshobool && kirobool);
}
bool wordAll1test()
{
StringPiece line1 = StringPiece("2-0 3-1 4-2 5-2");
StringPiece line2 = StringPiece("0-0 1-1 2-2 3-3 4-3 6-4 5-5");
std::vector<int> pesho;
bool peshobool = false;
bool kirobool = false;
std::vector<int> kiro;
pesho = splitWordAll1(line1);
kiro = splitWordAll1(line2);
if (pesho[0] == 2 && pesho[1] == 0 && pesho[2] == 3 && pesho[3] == 1 && pesho[4] == 4
&& pesho[5] == 2 && pesho[6] == 5 && pesho[7] == 2 && pesho.size() == 8) {
peshobool = true;
} else {
std::cout << "Processed: " << pesho[0] << "-" << pesho[1] << " " << pesho[2] << "-" << pesho[3] << " "
<< pesho[4] << "-" << pesho[5] << " " << pesho[6] << "-" << pesho[7] << std::endl;
std::cout << "Size is: " << pesho.size() << " Expected: 8." << std::endl;
std::cout << "Expected: " << "2-0 3-1 4-2 5-2" << std::endl;
}
if (kiro[0] == 0 && kiro[1] == 0 && kiro[2] == 1 && kiro[3] == 1 && kiro[4] == 2 && kiro[5] == 2
&& kiro[6] == 3 && kiro[7] == 3 && kiro[8] == 4 && kiro[9] == 3 && kiro[10] == 6 && kiro[11] == 4
&& kiro[12] == 5 && kiro[13] == 5 && kiro.size() == 14) {
kirobool = true;
} else {
std::cout << "Processed: " << kiro[0] << "-" << kiro[1] << " " << kiro[2] << "-" << kiro[3] << " "
<< kiro[4] << "-" << kiro[5] << " " << kiro[6] << "-" << kiro[7] << " " << kiro[8] << "-" << kiro[9]
<< " " << kiro[10] << "-" << kiro[11] << " " << kiro[12] << "-" << kiro[13] << std::endl;
std::cout << "Size is: " << kiro.size() << " Expected: 14" << std::endl;
std::cout << "Expected: " << "0-0 1-1 2-2 3-3 4-3 6-4 5-5" << std::endl;
}
return (peshobool && kirobool);
}
bool wordAll2test()
{
StringPiece line1 = StringPiece("4 9 1");
StringPiece line2 = StringPiece("3255 9 1");
std::vector<int> pesho;
bool peshobool = false;
bool kirobool = false;
std::vector<int> kiro;
pesho = splitWordAll2(line1);
kiro = splitWordAll2(line2);
if (pesho[0] == 4 && pesho[1] == 9 && pesho[2] == 1 && pesho.size() == 3) {
peshobool = true;
} else {
std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << std::endl;
std::cout << "Size: " << pesho.size() << " Expected: 3" << std::endl;
std::cout << "Expected: " << "4 9 1" << std::endl;
}
if (kiro[0] == 3255 && kiro[1] == 9 && kiro[2] == 1 && kiro.size() == 3) {
kirobool = true;
} else {
std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << std::endl;
std::cout << "Size: " << kiro.size() << " Expected: 3" << std::endl;
std::cout << "Expected: " << "3255 9 1" << std::endl;
}
return (peshobool && kirobool);
}
bool test_tokenization()
{
StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
StringPiece line3 = StringPiece("! ! ! ) , ||| ! ! ! ) - , ||| 0.0804289 0.075225 0.0804289 0.00310345 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 ||| 1 1 1");
StringPiece line4 = StringPiece("! ! ! ) ||| ! ! ! ) . ||| 0.0804289 0.177547 0.0268096 0.000872597 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 3 1");
line_text output1 = splitLine(line1);
line_text output2 = splitLine(line2);
line_text output3 = splitLine(line3);
line_text output4 = splitLine(line4);
bool test1 = output1.prob == StringPiece("0.0804289 0.141656 0.0804289 0.443409 2.718");
bool test2 = output2.word_all1 == StringPiece("0-0 1-1 2-2 3-3 4-4 4-5 5-6");
bool test3 = output2.target_phrase == StringPiece("! ! ! ) - , a");
bool test4 = output3.source_phrase == StringPiece("! ! ! ) ,");
bool test5 = output4.word_all2 == StringPiece("1 3 1");
//std::cout << test1 << " " << test2 << " " << test3 << " " << test4 << std::endl;
return (test1 && test2 && test3 && test4 && test5);
}
bool test_linesplitter()
{
StringPiece line1 = StringPiece("! &#93; 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1");
target_text ans1;
ans1 = splitSingleTargetLine(line1);
/* For testing purposes
std::cout << ans1.target_phrase[0] << " " <<ans1.target_phrase[1] << " Size: " << ans1.target_phrase.size() << std::endl;
std::cout << ans1.word_all1[3] << " " << ans1.word_all2[2] << " " << ans1.prob[3] << std::endl; */
return (ans1.target_phrase.size() == 2 && ans1.prob.size() == 5 && ans1.word_all1.size() == 4 && ans1.word_all2.size() == 3);
}
bool test_linessplitter()
{
StringPiece line1 = StringPiece("! &#93; 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1\n\n! ) . proto došlo 0.0738539 7.14446e-06");
StringPiece line2 = StringPiece("! &quot; ) 0.536553 0.75961 0.634108 0.532927 2.718 0-0 1-1 2-2 13 11 8\n! ) . 0.0369269 0.00049839 0.00671399 0.00372884 2.718 0-0 1-1 2-1 2-2 2 11 1\n&quot; ! ) 0.0738539 0.75961 0.00671399 0.532927 2.718 1-0 0-1 2-2 1 11 1\nse ! &quot; ) 0.0738539 0.75961 0.00671399 0.0225211 2.718 0-1 1-2 2-3 1 11 1\n\n! &quot; , a to 0.0738539 0.0894238 0.0738539 0.048");
std::vector<target_text> ans1;
std::vector<target_text> ans2;
ans1 = splitTargetLine(line1);
ans2 = splitTargetLine(line2);
bool sizes = ans1.size() == 1 && ans2.size() == 4;
bool prob = ans1[0].prob[3] == 0.65207 && ans2[1].prob[1] == 0.00049839;
bool word_alls = ans2[0].word_all2[1] == 11 && ans2[3].word_all1[5] == 3;
/* FOr testing
std::cout << ans1.size() << std::endl;
std::cout << ans2.size() << std::endl;
std::cout << ans1[0].prob[3] << std::endl;
std::cout << ans2[1].prob[1] << std::endl;
std::cout << ans2[0].word_all2[1] << std::endl;
std::cout << ans2[3].word_all1[5] << std::endl; */
return sizes && prob && word_alls;
}
int main()
{
if (probabilitiesTest() && wordAll1test() && wordAll2test() && test_tokenization() && test_linesplitter() && test_linessplitter() && test_vectorinsert()) {
std::cout << "All tests pass!" << std::endl;
} else {
std::cout << "Failiure in some tests!" << std::endl;
}
return 1;
}

View File

@ -1,46 +0,0 @@
#include <map> //Map for vocab ids
#include "hash.hh"
#include "vocabid.hh"
int main(int argc, char* argv[])
{
//Create a map and serialize it
std::map<uint64_t, std::string> vocabids;
StringPiece demotext = StringPiece("Demo text with 3 elements");
add_to_map(&vocabids, demotext);
//Serialize map
serialize_map(&vocabids, "/tmp/testmap.bin");
//Read the map and test if the values are the same
std::map<uint64_t, std::string> newmap;
read_map(&newmap, "/tmp/testmap.bin");
//Used hashes
uint64_t num1 = getHash(StringPiece("Demo"));
uint64_t num2 = getVocabID("text");
uint64_t num3 = getHash(StringPiece("with"));
uint64_t num4 = getVocabID("3");
uint64_t num5 = getHash(StringPiece("elements"));
uint64_t num6 = 0;
//Tests
bool test1 = getStringFromID(&newmap, num1) == getStringFromID(&vocabids, num1);
bool test2 = getStringFromID(&newmap, num2) == getStringFromID(&vocabids, num2);
bool test3 = getStringFromID(&newmap, num3) == getStringFromID(&vocabids, num3);
bool test4 = getStringFromID(&newmap, num4) == getStringFromID(&vocabids, num4);
bool test5 = getStringFromID(&newmap, num5) == getStringFromID(&vocabids, num5);
bool test6 = getStringFromID(&newmap, num6) == getStringFromID(&vocabids, num6);
if (test1 && test2 && test3 && test4 && test5 && test6) {
std::cout << "Map was successfully written and read!" << std::endl;
} else {
std::cout << "Error! " << test1 << " " << test2 << " " << test3 << " " << test4 << " " << test5 << " " << test6 << std::endl;
}
return 1;
}

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use Getopt::Std;
getopts('q');

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
use strict;
my $file = shift(@ARGV);

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
#input hindi word urdu word, delete all those entries that have number on any side
use utf8;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use utf8;
require Encode;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use utf8;
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use utf8;
###############################################

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# $Id$
# Reads a source and hypothesis file and counts equal tokens. Some of these

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Display OOV rate of a test set against a training corpus or a phrase table.
# Ondrej Bojar

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
#
# Author : Loic BARRAULT
# Script to convert MOSES searchgraph to DOT format

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
#show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
#by Philipp Koehn, de-augmented by Evan Herbst

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Collects and prints all n-grams that appear in the given corpus both
# tokenized as well as untokenized.
# Ondrej Bojar

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# runs Moses many times changing the values of one weight, all others fixed
# nbest lists are always produced to allow for comparison of real and
# 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring

View File

@ -185,7 +185,7 @@ lowercase
default-name: lm/lowercased
pass-unless: output-lowercaser
ignore-if: output-truecaser
only-factor-0: yes
#only-factor-0: yes
template: $output-lowercaser < IN > OUT
parallelizable: yes
truecase

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# Experiment Management System
# Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
@ -18,7 +18,18 @@ sub trim($)
my $host = `hostname`; chop($host);
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL_STEP,$FINAL_OUT,$VERBOSE,$IGNORE_TIME,$DELETE_CRASHED,$DELETE_VERSION);
my ($CONFIG_FILE,
$EXECUTE,
$NO_GRAPH,
$CONTINUE,
$FINAL_STEP,
$FINAL_OUT,
$VERBOSE,
$IGNORE_TIME,
$DELETE_CRASHED,
$DELETE_VERSION
);
my $SLEEP = 2;
my $META = "$RealBin/experiment.meta";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use IPC::Open3;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use FindBin qw($RealBin);

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use strict;
use File::Temp qw/ tempfile tempdir /;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# Based on Preprocessor written by Philipp Koehn

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# experiment.perl support script
# get filtered rule and reordering tables and place them into a configuration file

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Date::Parse;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
binmode( STDIN, ":utf8" );
binmode( STDOUT, ":utf8" );

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
#extract-factors.pl: extract only the desired factors from a factored corpus

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl -w
#!/usr/bin/env perl
# example
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Converts AT&T FSA format to 'python lattice format'.
# Note that the input FSA needs to be epsilon-free and topologically sorted.
# This script checks for topological sortedness.

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use utf8;

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
# example
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
#lopar2pos: extract POSs from LOPAR output

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
# $Id$
#######################

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use utf8;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use warnings;
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
use strict;

View File

@ -1,4 +1,5 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
package ph_numbers;
# Script to recognize and replace numbers in Moses training corpora

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
# $Id$
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl -w
#!/usr/bin/env perl
# example
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use strict;
use File::Basename;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# $Id$
# convert a phrase-table with alignment in Moses' dead-end format

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,5 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Uses Google AJAX API to collect many translations, i.e. create a parallel
# corpus of Google translations.
# Expects one sentence per line, not tokenized!

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
#retain lines in clean.lines-retained.1
use strict;

View File

@ -1,4 +1,5 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Script implemented by Pranava Swaroop Madhyastha (a student at Charles
# University, UFAL)

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use Getopt::Long "GetOptions";

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id$
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
my ($results, $truth) = @ARGV;

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl -w
#!/usr/bin/env perl
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#! /usr/bin/perl
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
# Sample De-Tokenizer

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;

Some files were not shown because too many files have changed in this diff Show More