mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
44c5ae344f
@ -5,11 +5,14 @@ namespace TOKENIZER_NAMESPACE {
|
||||
#endif
|
||||
|
||||
Parameters::Parameters()
|
||||
: cfg_path(0)
|
||||
: nthreads(0)
|
||||
, chunksize(2000)
|
||||
, cfg_path(0)
|
||||
, verbose_p(false)
|
||||
, detag_p(false)
|
||||
, alltag_p(false)
|
||||
, escape_p(true)
|
||||
, entities_p(false)
|
||||
, escape_p(false)
|
||||
, aggro_p(false)
|
||||
, supersub_p(false)
|
||||
, url_p(true)
|
||||
@ -23,6 +26,10 @@ Parameters::Parameters()
|
||||
, refined_p(false)
|
||||
, unescape_p(false)
|
||||
, drop_bad_p(false)
|
||||
, split_p(false)
|
||||
, notokenization_p(false)
|
||||
, para_marks_p(false)
|
||||
, split_breaks_p(false)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -12,10 +12,13 @@ struct Parameters
|
||||
std::string lang_iso;
|
||||
std::vector<std::string> args;
|
||||
std::string out_path;
|
||||
int nthreads;
|
||||
int chunksize;
|
||||
const char *cfg_path;
|
||||
bool verbose_p;
|
||||
bool detag_p;
|
||||
bool alltag_p;
|
||||
bool entities_p;
|
||||
bool escape_p;
|
||||
bool aggro_p;
|
||||
bool supersub_p;
|
||||
@ -30,6 +33,10 @@ struct Parameters
|
||||
bool refined_p;
|
||||
bool unescape_p;
|
||||
bool drop_bad_p;
|
||||
bool split_p;
|
||||
bool notokenization_p;
|
||||
bool para_marks_p;
|
||||
bool split_breaks_p;
|
||||
|
||||
Parameters();
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -26,12 +26,37 @@ class Tokenizer {
|
||||
|
||||
private:
|
||||
|
||||
static std::string cfg_dir;
|
||||
typedef enum {
|
||||
empty = 0,
|
||||
blank,
|
||||
upper, // upper case
|
||||
letta, // extended word class (includes number, hyphen)
|
||||
numba,
|
||||
hyphn,
|
||||
stops, // blank to stops are "extended word class" variants
|
||||
quote, // init & fini = {',"}
|
||||
pinit, // init (includes INVERT_*)
|
||||
pfini, // fini
|
||||
pfpct, // fini + pct
|
||||
marks,
|
||||
limit
|
||||
} charclass_t;
|
||||
|
||||
std::size_t nthreads;
|
||||
std::size_t chunksize;
|
||||
std::string cfg_dir;
|
||||
|
||||
// non-breaking prefixes (numeric) utf8
|
||||
std::set<std::string> nbpre_num_set;
|
||||
// non-breaking prefixes (other) utf8
|
||||
std::set<std::string> nbpre_gen_set;
|
||||
|
||||
// non-breaking prefixes (numeric) ucs4
|
||||
std::set<std::wstring> nbpre_num_ucs4;
|
||||
// non-breaking prefixes (other) ucs4
|
||||
std::set<std::wstring> nbpre_gen_ucs4;
|
||||
|
||||
// compiled protected patterns
|
||||
std::vector<re2::RE2 *> prot_pat_vec;
|
||||
|
||||
protected:
|
||||
@ -42,6 +67,7 @@ protected:
|
||||
bool latin_p; // is lang_iso "fr" or "it"
|
||||
bool skip_xml_p;
|
||||
bool skip_alltags_p;
|
||||
bool entities_p;
|
||||
bool escape_p;
|
||||
bool unescape_p;
|
||||
bool aggressive_hyphen_p;
|
||||
@ -54,20 +80,44 @@ protected:
|
||||
bool narrow_kana_p;
|
||||
bool refined_p;
|
||||
bool drop_bad_p;
|
||||
bool splits_p;
|
||||
bool verbose_p;
|
||||
bool para_marks_p;
|
||||
bool split_breaks_p;
|
||||
|
||||
// return counts of general and numeric prefixes loaded
|
||||
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
|
||||
|
||||
// escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
|
||||
bool escape(std::string& inplace);
|
||||
|
||||
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
|
||||
void protected_tokenize(std::string& inplace);
|
||||
|
||||
public:
|
||||
// used for boost::thread
|
||||
struct VectorTokenizerCallable {
|
||||
Tokenizer *tokenizer;
|
||||
std::vector<std::string>& in;
|
||||
std::vector<std::string>& out;
|
||||
|
||||
VectorTokenizerCallable(Tokenizer *_tokenizer,
|
||||
std::vector<std::string>& _in,
|
||||
std::vector<std::string>& _out)
|
||||
: tokenizer(_tokenizer)
|
||||
, in(_in)
|
||||
, out(_out) {
|
||||
};
|
||||
|
||||
// cfg_dir is assumed shared by all languages
|
||||
static void set_config_dir(const std::string& _cfg_dir);
|
||||
void operator()() {
|
||||
out.resize(in.size());
|
||||
for (std::size_t ii = 0; ii < in.size(); ++ii)
|
||||
if (in[ii].empty())
|
||||
out[ii] = in[ii];
|
||||
else if (tokenizer->penn_p)
|
||||
out[ii] = tokenizer->penn_tokenize(in[ii]);
|
||||
else
|
||||
out[ii] = tokenizer->quik_tokenize(in[ii]);
|
||||
};
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
Tokenizer(); // UNIMPL
|
||||
|
||||
@ -78,21 +128,46 @@ public:
|
||||
~Tokenizer();
|
||||
|
||||
// required before other methods, may throw
|
||||
void init();
|
||||
void init(const char *cfg_dir_path = 0);
|
||||
|
||||
// streaming tokenizer reads from is, writes to os, preserving line breaks
|
||||
void set_config_dir(const std::string& _cfg_dir);
|
||||
|
||||
// required after processing a contiguous sequence of lines when sentence splitting is on
|
||||
void reset();
|
||||
|
||||
// simultaneous sentence splitting not yet implemented
|
||||
bool splitting() const { return splits_p; }
|
||||
|
||||
// escapes chars the set &|"'<> after tokenization (moses special characters)
|
||||
bool escape(std::string& inplace);
|
||||
|
||||
// used in detokenizer, converts entities into characters
|
||||
// if escape_p is set, does not unescape moses special tokens, thus
|
||||
// escape_p and unescape_p can be used together usefully
|
||||
bool unescape(std::string& inplace);
|
||||
|
||||
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
|
||||
std::size_t tokenize(std::istream& is, std::ostream& os);
|
||||
|
||||
// tokenize padded line buffer to return string
|
||||
std::string tokenize(const std::string& buf);
|
||||
// quik-tokenize padded line buffer to return string
|
||||
std::string quik_tokenize(const std::string& buf);
|
||||
|
||||
// penn-tokenize padded line buffer to return string // untested
|
||||
std::string penn_tokenize(const std::string& buf);
|
||||
|
||||
// select-tokenize padded line buffer to return string
|
||||
std::string tokenize(const std::string& buf) {
|
||||
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
|
||||
}
|
||||
|
||||
// tokenize with output argument
|
||||
void tokenize(const std::string& buf, std::string& outs) {
|
||||
outs = tokenize(buf);
|
||||
}
|
||||
|
||||
// tokenize to a vector
|
||||
std::vector<std::string> tokens(const std::string& in) {
|
||||
std::istringstream tokss(tokenize(in));
|
||||
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
|
||||
std::vector<std::string> outv;
|
||||
std::copy(std::istream_iterator<std::string>(tokss),
|
||||
std::istream_iterator<std::string>(),
|
||||
@ -117,6 +192,12 @@ public:
|
||||
return detokenize(oss.str());
|
||||
}
|
||||
|
||||
// split a string on sentence boundaries (approximately)
|
||||
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
|
||||
|
||||
// split sentences from input stream and write one per line on output stream
|
||||
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
|
||||
|
||||
}; // end class Tokenizer
|
||||
|
||||
#ifdef TOKENIZER_NAMESPACE
|
||||
|
@ -16,10 +16,12 @@ usage(const char *path)
|
||||
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
|
||||
std::cerr << " -a -- aggressive hyphenization" << std::endl;
|
||||
std::cerr << " -b -- drop bad bytes" << std::endl;
|
||||
std::cerr << " -B -- splitter will split on linebreak" << std::endl;
|
||||
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
|
||||
std::cerr << " -d -- downcase" << std::endl;
|
||||
std::cerr << " -D -- detokenize" << std::endl;
|
||||
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
|
||||
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
|
||||
std::cerr << " -k -- narrow kana" << std::endl;
|
||||
std::cerr << " -n -- narrow latin" << std::endl;
|
||||
std::cerr << " -N -- normalize" << std::endl;
|
||||
@ -27,12 +29,16 @@ usage(const char *path)
|
||||
std::cerr << " -p -- penn treebank style" << std::endl;
|
||||
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
|
||||
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
|
||||
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
|
||||
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
|
||||
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
|
||||
std::cerr << " -u -- disable url handling" << std::endl;
|
||||
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
|
||||
std::cerr << " -v -- verbose" << std::endl;
|
||||
std::cerr << " -w -- word filter" << std::endl;
|
||||
std::cerr << " -x -- skip xml tag lines" << std::endl;
|
||||
std::cerr << " -y -- skip all xml tags" << std::endl;
|
||||
std::cerr << " -X -- split only, with <P> marks" << std::endl;
|
||||
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
|
||||
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
|
||||
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
|
||||
@ -83,15 +89,35 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
||||
int nlines = 0;
|
||||
std::string line;
|
||||
while (ifs.good() && std::getline(ifs,line)) {
|
||||
if (line.empty()) continue;
|
||||
if (line.empty())
|
||||
continue;
|
||||
std::vector<std::string> tokens(tize.tokens(line));
|
||||
int count = 0;
|
||||
bool was_break = false;
|
||||
|
||||
for (auto& token: tokens) {
|
||||
if (token.empty()) {
|
||||
if (count || was_break) {
|
||||
ofs << std::endl;
|
||||
count = 0;
|
||||
nlines++;
|
||||
was_break = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
was_break = false;
|
||||
|
||||
std::string word(token_word(token));
|
||||
if (word.empty()) continue;
|
||||
ofs << word << ' ';
|
||||
count++;
|
||||
if (word.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count++) {
|
||||
ofs << ' ';
|
||||
}
|
||||
ofs << word;
|
||||
}
|
||||
|
||||
if (count) {
|
||||
ofs << std::endl;
|
||||
nlines++;
|
||||
@ -104,13 +130,16 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
||||
int main(int ac, char **av)
|
||||
{
|
||||
int rc = 0;
|
||||
Parameters params;
|
||||
Parameters params;
|
||||
|
||||
const char *prog = av[0];
|
||||
bool next_cfg_p = false;
|
||||
bool next_output_p = false;
|
||||
bool next_threads_p = false;
|
||||
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
|
||||
|
||||
if (!detokenize_p)
|
||||
params.split_p = std::strstr(av[0],"splitter") != 0;
|
||||
|
||||
while (++av,--ac) {
|
||||
if (**av == '-') {
|
||||
switch (av[0][1]) {
|
||||
@ -120,6 +149,9 @@ int main(int ac, char **av)
|
||||
case 'b':
|
||||
params.drop_bad_p = true;
|
||||
break;
|
||||
case 'B':
|
||||
params.split_breaks_p = true;
|
||||
break;
|
||||
case 'c':
|
||||
next_cfg_p = true;
|
||||
break;
|
||||
@ -127,10 +159,13 @@ int main(int ac, char **av)
|
||||
params.downcase_p = true;
|
||||
break;
|
||||
case 'D':
|
||||
detokenize_p = true;
|
||||
detokenize_p = !detokenize_p;
|
||||
break;
|
||||
case 'e':
|
||||
params.escape_p = false;
|
||||
params.escape_p = !params.escape_p;
|
||||
break;
|
||||
case 'E':
|
||||
params.entities_p = true;
|
||||
break;
|
||||
case 'h':
|
||||
usage(prog);
|
||||
@ -156,6 +191,16 @@ int main(int ac, char **av)
|
||||
case 's':
|
||||
params.supersub_p = true;
|
||||
break;
|
||||
case 'S':
|
||||
params.split_p = !params.split_p;
|
||||
break;
|
||||
case 'T':
|
||||
params.notokenization_p = true;
|
||||
params.para_marks_p = false;
|
||||
break;
|
||||
case 't':
|
||||
next_threads_p = true;
|
||||
break;
|
||||
case 'U':
|
||||
params.unescape_p = true;
|
||||
break;
|
||||
@ -171,6 +216,10 @@ int main(int ac, char **av)
|
||||
case 'x':
|
||||
params.detag_p = true;
|
||||
break;
|
||||
case 'X':
|
||||
params.notokenization_p = true;
|
||||
params.para_marks_p = true;
|
||||
break;
|
||||
case 'y':
|
||||
params.alltag_p = true;
|
||||
break;
|
||||
@ -181,7 +230,7 @@ int main(int ac, char **av)
|
||||
std::cerr << "Unknown option: " << *av << std::endl;
|
||||
::exit(1);
|
||||
}
|
||||
} else if (params.lang_iso.empty() && strlen(*av) == 2) {
|
||||
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
|
||||
params.lang_iso = *av;
|
||||
} else if (next_output_p) {
|
||||
next_output_p = false;
|
||||
@ -189,6 +238,14 @@ int main(int ac, char **av)
|
||||
} else if (next_cfg_p) {
|
||||
next_cfg_p = false;
|
||||
params.cfg_path = *av;
|
||||
} else if (next_threads_p) {
|
||||
next_threads_p = false;
|
||||
char *comma = strchr(*av,',');
|
||||
if (comma) {
|
||||
*comma++ = 0;
|
||||
params.chunksize = std::strtoul(comma,0,0);
|
||||
}
|
||||
params.nthreads = std::strtoul(*av,0,0);
|
||||
} else {
|
||||
params.args.push_back(std::string(*av));
|
||||
}
|
||||
@ -230,7 +287,6 @@ int main(int ac, char **av)
|
||||
if (params.verbose_p) {
|
||||
std::cerr << "config path: " << params.cfg_path << std::endl;
|
||||
}
|
||||
Tokenizer::set_config_dir(std::string(params.cfg_path));
|
||||
}
|
||||
|
||||
std::unique_ptr<std::ofstream> pofs = 0;
|
||||
@ -244,16 +300,16 @@ int main(int ac, char **av)
|
||||
|
||||
Tokenizer tize(params);
|
||||
tize.init();
|
||||
size_t nlines = 0;
|
||||
std::pair<std::size_t,std::size_t> plines = { 0, 0 };
|
||||
|
||||
if (params.words_p) {
|
||||
if (params.args.empty()) {
|
||||
nlines += copy_words(tize,std::cin,ofs);
|
||||
plines.first += copy_words(tize,std::cin,ofs);
|
||||
} else {
|
||||
for (std::string& arg : params.args) {
|
||||
try {
|
||||
std::ifstream ifs(arg.c_str());
|
||||
nlines += copy_words(tize,ifs,ofs);
|
||||
plines.first += copy_words(tize,ifs,ofs);
|
||||
} catch (...) {
|
||||
std::cerr << "Exception extracting words from path " << arg << std::endl;
|
||||
}
|
||||
@ -261,18 +317,22 @@ int main(int ac, char **av)
|
||||
}
|
||||
} else if (params.args.empty()) {
|
||||
if (detokenize_p) {
|
||||
nlines = tize.detokenize(std::cin,ofs);
|
||||
plines.first = tize.detokenize(std::cin,ofs);
|
||||
} else if (params.notokenization_p) {
|
||||
plines = tize.splitter(std::cin,ofs);
|
||||
} else {
|
||||
nlines = tize.tokenize(std::cin,ofs);
|
||||
plines.first = tize.tokenize(std::cin,ofs);
|
||||
}
|
||||
} else {
|
||||
for (std::string& arg : params.args) {
|
||||
try {
|
||||
std::ifstream ifs(arg.c_str());
|
||||
if (detokenize_p) {
|
||||
nlines = tize.detokenize(ifs,ofs);
|
||||
plines.first = tize.detokenize(ifs,ofs);
|
||||
} else if (params.notokenization_p) {
|
||||
plines = tize.splitter(ifs,ofs);
|
||||
} else {
|
||||
nlines = tize.tokenize(ifs,ofs);
|
||||
plines.first = tize.tokenize(ifs,ofs);
|
||||
}
|
||||
} catch (...) {
|
||||
std::cerr << "Exception tokenizing from path " << arg << std::endl;
|
||||
@ -280,9 +340,12 @@ int main(int ac, char **av)
|
||||
}
|
||||
}
|
||||
|
||||
if (params.verbose_p)
|
||||
std::cerr << "%%% " << nlines << " lines." << std::endl;
|
||||
|
||||
if (params.verbose_p) {
|
||||
std::cerr << "%%% " << plines.first << " lines." << std::endl;
|
||||
if (plines.second) {
|
||||
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "EnOpenNLPChunker.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
@ -28,10 +29,11 @@ EnOpenNLPChunker::~EnOpenNLPChunker() {
|
||||
|
||||
void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector<string> &filterList)
|
||||
{
|
||||
const boost::filesystem::path
|
||||
inPath = boost::filesystem::unique_path(),
|
||||
outPath = boost::filesystem::unique_path();
|
||||
// read all input to a temp file
|
||||
char *ptr = tmpnam(NULL);
|
||||
string inStr(ptr);
|
||||
ofstream inFile(ptr);
|
||||
ofstream inFile(inPath.c_str());
|
||||
|
||||
string line;
|
||||
while (getline(in, line)) {
|
||||
@ -40,21 +42,18 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
|
||||
}
|
||||
inFile.close();
|
||||
|
||||
ptr = tmpnam(NULL);
|
||||
string outStr(ptr);
|
||||
|
||||
// execute chunker
|
||||
string cmd = "cat " + inStr + " | "
|
||||
string cmd = "cat " + inPath.native() + " | "
|
||||
+ m_openNLPPath + "/bin/opennlp POSTagger "
|
||||
+ m_openNLPPath + "/models/en-pos-maxent.bin | "
|
||||
+ m_openNLPPath + "/bin/opennlp ChunkerME "
|
||||
+ m_openNLPPath + "/models/en-chunker.bin > "
|
||||
+ outStr;
|
||||
+ outPath.native();
|
||||
//g << "Executing:" << cmd << endl;
|
||||
int ret = system(cmd.c_str());
|
||||
|
||||
// read result of chunker and output as Moses xml trees
|
||||
ifstream outFile(outStr.c_str());
|
||||
ifstream outFile(outPath.c_str());
|
||||
|
||||
size_t lineNum = 0;
|
||||
while (getline(outFile, line)) {
|
||||
@ -66,8 +65,8 @@ void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector
|
||||
outFile.close();
|
||||
|
||||
// clean up temporary files
|
||||
remove(inStr.c_str());
|
||||
remove(outStr.c_str());
|
||||
remove(inPath.c_str());
|
||||
remove(outPath.c_str());
|
||||
}
|
||||
|
||||
void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector<string> &filterList)
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include <string>
|
||||
#include <boost/thread/tss.hpp>
|
||||
|
||||
#include "Classifier.h"
|
||||
#include "vw/Classifier.h"
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/Util.h"
|
||||
#include "moses/FF/StatelessFeatureFunction.h"
|
||||
|
@ -70,10 +70,9 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
inputPath.SetTargetPhrases(*this, tpColl, NULL);
|
||||
} else {
|
||||
// TRANSLITERATE
|
||||
char *ptr = tmpnam(NULL);
|
||||
string inFile(ptr);
|
||||
ptr = tmpnam(NULL);
|
||||
string outDir(ptr);
|
||||
const boost::filesystem::path
|
||||
inFile = boost::filesystem::unique_path(),
|
||||
outDir = boost::filesystem::unique_path();
|
||||
|
||||
ofstream inStream(inFile.c_str());
|
||||
inStream << sourcePhrase.ToString() << endl;
|
||||
@ -85,14 +84,14 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input
|
||||
" --external-bin-dir " + m_externalDir +
|
||||
" --input-extension " + m_inputLang +
|
||||
" --output-extension " + m_outputLang +
|
||||
" --oov-file " + inFile +
|
||||
" --out-dir " + outDir;
|
||||
" --oov-file " + inFile.native() +
|
||||
" --out-dir " + outDir.native();
|
||||
|
||||
int ret = system(cmd.c_str());
|
||||
UTIL_THROW_IF2(ret != 0, "Transliteration script error");
|
||||
|
||||
TargetPhraseCollection *tpColl = new TargetPhraseCollection();
|
||||
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
|
||||
vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir.native());
|
||||
vector<TargetPhrase*>::const_iterator iter;
|
||||
for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
|
||||
TargetPhrase *tp = *iter;
|
||||
|
@ -1,206 +0,0 @@
|
||||
#include "line_splitter.hh"
|
||||
|
||||
bool test_vectorinsert()
|
||||
{
|
||||
StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
|
||||
StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
|
||||
line_text output = splitLine(line1);
|
||||
line_text output2 = splitLine(line2);
|
||||
|
||||
//Init container vector and iterator.
|
||||
std::vector<char> container;
|
||||
container.reserve(10000); //Reserve vector
|
||||
std::vector<char>::iterator it = container.begin();
|
||||
std::pair<std::vector<char>::iterator, int> binary_append_ret; //Return values from vector_append
|
||||
|
||||
//Put a value into the vector
|
||||
binary_append_ret = vector_append(&output, &container, it, false);
|
||||
it = binary_append_ret.first;
|
||||
binary_append_ret = vector_append(&output2, &container, it, false);
|
||||
it = binary_append_ret.first;
|
||||
|
||||
std::string test(container.begin(), container.end());
|
||||
std::string should_be = "! ! ! ! 0.0804289 0.141656 0.0804289 0.443409 2.718 0-0 1-1 2-2 3-3 1 1 1! ! ! ) - , a 0.0804289 0.0257627 0.0804289 0.00146736 2.718 0-0 1-1 2-2 3-3 4-4 4-5 5-6 1 1 1";
|
||||
if (test == should_be) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool probabilitiesTest()
|
||||
{
|
||||
StringPiece line1 = StringPiece("0.536553 0.75961 0.634108 0.532927 2.718");
|
||||
StringPiece line2 = StringPiece("1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718");
|
||||
|
||||
std::vector<double> pesho;
|
||||
bool peshobool = false;
|
||||
bool kirobool = false;
|
||||
std::vector<double> kiro;
|
||||
|
||||
pesho = splitProbabilities(line1);
|
||||
kiro = splitProbabilities(line2);
|
||||
|
||||
if (pesho[0] == 0.536553 && pesho[1] == 0.75961 && pesho[2] == 0.634108 && pesho[3] == 0.532927 && pesho[4] == 2.718 && pesho.size() == 5) {
|
||||
peshobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << " " << pesho[3] << " " << pesho[4] << std::endl;
|
||||
std::cout << "Size is: " << pesho.size() << " Expected 5." << std::endl;
|
||||
std::cout << "Expected: " << "0.536553 0.75961 0.634108 0.532927 2.718" << std::endl;
|
||||
}
|
||||
|
||||
if (kiro[0] == 1.42081e-05 && kiro[1] == 3.91895e-09 && kiro[2] == 0.0738539 && kiro[3] == 0.749514 && kiro[4] == 2.718 && kiro.size() == 5) {
|
||||
kirobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << " " << kiro[3] << " " << kiro[4] << std::endl;
|
||||
std::cout << "Size is: " << kiro.size() << " Expected 5." << std::endl;
|
||||
std::cout << "Expected: " << "1.42081e-05 3.91895e-09 0.0738539 0.749514 2.718" << std::endl;
|
||||
}
|
||||
|
||||
return (peshobool && kirobool);
|
||||
}
|
||||
|
||||
bool wordAll1test()
|
||||
{
|
||||
StringPiece line1 = StringPiece("2-0 3-1 4-2 5-2");
|
||||
StringPiece line2 = StringPiece("0-0 1-1 2-2 3-3 4-3 6-4 5-5");
|
||||
|
||||
std::vector<int> pesho;
|
||||
bool peshobool = false;
|
||||
bool kirobool = false;
|
||||
std::vector<int> kiro;
|
||||
|
||||
pesho = splitWordAll1(line1);
|
||||
kiro = splitWordAll1(line2);
|
||||
|
||||
if (pesho[0] == 2 && pesho[1] == 0 && pesho[2] == 3 && pesho[3] == 1 && pesho[4] == 4
|
||||
&& pesho[5] == 2 && pesho[6] == 5 && pesho[7] == 2 && pesho.size() == 8) {
|
||||
peshobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << pesho[0] << "-" << pesho[1] << " " << pesho[2] << "-" << pesho[3] << " "
|
||||
<< pesho[4] << "-" << pesho[5] << " " << pesho[6] << "-" << pesho[7] << std::endl;
|
||||
std::cout << "Size is: " << pesho.size() << " Expected: 8." << std::endl;
|
||||
std::cout << "Expected: " << "2-0 3-1 4-2 5-2" << std::endl;
|
||||
}
|
||||
|
||||
if (kiro[0] == 0 && kiro[1] == 0 && kiro[2] == 1 && kiro[3] == 1 && kiro[4] == 2 && kiro[5] == 2
|
||||
&& kiro[6] == 3 && kiro[7] == 3 && kiro[8] == 4 && kiro[9] == 3 && kiro[10] == 6 && kiro[11] == 4
|
||||
&& kiro[12] == 5 && kiro[13] == 5 && kiro.size() == 14) {
|
||||
kirobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << kiro[0] << "-" << kiro[1] << " " << kiro[2] << "-" << kiro[3] << " "
|
||||
<< kiro[4] << "-" << kiro[5] << " " << kiro[6] << "-" << kiro[7] << " " << kiro[8] << "-" << kiro[9]
|
||||
<< " " << kiro[10] << "-" << kiro[11] << " " << kiro[12] << "-" << kiro[13] << std::endl;
|
||||
std::cout << "Size is: " << kiro.size() << " Expected: 14" << std::endl;
|
||||
std::cout << "Expected: " << "0-0 1-1 2-2 3-3 4-3 6-4 5-5" << std::endl;
|
||||
}
|
||||
|
||||
return (peshobool && kirobool);
|
||||
}
|
||||
|
||||
bool wordAll2test()
|
||||
{
|
||||
StringPiece line1 = StringPiece("4 9 1");
|
||||
StringPiece line2 = StringPiece("3255 9 1");
|
||||
|
||||
std::vector<int> pesho;
|
||||
bool peshobool = false;
|
||||
bool kirobool = false;
|
||||
std::vector<int> kiro;
|
||||
|
||||
pesho = splitWordAll2(line1);
|
||||
kiro = splitWordAll2(line2);
|
||||
|
||||
if (pesho[0] == 4 && pesho[1] == 9 && pesho[2] == 1 && pesho.size() == 3) {
|
||||
peshobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << pesho[0] << " " << pesho[1] << " " << pesho[2] << std::endl;
|
||||
std::cout << "Size: " << pesho.size() << " Expected: 3" << std::endl;
|
||||
std::cout << "Expected: " << "4 9 1" << std::endl;
|
||||
}
|
||||
|
||||
if (kiro[0] == 3255 && kiro[1] == 9 && kiro[2] == 1 && kiro.size() == 3) {
|
||||
kirobool = true;
|
||||
} else {
|
||||
std::cout << "Processed: " << kiro[0] << " " << kiro[1] << " " << kiro[2] << std::endl;
|
||||
std::cout << "Size: " << kiro.size() << " Expected: 3" << std::endl;
|
||||
std::cout << "Expected: " << "3255 9 1" << std::endl;
|
||||
}
|
||||
|
||||
return (peshobool && kirobool);
|
||||
|
||||
}
|
||||
|
||||
bool test_tokenization()
|
||||
{
|
||||
StringPiece line1 = StringPiece("! ! ! ! ||| ! ! ! ! ||| 0.0804289 0.141656 0.0804289 0.443409 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 1 1");
|
||||
StringPiece line2 = StringPiece("! ! ! ) , has ||| ! ! ! ) - , a ||| 0.0804289 0.0257627 0.0804289 0.00146736 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 5-6 ||| 1 1 1");
|
||||
StringPiece line3 = StringPiece("! ! ! ) , ||| ! ! ! ) - , ||| 0.0804289 0.075225 0.0804289 0.00310345 2.718 ||| 0-0 1-1 2-2 3-3 4-4 4-5 ||| 1 1 1");
|
||||
StringPiece line4 = StringPiece("! ! ! ) ||| ! ! ! ) . ||| 0.0804289 0.177547 0.0268096 0.000872597 2.718 ||| 0-0 1-1 2-2 3-3 ||| 1 3 1");
|
||||
|
||||
line_text output1 = splitLine(line1);
|
||||
line_text output2 = splitLine(line2);
|
||||
line_text output3 = splitLine(line3);
|
||||
line_text output4 = splitLine(line4);
|
||||
|
||||
bool test1 = output1.prob == StringPiece("0.0804289 0.141656 0.0804289 0.443409 2.718");
|
||||
bool test2 = output2.word_all1 == StringPiece("0-0 1-1 2-2 3-3 4-4 4-5 5-6");
|
||||
bool test3 = output2.target_phrase == StringPiece("! ! ! ) - , a");
|
||||
bool test4 = output3.source_phrase == StringPiece("! ! ! ) ,");
|
||||
bool test5 = output4.word_all2 == StringPiece("1 3 1");
|
||||
|
||||
//std::cout << test1 << " " << test2 << " " << test3 << " " << test4 << std::endl;
|
||||
|
||||
return (test1 && test2 && test3 && test4 && test5);
|
||||
|
||||
}
|
||||
|
||||
bool test_linesplitter()
|
||||
{
|
||||
StringPiece line1 = StringPiece("! ] 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1");
|
||||
target_text ans1;
|
||||
ans1 = splitSingleTargetLine(line1);
|
||||
|
||||
/* For testing purposes
|
||||
std::cout << ans1.target_phrase[0] << " " <<ans1.target_phrase[1] << " Size: " << ans1.target_phrase.size() << std::endl;
|
||||
std::cout << ans1.word_all1[3] << " " << ans1.word_all2[2] << " " << ans1.prob[3] << std::endl; */
|
||||
|
||||
return (ans1.target_phrase.size() == 2 && ans1.prob.size() == 5 && ans1.word_all1.size() == 4 && ans1.word_all2.size() == 3);
|
||||
}
|
||||
|
||||
bool test_linessplitter()
|
||||
{
|
||||
StringPiece line1 = StringPiece("! ] 0.0738539 0.901133 0.0738539 0.65207 2.718 0-0 1-1 1 1 1\n\n! ) . proto došlo 0.0738539 7.14446e-06");
|
||||
StringPiece line2 = StringPiece("! " ) 0.536553 0.75961 0.634108 0.532927 2.718 0-0 1-1 2-2 13 11 8\n! ) . 0.0369269 0.00049839 0.00671399 0.00372884 2.718 0-0 1-1 2-1 2-2 2 11 1\n" ! ) 0.0738539 0.75961 0.00671399 0.532927 2.718 1-0 0-1 2-2 1 11 1\nse ! " ) 0.0738539 0.75961 0.00671399 0.0225211 2.718 0-1 1-2 2-3 1 11 1\n\n! " , a to 0.0738539 0.0894238 0.0738539 0.048");
|
||||
|
||||
std::vector<target_text> ans1;
|
||||
std::vector<target_text> ans2;
|
||||
|
||||
ans1 = splitTargetLine(line1);
|
||||
ans2 = splitTargetLine(line2);
|
||||
|
||||
bool sizes = ans1.size() == 1 && ans2.size() == 4;
|
||||
bool prob = ans1[0].prob[3] == 0.65207 && ans2[1].prob[1] == 0.00049839;
|
||||
bool word_alls = ans2[0].word_all2[1] == 11 && ans2[3].word_all1[5] == 3;
|
||||
|
||||
/* FOr testing
|
||||
std::cout << ans1.size() << std::endl;
|
||||
std::cout << ans2.size() << std::endl;
|
||||
std::cout << ans1[0].prob[3] << std::endl;
|
||||
std::cout << ans2[1].prob[1] << std::endl;
|
||||
std::cout << ans2[0].word_all2[1] << std::endl;
|
||||
std::cout << ans2[3].word_all1[5] << std::endl; */
|
||||
|
||||
return sizes && prob && word_alls;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
if (probabilitiesTest() && wordAll1test() && wordAll2test() && test_tokenization() && test_linesplitter() && test_linessplitter() && test_vectorinsert()) {
|
||||
std::cout << "All tests pass!" << std::endl;
|
||||
} else {
|
||||
std::cout << "Failiure in some tests!" << std::endl;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
#include <map> //Map for vocab ids
|
||||
|
||||
#include "hash.hh"
|
||||
#include "vocabid.hh"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
//Create a map and serialize it
|
||||
std::map<uint64_t, std::string> vocabids;
|
||||
StringPiece demotext = StringPiece("Demo text with 3 elements");
|
||||
add_to_map(&vocabids, demotext);
|
||||
//Serialize map
|
||||
serialize_map(&vocabids, "/tmp/testmap.bin");
|
||||
|
||||
//Read the map and test if the values are the same
|
||||
std::map<uint64_t, std::string> newmap;
|
||||
read_map(&newmap, "/tmp/testmap.bin");
|
||||
|
||||
//Used hashes
|
||||
uint64_t num1 = getHash(StringPiece("Demo"));
|
||||
uint64_t num2 = getVocabID("text");
|
||||
uint64_t num3 = getHash(StringPiece("with"));
|
||||
uint64_t num4 = getVocabID("3");
|
||||
uint64_t num5 = getHash(StringPiece("elements"));
|
||||
uint64_t num6 = 0;
|
||||
|
||||
//Tests
|
||||
bool test1 = getStringFromID(&newmap, num1) == getStringFromID(&vocabids, num1);
|
||||
bool test2 = getStringFromID(&newmap, num2) == getStringFromID(&vocabids, num2);
|
||||
bool test3 = getStringFromID(&newmap, num3) == getStringFromID(&vocabids, num3);
|
||||
bool test4 = getStringFromID(&newmap, num4) == getStringFromID(&vocabids, num4);
|
||||
bool test5 = getStringFromID(&newmap, num5) == getStringFromID(&vocabids, num5);
|
||||
bool test6 = getStringFromID(&newmap, num6) == getStringFromID(&vocabids, num6);
|
||||
|
||||
|
||||
if (test1 && test2 && test3 && test4 && test5 && test6) {
|
||||
std::cout << "Map was successfully written and read!" << std::endl;
|
||||
} else {
|
||||
std::cout << "Error! " << test1 << " " << test2 << " " << test3 << " " << test4 << " " << test5 << " " << test6 << std::endl;
|
||||
}
|
||||
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use Getopt::Std;
|
||||
getopts('q');
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
use strict;
|
||||
|
||||
my $file = shift(@ARGV);
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#input hindi word urdu word, delete all those entries that have number on any side
|
||||
use utf8;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use utf8;
|
||||
require Encode;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use utf8;
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
use utf8;
|
||||
|
||||
###############################################
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
# Reads a source and hypothesis file and counts equal tokens. Some of these
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
# Display OOV rate of a test set against a training corpus or a phrase table.
|
||||
# Ondrej Bojar
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
#
|
||||
# Author : Loic BARRAULT
|
||||
# Script to convert MOSES searchgraph to DOT format
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#show-phrases-used: display all source and target phrases for each sentence in a corpus, and give average phrase length used
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#by Philipp Koehn, de-augmented by Evan Herbst
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
# Collects and prints all n-grams that appear in the given corpus both
|
||||
# tokenized as well as untokenized.
|
||||
# Ondrej Bojar
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
# runs Moses many times changing the values of one weight, all others fixed
|
||||
# nbest lists are always produced to allow for comparison of real and
|
||||
# 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring
|
||||
|
@ -185,7 +185,7 @@ lowercase
|
||||
default-name: lm/lowercased
|
||||
pass-unless: output-lowercaser
|
||||
ignore-if: output-truecaser
|
||||
only-factor-0: yes
|
||||
#only-factor-0: yes
|
||||
template: $output-lowercaser < IN > OUT
|
||||
parallelizable: yes
|
||||
truecase
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Experiment Management System
|
||||
# Documentation at http://www.statmt.org/moses/?n=FactoredTraining.EMS
|
||||
@ -18,7 +18,18 @@ sub trim($)
|
||||
my $host = `hostname`; chop($host);
|
||||
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
|
||||
|
||||
my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL_STEP,$FINAL_OUT,$VERBOSE,$IGNORE_TIME,$DELETE_CRASHED,$DELETE_VERSION);
|
||||
my ($CONFIG_FILE,
|
||||
$EXECUTE,
|
||||
$NO_GRAPH,
|
||||
$CONTINUE,
|
||||
$FINAL_STEP,
|
||||
$FINAL_OUT,
|
||||
$VERBOSE,
|
||||
$IGNORE_TIME,
|
||||
$DELETE_CRASHED,
|
||||
$DELETE_VERSION
|
||||
);
|
||||
|
||||
my $SLEEP = 2;
|
||||
my $META = "$RealBin/experiment.meta";
|
||||
|
||||
@ -3442,7 +3453,7 @@ sub create_step {
|
||||
$subdir = "lm" if $subdir eq "interpolated-lm";
|
||||
open(STEP,">$file") or die "Cannot open: $!";
|
||||
print STEP "#!/bin/bash\n\n";
|
||||
print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
|
||||
print STEP "PATH=\"".$ENV{"PATH"}."\"\n";
|
||||
print STEP "cd $dir\n";
|
||||
print STEP "echo 'starting at '`date`' on '`hostname`\n";
|
||||
print STEP "mkdir -p $dir/$subdir\n\n";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use IPC::Open3;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id: report-experiment-scores.perl 407 2008-11-10 14:43:31Z philipp $
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use File::Temp qw/ tempfile tempdir /;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Based on Preprocessor written by Philipp Koehn
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# experiment.perl support script
|
||||
# get filtered rule and reordering tables and place them into a configuration file
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# experiment.perl support script
|
||||
# get filtered rule and reordering tables and place them into a configuration file
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Date::Parse;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
binmode( STDIN, ":utf8" );
|
||||
binmode( STDOUT, ":utf8" );
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#extract-factors.pl: extract only the desired factors from a factored corpus
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# example
|
||||
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env perl
|
||||
#!/usr/bin/env perl
|
||||
# A very simple script that converts fsa format (openfst lattices) to the same
|
||||
# thing represented one sentence per line. It uses '|||' to delimit columns and
|
||||
# ' ' to delimit nodes (i.e. original lines).
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
# Converts AT&T FSA format to 'python lattice format'.
|
||||
# Note that the input FSA needs to be epsilon-free and topologically sorted.
|
||||
# This script checks for topological sortedness.
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env perl
|
||||
#!/usr/bin/env perl
|
||||
# A very simple script that converts fsal back to fsa format (openfst lattices)
|
||||
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use utf8;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# example
|
||||
# ~/giza-parallel.perl 10 split ~/workspace/sourceforge/trunk/scripts/training/train-model.perl ar en train align
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#lopar2pos: extract POSs from LOPAR output
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
#######################
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use utf8;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
use strict;
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
package ph_numbers;
|
||||
|
||||
# Script to recognize and replace numbers in Moses training corpora
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# example
|
||||
# ./score-parallel.perl 8 "gsort --batch-size=253" ./score ./extract.2.sorted.gz ./lex.2.f2e ./phrase-table.2.half.f2e --GoodTuring ./phrase-table.2.coc 0
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use File::Basename;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
# convert a phrase-table with alignment in Moses' dead-end format
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Uses Google AJAX API to collect many translations, i.e. create a parallel
|
||||
# corpus of Google translations.
|
||||
# Expects one sentence per line, not tokenized!
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#retain lines in clean.lines-retained.1
|
||||
use strict;
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Script implemented by Pranava Swaroop Madhyastha (a student at Charles
|
||||
# University, UFAL)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long "GetOptions";
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id$
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
|
||||
use strict;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
my ($results, $truth) = @ARGV;
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#! /usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl -w
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
|
||||
# Sample De-Tokenizer
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user