mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-30 15:34:01 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
216af91d1f
@ -73,7 +73,7 @@ you're ready to install packages in non-standard paths:
|
||||
|
||||
#For Boost:
|
||||
./bootstrap.sh
|
||||
./b2 --prefix=$PREFIX --libdir=$PREFIX/lib64 link=static,shared threading=multi install
|
||||
./b2 --prefix=$PREFIX --libdir=$PREFIX/lib64 --layout=tagged link=static,shared threading=multi install
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
|
13
bjam
13
bjam
@ -1,13 +1,14 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
if
|
||||
which bjam >/dev/null 2>/dev/null && #Have a bjam in path
|
||||
! grep UFIHGUFIHBDJKNCFZXAEVA "$(which bjam)" >/dev/null && #bjam in path isn't this script
|
||||
bjam --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
|
||||
bjam --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.
|
||||
if
|
||||
bjam="$(which bjam 2>/dev/null)" && #exists
|
||||
[ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
|
||||
! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
|
||||
"${bjam}" --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
|
||||
"${bjam}" --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.
|
||||
then
|
||||
#Delegate to system bjam
|
||||
exec bjam "$@"
|
||||
exec "${bjam}" "$@"
|
||||
fi
|
||||
|
||||
top="$(dirname "$0")"
|
||||
|
@ -1,90 +1,9 @@
|
||||
#include "lm/enumerate_vocab.hh"
|
||||
#include "lm/model.hh"
|
||||
#include "lm/ngram_query.hh"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <ctype.h>
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
float FloatSec(const struct timeval &tv) {
|
||||
return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
|
||||
}
|
||||
#endif
|
||||
|
||||
void PrintUsage(const char *message) {
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage)) {
|
||||
perror("getrusage");
|
||||
return;
|
||||
}
|
||||
std::cerr << message;
|
||||
std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
|
||||
|
||||
// Linux doesn't set memory usage :-(.
|
||||
std::ifstream status("/proc/self/status", std::ios::in);
|
||||
std::string line;
|
||||
while (getline(status, line)) {
|
||||
if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
|
||||
std::cerr << "rss " << (line.c_str() + 7) << '\n';
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Model> void Query(const Model &model, bool sentence_context) {
|
||||
PrintUsage("Loading statistics:\n");
|
||||
typename Model::State state, out;
|
||||
lm::FullScoreReturn ret;
|
||||
std::string word;
|
||||
|
||||
while (std::cin) {
|
||||
state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
|
||||
float total = 0.0;
|
||||
bool got = false;
|
||||
unsigned int oov = 0;
|
||||
while (std::cin >> word) {
|
||||
got = true;
|
||||
lm::WordIndex vocab = model.GetVocabulary().Index(word);
|
||||
if (vocab == 0) ++oov;
|
||||
ret = model.FullScore(state, vocab, out);
|
||||
total += ret.prob;
|
||||
std::cout << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
||||
state = out;
|
||||
char c;
|
||||
while (true) {
|
||||
c = std::cin.get();
|
||||
if (!std::cin) break;
|
||||
if (c == '\n') break;
|
||||
if (!isspace(c)) {
|
||||
std::cin.unget();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c == '\n') break;
|
||||
}
|
||||
if (!got && !std::cin) break;
|
||||
if (sentence_context) {
|
||||
ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
|
||||
total += ret.prob;
|
||||
std::cout << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
||||
}
|
||||
std::cout << "Total: " << total << " OOV: " << oov << '\n';
|
||||
}
|
||||
PrintUsage("After queries:\n");
|
||||
}
|
||||
|
||||
template <class Model> void Query(const char *name) {
|
||||
lm::ngram::Config config;
|
||||
Model model(name, config);
|
||||
Model model(name, config, std::cin, std::cout);
|
||||
Query(model);
|
||||
}
|
||||
|
||||
@ -100,19 +19,19 @@ int main(int argc, char *argv[]) {
|
||||
if (lm::ngram::RecognizeBinary(argv[1], model_type)) {
|
||||
switch(model_type) {
|
||||
case lm::ngram::HASH_PROBING:
|
||||
Query<lm::ngram::ProbingModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
break;
|
||||
case lm::ngram::TRIE_SORTED:
|
||||
Query<lm::ngram::TrieModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::TrieModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
break;
|
||||
case lm::ngram::QUANT_TRIE_SORTED:
|
||||
Query<lm::ngram::QuantTrieModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::QuantTrieModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
break;
|
||||
case lm::ngram::ARRAY_TRIE_SORTED:
|
||||
Query<lm::ngram::ArrayTrieModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::ArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
break;
|
||||
case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
|
||||
Query<lm::ngram::QuantArrayTrieModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::QuantArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
break;
|
||||
case lm::ngram::HASH_SORTED:
|
||||
default:
|
||||
@ -120,7 +39,7 @@ int main(int argc, char *argv[]) {
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
Query<lm::ngram::ProbingModel>(argv[1], sentence_context);
|
||||
Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
|
||||
}
|
||||
|
||||
PrintUsage("Total time including destruction:\n");
|
||||
|
91
lm/ngram_query.hh
Normal file
91
lm/ngram_query.hh
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef LM_NGRAM_QUERY__
|
||||
#define LM_NGRAM_QUERY__
|
||||
|
||||
#include "lm/enumerate_vocab.hh"
|
||||
#include "lm/model.hh"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <ctype.h>
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
float FloatSec(const struct timeval &tv) {
|
||||
return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
|
||||
}
|
||||
#endif
|
||||
|
||||
void PrintUsage(const char *message) {
|
||||
#if !defined(_WIN32) && !defined(_WIN64)
|
||||
struct rusage usage;
|
||||
if (getrusage(RUSAGE_SELF, &usage)) {
|
||||
perror("getrusage");
|
||||
return;
|
||||
}
|
||||
std::cerr << message;
|
||||
std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
|
||||
|
||||
// Linux doesn't set memory usage :-(.
|
||||
std::ifstream status("/proc/self/status", std::ios::in);
|
||||
std::string line;
|
||||
while (getline(status, line)) {
|
||||
if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
|
||||
std::cerr << "rss " << (line.c_str() + 7) << '\n';
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Model> void Query(const Model &model, bool sentence_context, std::istream &inStream, std::ostream &outStream) {
|
||||
PrintUsage("Loading statistics:\n");
|
||||
typename Model::State state, out;
|
||||
lm::FullScoreReturn ret;
|
||||
std::string word;
|
||||
|
||||
while (inStream) {
|
||||
state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
|
||||
float total = 0.0;
|
||||
bool got = false;
|
||||
unsigned int oov = 0;
|
||||
while (inStream >> word) {
|
||||
got = true;
|
||||
lm::WordIndex vocab = model.GetVocabulary().Index(word);
|
||||
if (vocab == 0) ++oov;
|
||||
ret = model.FullScore(state, vocab, out);
|
||||
total += ret.prob;
|
||||
outStream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
||||
state = out;
|
||||
char c;
|
||||
while (true) {
|
||||
c = inStream.get();
|
||||
if (!inStream) break;
|
||||
if (c == '\n') break;
|
||||
if (!isspace(c)) {
|
||||
inStream.unget();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c == '\n') break;
|
||||
}
|
||||
if (!got && !inStream) break;
|
||||
if (sentence_context) {
|
||||
ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
|
||||
total += ret.prob;
|
||||
outStream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
||||
}
|
||||
outStream << "Total: " << total << " OOV: " << oov << '\n';
|
||||
}
|
||||
PrintUsage("After queries:\n");
|
||||
}
|
||||
|
||||
|
||||
#endif // LM_NGRAM_QUERY__
|
||||
|
||||
|
@ -47,6 +47,99 @@ Data::~Data() {
|
||||
}
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void Data::remove_duplicates() {
|
||||
|
||||
size_t nSentences = featdata->size();
|
||||
assert(scoredata->size() == nSentences);
|
||||
|
||||
for (size_t s=0; s < nSentences; s++) {
|
||||
|
||||
FeatureArray& feat_array = featdata->get(s);
|
||||
ScoreArray& score_array = scoredata->get(s);
|
||||
|
||||
assert(feat_array.size() == score_array.size());
|
||||
|
||||
//serves as a hash-map:
|
||||
std::map<double, std::vector<size_t> > lookup;
|
||||
|
||||
size_t end_pos = feat_array.size() - 1;
|
||||
|
||||
size_t nRemoved = 0;
|
||||
for (size_t k=0; k <= end_pos; k++) {
|
||||
|
||||
const FeatureStats& cur_feats = feat_array.get(k);
|
||||
|
||||
double sum = 0.0;
|
||||
for (size_t l=0; l < cur_feats.size(); l++)
|
||||
sum += cur_feats.get(l);
|
||||
|
||||
if (lookup.find(sum) != lookup.end()) {
|
||||
|
||||
//std::cerr << "hit" << std::endl;
|
||||
|
||||
std::vector<size_t>& cur_list = lookup[sum];
|
||||
|
||||
size_t l=0;
|
||||
for (l=0; l < cur_list.size(); l++) {
|
||||
|
||||
size_t j=cur_list[l];
|
||||
|
||||
if (cur_feats == feat_array.get(j)
|
||||
&& score_array.get(k) == score_array.get(j)) {
|
||||
|
||||
if (k < end_pos) {
|
||||
|
||||
feat_array.swap(k,end_pos);
|
||||
score_array.swap(k,end_pos);
|
||||
|
||||
k--;
|
||||
}
|
||||
|
||||
end_pos--;
|
||||
nRemoved++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (l == lookup[sum].size())
|
||||
cur_list.push_back(k);
|
||||
}
|
||||
else
|
||||
lookup[sum].push_back(k);
|
||||
|
||||
// for (size_t j=0; j < k; j++) {
|
||||
|
||||
// if (feat_array.get(k) == feat_array.get(j)
|
||||
// && score_array.get(k) == score_array.get(j)) {
|
||||
|
||||
// if (k < end_pos) {
|
||||
|
||||
// feat_array.swap(k,end_pos);
|
||||
// score_array.swap(k,end_pos);
|
||||
|
||||
// k--;
|
||||
// }
|
||||
|
||||
// end_pos--;
|
||||
// nRemoved++;
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl;
|
||||
|
||||
if (nRemoved > 0) {
|
||||
|
||||
feat_array.resize(end_pos+1);
|
||||
score_array.resize(end_pos+1);
|
||||
}
|
||||
}
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
|
||||
void Data::loadnbest(const std::string &file)
|
||||
{
|
||||
TRACE_ERR("loading nbest from " << file << std::endl);
|
||||
|
@ -73,7 +73,7 @@ public:
|
||||
void mergeSparseFeatures();
|
||||
|
||||
void loadnbest(const std::string &file);
|
||||
|
||||
|
||||
void load(const std::string &featfile,const std::string &scorefile) {
|
||||
featdata->load(featfile);
|
||||
scoredata->load(scorefile);
|
||||
@ -81,6 +81,10 @@ public:
|
||||
_sparse_flag = true;
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void remove_duplicates();
|
||||
//END_ADDED
|
||||
|
||||
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
|
||||
|
||||
if (bin) cerr << "Binary write mode is selected" << endl;
|
||||
|
@ -63,6 +63,16 @@ public:
|
||||
array_.push_back(e);
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(size_t i, size_t j) {
|
||||
std::swap(array_[i],array_[j]);
|
||||
}
|
||||
|
||||
void resize(size_t new_size) {
|
||||
array_.resize(std::min(new_size,array_.size()));
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void merge(FeatureArray& e);
|
||||
|
||||
inline size_t size() const {
|
||||
|
@ -41,7 +41,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
class FileFormatException : public util::Exception
|
||||
{
|
||||
public:
|
||||
explicit FileFormatException(const std::string filename, const std::string& line) {
|
||||
explicit FileFormatException(const std::string& filename, const std::string& line) {
|
||||
*this << "Error in line \"" << line << "\" of " << filename;
|
||||
}
|
||||
};
|
||||
@ -68,7 +68,7 @@ class FeatureDataIterator :
|
||||
{
|
||||
public:
|
||||
FeatureDataIterator();
|
||||
FeatureDataIterator(const std::string& filename);
|
||||
explicit FeatureDataIterator(const std::string& filename);
|
||||
|
||||
static FeatureDataIterator end() {
|
||||
return FeatureDataIterator();
|
||||
@ -89,5 +89,3 @@ class FeatureDataIterator :
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -218,3 +218,19 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
//ADEED_BY_TS
|
||||
bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
|
||||
size_t size = f1.size();
|
||||
|
||||
if (size != f2.size())
|
||||
return false;
|
||||
|
||||
for (size_t k=0; k < size; k++) {
|
||||
if (f1.get(k) != f2.get(k))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
//END_ADDED
|
||||
|
@ -134,4 +134,8 @@ public:
|
||||
friend ostream& operator<<(ostream& o, const FeatureStats& e);
|
||||
};
|
||||
|
||||
//ADEED_BY_TS
|
||||
bool operator==(const FeatureStats& f1, const FeatureStats& f2);
|
||||
//END_ADDED
|
||||
|
||||
#endif // FEATURE_STATS_H
|
||||
|
@ -62,6 +62,16 @@ public:
|
||||
array_.push_back(e);
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(size_t i, size_t j) {
|
||||
std::swap(array_[i],array_[j]);
|
||||
}
|
||||
|
||||
void resize(size_t new_size) {
|
||||
array_.resize(std::min(new_size,array_.size()));
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void merge(ScoreArray& e);
|
||||
|
||||
inline std::string name() const {
|
||||
|
@ -43,7 +43,7 @@ class ScoreDataIterator :
|
||||
{
|
||||
public:
|
||||
ScoreDataIterator();
|
||||
ScoreDataIterator(const std::string& filename);
|
||||
explicit ScoreDataIterator(const std::string& filename);
|
||||
|
||||
static ScoreDataIterator end() {
|
||||
return ScoreDataIterator();
|
||||
@ -62,6 +62,4 @@ class ScoreDataIterator :
|
||||
std::vector<ScoreDataItem> m_next;
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -132,3 +132,19 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
|
||||
o << e.get(i) << " ";
|
||||
return o;
|
||||
}
|
||||
|
||||
//ADDED_BY_TS
|
||||
bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
|
||||
size_t size = s1.size();
|
||||
|
||||
if (size != s2.size())
|
||||
return false;
|
||||
|
||||
for (size_t k=0; k < size; k++) {
|
||||
if (s1.get(k) != s2.get(k))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
//END_ADDED
|
||||
|
@ -100,4 +100,8 @@ public:
|
||||
friend ostream& operator<<(ostream& o, const ScoreStats& e);
|
||||
};
|
||||
|
||||
//ADDED_BY_TS
|
||||
bool operator==(const ScoreStats& s1, const ScoreStats& s2);
|
||||
//END_ADDED
|
||||
|
||||
#endif // SCORE_STATS_H
|
||||
|
@ -84,7 +84,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
|
||||
} else if ( result.scoreAv() > tmp_result.scoreAv() ) {
|
||||
result = tmp_result;
|
||||
}
|
||||
|
||||
delete evaluation;
|
||||
}
|
||||
ostringstream stats;
|
||||
// multiplication by 100 in order to keep the average precision
|
||||
|
@ -182,6 +182,10 @@ int main(int argc, char** argv)
|
||||
|
||||
PrintUserTime("Nbest entries loaded and scored");
|
||||
|
||||
//ADDED_BY_TS
|
||||
data.remove_duplicates();
|
||||
//END_ADDED
|
||||
|
||||
if (binmode)
|
||||
cerr << "Binary write mode is selected" << endl;
|
||||
else
|
||||
|
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* \description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon
|
||||
* \description This is the main for the new version of the mert algorithm developed during the 2nd MT marathon
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
@ -260,6 +260,7 @@ int main (int argc, char **argv)
|
||||
if(j<pdim) {
|
||||
cerr<<initfile<<":Too few minimum weights." << endl;
|
||||
cerr<<"error could not initialize start point with " << initfile << endl;
|
||||
std::cerr << "j: " << j << ", pdim: " << pdim << std::endl;
|
||||
exit(3);
|
||||
}
|
||||
max.resize(pdim);
|
||||
@ -297,6 +298,10 @@ int main (int argc, char **argv)
|
||||
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
|
||||
}
|
||||
|
||||
//ADDED_BY_TS
|
||||
D.remove_duplicates();
|
||||
//END_ADDED
|
||||
|
||||
PrintUserTime("Data loaded");
|
||||
|
||||
// starting point score over latest n-best, accumulative n-best
|
||||
|
@ -101,11 +101,11 @@ template <class Model> class LanguageModelKen : public LanguageModel {
|
||||
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
||||
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
||||
for (; ; ++index, --position) {
|
||||
if (index == end) return index;
|
||||
if (position == -1) {
|
||||
*index = m_ngram->GetVocabulary().BeginSentence();
|
||||
return index + 1;
|
||||
}
|
||||
if (index == end) return index;
|
||||
*index = TranslateID(hypo.GetWord(position));
|
||||
}
|
||||
}
|
||||
|
@ -2,20 +2,20 @@ STDOUT_1=1020 14190 86273 FEATSTAT.run1
|
||||
STDOUT_2=1020 9060 26328 SCORESTAT.run1
|
||||
STDOUT_3=1020 14190 86273 FEATSTAT.run1.2
|
||||
STDOUT_4=1020 9060 26328 SCORESTAT.run1.2
|
||||
STDOUT_5=2020 28190 172503 FEATSTAT.run2
|
||||
STDOUT_6=2020 18060 52341 SCORESTAT.run2
|
||||
STDOUT_7=2020 28190 172503 FEATSTAT.run2.2
|
||||
STDOUT_8=2020 18060 52341 SCORESTAT.run2.2
|
||||
STDOUT_9=3020 42190 264672 FEATSTAT.run3
|
||||
STDOUT_10=3020 27060 77299 SCORESTAT.run3
|
||||
STDOUT_11=3020 42190 264672 FEATSTAT.run3.2
|
||||
STDOUT_12=3020 27060 77299 SCORESTAT.run3.2
|
||||
STDOUT_13=4020 56190 360150 FEATSTAT.run4
|
||||
STDOUT_14=4020 36060 103698 SCORESTAT.run4
|
||||
STDOUT_15=4020 56190 360150 FEATSTAT.run4.2
|
||||
STDOUT_16=4020 36060 103698 SCORESTAT.run4.2
|
||||
STDOUT_17=5020 70190 462892 FEATSTAT.run5
|
||||
STDOUT_18=5020 45060 129840 SCORESTAT.run5
|
||||
STDOUT_19=5020 70190 462892 FEATSTAT.run5.2
|
||||
STDOUT_20=5020 45060 129840 SCORESTAT.run5.2
|
||||
TOTAL_WALLTIME ~ 5
|
||||
STDOUT_5=2019 28176 172418 FEATSTAT.run2
|
||||
STDOUT_6=2019 18051 52315 SCORESTAT.run2
|
||||
STDOUT_7=2019 28176 172418 FEATSTAT.run2.2
|
||||
STDOUT_8=2019 18051 52315 SCORESTAT.run2.2
|
||||
STDOUT_9=3019 42176 264587 FEATSTAT.run3
|
||||
STDOUT_10=3019 27051 77273 SCORESTAT.run3
|
||||
STDOUT_11=3019 42176 264587 FEATSTAT.run3.2
|
||||
STDOUT_12=3019 27051 77273 SCORESTAT.run3.2
|
||||
STDOUT_13=3963 55392 355328 FEATSTAT.run4
|
||||
STDOUT_14=3963 35547 102216 SCORESTAT.run4
|
||||
STDOUT_15=3963 55392 355328 FEATSTAT.run4.2
|
||||
STDOUT_16=3963 35547 102216 SCORESTAT.run4.2
|
||||
STDOUT_17=4932 68958 455449 FEATSTAT.run5
|
||||
STDOUT_18=4932 44268 127552 SCORESTAT.run5
|
||||
STDOUT_19=4932 68958 455449 FEATSTAT.run5.2
|
||||
STDOUT_20=4932 44268 127552 SCORESTAT.run5.2
|
||||
TOTAL_WALLTIME ~ 2
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -95,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -123,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
@ -138,27 +152,21 @@ order = 5
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
|
||||
#
|
||||
# if binarized, set type (default srilm; if binarized: irstlm)
|
||||
#
|
||||
# set to 8 when using kenlm
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -170,7 +178,7 @@ order = 5
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -182,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -208,32 +216,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -261,12 +273,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -355,18 +373,18 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -394,14 +412,14 @@ decoder-settings = ""
|
||||
# and also point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#########################################################
|
||||
## RECASER: restore case, this part only trains the model
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -448,6 +466,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -470,8 +493,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
@ -502,11 +525,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -514,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -95,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -123,36 +132,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#lm-binarizer = $moses-bin-dir/build_binary
|
||||
#type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -164,7 +178,7 @@ order = 5
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -176,19 +190,19 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
[LM:nc=pos]
|
||||
factors = "pos"
|
||||
order = 7
|
||||
settings = "-interpolate -unk"
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
#################################################################
|
||||
# INTERPOLATING LANGUAGE MODELS
|
||||
@ -207,32 +221,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#lm-binarizer = $moses-bin-dir/build_binary
|
||||
#type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# FACTOR DEFINITION
|
||||
@ -275,12 +293,18 @@ reordering-factors = "word -> word"
|
||||
#generation-factors =
|
||||
decoding-steps = "t0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -354,7 +378,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -369,18 +393,18 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -415,7 +439,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -462,6 +486,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -484,8 +513,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
@ -516,11 +545,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -528,7 +557,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
||||
decoder = $moses-bin-dir/moses_chart
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -95,7 +104,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -123,36 +132,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -164,7 +178,7 @@ order = 5
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -176,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -202,32 +216,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -255,12 +273,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -334,7 +358,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -349,18 +373,18 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -395,7 +419,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -442,6 +466,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
@ -464,8 +493,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
@ -496,11 +525,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -508,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
wmt10-data = $working-dir/data
|
||||
wmt12-data = $working-dir/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses_chart
|
||||
decoder = $moses-bin-dir/moses_chart
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
#ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -99,7 +108,7 @@ max-sentence-length = 80
|
||||
|
||||
### raw corpus files (untokenized, but sentence aligned)
|
||||
#
|
||||
raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
raw-stem = $wmt12-data/training/europarl-v7.$pair-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -116,10 +125,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
|
||||
#lowercased-stem =
|
||||
|
||||
[CORPUS:nc]
|
||||
raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
|
||||
raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension
|
||||
|
||||
[CORPUS:un] IGNORE
|
||||
raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
|
||||
|
||||
#################################################################
|
||||
# LANGUAGE MODEL TRAINING
|
||||
@ -127,36 +136,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -168,7 +182,7 @@ order = 5
|
||||
|
||||
### raw corpus (untokenized)
|
||||
#
|
||||
raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
raw-corpus = $wmt12-data/training/europarl-v7.$output-extension
|
||||
|
||||
### tokenized corpus files (may contain long sentences)
|
||||
#
|
||||
@ -180,13 +194,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
|
||||
#lm =
|
||||
|
||||
[LM:nc]
|
||||
raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension
|
||||
|
||||
[LM:un] IGNORE
|
||||
raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension
|
||||
|
||||
[LM:news] IGNORE
|
||||
raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
|
||||
raw-corpus = $wmt12-data/training/news.$output-extension.shuffled
|
||||
|
||||
|
||||
#################################################################
|
||||
@ -206,32 +220,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
|
||||
### tuning set
|
||||
# you may use the same set that is used for mert tuning (reference set)
|
||||
#
|
||||
tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-tuning =
|
||||
#tokenized-tuning =
|
||||
#factored-tuning =
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -259,12 +277,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -338,7 +362,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -353,18 +377,18 @@ score-settings = "--GoodTuring"
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
|
||||
#raw-input =
|
||||
#tokenized-input =
|
||||
#factorized-input =
|
||||
#input =
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
|
||||
#raw-reference =
|
||||
#tokenized-reference =
|
||||
#factorized-reference =
|
||||
@ -399,7 +423,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -446,6 +470,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
#decoder-settings = ""
|
||||
|
||||
@ -468,8 +497,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
@ -500,11 +529,11 @@ report-segmentation = yes
|
||||
# further precision breakdown by factor
|
||||
#precision-by-coverage-factor = pos
|
||||
|
||||
[EVALUATION:newstest2009]
|
||||
[EVALUATION:newstest2011]
|
||||
|
||||
### input data
|
||||
#
|
||||
input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
|
||||
# raw-input =
|
||||
# tokenized-input =
|
||||
# factorized-input =
|
||||
@ -512,7 +541,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
|
||||
|
||||
### reference data
|
||||
#
|
||||
reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
|
||||
reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
|
||||
# raw-reference =
|
||||
# tokenized-reference =
|
||||
# reference =
|
||||
|
@ -18,25 +18,34 @@ pair-extension = fr-en
|
||||
# moses
|
||||
moses-src-dir = /home/pkoehn/moses
|
||||
#
|
||||
# moses binaries
|
||||
moses-bin-dir = $moses-src-dir/dist/bin
|
||||
#
|
||||
# moses scripts
|
||||
moses-script-dir = /home/pkoehn/moses/scripts
|
||||
moses-script-dir = $moses-src-dir/scripts
|
||||
#
|
||||
# srilm
|
||||
srilm-dir = $moses-src-dir/srilm/bin/i686
|
||||
#
|
||||
# irstlm
|
||||
irstlm-dir = $moses-src-dir/irstlm/bin
|
||||
#
|
||||
# randlm
|
||||
randlm-dir = $moses-src-dir/randlm/bin
|
||||
#
|
||||
# data
|
||||
toy-data = $moses-script-dir/ems/example/data
|
||||
|
||||
### basic tools
|
||||
#
|
||||
# moses decoder
|
||||
decoder = $moses-src-dir/dist/bin/moses
|
||||
decoder = $moses-bin-dir/moses
|
||||
|
||||
# conversion of phrase table into binary on-disk format
|
||||
ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
|
||||
ttable-binarizer = $moses-bin-dir/processPhraseTable
|
||||
|
||||
# conversion of rule table into binary on-disk format
|
||||
#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
|
||||
#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"
|
||||
|
||||
# tokenizers - comment out if all your data is already tokenized
|
||||
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
|
||||
@ -117,36 +126,41 @@ raw-stem = $toy-data/nc-5k
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
# (more commonly, a SRILM is trained)
|
||||
#
|
||||
#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
### each language model to be used has its own section here
|
||||
|
||||
@ -193,25 +207,29 @@ raw-corpus = $toy-data/nc-5k.$output-extension
|
||||
#lowercased-tuning =
|
||||
#split-tuning =
|
||||
|
||||
### group language models for hierarchical interpolation
|
||||
# (flat interpolation is limited to 10 language models)
|
||||
#group = "first,second fourth,fifth"
|
||||
|
||||
### script to use for binary table format for irstlm or kenlm
|
||||
# (default: no binarization)
|
||||
|
||||
# irstlm
|
||||
#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
|
||||
#lm-binarizer = $irstlm-dir/compile-lm
|
||||
|
||||
# kenlm, also set type to 8
|
||||
#lm-binarizer = $moses-src-dir/dist/bin/build_binary
|
||||
#type = 8
|
||||
lm-binarizer = $moses-bin-dir/build_binary
|
||||
type = 8
|
||||
|
||||
### script to create quantized language model format (irstlm)
|
||||
# (default: no quantization)
|
||||
#
|
||||
#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
|
||||
#lm-quantizer = $irstlm-dir/quantize-lm
|
||||
|
||||
### script to use for converting into randomized table format
|
||||
# (default: no randomization)
|
||||
#
|
||||
#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
|
||||
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
|
||||
|
||||
#################################################################
|
||||
# TRANSLATION MODEL TRAINING
|
||||
@ -239,12 +257,18 @@ script = $moses-script-dir/training/train-model.perl
|
||||
#generation-factors = "word -> pos"
|
||||
#decoding-steps = "t0, g0"
|
||||
|
||||
### parallelization of data preparation step
|
||||
# the two directions of the data preparation can be run in parallel
|
||||
# comment out if not needed
|
||||
#
|
||||
parallel = yes
|
||||
|
||||
### pre-computation for giza++
|
||||
# giza++ has a more efficient data structure that needs to be
|
||||
# initialized with snt2cooc. if run in parallel, this may reduces
|
||||
# memory requirements. set here the number of parts
|
||||
#
|
||||
run-giza-in-parts = 5
|
||||
#run-giza-in-parts = 5
|
||||
|
||||
### symmetrization method to obtain word alignments from giza output
|
||||
# (commonly used: grow-diag-final-and)
|
||||
@ -318,7 +342,7 @@ score-settings = "--GoodTuring"
|
||||
# point to a configuration file that contains
|
||||
# pointers to all relevant model files
|
||||
#
|
||||
#config =
|
||||
#config-with-reused-weights =
|
||||
|
||||
#####################################################
|
||||
### TUNING: finding good weights for model components
|
||||
@ -333,7 +357,7 @@ weight-config = $toy-data/weight.ini
|
||||
### tuning script to be used
|
||||
#
|
||||
tuning-script = $moses-script-dir/training/mert-moses.pl
|
||||
tuning-settings = "-mertdir $moses-src-dir/mert"
|
||||
tuning-settings = "-mertdir $moses-bin-dir"
|
||||
|
||||
### specify the corpus used for tuning
|
||||
# it should contain 1000s of sentences
|
||||
@ -379,7 +403,7 @@ decoder-settings = ""
|
||||
|
||||
[RECASING]
|
||||
|
||||
#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
|
||||
#decoder = $moses-bin-dir/moses
|
||||
|
||||
### training data
|
||||
# raw input needs to be still tokenized,
|
||||
@ -422,6 +446,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
|
||||
|
||||
### additional decoder settings
|
||||
# switches for the Moses decoder
|
||||
# common choices:
|
||||
# "-threads N" for multi-threading
|
||||
# "-mbr" for MBR decoding
|
||||
# "-drop-unknown" for dropping unknown source words
|
||||
# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
|
||||
#
|
||||
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
|
||||
|
||||
@ -444,8 +473,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension
|
||||
|
||||
### BLEU
|
||||
#
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v12.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
|
||||
nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
|
||||
nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
|
||||
#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
|
||||
#ibm-bleu =
|
||||
|
||||
|
@ -107,7 +107,7 @@ consolidate
|
||||
default-name: truecaser/corpus
|
||||
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
|
||||
train
|
||||
in: tokenized-stem
|
||||
in: tokenized-stem
|
||||
out: truecase-model
|
||||
rerun-on-change: trainer
|
||||
default-name: truecaser/truecase-model
|
||||
@ -207,6 +207,7 @@ binarize
|
||||
rerun-on-change: lm
|
||||
default-name: lm/binlm
|
||||
template: $lm-binarizer IN OUT
|
||||
error: set kMaxOrder to at least this value
|
||||
|
||||
[INTERPOLATED-LM] single
|
||||
tuning-from-sgm
|
||||
@ -253,27 +254,26 @@ split-tuning
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
interpolate
|
||||
in: script split-tuning LM:lm
|
||||
rerun-on-change: srilm-dir
|
||||
rerun-on-change: srilm-dir group
|
||||
out: lm
|
||||
default-name: lm/interpolated-lm
|
||||
randomize
|
||||
in: lm
|
||||
out: rlm
|
||||
pass-unless: lm-randomizer
|
||||
default-name: lm/rlm
|
||||
default-name: lm/interpolated-rlm
|
||||
quantize
|
||||
in: rlm
|
||||
out: qlm
|
||||
pass-unless: lm-quantizer
|
||||
default-name: lm/interpolated-qlm
|
||||
template: $lm-quantizer IN OUT
|
||||
binarize
|
||||
in: qlm
|
||||
out: binlm
|
||||
pass-unless: lm-binarizer
|
||||
rerun-on-change: lm
|
||||
default-name: lm/interpolated-binlm
|
||||
template: $lm-binarizer IN OUT
|
||||
error: set kMaxOrder to at least this value
|
||||
|
||||
[TRAINING] single
|
||||
consolidate
|
||||
@ -370,17 +370,9 @@ build-generation-custom
|
||||
ignore-unless: AND generation-factors generation-corpus
|
||||
default-name: model/generation-table
|
||||
create-config
|
||||
in: reordering-table phrase-translation-table generation-table LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero INTERPOLATED-LM:script
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
|
||||
default-name: model/moses.ini
|
||||
error: Unknown option
|
||||
create-config-interpolated-lm
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
ignore-unless: INTERPOLATED-LM:script
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
|
||||
default-name: model/moses.ini
|
||||
error: Unknown option
|
||||
|
@ -934,7 +934,12 @@ sub define_step {
|
||||
&define_training_create_config($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
|
||||
&define_training_interpolated_lm_interpolate($i);
|
||||
&define_interpolated_lm_interpolate($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:binarize' ||
|
||||
$DO_STEP[$i] eq 'INTERPOLATED-LM:quantize' ||
|
||||
$DO_STEP[$i] eq 'INTERPOLATED-LM:randomize') {
|
||||
&define_interpolated_lm_process($i);
|
||||
}
|
||||
elsif ($DO_STEP[$i] eq 'TUNING:factorize-input') {
|
||||
&define_tuningevaluation_factorize($i);
|
||||
@ -991,7 +996,10 @@ sub execute_steps {
|
||||
while(1) {
|
||||
|
||||
# find steps to be done
|
||||
for(my $i=0;$i<=$#DO_STEP;$i++) {
|
||||
my $repeat_if_passed = 1;
|
||||
while($repeat_if_passed) {
|
||||
$repeat_if_passed = 0;
|
||||
for(my $i=0;$i<=$#DO_STEP;$i++) {
|
||||
next if (defined($DONE{$i}));
|
||||
next if (defined($DO{$i}));
|
||||
next if (defined($CRASHED{$i}));
|
||||
@ -1000,10 +1008,19 @@ sub execute_steps {
|
||||
foreach my $prev_step (@{$DEPENDENCY[$i]}) {
|
||||
$doable = 0 if !defined($DONE{$prev_step});
|
||||
}
|
||||
$DO{$i} = 1 if $doable;
|
||||
next unless $doable;
|
||||
$DO{$i} = 1;
|
||||
|
||||
# immediately label pass steps as done
|
||||
next unless defined($PASS{$i});
|
||||
$DONE{$i} = 1;
|
||||
delete($DO{$i});
|
||||
$repeat_if_passed = 1;
|
||||
}
|
||||
}
|
||||
|
||||
print "number of steps doable or running: ".(scalar keys %DO)."\n";
|
||||
foreach my $step (keys %DO) { print "\t".($DO{$step}==2?"running: ":"doable: ").$DO_STEP[$step]."\n"; }
|
||||
return unless scalar keys %DO;
|
||||
|
||||
# execute new step
|
||||
@ -1033,7 +1050,7 @@ sub execute_steps {
|
||||
elsif ($CLUSTER || $active < $MAX_ACTIVE) {
|
||||
$active++;
|
||||
$DO{$i}++;
|
||||
print "sh ($active)\n";
|
||||
print "sh ($active active)\n";
|
||||
sleep(5);
|
||||
if (!fork) {
|
||||
`sh $step >$step.STDOUT 2> $step.STDERR`;
|
||||
@ -1275,7 +1292,8 @@ sub check_if_crashed {
|
||||
foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
|
||||
'error','killed','core dumped','can\'t read',
|
||||
'no such file or directory','unknown option',
|
||||
'died at','exit code','permission denied') {
|
||||
'died at','exit code','permission denied',
|
||||
"Can't locate") {
|
||||
if (/$pattern/i) {
|
||||
my $not_error = 0;
|
||||
if (defined($NOT_ERROR{&defined_step_id($i)})) {
|
||||
@ -1769,11 +1787,11 @@ sub define_training_create_config {
|
||||
|
||||
# find out which language model files have been built
|
||||
my @LM_SETS = &get_sets("LM");
|
||||
my %INTERPOLATED_AWAY;
|
||||
my %OUTPUT_FACTORS;
|
||||
%OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors");
|
||||
|
||||
my $interpolated = &get("INTERPOLATED-LM:script"); # flag
|
||||
if ($interpolated) {
|
||||
if (&get("INTERPOLATED-LM:script")) {
|
||||
my $type = 0;
|
||||
# binarizing the lm?
|
||||
$type = 1 if (&get("INTERPOLATED-LM:binlm") ||
|
||||
@ -1783,23 +1801,32 @@ sub define_training_create_config {
|
||||
&backoff_and_get("INTERPOLATED-LM:lm-randomizer"));
|
||||
|
||||
# manually set type
|
||||
$type = &get("INTERPOLATED-LM:type") if (&get("INTERPOLATED-LM:type"));
|
||||
$type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
|
||||
|
||||
# order and factor inherited from individual LMs
|
||||
my $set = shift @LM_SETS;
|
||||
my $order = &check_backoff_and_get("LM:$set:order");
|
||||
my $factor = 0;
|
||||
if (&backoff_and_get("TRAINING:output-factors") &&
|
||||
&backoff_and_get("LM:$set:factors")) {
|
||||
$factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
|
||||
}
|
||||
$cmd .= "-lm $factor:$order:$LM[0]:$type ";
|
||||
# go through each interpolated language model
|
||||
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
|
||||
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
|
||||
foreach my $factor (keys %{$ILM_SETS}) {
|
||||
foreach my $order (keys %{$$ILM_SETS{$factor}}) {
|
||||
next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
|
||||
my $suffix = "";
|
||||
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
|
||||
$suffix .= ".order$order" if $icount > 1;
|
||||
$cmd .= "-lm $factor:$order:$LM[0]$suffix:$type ";
|
||||
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
|
||||
my ($id,$set) = split(/ /,$id_set,2);
|
||||
$INTERPOLATED_AWAY{$set} = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
}
|
||||
shift @LM;
|
||||
}
|
||||
|
||||
die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
|
||||
unless scalar @LM == scalar @LM_SETS;
|
||||
foreach my $lm (@LM) {
|
||||
my $set = shift @LM_SETS;
|
||||
next if defined($INTERPOLATED_AWAY{$set});
|
||||
my $order = &check_backoff_and_get("LM:$set:order");
|
||||
my $lm_file = "$lm";
|
||||
my $type = 0; # default: SRILM
|
||||
@ -1824,54 +1851,143 @@ sub define_training_create_config {
|
||||
}
|
||||
|
||||
$cmd .= "-lm $factor:$order:$lm_file:$type ";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_training_interpolated_lm_interpolate {
|
||||
sub define_interpolated_lm_interpolate {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($interpolated_lm,
|
||||
$interpolation_script, $tuning, @LM)
|
||||
= &get_output_and_input($step_id);
|
||||
$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
|
||||
my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
|
||||
my $group = &get("INTERPOLATED-LM:group");
|
||||
|
||||
my $lm_list = "";
|
||||
foreach (@LM) {
|
||||
$lm_list .= $_.",";
|
||||
}
|
||||
chop($lm_list);
|
||||
my $cmd = "";
|
||||
|
||||
# sanity checks on order and factors
|
||||
my @LM_SETS = &get_sets("LM");
|
||||
my %OUTPUT_FACTORS;
|
||||
%OUTPUT_FACTORS = &get_factor_id("output")
|
||||
if &backoff_and_get("TRAINING:output-factors");
|
||||
my ($factor,$order);
|
||||
foreach my $set (@LM_SETS) {
|
||||
my $set_order = &check_backoff_and_get("LM:$set:order");
|
||||
if (defined($order) && $order != $set_order) {
|
||||
die("ERROR: language models have mismatching order - no interpolation possible!");
|
||||
}
|
||||
$order = $set_order;
|
||||
|
||||
if (&backoff_and_get("TRAINING:output-factors") &&
|
||||
&backoff_and_get("LM:$set:factors")) {
|
||||
my $set_factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
|
||||
if (defined($factor) && $factor != $set_factor) {
|
||||
die("ERROR: language models have mismatching factors - no interpolation possible!");
|
||||
}
|
||||
$factor = $set_factor;
|
||||
}
|
||||
# go through language models by factor and order
|
||||
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
|
||||
foreach my $factor (keys %{$ILM_SETS}) {
|
||||
foreach my $order (keys %{$$ILM_SETS{$factor}}) {
|
||||
next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
|
||||
|
||||
# get list of language model files
|
||||
my $lm_list = "";
|
||||
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
|
||||
my ($id,$set) = split(/ /,$id_set,2);
|
||||
$lm_list .= $LM[$id].",";
|
||||
}
|
||||
chop($lm_list);
|
||||
|
||||
# if grouping, identify position in list
|
||||
my $numbered_string = "";
|
||||
if (defined($group)) {
|
||||
my %POSITION;
|
||||
foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
|
||||
my ($id,$set) = split(/ /,$id_set,2);
|
||||
$POSITION{$set} = scalar keys %POSITION;
|
||||
}
|
||||
my $group_string = $group;
|
||||
$group_string =~ s/\s+/ /g;
|
||||
$group_string =~ s/ *, */,/g;
|
||||
$group_string =~ s/^ //;
|
||||
$group_string =~ s/ $//;
|
||||
$group_string .= " ";
|
||||
while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
|
||||
die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
|
||||
if ! defined($POSITION{$1});
|
||||
$numbered_string .= $POSITION{$1}.$2;
|
||||
$group_string = $3;
|
||||
}
|
||||
chop($numbered_string);
|
||||
}
|
||||
|
||||
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
|
||||
my $name = $interpolated_lm;
|
||||
if ($icount > 1) {
|
||||
$name .= ".$$FACTOR[$factor]" if defined($FACTOR);
|
||||
$name .= ".order$order";
|
||||
}
|
||||
$cmd .= "$interpolation_script --tuning $tuning --name $name --srilm $srilm_dir --lm $lm_list";
|
||||
$cmd .= " --group \"$numbered_string\"" if defined($group);
|
||||
$cmd .= "\n";
|
||||
}
|
||||
}
|
||||
|
||||
my $cmd = "$interpolation_script --tuning $tuning --name $interpolated_lm --srilm $srilm_dir --lm $lm_list";
|
||||
|
||||
die("ERROR: Nothing to interpolate, remove interpolation step!") if $cmd eq "";
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub define_interpolated_lm_process {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($processed_lm, $interpolatd_lm) = &get_output_and_input($step_id);
|
||||
my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]);
|
||||
my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r");
|
||||
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
|
||||
|
||||
# go through language models by factor and order
|
||||
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
|
||||
my $cmd = "";
|
||||
foreach my $factor (keys %{$ILM_SETS}) {
|
||||
foreach my $order (keys %{$$ILM_SETS{$factor}}) {
|
||||
next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
|
||||
my $suffix = "";
|
||||
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
|
||||
$suffix .= ".order$order" if $icount > 1;
|
||||
$cmd .= "$tool $interpolatd_lm$suffix $processed_lm$suffix\n";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
sub get_interpolated_lm_processed_names {
|
||||
my ($processed_lm) = @_;
|
||||
my @ILM_NAME;
|
||||
my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
|
||||
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
|
||||
foreach my $factor (keys %{$ILM_SETS}) {
|
||||
foreach my $order (keys %{$$ILM_SETS{$factor}}) {
|
||||
if (scalar(@{$$ILM_SETS{$factor}{$order}}) > 1) {
|
||||
my $suffix = "";
|
||||
$suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
|
||||
$suffix .= ".order$order" if $icount > 1;
|
||||
push @ILM_NAME,"$processed_lm$suffix";
|
||||
}
|
||||
else {
|
||||
push @ILM_NAME,"$processed_lm.".($FACTOR?"":".$$FACTOR[$factor]").".order$order";
|
||||
}
|
||||
}
|
||||
}
|
||||
return @ILM_NAME;
|
||||
}
|
||||
|
||||
sub get_interpolated_lm_sets {
|
||||
my %ILM_SETS;
|
||||
|
||||
my @LM_SETS = &get_sets("LM");
|
||||
my %OUTPUT_FACTORS;
|
||||
%OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors");
|
||||
|
||||
my $count=0;
|
||||
my $icount=0;
|
||||
foreach my $set (@LM_SETS) {
|
||||
my $order = &check_backoff_and_get("LM:$set:order");
|
||||
|
||||
my $factor = 0;
|
||||
if (&backoff_and_get("TRAINING:output-factors") &&
|
||||
&backoff_and_get("LM:$set:factors")) {
|
||||
$factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
|
||||
}
|
||||
|
||||
push @{$ILM_SETS{$factor}{$order}}, ($count++)." ".$set;
|
||||
$icount++ if scalar(@{$ILM_SETS{$factor}{$order}}) == 2;
|
||||
}
|
||||
return ($icount,\%ILM_SETS);
|
||||
}
|
||||
|
||||
sub get_training_setting {
|
||||
my ($step) = @_;
|
||||
my $dir = &check_and_get("GENERAL:working-dir");
|
||||
@ -1888,6 +2004,7 @@ sub get_training_setting {
|
||||
my $source_syntax = &get("GENERAL:input-parser");
|
||||
my $target_syntax = &get("GENERAL:output-parser");
|
||||
my $score_settings = &get("TRAINING:score-settings");
|
||||
my $parallel = &get("TRAINING:parallel");
|
||||
|
||||
my $xml = $source_syntax || $target_syntax;
|
||||
|
||||
@ -1909,6 +2026,7 @@ sub get_training_setting {
|
||||
$cmd .= "-source-syntax " if $source_syntax;
|
||||
$cmd .= "-glue-grammar " if $hierarchical;
|
||||
$cmd .= "-score-options '".$score_settings."' " if $score_settings;
|
||||
$cmd .= "-parallel " if $parallel;
|
||||
|
||||
# factored training
|
||||
if (&backoff_and_get("TRAINING:input-factors")) {
|
||||
@ -2261,12 +2379,13 @@ sub define_reporting_report {
|
||||
### subs for step definition
|
||||
|
||||
sub get_output_and_input {
|
||||
my ($step_id) = @_;
|
||||
my ($step_id) = @_;
|
||||
|
||||
my $step = $DO_STEP[$step_id];
|
||||
my $output = &get_default_file(&deconstruct_name($step));
|
||||
my $step = $DO_STEP[$step_id];
|
||||
my $output = &get_default_file(&deconstruct_name($step));
|
||||
|
||||
my @INPUT;
|
||||
my @INPUT;
|
||||
if (defined($USES_INPUT{$step_id})) {
|
||||
for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) {
|
||||
# get name of input file needed
|
||||
my $in_file = $USES_INPUT{$step_id}[$i];
|
||||
@ -2298,7 +2417,8 @@ sub get_output_and_input {
|
||||
push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file),
|
||||
&deconstruct_name($prev_step));
|
||||
}
|
||||
return ($output,@INPUT);
|
||||
}
|
||||
return ($output,@INPUT);
|
||||
}
|
||||
|
||||
sub define_template {
|
||||
@ -2397,6 +2517,9 @@ sub define_template {
|
||||
}
|
||||
# input is defined as IN or IN0, IN1, IN2
|
||||
else {
|
||||
if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
|
||||
die("ERROR: Step $step requires input from prior steps, but none defined.");
|
||||
}
|
||||
$cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g; # a bit trickier to
|
||||
$cmd =~ s/([^ANS])IN/$1$INPUT[0]/g; # avoid matching TRAINING, RECASING
|
||||
$cmd =~ s/^IN(\d+)/$INPUT[$2]/g;
|
||||
|
@ -12,13 +12,14 @@ binmode(STDERR, ":utf8");
|
||||
|
||||
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
|
||||
my $TEMPDIR = "/tmp";
|
||||
my ($TUNING,$LM,$NAME);
|
||||
my ($TUNING,$LM,$NAME,$GROUP);
|
||||
|
||||
die("interpolate-lm.perl --tuning set --name out-lm --lm lm1,lm2,lm3 [--srilm srtilm-dir --tempdir tempdir]")
|
||||
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
|
||||
unless &GetOptions('tuning=s' => => \$TUNING,
|
||||
'name=s' => \$NAME,
|
||||
'srilm=s' => \$SRILM,
|
||||
'tempdir=s' => \$TEMPDIR,
|
||||
'group=s' => \$GROUP,
|
||||
'lm=s' => \$LM);
|
||||
|
||||
# check and set default to unset parameters
|
||||
@ -52,49 +53,109 @@ foreach my $lm (@LM) {
|
||||
}
|
||||
print STDERR "language models have order $order.\n";
|
||||
|
||||
my $tmp = tempdir(DIR=>$TEMPDIR);
|
||||
# too many language models? group them first
|
||||
if (!defined($GROUP) && scalar(@LM) > 10) {
|
||||
print STDERR "more than 10, automatically grouping language models.\n";
|
||||
my $num_groups = int(scalar(@LM)/10 + 0.99);
|
||||
my $size_groups = int(scalar(@LM)/$num_groups + 0.99);
|
||||
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
$GROUP = "";
|
||||
for(my $i=0;$i<$num_groups;$i++) {
|
||||
$GROUP .= " " unless $i==0;
|
||||
for(my $j=0;$j<$size_groups;$j++) {
|
||||
my $lm_i = $i*$size_groups+$j;
|
||||
next if $lm_i >= scalar(@LM);
|
||||
$GROUP .= "," unless $j==0;
|
||||
$GROUP .= $lm_i;
|
||||
}
|
||||
}
|
||||
print STDERR "groups: $GROUP\n";
|
||||
}
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
# normal interpolation
|
||||
if (!defined($GROUP)) {
|
||||
&interpolate($NAME,@LM);
|
||||
exit;
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
my @LAMBDA = split(/ /,$1);
|
||||
|
||||
# create new language models
|
||||
print STDERR "creating new language model...\n";
|
||||
$i = 0;
|
||||
$cmd = "$SRILM/ngram -unk -order $order -write-lm $NAME";
|
||||
foreach my $lm (@LM) {
|
||||
$cmd .= " -lm " if $i==0;
|
||||
$cmd .= " -mix-lm " if $i==1;
|
||||
$cmd .= " -mix-lm$i " if $i>1;
|
||||
$cmd .= $lm;
|
||||
$cmd .= " -lambda " if $i==0;
|
||||
$cmd .= " -mix-lambda$i " if $i>1;
|
||||
$cmd .= $LAMBDA[$i] if $i!=1;
|
||||
$i++;
|
||||
# group language models into sub-interpolated models
|
||||
my %ALREADY;
|
||||
my $g = 0;
|
||||
my @SUB_NAME;
|
||||
foreach my $subgroup (split(/ /,$GROUP)) {
|
||||
my @SUB_LM;
|
||||
foreach my $lm_i (split(/,/,$subgroup)) {
|
||||
die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM);
|
||||
push @SUB_LM,$LM[$lm_i];
|
||||
$ALREADY{$lm_i} = 1;
|
||||
}
|
||||
#if (scalar @SUB_NAME == 0 && scalar keys %ALREADY == scalar @LM) {
|
||||
# print STDERR "WARNING: grouped all language models into one, perform normal interpolation\n";
|
||||
# &interpolate($NAME,@LM);
|
||||
# exit;
|
||||
#}
|
||||
my $name = $NAME.".group-".chr(97+($g++));
|
||||
push @SUB_NAME,$name;
|
||||
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
|
||||
&interpolate($name, @SUB_LM);
|
||||
}
|
||||
safesystem($cmd) or die "Failed.";
|
||||
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
|
||||
next if defined($ALREADY{$lm_i});
|
||||
push @SUB_NAME, $LM[$lm_i];
|
||||
}
|
||||
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
|
||||
&interpolate($NAME, @SUB_NAME);
|
||||
|
||||
rmtree($tmp); # remove the temp dir
|
||||
print STDERR "done.\n";
|
||||
# main interpolation function
|
||||
sub interpolate {
|
||||
my ($name,@LM) = @_;
|
||||
|
||||
die("cannot interpolate more than 10 language models at once.")
|
||||
if scalar(@LM) > 10;
|
||||
|
||||
my $tmp = tempdir(DIR=>$TEMPDIR);
|
||||
|
||||
# compute perplexity
|
||||
my $i = 0;
|
||||
foreach my $lm (@LM) {
|
||||
print STDERR "compute perplexity for $lm\n";
|
||||
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
|
||||
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
|
||||
$i++;
|
||||
}
|
||||
|
||||
# compute lambdas
|
||||
print STDERR "computing lambdas...\n";
|
||||
my $cmd = "$SRILM/compute-best-mix";
|
||||
for(my $i=0;$i<scalar(@LM);$i++) {
|
||||
$cmd .= " $tmp/iplm.$$.$i";
|
||||
}
|
||||
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
|
||||
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
|
||||
my $mix = $mixout;
|
||||
`rm $tmp/iplm.$$.*`;
|
||||
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
|
||||
my @LAMBDA = split(/ /,$1);
|
||||
|
||||
# create new language model
|
||||
print STDERR "creating new language model...\n";
|
||||
$i = 0;
|
||||
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
|
||||
foreach my $lm (@LM) {
|
||||
$cmd .= " -lm " if $i==0;
|
||||
$cmd .= " -mix-lm " if $i==1;
|
||||
$cmd .= " -mix-lm$i " if $i>1;
|
||||
$cmd .= $lm;
|
||||
$cmd .= " -lambda " if $i==0;
|
||||
$cmd .= " -mix-lambda$i " if $i>1;
|
||||
$cmd .= $LAMBDA[$i] if $i!=1;
|
||||
$i++;
|
||||
}
|
||||
safesystem($cmd) or die "Failed.";
|
||||
|
||||
rmtree($tmp); # remove the temp dir
|
||||
print STDERR "done.\n";
|
||||
}
|
||||
|
||||
sub safesystem {
|
||||
print STDERR "Executing: @_\n";
|
||||
|
@ -16,7 +16,7 @@ while(<WEIGHT>) {
|
||||
if (/^\[weight\-(\S+)\]/) {
|
||||
$current_weight = $1;
|
||||
}
|
||||
elsif ($current_weight && /^([\-\d\.]+)([Ee][+-]?[\d]+)?$/) {
|
||||
elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
|
||||
push @{$WEIGHT{$current_weight}},$1;
|
||||
}
|
||||
elsif (/^\[/) {
|
||||
|
@ -553,6 +553,7 @@ sub bleu_score {
|
||||
my $score = 0;
|
||||
my $iscore = 0;
|
||||
my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
|
||||
print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";
|
||||
|
||||
for (my $j=1; $j<=$max_Ngram; $j++) {
|
||||
if ($matching_ngrams->[$j] == 0) {
|
||||
|
1168
scripts/generic/mteval-v13a.pl
Executable file
1168
scripts/generic/mteval-v13a.pl
Executable file
File diff suppressed because it is too large
Load Diff
@ -3,9 +3,15 @@
|
||||
# $Id$
|
||||
use strict;
|
||||
|
||||
my $lowercase = 0;
|
||||
if ($ARGV[0] eq "-lc") {
|
||||
$lowercase = 1;
|
||||
shift;
|
||||
}
|
||||
|
||||
my $stem = $ARGV[0];
|
||||
if (!defined $stem) {
|
||||
print STDERR "usage: multi-bleu.pl reference < hypothesis\n";
|
||||
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
|
||||
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
|
||||
exit(1);
|
||||
}
|
||||
@ -35,12 +41,14 @@ my(@CORRECT,@TOTAL,$length_translation,$length_reference);
|
||||
my $s=0;
|
||||
while(<STDIN>) {
|
||||
chop;
|
||||
$_ = lc if $lowercase;
|
||||
my @WORD = split;
|
||||
my %REF_NGRAM = ();
|
||||
my $length_translation_this_sentence = scalar(@WORD);
|
||||
my ($closest_diff,$closest_length) = (9999,9999);
|
||||
foreach my $reference (@{$REF[$s]}) {
|
||||
# print "$s $_ <=> $reference\n";
|
||||
$reference = lc($reference) if $lowercase;
|
||||
my @WORD = split(/ /,$reference);
|
||||
my $length = scalar(@WORD);
|
||||
my $diff = abs($length_translation_this_sentence-$length);
|
||||
|
78
scripts/generic/trainlm-irst.perl
Executable file
78
scripts/generic/trainlm-irst.perl
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
# To use it in the EMS, add this to the [LM] section
|
||||
# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
|
||||
# settings = ""
|
||||
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
|
||||
# It should point to the root of the LM toolkit, eg
|
||||
# irst-dir = /Users/hieu/workspace/irstlm/trunk
|
||||
# And make sure that $cores is defined, eg $cores = 8
|
||||
|
||||
use strict;
|
||||
use FindBin qw($Bin);
|
||||
use Getopt::Long;
|
||||
|
||||
my $order;
|
||||
my $corpusPath;
|
||||
my $lmPath;
|
||||
my $cores = 2;
|
||||
my $irstPath;
|
||||
my $tempPath = "tmp";
|
||||
|
||||
GetOptions("order=s" => \$order,
|
||||
"text=s" => \$corpusPath,
|
||||
"lm=s" => \$lmPath,
|
||||
"cores=s" => \$cores,
|
||||
"irst-dir=s" => \$irstPath,
|
||||
"temp-dir=s" => \$tempPath
|
||||
) or exit 1;
|
||||
|
||||
die("ERROR: please set order") unless defined($order);
|
||||
die("ERROR: please set text") unless defined($corpusPath);
|
||||
die("ERROR: please set lm") unless defined($lmPath);
|
||||
die("ERROR: please set irst-dir") unless defined($irstPath);
|
||||
|
||||
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
|
||||
print "extension is $ext\n";
|
||||
|
||||
$tempPath .= "/irstlm-build-tmp.$$";
|
||||
`mkdir -p $tempPath`;
|
||||
|
||||
my $cmd;
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$ext = ($lmPath =~ m/([^.]+)$/)[0];
|
||||
print "extension is $ext\n";
|
||||
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
|
||||
}
|
||||
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "rm -rf $tempPath";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
print STDERR "FINISH.\n";
|
@ -10,6 +10,7 @@
|
||||
|
||||
# Excerpts from revision history
|
||||
|
||||
# Dec 2011 update the script for the mert-moses.pl compatibility
|
||||
# Sept 2011 multi-threaded mert (Barry Haddow)
|
||||
# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
|
||||
# Jul 2011 simplifications (Ondrej Bojar)
|
||||
@ -47,9 +48,13 @@
|
||||
# 13 Oct 2004 Use alternative decoders (DWC)
|
||||
# Original version by Philipp Koehn
|
||||
|
||||
use strict;
|
||||
use FindBin qw($Bin);
|
||||
use File::Basename;
|
||||
use File::Path;
|
||||
use File::Spec;
|
||||
use Cwd;
|
||||
|
||||
my $SCRIPTS_ROOTDIR = $Bin;
|
||||
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
|
||||
@ -82,12 +87,16 @@ my $minimum_required_change_in_weights = 0.00001;
|
||||
|
||||
my $verbose = 0;
|
||||
my $usage = 0; # request for --help
|
||||
my $___WORKING_DIR = "mert-work";
|
||||
|
||||
# We assume that if you don't specify working directory,
|
||||
# we set the default is set to `pwd`/mert-work
|
||||
my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work");
|
||||
my $___DEV_F = undef; # required, input text to decode
|
||||
my $___DEV_E = undef; # required, basename of files with references
|
||||
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||
my $___CONFIG = undef; # required, pathname to startup ini file
|
||||
my $___N_BEST_LIST_SIZE = 100;
|
||||
my $___LATTICE_SAMPLES = 0;
|
||||
my $queue_flags = "-hard"; # extra parameters for parallelizer
|
||||
# the -l ws0ssmt was relevant only to JHU 2006 workshop
|
||||
my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial)
|
||||
@ -133,7 +142,6 @@ my $filtercmd = undef; # path to filter-model-given-input.pl
|
||||
my $filterfile = undef;
|
||||
my $qsubwrapper = undef;
|
||||
my $moses_parallel_cmd = undef;
|
||||
my $scorer_config = "BLEU:1";
|
||||
my $old_sge = 0; # assume sge<6.0
|
||||
my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering
|
||||
my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
|
||||
@ -144,10 +152,10 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
|
||||
# -1 means all previous, i.e. from iteration 1
|
||||
# 0 means no previous data, i.e. from actual iteration
|
||||
# 1 means 1 previous data , i.e. from the actual iteration and from the previous one
|
||||
# and so on
|
||||
# and so on
|
||||
my $maximum_iterations = 25;
|
||||
my $scorer_config = undef ;
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
GetOptions(
|
||||
"working-dir=s" => \$___WORKING_DIR,
|
||||
@ -157,6 +165,7 @@ GetOptions(
|
||||
"decoder=s" => \$___DECODER,
|
||||
"config=s" => \$___CONFIG,
|
||||
"nbest=i" => \$___N_BEST_LIST_SIZE,
|
||||
"lattice-samples=i" => \$___LATTICE_SAMPLES,
|
||||
"queue-flags=s" => \$queue_flags,
|
||||
"jobs=i" => \$___JOBS,
|
||||
"decoder-flags=s" => \$___DECODER_FLAGS,
|
||||
@ -191,8 +200,8 @@ GetOptions(
|
||||
"pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER,
|
||||
"pro-starting-point" => \$___PRO_STARTING_POINT,
|
||||
"historic-interpolation=f" => \$___HISTORIC_INTERPOLATION,
|
||||
"threads=i" => \$__THREADS,
|
||||
"sc-config=s" => \$scorer_config
|
||||
"sc-config=s" => \$scorer_config,
|
||||
"threads=i" => \$__THREADS
|
||||
) or exit(1);
|
||||
|
||||
# the 4 required parameters can be supplied on the command line directly
|
||||
@ -210,6 +219,7 @@ if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER |
|
||||
Options:
|
||||
--working-dir=mert-dir ... where all the files are created
|
||||
--nbest=100 ... how big nbestlist to generate
|
||||
--lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010)
|
||||
--jobs=N ... set this to anything to run moses in parallel
|
||||
--mosesparallelcmd=STR ... use a different script instead of moses-parallel
|
||||
--queue-flags=STRING ... anything you with to pass to qsub, eg.
|
||||
@ -276,7 +286,7 @@ Options:
|
||||
--threads=NUMBER ... Use multi-threaded mert (must be compiled in).
|
||||
--historic-interpolation ... Interpolate optimized weights with prior iterations' weight
|
||||
(parameter sets factor [0;1] given to current weights)
|
||||
--sc-config=STRING ... extra option to specify multiscoring.
|
||||
--sc-config=\"METRIC1:WEIGHT1,METRIC2:WEIGHT2\" ... extra option to specify tuning with multiple metrics.
|
||||
";
|
||||
exit 1;
|
||||
}
|
||||
@ -284,7 +294,6 @@ Options:
|
||||
|
||||
# Check validity of input parameters and set defaults if needed
|
||||
|
||||
print STDERR "Using WORKING_DIR: $___WORKING_DIR\n";
|
||||
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
|
||||
|
||||
# path of script for filtering phrase tables and running the decoder
|
||||
@ -308,9 +317,11 @@ if (!defined $mertdir) {
|
||||
|
||||
my $mert_extract_cmd = "$mertdir/extractor";
|
||||
my $mert_mert_cmd = "$mertdir/mert";
|
||||
my $mert_pro_cmd = "$mertdir/pro";
|
||||
|
||||
die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd;
|
||||
die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd;
|
||||
die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd;
|
||||
|
||||
my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation
|
||||
if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
|
||||
@ -610,6 +621,8 @@ my $oldallsorted = undef;
|
||||
my $allsorted = undef;
|
||||
|
||||
my $nbest_file=undef;
|
||||
my $lsamp_file=undef; #Lattice samples
|
||||
my $orig_nbest_file=undef; # replaced if lattice sampling
|
||||
|
||||
while(1) {
|
||||
$run++;
|
||||
@ -629,8 +642,20 @@ while(1) {
|
||||
# skip running the decoder if the user wanted
|
||||
if (!$skip_decoder) {
|
||||
print "($run) run decoder to produce n-best lists\n";
|
||||
$nbest_file = run_decoder($featlist, $run, $need_to_normalize);
|
||||
($nbest_file,$lsamp_file) = run_decoder($featlist, $run, $need_to_normalize);
|
||||
$need_to_normalize = 0;
|
||||
if ($___LATTICE_SAMPLES) {
|
||||
my $combined_file = "$nbest_file.comb";
|
||||
safesystem("sort -k1,1n $nbest_file $lsamp_file > $combined_file") or
|
||||
die("failed to merge nbest and lattice samples");
|
||||
safesystem("gzip -f $nbest_file; gzip -f $lsamp_file") or
|
||||
die "Failed to gzip nbests and lattice samples";
|
||||
$orig_nbest_file = "$nbest_file.gz";
|
||||
$orig_nbest_file = "$nbest_file.gz";
|
||||
$lsamp_file = "$lsamp_file.gz";
|
||||
$lsamp_file = "$lsamp_file.gz";
|
||||
$nbest_file = "$combined_file";
|
||||
}
|
||||
safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out";
|
||||
$nbest_file = $nbest_file.".gz";
|
||||
}
|
||||
@ -648,9 +673,12 @@ while(1) {
|
||||
my $base_score_file = "scores.dat";
|
||||
my $feature_file = "run$run.${base_feature_file}";
|
||||
my $score_file = "run$run.${base_score_file}";
|
||||
|
||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||
my $cmd = "";
|
||||
|
||||
if (defined($scorer_config))
|
||||
{
|
||||
#process the mulitple metric way
|
||||
print STDERR "-- process the mulitple metric way --\n";
|
||||
my $scorer_name;
|
||||
my $scorer_weight;
|
||||
$scorer_config=~s/ //g;
|
||||
@ -659,108 +687,153 @@ while(1) {
|
||||
my $scorer_config_spec;
|
||||
foreach $scorer_config_spec(@lists_scorer_config)
|
||||
{
|
||||
# print STDERR $scorer_config_spec."\n";
|
||||
my @lists_scorer_config_spec=split(":",$scorer_config_spec);
|
||||
$scorer_name=$lists_scorer_config_spec[0];
|
||||
$scorer_weight=$lists_scorer_config_spec[1];
|
||||
# print STDERR $scorer_name."\n";
|
||||
# print STDERR $scorer_weight."\n";
|
||||
$cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file.$scorer_name --ffile $feature_file.$scorer_name --sctype $scorer_name -r ".join(",", @references)." -n $nbest_file";
|
||||
# print STDERR "LANCEMENT $scorer_name ********************************************\n";
|
||||
&submit_or_exec($cmd,"extract.out.$scorer_name","extract.err.$scorer_name");
|
||||
# print STDERR "FIN $scorer_name ************************************************** \n";
|
||||
# print STDERR "executing $cmd\n";
|
||||
|
||||
# print STDERR "\n";
|
||||
# safesystem("date");
|
||||
# print STDERR "\n";
|
||||
|
||||
# if (defined $___JOBS) {
|
||||
# safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=extract.out.$scorer_name -stderr=extract.err.$scorer_name" )
|
||||
# or die "$scorer_name Failed to submit extraction to queue (via $qsubwrapper)";
|
||||
# } else {
|
||||
# safesystem("$cmd > extract.out.$scorer_name 2> extract.err.$scorer_name") or die "$scorer_name Failed to do extraction of statistics.";
|
||||
# }
|
||||
|
||||
# print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n";
|
||||
}
|
||||
# print STDERR "CREATION INI\n";
|
||||
my @scorer_content;
|
||||
my $fileIncrement=0;
|
||||
open(FILE,">merge.init") || die ("File creation ERROR : merge.init");
|
||||
my $minFileName="";
|
||||
my $minFileSize;
|
||||
my %scoreFileContent;
|
||||
my %featureFileContent;
|
||||
my $firstContent;
|
||||
foreach $scorer_config_spec(@lists_scorer_config)
|
||||
{
|
||||
my @lists_scorer_config_spec=split(":",$scorer_config_spec);
|
||||
$scorer_name=$lists_scorer_config_spec[0];
|
||||
$scorer_weight=$lists_scorer_config_spec[1];
|
||||
print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n";
|
||||
my @tmp_content=`/bin/cat $score_file.$scorer_name`;
|
||||
$scorer_content[$fileIncrement] = [ @tmp_content ];
|
||||
my @tmp_scoreContent=`/bin/cat $score_file.$scorer_name`;
|
||||
my @tmp_featContent=`/bin/cat $feature_file.$scorer_name`;
|
||||
my $localIncrementFileContent=0;
|
||||
my $fileContentInfo=0;
|
||||
my $localIncrementInfo=0;
|
||||
for ($localIncrementFileContent=0; $localIncrementFileContent<scalar(@tmp_scoreContent); $localIncrementFileContent++)
|
||||
{
|
||||
if (rindex($tmp_scoreContent[$localIncrementFileContent],"BEGIN")>-1)
|
||||
{
|
||||
my @split_local=split(" ",$tmp_scoreContent[$localIncrementFileContent]);
|
||||
$fileContentInfo=$split_local[1];
|
||||
|
||||
$localIncrementInfo=0;
|
||||
}
|
||||
chomp($tmp_scoreContent[$localIncrementFileContent]);
|
||||
chomp($tmp_featContent[$localIncrementFileContent]);
|
||||
$scoreFileContent{$fileIncrement}{$fileContentInfo}{$localIncrementInfo}=$tmp_scoreContent[$localIncrementFileContent];
|
||||
$featureFileContent{$fileIncrement}{$fileContentInfo}{$localIncrementInfo}=$tmp_featContent[$localIncrementFileContent];
|
||||
$localIncrementInfo++;
|
||||
}
|
||||
if ($fileIncrement==0)
|
||||
{
|
||||
`/bin/cp $feature_file.$scorer_name $feature_file`;
|
||||
$minFileSize=$localIncrementFileContent;
|
||||
$minFileName=$scorer_name;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ($minFileSize>$localIncrementFileContent)
|
||||
{
|
||||
$minFileSize=$localIncrementFileContent;
|
||||
$minFileName=$scorer_name;
|
||||
}
|
||||
}
|
||||
$fileIncrement++;
|
||||
}
|
||||
close(FILE);
|
||||
# print STDERR "\n";
|
||||
# safesystem("date");
|
||||
# print STDERR "\n";
|
||||
|
||||
# print STDERR "ON VA RASSEMBLER dans $score_file\n";
|
||||
open(SCOREFILE,">$score_file") || die ("File creation ERROR : $score_file");
|
||||
open(FEATUREFILE,">$feature_file") || die ("File creation ERROR : $feature_file");
|
||||
my $newFileIncrement=0;
|
||||
my $contentIncrement=0;
|
||||
my $contentSize=scalar(@{$scorer_content[0]});
|
||||
# print STDERR "TAILLE : ".$contentSize."|".$fileIncrement."\n";
|
||||
while ($contentIncrement< $contentSize)
|
||||
my @nbestSize;
|
||||
my $contentSize;
|
||||
my $lineScore="";
|
||||
my $lineFeature="";
|
||||
my $minSize;
|
||||
my $localContentIncrement=0;
|
||||
my @localContentSizeSize;
|
||||
my $scoreFileName;
|
||||
my $notFinished=1;
|
||||
my $scoreName=$minFileName;
|
||||
my $minInfoSize=-1;
|
||||
$fileIncrement=0;
|
||||
while (defined($scoreFileContent{$fileIncrement}{$contentIncrement}))
|
||||
{
|
||||
my $line="";
|
||||
$newFileIncrement=0;
|
||||
while($newFileIncrement< $fileIncrement)
|
||||
if ($localContentIncrement==0)
|
||||
{
|
||||
if (rindex($scorer_content[$newFileIncrement][$contentIncrement],"BEGIN")<0)
|
||||
{
|
||||
$line=$line." ".$scorer_content[$newFileIncrement][$contentIncrement];
|
||||
chomp($line);
|
||||
}
|
||||
else
|
||||
{
|
||||
my @split_line_input=split(" ",$scorer_content[$newFileIncrement][$contentIncrement]);
|
||||
my @split_line=split(" ",$line);
|
||||
foreach $fileIncrement(sort keys %scoreFileContent)
|
||||
{
|
||||
# process the score file
|
||||
my @tmp_split=split(" ",$scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement});
|
||||
if ($minInfoSize==-1)
|
||||
{
|
||||
$minInfoSize=$tmp_split[2];
|
||||
}
|
||||
elsif ($minInfoSize>$tmp_split[2])
|
||||
{
|
||||
$minInfoSize=$tmp_split[2];
|
||||
}
|
||||
my @split_line=split(" ",$lineScore);
|
||||
if (scalar(@split_line)>0)
|
||||
{
|
||||
$split_line_input[3]=$split_line[3]+$split_line_input[3];
|
||||
$tmp_split[3]=$split_line[3]+$tmp_split[3];
|
||||
}
|
||||
$line=$split_line_input[0]." ".$split_line_input[1]." ".$split_line_input[2]." ".$split_line_input[3]." MERGE";
|
||||
$lineScore=$tmp_split[0]." ".$contentIncrement." ".$minInfoSize." ".$tmp_split[3]." MERGE";
|
||||
# process the feature file
|
||||
@tmp_split=split(" ",$featureFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement});
|
||||
$lineFeature=$tmp_split[0]." ".$contentIncrement." ".$minInfoSize." ".$tmp_split[3]." MERGE";
|
||||
|
||||
}
|
||||
$newFileIncrement++;
|
||||
$localContentIncrement++;
|
||||
}
|
||||
$line=~s/^[ ]+//g;
|
||||
$line=~s/[ ]+$//g;
|
||||
$line=~s/[ ]+/ /g;
|
||||
# print STDERR $line."\n";
|
||||
print SCOREFILE $line."\n";
|
||||
$contentIncrement++;
|
||||
else
|
||||
{
|
||||
LOOP_CONTENT: foreach $scoreName(sort keys %scoreFileContent)
|
||||
{
|
||||
if ((rindex($scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement},"END")>-1) || ($minInfoSize < $localContentIncrement))
|
||||
{
|
||||
$lineScore="SCORES_TXT_END_0";
|
||||
$lineFeature="FEATURES_TXT_END_0";
|
||||
$localContentIncrement=0;
|
||||
$contentIncrement++;
|
||||
$minInfoSize=-1;
|
||||
last LOOP_CONTENT;
|
||||
}
|
||||
else
|
||||
{
|
||||
$lineScore=$lineScore." ".$scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement};
|
||||
$lineFeature=$featureFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement};
|
||||
}
|
||||
}
|
||||
if ($localContentIncrement!=0)
|
||||
{
|
||||
$localContentIncrement++;
|
||||
}
|
||||
}
|
||||
$lineScore=~s/^[ ]+//g;
|
||||
$lineScore=~s/[ ]+$//g;
|
||||
$lineScore=~s/[ ]+/ /g;
|
||||
$lineFeature=~s/^[ ]+//g;
|
||||
$lineFeature=~s/[ ]+$//g;
|
||||
$lineFeature=~s/[ ]+/ /g;
|
||||
print SCOREFILE $lineScore."\n";
|
||||
print FEATUREFILE $lineFeature."\n";
|
||||
$lineScore="";
|
||||
$lineFeature="";
|
||||
}
|
||||
close(SCOREFILE);
|
||||
close(FEATUREFILE);
|
||||
}
|
||||
else
|
||||
{
|
||||
# continue with the classical way
|
||||
$cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file";
|
||||
$cmd = create_extractor_script($cmd, $___WORKING_DIR);
|
||||
&submit_or_exec($cmd,"extract.out","extract.err");
|
||||
}
|
||||
close(SCOREFILE);
|
||||
# `/bin/cp `
|
||||
|
||||
# $cmd="$mertdir/mergeWeights -c merge.init -s $score_file -f $feature_file";
|
||||
# print STDERR "executing : $cmd\n";
|
||||
|
||||
# if (defined $___JOBS) {
|
||||
# safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=mergeWeight.out.MERGE -stderr=mergeWeight.err.MERGE" )
|
||||
# or die "MERGE Failed to submit extraction to queue (via $qsubwrapper)";
|
||||
# } else {
|
||||
# safesystem("$cmd > mergeWeight.out.MERGE 2> mergeWeight.err.MERGE") or die "MERGE Failed to do extraction of statistics.";
|
||||
# }
|
||||
|
||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||
|
||||
# my $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file";
|
||||
# &submit_or_exec($cmd,"extract.out","extract.err");
|
||||
|
||||
# Create the initial weights file for mert: init.opt
|
||||
|
||||
my @MIN = @{$featlist->{"mins"}};
|
||||
@ -785,10 +858,12 @@ while(1) {
|
||||
$cmd = "$mert_mert_cmd -d $DIM $mert_mert_args";
|
||||
|
||||
my $mert_settings = " -n $___RANDOM_RESTARTS";
|
||||
my $seed_settings = "";
|
||||
if ($___PREDICTABLE_SEEDS) {
|
||||
my $seed = $run * 1000;
|
||||
$mert_settings .= " -r $seed";
|
||||
$seed_settings .= " -r $seed";
|
||||
}
|
||||
$mert_settings .= $seed_settings;
|
||||
if ($___RANDOM_DIRECTIONS) {
|
||||
if ($___NUM_RANDOM_DIRECTIONS == 0) {
|
||||
$mert_settings .= " -m 50";
|
||||
@ -802,19 +877,25 @@ while(1) {
|
||||
$mert_settings .= " --threads $__THREADS";
|
||||
}
|
||||
|
||||
my $file_settings = "";
|
||||
my $ffiles = "";
|
||||
my $scfiles = "";
|
||||
if (defined $prev_feature_file) {
|
||||
$file_settings .= " --ffile $prev_feature_file,$feature_file";
|
||||
$ffiles = "$prev_feature_file,$feature_file";
|
||||
}
|
||||
else{
|
||||
$file_settings .= " --ffile $feature_file";
|
||||
$ffiles = "$feature_file";
|
||||
}
|
||||
if (defined $prev_score_file) {
|
||||
$file_settings .= " --scfile $prev_score_file,$score_file";
|
||||
$scfiles = "$prev_score_file,$score_file";
|
||||
}
|
||||
else{
|
||||
$file_settings .= " --scfile $score_file";
|
||||
$scfiles = "$score_file";
|
||||
}
|
||||
|
||||
my $file_settings = " --ffile $ffiles --scfile $scfiles";
|
||||
my $pro_file_settings = "--ffile " . join( " --ffile ", split(/,/, $ffiles)) .
|
||||
" --scfile " . join( " --scfile ", split(/,/, $scfiles));
|
||||
|
||||
if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) {
|
||||
$file_settings .= " --ifile $prev_init_file,run$run.$weights_in_file";
|
||||
}
|
||||
@ -826,13 +907,13 @@ while(1) {
|
||||
|
||||
# pro optimization
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER) {
|
||||
$cmd .= " --pro run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
|
||||
$cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
|
||||
&submit_or_exec($cmd,$mert_outfile,$mert_logfile);
|
||||
}
|
||||
# first pro, then mert
|
||||
elsif ($___PRO_STARTING_POINT) {
|
||||
# run pro...
|
||||
my $pro_cmd = $cmd." --pro run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
|
||||
my $pro_cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
|
||||
&submit_or_exec($pro_cmd,"run$run.pro.out","run$run.pro.err");
|
||||
# ... get results ...
|
||||
my %dummy;
|
||||
@ -858,9 +939,8 @@ while(1) {
|
||||
chomp $extractFiles;
|
||||
safesystem ("\\cp -f $extractFiles run$run.$extractFiles") or die;
|
||||
}
|
||||
|
||||
# safesystem ("\\cp -f extract.err run$run.extract.err") or die;
|
||||
# safesystem ("\\cp -f extract.out run$run.extract.out") or die;
|
||||
# safesystem ("\\cp -f extract.err run$run.extract.err") or die;
|
||||
# safesystem ("\\cp -f extract.out run$run.extract.out") or die;
|
||||
safesystem ("\\cp -f $mert_outfile run$run.$mert_outfile") or die;
|
||||
safesystem ("\\cp -f $mert_logfile run$run.$mert_logfile") or die;
|
||||
safesystem ("touch $mert_logfile run$run.$mert_logfile") or die;
|
||||
@ -985,7 +1065,7 @@ if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; };
|
||||
safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die;
|
||||
safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die;
|
||||
|
||||
create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu);
|
||||
create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu, $sparse_weights_file);
|
||||
|
||||
# just to be sure that we have the really last finished step marked
|
||||
open F, "> finished_step.txt" or die "Can't mark finished step";
|
||||
@ -1040,6 +1120,11 @@ sub run_decoder {
|
||||
my ($featlist, $run, $need_to_normalize) = @_;
|
||||
my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
|
||||
my $filename = sprintf($filename_template, $run);
|
||||
my $lsamp_filename = undef;
|
||||
if ($___LATTICE_SAMPLES) {
|
||||
my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out";
|
||||
$lsamp_filename = sprintf($lsamp_filename_template, $run);
|
||||
}
|
||||
|
||||
# user-supplied parameters
|
||||
print "params = $___DECODER_FLAGS\n";
|
||||
@ -1060,23 +1145,28 @@ sub run_decoder {
|
||||
$model_weights{$name} .= sprintf " %.6f", $vals[$i];
|
||||
}
|
||||
my $decoder_config = join(" ", values %model_weights);
|
||||
$decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights";
|
||||
print STDERR "DECODER_CFG = $decoder_config\n";
|
||||
print "decoder_config = $decoder_config\n";
|
||||
|
||||
|
||||
# run the decoder
|
||||
my $nBest_cmd = "-n-best-size $___N_BEST_LIST_SIZE";
|
||||
my $decoder_cmd;
|
||||
my $lsamp_cmd = "";
|
||||
if ($___LATTICE_SAMPLES) {
|
||||
$lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES ";
|
||||
}
|
||||
|
||||
if (defined $___JOBS && $___JOBS > 0) {
|
||||
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
||||
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
||||
} else {
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out";
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out";
|
||||
}
|
||||
|
||||
safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n";
|
||||
|
||||
sanity_check_order_of_lambdas($featlist, $filename);
|
||||
return $filename;
|
||||
return ($filename, $lsamp_filename);
|
||||
}
|
||||
|
||||
|
||||
@ -1374,3 +1464,20 @@ sub submit_or_exec {
|
||||
safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'.";
|
||||
}
|
||||
}
|
||||
|
||||
sub create_extractor_script
|
||||
{
|
||||
my ($cmd, $outdir) = @_;
|
||||
my $script_path = File::Spec->catfile($outdir, "extractor.sh");
|
||||
|
||||
open my $out, '>', $script_path
|
||||
or die "Couldn't open $script_path for writing: $!\n";
|
||||
print $out "#!/bin/bash\n";
|
||||
print $out "cd $outdir\n";
|
||||
print $out "$cmd\n";
|
||||
close($out);
|
||||
|
||||
`chmod +x $script_path`;
|
||||
|
||||
return $script_path;
|
||||
}
|
||||
|
@ -47,9 +47,13 @@
|
||||
# 13 Oct 2004 Use alternative decoders (DWC)
|
||||
# Original version by Philipp Koehn
|
||||
|
||||
use strict;
|
||||
use FindBin qw($Bin);
|
||||
use File::Basename;
|
||||
use File::Path;
|
||||
use File::Spec;
|
||||
use Cwd;
|
||||
|
||||
my $SCRIPTS_ROOTDIR = $Bin;
|
||||
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
|
||||
@ -82,7 +86,10 @@ my $minimum_required_change_in_weights = 0.00001;
|
||||
|
||||
my $verbose = 0;
|
||||
my $usage = 0; # request for --help
|
||||
my $___WORKING_DIR = "mert-work";
|
||||
|
||||
# We assume that if you don't specify working directory,
|
||||
# we set the default is set to `pwd`/mert-work
|
||||
my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work");
|
||||
my $___DEV_F = undef; # required, input text to decode
|
||||
my $___DEV_E = undef; # required, basename of files with references
|
||||
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||
@ -144,10 +151,9 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
|
||||
# -1 means all previous, i.e. from iteration 1
|
||||
# 0 means no previous data, i.e. from actual iteration
|
||||
# 1 means 1 previous data , i.e. from the actual iteration and from the previous one
|
||||
# and so on
|
||||
# and so on
|
||||
my $maximum_iterations = 25;
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
GetOptions(
|
||||
"working-dir=s" => \$___WORKING_DIR,
|
||||
@ -1298,19 +1304,16 @@ sub submit_or_exec {
|
||||
sub create_extractor_script()
|
||||
{
|
||||
my ($cmd, $outdir) = @_;
|
||||
my $script_path = File::Spec->catfile($outdir, "extractor.sh");
|
||||
|
||||
my $script_path = $outdir."/extractor.sh";
|
||||
|
||||
open(OUT,"> $script_path")
|
||||
or die "Can't write $script_path";
|
||||
print OUT "#!/bin/bash\n";
|
||||
print OUT "cd $outdir\n";
|
||||
print OUT $cmd."\n";
|
||||
close(OUT);
|
||||
open my $out, '>', $script_path
|
||||
or die "Couldn't open $script_path for writing: $!\n";
|
||||
print $out "#!/bin/bash\n";
|
||||
print $out "cd $outdir\n";
|
||||
print $out "$cmd\n";
|
||||
close($out);
|
||||
|
||||
`chmod +x $script_path`;
|
||||
|
||||
return $script_path;
|
||||
return $script_path;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user