From 8dee1725fb4f362a5d65f25ecdecafec7b08b1ae Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 3 Jun 2014 21:36:04 +0100 Subject: [PATCH 01/27] Removed Phrase penalty as a built-in feature function. --- moses/TranslationModel/UG/mmsapt.cpp | 20 ++++++++++---------- moses/TranslationModel/UG/mmsapt.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 128dcfe80..789907321 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -196,8 +196,8 @@ namespace Moses // currently always active by default; may (should) change later num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex"); - if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility - num_feats = apply_pp.init(num_feats); + // if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility + // num_feats = apply_pp.init(num_feats); if (num_feats < this->m_numScoreComponents) { @@ -283,8 +283,8 @@ namespace Moses { PhrasePair pp; pp.init(pid1, stats, this->m_numScoreComponents); - if (this->m_numScoreComponents%2) - apply_pp(bt,pp); + // if (this->m_numScoreComponents%2) + // apply_pp(bt,pp); pstats::trg_map_t::const_iterator t; for (t = stats.trg.begin(); t != stats.trg.end(); ++t) { @@ -318,8 +318,8 @@ namespace Moses pp.init(pid1b, *statsb, this->m_numScoreComponents); else return false; // throw "no stats for pooling available!"; - if (this->m_numScoreComponents%2) - apply_pp(bta,pp); + // if (this->m_numScoreComponents%2) + // apply_pp(bta,pp); pstats::trg_map_t::const_iterator b; pstats::trg_map_t::iterator a; if (statsb) @@ -415,8 +415,8 @@ namespace Moses if (statsb) { pool.init(pid1b,*statsb,0); - if (this->m_numScoreComponents%2) - apply_pp(btb,ppdyn); + // if (this->m_numScoreComponents%2) + // apply_pp(btb,ppdyn); for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) { ppdyn.update(b->first,b->second); @@ -456,8 +456,8 @@ namespace Moses if (statsa) { pool.init(pid1a,*statsa,0); - if (this->m_numScoreComponents%2) - apply_pp(bta,ppfix); + // if (this->m_numScoreComponents%2) + // apply_pp(bta,ppfix); for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) { if (!a->second.valid()) continue; // done above diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 5353a1c46..e0e2d8950 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -71,7 +71,7 @@ namespace Moses PScorePfwd calc_pfwd_fix, calc_pfwd_dyn; PScorePbwd calc_pbwd_fix, calc_pbwd_dyn; PScoreLex calc_lex; // this one I'd like to see as an external ff eventually - PScorePP apply_pp; // apply phrase penalty + // PScorePP apply_pp; // apply phrase penalty PScoreLogCounts add_logcounts_fix; PScoreLogCounts add_logcounts_dyn; void init(string const& line); From ce853731aec10c99738291a24b695bb2c6abffd9 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:38:55 +0100 Subject: [PATCH 02/27] Added mmsapt lookup utility. --- Jamroot | 1 + moses/TranslationModel/UG/lookup_mmsapt.cc | 76 ++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 moses/TranslationModel/UG/lookup_mmsapt.cc diff --git a/Jamroot b/Jamroot index 1f7ca48cd..f6ce6b8f3 100644 --- a/Jamroot +++ b/Jamroot @@ -145,6 +145,7 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses if [ option.get "with-mm" : : "yes" ] { alias mm : + moses/TranslationModel/UG//lookup_mmsapt moses/TranslationModel/UG/mm//mtt-build moses/TranslationModel/UG/mm//mtt-dump moses/TranslationModel/UG/mm//symal2mam diff --git a/moses/TranslationModel/UG/lookup_mmsapt.cc b/moses/TranslationModel/UG/lookup_mmsapt.cc new file mode 100644 index 000000000..39ac23cc7 --- /dev/null +++ b/moses/TranslationModel/UG/lookup_mmsapt.cc @@ -0,0 +1,76 @@ +#include "mmsapt.h" +#include +#include +#include +#include +#include + +using namespace Moses; +using namespace bitext; +using namespace std; +using namespace boost; + +vector fo(1,FactorType(0)); + +class SimplePhrase : public Moses::Phrase +{ + vector const m_fo; // factor order +public: + SimplePhrase(): m_fo(1,FactorType(0)) {} + + void init(string const& s) + { + istringstream buf(s); string w; + while (buf >> w) + { + Word wrd; + this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false); + } + } +}; + +class TargetPhraseIndexSorter +{ + TargetPhraseCollection const& my_tpc; + CompareTargetPhrase cmp; +public: + TargetPhraseIndexSorter(TargetPhraseCollection const& tpc) : my_tpc(tpc) {} + bool operator()(size_t a, size_t b) const + { + return cmp(*my_tpc[a], *my_tpc[b]); + } +}; + +int main(int argc, char* argv[]) +{ + Parameter params; + if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(¶ms, argv[0])) + exit(1); + + Mmsapt* PT; + BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl()) + if ((PT = dynamic_cast(pd))) break; + + string line; + while (getline(cin,line)) + { + SimplePhrase p; p.init(line); + cout << p << endl; + TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p); + if (!trg) continue; + vector order(trg->GetSize()); + for (size_t i = 0; i < order.size(); ++i) order[i] = i; + sort(order.begin(),order.end(),TargetPhraseIndexSorter(*trg)); + size_t k = 0; + BOOST_FOREACH(size_t i, order) + { + Phrase const& phr = static_cast(*(*trg)[i]); + cout << setw(3) << ++k << " " << phr << endl; + } + PT->Release(trg); + } + exit(0); +} + + + From 5ae57f09d7e97d7608ae848b3aebd5502c37292e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:39:44 +0100 Subject: [PATCH 03/27] Commented out unused variable. --- moses/ChartManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index 139256171..e137da915 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -125,7 +125,7 @@ void ChartManager::ProcessSentence() */ void ChartManager::AddXmlChartOptions() { - const StaticData &staticData = StaticData::Instance(); + // const StaticData &staticData = StaticData::Instance(); const std::vector xmlChartOptionsList = m_source.GetXmlChartTranslationOptions(); IFVERBOSE(2) { From 0e98a08446b1280e56c0ca130540a03ad46fe9d7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:40:27 +0100 Subject: [PATCH 04/27] Commented out unused variable. --- moses/ConfusionNet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp index 5861ee5f1..d9270bd1b 100644 --- a/moses/ConfusionNet.cpp +++ b/moses/ConfusionNet.cpp @@ -142,7 +142,7 @@ namespace Moses { Clear(); - const StaticData &staticData = StaticData::Instance(); + // const StaticData &staticData = StaticData::Instance(); const InputFeature &inputFeature = InputFeature::Instance(); size_t numInputScores = inputFeature.GetNumInputScores(); size_t numRealWordCount = inputFeature.GetNumRealWordsInInput(); From 3145bf3cc4de18eb56e8c6bd30cd152abd00d5c3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:40:51 +0100 Subject: [PATCH 05/27] Commented out unused variable. --- moses/InputPath.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/InputPath.cpp b/moses/InputPath.cpp index f00f1a7a4..523b03d53 100644 --- a/moses/InputPath.cpp +++ b/moses/InputPath.cpp @@ -85,7 +85,7 @@ size_t InputPath::GetTotalRuleSize() const size_t ret = 0; std::map >::const_iterator iter; for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter) { - const PhraseDictionary *pt = iter->first; + // const PhraseDictionary *pt = iter->first; const TargetPhraseCollection *tpColl = iter->second.first; if (tpColl) { From 004b8c907856ba01d54c8c7f3f1b198e5d3ad4ba Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:41:50 +0100 Subject: [PATCH 06/27] Changed Phrase.m_words from private to protected. --- moses/Phrase.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/Phrase.h b/moses/Phrase.h index 4a5c4828a..f6eb661de 100644 --- a/moses/Phrase.h +++ b/moses/Phrase.h @@ -47,8 +47,8 @@ class WordsRange; class Phrase { friend std::ostream& operator<<(std::ostream&, const Phrase&); -private: - + // private: +protected: std::vector m_words; public: From c7a0520a18c19388d923221b00f673394b960710 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:43:26 +0100 Subject: [PATCH 07/27] Made moses shut up by changing unconditional 'cerr's to VERBOSE(1,...) --- moses/StaticData.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 0340778ed..f5cb1b77d 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -494,7 +494,8 @@ bool StaticData::LoadData(Parameter *parameter) } m_xmlBrackets.first= brackets[0]; m_xmlBrackets.second=brackets[1]; - cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl; + VERBOSE(1,"XML tags opening and closing brackets for XML input are: " + << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl); } if (m_parameter->GetParam("placeholder-factor").size() > 0) { @@ -511,7 +512,7 @@ bool StaticData::LoadData(Parameter *parameter) const vector &features = m_parameter->GetParam("feature"); for (size_t i = 0; i < features.size(); ++i) { const string &line = Trim(features[i]); - cerr << "line=" << line << endl; + VERBOSE(1,"line=" << line << endl); if (line.empty()) continue; @@ -640,7 +641,8 @@ void StaticData::LoadNonTerminals() "Incorrect unknown LHS format: " << line); UnknownLHSEntry entry(tokens[0], Scan(tokens[1])); m_unknownLHS.push_back(entry); - const Factor *targetFactor = factorCollection.AddFactor(Output, 0, tokens[0], true); + // const Factor *targetFactor = + factorCollection.AddFactor(Output, 0, tokens[0], true); } } @@ -734,7 +736,7 @@ bool StaticData::LoadDecodeGraphs() DecodeGraph *decodeGraph; if (IsChart()) { size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; - cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl; + VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl); decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); } else { decodeGraph = new DecodeGraph(m_decodeGraphs.size()); @@ -866,7 +868,7 @@ void StaticData::SetExecPath(const std::string &path) if (pos != string::npos) { m_binPath = path.substr(0, pos); } - cerr << m_binPath << endl; + VERBOSE(1,m_binPath << endl); } const string &StaticData::GetBinDirectory() const @@ -920,7 +922,8 @@ void StaticData::LoadFeatureFunctions() FeatureFunction *ff = *iter; bool doLoad = true; - if (PhraseDictionary *ffCast = dynamic_cast(ff)) { + // if (PhraseDictionary *ffCast = dynamic_cast(ff)) { + if (dynamic_cast(ff)) { doLoad = false; } @@ -964,7 +967,7 @@ bool StaticData::CheckWeights() const set::iterator iter; for (iter = weightNames.begin(); iter != weightNames.end(); ) { string fname = (*iter).substr(0, (*iter).find("_")); - cerr << fname << "\n"; + VERBOSE(1,fname << "\n"); if (featureNames.find(fname) != featureNames.end()) { weightNames.erase(iter++); } @@ -1039,7 +1042,7 @@ bool StaticData::LoadAlternateWeightSettings() vector tokens = Tokenize(weightSpecification[i]); vector args = Tokenize(tokens[0], "="); currentId = args[1]; - cerr << "alternate weight setting " << currentId << endl; + VERBOSE(1,"alternate weight setting " << currentId << endl); UTIL_THROW_IF2(m_weightSetting.find(currentId) != m_weightSetting.end(), "Duplicate alternate weight id: " << currentId); m_weightSetting[ currentId ] = new ScoreComponentCollection; From 22ec93b85ce5e64a04a24942e51b2ca84d36625b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:44:11 +0100 Subject: [PATCH 08/27] Added operator [] to TargetPhraseCollection. --- moses/TargetPhraseCollection.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h index 47eee0458..0c6a7a74c 100644 --- a/moses/TargetPhraseCollection.h +++ b/moses/TargetPhraseCollection.h @@ -44,6 +44,12 @@ public: typedef CollType::iterator iterator; typedef CollType::const_iterator const_iterator; + TargetPhrase const* + operator[](size_t const i) const + { + return m_collection.at(i); + } + iterator begin() { return m_collection.begin(); } From a40fcbae02827d31bad754c90b9596b977914d2e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:45:34 +0100 Subject: [PATCH 09/27] Added utility lookup_mmsapt --- moses/TranslationModel/UG/Jamfile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile index 1ee663044..547928423 100644 --- a/moses/TranslationModel/UG/Jamfile +++ b/moses/TranslationModel/UG/Jamfile @@ -9,6 +9,17 @@ $(TOP)/moses/TranslationModel/UG//mmsapt $(TOP)/util//kenutil ; +exe lookup_mmsapt : +lookup_mmsapt.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; + install $(PREFIX)/bin : try-align ; fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ; From 2f109621bff2eacac5168155a171f84f5de4f9a9 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:47:29 +0100 Subject: [PATCH 10/27] Added configurable options and SetTableLimit to Mmsapt. --- moses/TranslationModel/UG/mmsapt.cpp | 33 ++++++++++++++++++---------- moses/TranslationModel/UG/mmsapt.h | 3 +++ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 789907321..b2c4c10f2 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -122,16 +122,16 @@ namespace Moses if (m != param.end()) withPbwd = m->second != "0"; - m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000; - m = param.find("workers"); m_workers = m != param.end() ? atoi(m->second.c_str()) : 8; m_workers = min(m_workers,24UL); + m = param.find("limit"); + if (m != param.end()) m_tableLimit = atoi(m->second.c_str()); + m = param.find("cache-size"); - m_history.reserve(m != param.end() - ? max(1000,atoi(m->second.c_str())) - : 10000); + m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000); + // in plain language: cache size is at least 1000, and 10,000 by default this->m_numScoreComponents = atoi(param["num-features"].c_str()); @@ -368,6 +368,13 @@ namespace Moses } else pp.update(a->first,a->second); +#if 0 + // jstats const& j = a->second; + cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " + << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl; + cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " + << pp.joint << " " << pp.raw2 << endl; +#endif UTIL_THROW_IF2(pp.raw2 == 0, "OOPS" @@ -376,12 +383,6 @@ namespace Moses << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " << pp.joint << " " << pp.raw2); -#if 0 - jstats const& j = a->second; - cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " - << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl; - cerr << j.rcnt() << " " << j.cnt2() << " " << j.wcnt() << endl; -#endif calc_lex(bta,pp); if (withPfwd) calc_pfwd_fix(bta,pp); if (withPbwd) calc_pbwd_fix(bta,pp); @@ -662,7 +663,7 @@ namespace Moses || combine_pstats(src, mfix.getPid(),sfix.get(),btfix, mdyn.getPid(),sdyn.get(),*dyn,ret)) { - ret->NthElement(m_tableLimit); + if (m_tableLimit) ret->Prune(true,m_tableLimit); #if 0 sort(ret->begin(), ret->end(), CompareTargetPhrase()); cout << "SOURCE PHRASE: " << src << endl; @@ -683,6 +684,14 @@ namespace Moses return encache(ret); } + size_t + Mmsapt:: + SetTableLimit(size_t limit) + { + std::swap(m_tableLimit,limit); + return limit; + } + void Mmsapt:: CleanUpAfterSentenceProcessing(const InputType& source) diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index e0e2d8950..b5a5b15e2 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -168,6 +168,9 @@ namespace Moses void Load(); + // returns the prior table limit + size_t SetTableLimit(size_t limit); + #ifndef NO_MOSES TargetPhraseCollection const* GetTargetPhraseCollectionLEGACY(const Phrase& src) const; From b92d599727f53f7c97f6cf4fee92516a7e40ca6a Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:48:11 +0100 Subject: [PATCH 11/27] Bug fix in mmlex-lookup. --- moses/TranslationModel/UG/mm/mmlex-lookup.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc index 14d839edf..fbdceeaa0 100644 --- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc +++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc @@ -131,7 +131,7 @@ interpret_args(int ac, char* av[]) o.add_options() ("help,h", "print this message") ("source,s",po::value(&swrd),"source word") - ("target,t",po::value(&swrd),"target word") + ("target,t",po::value(&twrd),"target word") ; h.add_options() From 5116f0072b162ed06dc7132d0f727fcaaecab306 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 5 Jun 2014 01:50:55 +0100 Subject: [PATCH 12/27] Minor edits to ug_bitext.h. Added min_diverse to ug_bitext::job to ensure minimum number of translation alternatives before sampling stops. --- moses/TranslationModel/UG/mm/ug_bitext.h | 50 +++++++++++++++++------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 84c3713ac..5dfbec285 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -318,10 +318,10 @@ namespace Moses { assert(pp.sample1); assert(pp.joint); assert(pp.raw2); - (*dest)[i] = log(pp.raw1); - (*dest)[++i] = log(pp.sample1); - (*dest)[++i] = log(pp.joint); - (*dest)[++i] = log(pp.raw2); + (*dest)[i] = -log(pp.raw1); + (*dest)[++i] = -log(pp.sample1); + (*dest)[++i] = +log(pp.joint); + (*dest)[++i] = -log(pp.raw2); } }; @@ -590,8 +590,9 @@ namespace Moses { static ThreadSafeCounter active; boost::mutex lock; friend class agenda; - boost::taus88 rnd; // every job has its own pseudo random generator - double rnddenom; // denominator for scaling random sampling + boost::taus88 rnd; // every job has its own pseudo random generator + double rnddenom; // denominator for scaling random sampling + size_t min_diverse; // minimum number of distinct translations public: size_t workers; // how many workers are working on this job? sptr const> root; // root of the underlying suffix array @@ -644,34 +645,47 @@ namespace Moses { step(uint64_t & sid, uint64_t & offset) { boost::lock_guard jguard(lock); - if ((max_samples == 0) && (next < stop)) + bool ret = (max_samples == 0) && (next < stop); + if (ret) { next = root->readSid(next,stop,sid); next = root->readOffset(next,stop,offset); boost::lock_guard sguard(stats->lock); if (stats->raw_cnt == ctr) ++stats->raw_cnt; stats->sample_cnt++; - return true; } else { - while (next < stop && stats->good < max_samples) + while (next < stop && (stats->good < max_samples || + stats->trg.size() < min_diverse)) { next = root->readSid(next,stop,sid); next = root->readOffset(next,stop,offset); - { - boost::lock_guard sguard(stats->lock); + { // brackets required for lock scoping; see sguard immediately below + boost::lock_guard sguard(stats->lock); if (stats->raw_cnt == ctr) ++stats->raw_cnt; - size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.)); + size_t scalefac = (stats->raw_cnt - ctr++); + size_t rnum = scalefac*(rnd()/(rnd.max()+1.)); +#if 0 + cerr << rnum << "/" << scalefac << " vs. " + << max_samples - stats->good << " (" + << max_samples << " - " << stats->good << ")" + << endl; +#endif if (rnum < max_samples - stats->good) { stats->sample_cnt++; - return true; + ret = true; + break; } } } - return false; } + + // boost::lock_guard sguard(stats->lock); + // abuse of lock for clean output to cerr + // cerr << stats->sample_cnt++; + return ret; } template @@ -713,6 +727,13 @@ namespace Moses { worker:: operator()() { + // things to do: + // - have each worker maintain their own pstats object and merge results at the end; + // - ensure the minimum size of samples considered by a non-locked counter that is only + // ever incremented -- who cares if we look at more samples than required, as long + // as we look at at least the minimum required + // This way, we can reduce the number of lock / unlock operations we need to do during + // sampling. size_t s1=0, s2=0, e1=0, e2=0; uint64_t sid=0, offset=0; // of the source phrase while(sptr j = ag.get_job()) @@ -812,6 +833,7 @@ namespace Moses { sptr > const& r, size_t maxsmpl, bool isfwd) : rnd(0) , rnddenom(rnd.max() + 1.) + , min_diverse(10) , workers(0) , root(r) , next(m.lower_bound(-1)) From a5f46e65cb0f01ec0280a5d045cb61f342b9d51b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 6 Jun 2014 17:25:09 +0100 Subject: [PATCH 13/27] eclipse --- contrib/other-builds/extractor/.cproject | 4 +++- contrib/other-builds/extractor/.project | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject index 5f0b24ef0..06a8a8a24 100644 --- a/contrib/other-builds/extractor/.cproject +++ b/contrib/other-builds/extractor/.cproject @@ -42,9 +42,11 @@ diff --git a/contrib/other-builds/extractor/.project b/contrib/other-builds/extractor/.project index e4fe08579..56d560019 100644 --- a/contrib/other-builds/extractor/.project +++ b/contrib/other-builds/extractor/.project @@ -4,6 +4,7 @@ mert_lib + util From 91a7c19b7c5035eb986c6bca5dc628db8052b71c Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 12:41:54 +0100 Subject: [PATCH 14/27] eclipse project for consolidate --- contrib/other-builds/consolidate/.cproject | 132 +++++++++++++++++++++ contrib/other-builds/consolidate/.project | 64 ++++++++++ 2 files changed, 196 insertions(+) create mode 100644 contrib/other-builds/consolidate/.cproject create mode 100644 contrib/other-builds/consolidate/.project diff --git a/contrib/other-builds/consolidate/.cproject b/contrib/other-builds/consolidate/.cproject new file mode 100644 index 000000000..3c70ed365 --- /dev/null +++ b/contrib/other-builds/consolidate/.cproject @@ -0,0 +1,132 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/consolidate/.project b/contrib/other-builds/consolidate/.project new file mode 100644 index 000000000..4095862b4 --- /dev/null +++ b/contrib/other-builds/consolidate/.project @@ -0,0 +1,64 @@ + + + consolidate + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + InputFileStream.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp + + + InputFileStream.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h + + + OutputFileStream.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp + + + OutputFileStream.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h + + + consolidate-main.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/consolidate-main.cpp + + + tables-core.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp + + + tables-core.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h + + + From f58c7fc831ada7701eb070014def87a5988f509a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 13:17:23 +0100 Subject: [PATCH 15/27] use standard c++ getline instead of old Moses SAFE_GETLINE --- phrase-extract/consolidate-main.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index de0d7f646..c57cc7747 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -30,8 +30,6 @@ #include "InputFileStream.h" #include "OutputFileStream.h" -#define LINE_MAX_LENGTH 10000 - using namespace std; bool hierarchicalFlag = false; @@ -46,12 +44,11 @@ inline float maybeLogProb( float a ) return logProbFlag ? log(a) : a; } -char line[LINE_MAX_LENGTH]; void processFiles( char*, char*, char*, char* ); void loadCountOfCounts( char* ); void breakdownCoreAndSparse( string combined, string &core, string &sparse ); bool getLine( istream &fileP, vector< string > &item ); -vector< string > splitLine(); +vector< string > splitLine(const char *line); vector< int > countBin; bool sparseCountBinFeatureFlag = false; @@ -140,14 +137,13 @@ void loadCountOfCounts( char* fileNameCountOfCounts ) istream &fileP = fileCountOfCounts; countOfCounts.push_back(0.0); - while(1) { - if (fileP.eof()) break; - SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (fileP.eof()) break; + + string line; + while (getline(fileP, line)) { if (totalCount < 0) - totalCount = atof(line); // total number of distinct phrase pairs + totalCount = atof(line.c_str()); // total number of distinct phrase pairs else - countOfCounts.push_back( atof(line) ); + countOfCounts.push_back( atof(line.c_str()) ); } fileCountOfCounts.Close(); @@ -370,16 +366,16 @@ bool getLine( istream &fileP, vector< string > &item ) if (fileP.eof()) return false; - SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (fileP.eof()) + string line; + if (!getline(fileP, line)) return false; - item = splitLine(); + item = splitLine(line.c_str()); return true; } -vector< string > splitLine() +vector< string > splitLine(const char *line) { vector< string > item; int start=0; From d979b24314944348cea2d7f8d8e00691c64abebb Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 14:06:33 +0100 Subject: [PATCH 16/27] use standard c++ getline instead of old Moses SAFE_GETLINE --- phrase-extract/score-main.cpp | 37 ++++++++++++----------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 46538010f..dfb5103f4 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -40,8 +40,6 @@ using namespace std; using namespace MosesTraining; -#define LINE_MAX_LENGTH 100000 - namespace MosesTraining { LexicalTable lexTable; @@ -232,7 +230,7 @@ int main(int argc, char* argv[]) } // loop through all extracted phrase translations - char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH]; + string line, lastLine; lastLine[0] = '\0'; ExtractionPhrasePair *phrasePair = NULL; std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource; @@ -245,8 +243,8 @@ int main(int argc, char* argv[]) float tmpCount=0.0f, tmpPcfgSum=0.0f; int i=0; - SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); - if ( !extractFileP.eof() ) { + // TODO why read only the 1st line? + if ( getline(extractFileP, line)) { ++i; tmpPhraseSource = new PHRASE(); tmpPhraseTarget = new PHRASE(); @@ -265,23 +263,21 @@ int main(int argc, char* argv[]) if ( hierarchicalFlag ) { phrasePairsWithSameSourceAndTarget.push_back( phrasePair ); } - strcpy( lastLine, line ); - SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); + lastLine = line; } - while ( !extractFileP.eof() ) { + while ( getline(extractFileP, line) ) { if ( ++i % 100000 == 0 ) { std::cerr << "." << std::flush; } // identical to last line? just add count - if (strcmp(line,lastLine) == 0) { + if (line == lastLine) { phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum); - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); continue; } else { - strcpy( lastLine, line ); + lastLine = line; } tmpPhraseSource = new PHRASE(); @@ -359,8 +355,6 @@ int main(int argc, char* argv[]) } } - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - } processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); @@ -750,11 +744,9 @@ void loadFunctionWords( const string &fileName ) } istream *inFileP = &inFile; - char line[LINE_MAX_LENGTH]; - while(true) { - SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (inFileP->eof()) break; - std::vector token = tokenize( line ); + string line; + while(getline(*inFileP, line)) { + std::vector token = tokenize( line.c_str() ); if (token.size() > 0) functionWordList.insert( token[0] ); } @@ -799,16 +791,13 @@ void LexicalTable::load( const string &fileName ) } istream *inFileP = &inFile; - char line[LINE_MAX_LENGTH]; - + string line; int i=0; - while(true) { + while(getline(*inFileP, line)) { i++; if (i%100000 == 0) std::cerr << "." << flush; - SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (inFileP->eof()) break; - std::vector token = tokenize( line ); + std::vector token = tokenize( line.c_str() ); if (token.size() != 3) { std::cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:" << std::endl From 23ba0de2247e84db69759445a41c4c4f04840460 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 15:41:27 +0100 Subject: [PATCH 17/27] use standard c++ getline instead of old Moses SAFE_GETLINE --- phrase-extract/SentenceAlignment.cpp | 6 +++++- phrase-extract/SentenceAlignment.h | 7 +++++-- phrase-extract/extract-main.cpp | 28 +++++++++++++--------------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp index c3d71d525..120c9154d 100644 --- a/phrase-extract/SentenceAlignment.cpp +++ b/phrase-extract/SentenceAlignment.cpp @@ -54,7 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo return true; } -bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules) +bool SentenceAlignment::create(const char targetString[], + const char sourceString[], + const char alignmentString[], + const char weightString[], + int sentenceID, bool boundaryRules) { using namespace std; this->sentenceID = sentenceID; diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h index 1df61cf02..576d3279e 100644 --- a/phrase-extract/SentenceAlignment.h +++ b/phrase-extract/SentenceAlignment.h @@ -43,8 +43,11 @@ public: virtual bool processSourceSentence(const char *, int, bool boundaryRules); - bool create(char targetString[], char sourceString[], - char alignmentString[], char weightString[], int sentenceID, bool boundaryRules); + bool create(const char targetString[], + const char sourceString[], + const char alignmentString[], + const char weightString[], + int sentenceID, bool boundaryRules); void invertAlignment(); diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 5d58028d6..698599a10 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -32,10 +32,6 @@ using namespace MosesTraining; namespace MosesTraining { - -const long int LINE_MAX_LENGTH = 500000 ; - - // HPhraseVertex represents a point in the alignment matrix typedef pair HPhraseVertex; @@ -277,20 +273,18 @@ int main(int argc, char* argv[]) int i = sentenceOffset; - while(true) { + string englishString, foreignString, alignmentString, weightString; + + while(getline(*eFileP, englishString)) { i++; if (i%10000 == 0) cerr << "." << flush; - char englishString[LINE_MAX_LENGTH]; - char foreignString[LINE_MAX_LENGTH]; - char alignmentString[LINE_MAX_LENGTH]; - char weightString[LINE_MAX_LENGTH]; - SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); - if (eFileP->eof()) break; - SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); - SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + + getline(*fFileP, foreignString); + getline(*aFileP, alignmentString); if (iwFileP) { - SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); + getline(*iwFileP, weightString); } + SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line @@ -300,7 +294,11 @@ int main(int argc, char* argv[]) cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { + if (sentence.create( englishString.c_str(), + foreignString.c_str(), + alignmentString.c_str(), + weightString.c_str(), + i, false)) { if (options.placeholders.size()) { sentence.invertAlignment(); } From cb94a3181bd00c74bf0b2b81fea4aee2195dc121 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 16:23:14 +0100 Subject: [PATCH 18/27] use standard c++ getline instead of old Moses SAFE_GETLINE --- phrase-extract/DomainFeature.cpp | 11 +++----- phrase-extract/consolidate-direct-main.cpp | 24 +++++++----------- phrase-extract/consolidate-reverse-main.cpp | 22 ++++++++-------- phrase-extract/extract-ordering-main.cpp | 28 +++++++++------------ phrase-extract/extract-rules-main.cpp | 22 +++++++--------- phrase-extract/relax-parse-main.cpp | 8 ++---- phrase-extract/statistics-main.cpp | 28 ++++++++------------- 7 files changed, 57 insertions(+), 86 deletions(-) diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp index 2f99a8709..337364b1d 100644 --- a/phrase-extract/DomainFeature.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -4,8 +4,6 @@ #include "InputFileStream.h" #include "SafeGetline.h" -#define TABLE_LINE_MAX_LENGTH 1000 - using namespace std; namespace MosesTraining @@ -16,12 +14,11 @@ void Domain::load( const std::string &domainFileName ) { Moses::InputFileStream fileS( domainFileName ); istream *fileP = &fileS; - while(true) { - char line[TABLE_LINE_MAX_LENGTH]; - SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__); - if (fileP->eof()) break; + + string line; + while(getline(*fileP, line)) { // read - vector< string > domainSpecLine = tokenize( line ); + vector< string > domainSpecLine = tokenize( line.c_str() ); int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp index 3b38f741c..40e0e35d4 100644 --- a/phrase-extract/consolidate-direct-main.cpp +++ b/phrase-extract/consolidate-direct-main.cpp @@ -26,16 +26,9 @@ #include "InputFileStream.h" #include "OutputFileStream.h" -#include "SafeGetline.h" - -#define LINE_MAX_LENGTH 10000 - using namespace std; -char line[LINE_MAX_LENGTH]; - - -vector< string > splitLine() +vector< string > splitLine(const char *line) { vector< string > item; int start=0; @@ -61,14 +54,15 @@ bool getLine( istream &fileP, vector< string > &item ) { if (fileP.eof()) return false; - - SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (fileP.eof()) + + string line; + if (getline(fileP, line)) { + item = splitLine(line.c_str()); return false; - - item = splitLine(); - - return true; + } + else { + return false; + } } diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index 6843bf3aa..891773418 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -30,20 +30,17 @@ #include "SafeGetline.h" #include "InputFileStream.h" -#define LINE_MAX_LENGTH 10000 - using namespace std; bool hierarchicalFlag = false; bool onlyDirectFlag = false; bool phraseCountFlag = true; bool logProbFlag = false; -char line[LINE_MAX_LENGTH]; void processFiles( char*, char*, char* ); bool getLine( istream &fileP, vector< string > &item ); string reverseAlignment(const string &alignments); -vector< string > splitLine(); +vector< string > splitLine(const char *lin); inline void Tokenize(std::vector &output , const std::string& str @@ -190,17 +187,18 @@ bool getLine( istream &fileP, vector< string > &item ) { if (fileP.eof()) return false; - - SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (fileP.eof()) + + string line; + if (getline(fileP, line)) { + item = splitLine(line.c_str()); return false; - - item = splitLine(); - - return true; + } + else { + return false; + } } -vector< string > splitLine() +vector< string > splitLine(const char *line) { vector< string > item; bool betweenWords = true; diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index 104457b01..78132d4fd 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -32,10 +32,6 @@ using namespace MosesTraining; namespace MosesTraining { - -const long int LINE_MAX_LENGTH = 500000 ; - - // HPhraseVertex represents a point in the alignment matrix typedef pair HPhraseVertex; @@ -246,20 +242,20 @@ int main(int argc, char* argv[]) int i = sentenceOffset; - while(true) { + string englishString, foreignString, alignmentString, weightString; + + while(getline(*eFileP, englishString)) { i++; - if (i%10000 == 0) cerr << "." << flush; - char englishString[LINE_MAX_LENGTH]; - char foreignString[LINE_MAX_LENGTH]; - char alignmentString[LINE_MAX_LENGTH]; - char weightString[LINE_MAX_LENGTH]; - SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); - if (eFileP->eof()) break; - SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); - SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + + getline(*eFileP, englishString); + getline(*fFileP, foreignString); + getline(*aFileP, alignmentString); if (iwFileP) { - SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); + getline(*iwFileP, weightString); } + + if (i%10000 == 0) cerr << "." << flush; + SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line @@ -269,7 +265,7 @@ int main(int argc, char* argv[]) cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { + if (sentence.create( englishString.c_str(), foreignString.c_str(), alignmentString.c_str(), weightString.c_str(), i, false)) { ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); task->Run(); delete task; diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index f5f44316e..30963f32b 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -47,8 +47,6 @@ #include "InputFileStream.h" #include "OutputFileStream.h" -#define LINE_MAX_LENGTH 500000 - using namespace std; using namespace MosesTraining; @@ -326,17 +324,15 @@ int main(int argc, char* argv[]) // loop through all sentence pairs size_t i=sentenceOffset; - while(true) { - i++; - if (i%1000 == 0) cerr << i << " " << flush; + string targetString, sourceString, alignmentString; - char targetString[LINE_MAX_LENGTH]; - char sourceString[LINE_MAX_LENGTH]; - char alignmentString[LINE_MAX_LENGTH]; - SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__); - if (tFileP->eof()) break; - SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__); - SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + while(getline(*tFileP, targetString)) { + i++; + + getline(*sFileP, sourceString); + getline(*aFileP, alignmentString); + + if (i%1000 == 0) cerr << i << " " << flush; SentenceAlignmentWithSyntax sentence (targetLabelCollection, sourceLabelCollection, @@ -349,7 +345,7 @@ int main(int argc, char* argv[]) cout << "LOG: PHRASES_BEGIN:" << endl; } - if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) { + if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(sentence); } diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index a58d4d97f..c04cae85b 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -33,17 +33,13 @@ int main(int argc, char* argv[]) // loop through all sentences int i=0; - char inBuffer[LINE_MAX_LENGTH]; - while(true) { + string inBuffer; + while(getline(cin, inBuffer)) { i++; if (i%1000 == 0) cerr << "." << flush; if (i%10000 == 0) cerr << ":" << flush; if (i%100000 == 0) cerr << "!" << flush; - // get line from stdin - SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__); - if (cin.eof()) break; - // process into syntax tree representation string inBufferString = string( inBuffer ); set< string > labelCollection; // set of labels, not used diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp index 67373ec93..f1563dc05 100644 --- a/phrase-extract/statistics-main.cpp +++ b/phrase-extract/statistics-main.cpp @@ -19,8 +19,6 @@ using namespace std; using namespace MosesTraining; -#define LINE_MAX_LENGTH 10000 - namespace MosesTraining { @@ -31,7 +29,7 @@ public: vector< vector > alignedToE; vector< vector > alignedToF; - bool create( char*, int ); + bool create( const char*, int ); void clear(); bool equals( const PhraseAlignment& ); }; @@ -106,16 +104,14 @@ int main(int argc, char* argv[]) vector< PhraseAlignment > phrasePairsWithSameF; int i=0; int fileCount = 0; - while(true) { + + string line; + while(getline(extractFileP, line)) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; - char line[LINE_MAX_LENGTH]; - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - // if (fileCount>0) - if (extractFileP.eof()) - break; + PhraseAlignment phrasePair; - bool isPhrasePair = phrasePair.create( line, i ); + bool isPhrasePair = phrasePair.create( line.c_str(), i ); if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { processPhrasePairs( phrasePairsWithSameF ); for(size_t j=0; j &phrasePair ) } } -bool PhraseAlignment::create( char line[], int lineID ) +bool PhraseAlignment::create(const char line[], int lineID ) { vector< string > token = tokenize( line ); int item = 1; @@ -321,16 +317,14 @@ void LexicalTable::load( const string &filePath ) } istream *inFileP = &inFile; - char line[LINE_MAX_LENGTH]; + string line; int i=0; - while(true) { + while(getline(*inFileP, line)) { i++; if (i%100000 == 0) cerr << "." << flush; - SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (inFileP->eof()) break; - vector token = tokenize( line ); + vector token = tokenize( line.c_str() ); if (token.size() != 3) { cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" << token.size() << " " << token[0] << " " << line << endl; From d68257c34d05ab278f2b043bb208403cb4e98872 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 16:37:12 +0100 Subject: [PATCH 19/27] use standard c++ getline instead of old Moses SAFE_GETLINE --- .../PhraseDictionaryMultiModelCounts.cpp | 14 ++++------- .../fuzzy-match/FuzzyMatchWrapper.cpp | 24 +++++++------------ .../fuzzy-match/SuffixArray.cpp | 15 +++++------- .../TranslationModel/fuzzy-match/Vocabulary.h | 14 ----------- 4 files changed, 18 insertions(+), 49 deletions(-) diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp index 04bb321d0..99d3ad256 100644 --- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp +++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp @@ -17,12 +17,8 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "util/exception.hh" - #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" -#define LINE_MAX_LENGTH 100000 -#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE() - using namespace std; template @@ -461,16 +457,14 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic } istream *inFileP = &inFile; - char line[LINE_MAX_LENGTH]; - int i=0; - while(true) { + string line; + + while(getline(*inFileP, line)) { i++; if (i%100000 == 0) cerr << "." << flush; - SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (inFileP->eof()) break; - vector token = tokenize( line ); + vector token = tokenize( line.c_str() ); if (token.size() != 4) { cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index fc68e1f0d..8766743b3 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -413,11 +413,9 @@ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector istream *fileStreamP = &fileStream; - char line[LINE_MAX_LENGTH]; - while(true) { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - corpus.push_back( GetVocabulary().Tokenize( line ) ); + string line; + while(getline(*fileStreamP, line)) { + corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) ); } } @@ -436,12 +434,9 @@ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< WORD_ID delimiter = GetVocabulary().StoreIfNew("|||"); int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - - vector toks = GetVocabulary().Tokenize( line ); + string line; + while(getline(*fileStreamP, line)) { + vector toks = GetVocabulary().Tokenize( line.c_str() ); corpus.push_back(vector< SentenceAlignment >()); vector< SentenceAlignment > &vec = corpus.back(); @@ -493,11 +488,8 @@ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vect string delimiter = "|||"; int lineNum = 0; - char line[LINE_MAX_LENGTH]; - while(true) { - SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n'); - if (fileStreamP->eof()) break; - + string line; + while(getline(*fileStreamP, line)) { vector< SentenceAlignment > &vec = corpus[lineNum]; size_t targetInd = 0; SentenceAlignment *sentence = &vec[targetInd]; diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp index 536bff741..2930147ab 100644 --- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp +++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp @@ -14,17 +14,16 @@ SuffixArray::SuffixArray( string fileName ) m_endOfSentence = m_vcb.StoreIfNew( "" ); ifstream extractFile; - char line[LINE_MAX_LENGTH]; // count the number of words first; extractFile.open(fileName.c_str()); istream *fileP = &extractFile; m_size = 0; size_t sentenceCount = 0; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); + string line; + while(getline(*fileP, line)) { + + vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() ); m_size += words.size() + 1; sentenceCount++; } @@ -43,10 +42,8 @@ SuffixArray::SuffixArray( string fileName ) int sentenceId = 0; extractFile.open(fileName.c_str()); fileP = &extractFile; - while(!fileP->eof()) { - SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); - if (fileP->eof()) break; - vector< WORD_ID > words = m_vcb.Tokenize( line ); + while(getline(*fileP, line)) { + vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() ); // add to corpus vector corpus.push_back(words); diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h index dfa11c1db..5a79e2f26 100644 --- a/moses/TranslationModel/fuzzy-match/Vocabulary.h +++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h @@ -17,20 +17,6 @@ namespace tmmt { - -#define MAX_LENGTH 10000 - -#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \ - _IS.getline(_LINE, _SIZE, _DELIM); \ - if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \ - if (_IS.gcount() == _SIZE-1) { \ - cerr << "Line too long! Buffer overflow. Delete lines >=" \ - << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \ - << endl; \ - exit(1); \ - } \ - } - typedef std::string WORD; typedef unsigned int WORD_ID; From 1b667e3e24620fb55fb7f62d3d643455521cdcb4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 17:07:12 +0100 Subject: [PATCH 20/27] delete any mention of SAFE_GETLINE so it doesn't reappear --- phrase-extract/DomainFeature.cpp | 1 - phrase-extract/ExtractionPhrasePair.cpp | 1 - phrase-extract/SafeGetline.h | 35 --------------------- phrase-extract/consolidate-main.cpp | 1 - phrase-extract/consolidate-reverse-main.cpp | 1 - phrase-extract/extract-main.cpp | 1 - phrase-extract/extract-ordering-main.cpp | 1 - phrase-extract/extract-rules-main.cpp | 1 - phrase-extract/score-main.cpp | 1 - phrase-extract/statistics-main.cpp | 1 - 10 files changed, 44 deletions(-) delete mode 100644 phrase-extract/SafeGetline.h diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp index 337364b1d..99f0713a7 100644 --- a/phrase-extract/DomainFeature.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -2,7 +2,6 @@ #include "ExtractionPhrasePair.h" #include "tables-core.h" #include "InputFileStream.h" -#include "SafeGetline.h" using namespace std; diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index f70d106d1..2b26c2ad6 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -19,7 +19,6 @@ #include #include "ExtractionPhrasePair.h" -#include "SafeGetline.h" #include "tables-core.h" #include "score.h" #include "moses/Util.h" diff --git a/phrase-extract/SafeGetline.h b/phrase-extract/SafeGetline.h deleted file mode 100644 index 0e03b8468..000000000 --- a/phrase-extract/SafeGetline.h +++ /dev/null @@ -1,35 +0,0 @@ -/*********************************************************************** - Moses - factored phrase-based language decoder - Copyright (C) 2010 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - ***********************************************************************/ - -#pragma once -#ifndef SAFE_GETLINE_INCLUDED_ -#define SAFE_GETLINE_INCLUDED_ - -#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) { \ - _IS.getline(_LINE, _SIZE, _DELIM); \ - if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \ - if (_IS.gcount() == _SIZE-1) { \ - cerr << "Line too long! Buffer overflow. Delete lines >=" \ - << _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE \ - << endl; \ - exit(1); \ - } \ - } - -#endif diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index c57cc7747..43d912b81 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -26,7 +26,6 @@ #include #include "tables-core.h" -#include "SafeGetline.h" #include "InputFileStream.h" #include "OutputFileStream.h" diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp index 891773418..ce59315b9 100644 --- a/phrase-extract/consolidate-reverse-main.cpp +++ b/phrase-extract/consolidate-reverse-main.cpp @@ -27,7 +27,6 @@ #include #include "tables-core.h" -#include "SafeGetline.h" #include "InputFileStream.h" using namespace std; diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 698599a10..fe3d99cd2 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -19,7 +19,6 @@ #include #include -#include "SafeGetline.h" #include "SentenceAlignment.h" #include "tables-core.h" #include "InputFileStream.h" diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index 78132d4fd..b418ba24d 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -19,7 +19,6 @@ #include #include -#include "SafeGetline.h" #include "SentenceAlignment.h" #include "tables-core.h" #include "InputFileStream.h" diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 30963f32b..592946b0d 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -39,7 +39,6 @@ #include "Hole.h" #include "HoleCollection.h" #include "RuleExist.h" -#include "SafeGetline.h" #include "SentenceAlignmentWithSyntax.h" #include "SyntaxTree.h" #include "tables-core.h" diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index dfb5103f4..3ab6e2fd3 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -29,7 +29,6 @@ #include #include -#include "SafeGetline.h" #include "ScoreFeature.h" #include "tables-core.h" #include "ExtractionPhrasePair.h" diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp index f1563dc05..9d814ed76 100644 --- a/phrase-extract/statistics-main.cpp +++ b/phrase-extract/statistics-main.cpp @@ -12,7 +12,6 @@ #include #include "AlignmentPhrase.h" -#include "SafeGetline.h" #include "tables-core.h" #include "InputFileStream.h" From 29d83d94b109b0b2e6fc134692d61d824656c1e6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 8 Jun 2014 17:18:07 +0100 Subject: [PATCH 21/27] delete any mention of SAFE_GETLINE so it doesn't reappear --- phrase-extract/relax-parse-main.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index c04cae85b..e5feb94d0 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -20,8 +20,6 @@ ***********************************************************************/ #include "relax-parse.h" - -#include "SafeGetline.h" #include "tables-core.h" using namespace std; From 169c3fce383bc66ae580884bfa72d60712beffef Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 9 Jun 2014 15:24:41 +0100 Subject: [PATCH 22/27] convert CoNNL-X to Moses XML format --- scripts/training/wrappers/conll2mosesxml.py | 188 ++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100755 scripts/training/wrappers/conll2mosesxml.py diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py new file mode 100755 index 000000000..d85695b16 --- /dev/null +++ b/scripts/training/wrappers/conll2mosesxml.py @@ -0,0 +1,188 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat ) +# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION), +# which not all parsers produce. + +# usage: conll2mosesxml.py [--brackets] < input_file > output_file + +from __future__ import print_function, unicode_literals +import sys +import re +import codecs +from collections import namedtuple,defaultdict +from lxml import etree as ET + + +Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func']) + +def main(output_format='xml'): + sentence = [] + + for line in sys.stdin: + + # process sentence + if line == "\n": + sentence.insert(0,[]) + if is_projective(sentence): + write(sentence,output_format) + else: + sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n') + sys.stdout.write('\n') + sentence = [] + continue + + try: + pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split() + except ValueError: # word may be unicode whitespace + pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip()) + + word = escape_special_chars(word) + lemma = escape_special_chars(lemma) + + if proj_head == '_': + proj_head = head + proj_func = func + + sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func)) + + +# this script performs the same escaping as escape-special-chars.perl in Moses. +# most of it is done in function write(), but quotation marks need to be processed first +def escape_special_chars(line): + + line = line.replace('\'',''') # xml + line = line.replace('"','"') # xml + + return line + + +# make a check if structure is projective +def is_projective(sentence): + dominates = defaultdict(set) + for i,w in enumerate(sentence): + dominates[i].add(i) + if not i: + continue + head = int(w.proj_head) + while head != 0: + if i in dominates[head]: + break + dominates[head].add(i) + head = int(sentence[head].proj_head) + + for i in dominates: + dependents = dominates[i] + if max(dependents) - min(dependents) != len(dependents)-1: + sys.stderr.write("error: non-projective structure.\n") + return False + return True + + +def write(sentence, output_format='xml'): + + if output_format == 'xml': + tree = create_subtree(0,sentence) + out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8') + + if output_format == 'brackets': + out = create_brackets(0,sentence) + + out = out.replace('|','|') # factor separator + out = out.replace('[','[') # syntax non-terminal + out = out.replace(']',']') # syntax non-terminal + + out = out.replace('&apos;',''') # lxml is buggy if input is escaped + out = out.replace('&quot;','"') # lxml is buggy if input is escaped + + print(out) + +# write node in Moses XML format +def create_subtree(position, sentence): + + element = ET.Element('tree') + + if position: + element.set('label', sentence[position].proj_func) + else: + element.set('label', 'sent') + + for i in range(1,position): + if sentence[i].proj_head == position: + element.append(create_subtree(i, sentence)) + + if position: + + if preterminals: + head = ET.Element('tree') + head.set('label', sentence[position].tag) + head.text = sentence[position].word + element.append(head) + + else: + if len(element): + element[-1].tail = sentence[position].word + else: + element.text = sentence[position].word + + for i in range(position, len(sentence)): + if i and sentence[i].proj_head == position: + element.append(create_subtree(i, sentence)) + + return element + + +# write node in bracket format (Penn treebank style) +def create_brackets(position, sentence): + + if position: + element = "( " + sentence[position].proj_func + ' ' + else: + element = "( sent " + + for i in range(1,position): + if sentence[i].proj_head == position: + element += create_brackets(i, sentence) + + if position: + word = sentence[position].word + if word == ')': + word = 'RBR' + elif word == '(': + word = 'LBR' + + tag = sentence[position].tag + if tag == '$(': + tag = '$BR' + + if preterminals: + element += '( ' + tag + ' ' + word + ' ) ' + else: + element += word + ' ) ' + + for i in range(position, len(sentence)): + if i and sentence[i].proj_head == position: + element += create_brackets(i, sentence) + + if preterminals or not position: + element += ') ' + + return element + +if __name__ == '__main__': + if sys.version_info < (3,0,0): + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + + if '--no_preterminals' in sys.argv: + preterminals = False + else: + preterminals = True + + if '--brackets' in sys.argv: + main('brackets') + else: + main('xml') From 8edb3444925a2af26297189adb46d1a9aabe855d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 10 Jun 2014 10:16:17 +0100 Subject: [PATCH 23/27] =?UTF-8?q?minor=20const=C2=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- moses/PP/PhraseProperty.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/PP/PhraseProperty.h b/moses/PP/PhraseProperty.h index b977787b2..a4353e634 100644 --- a/moses/PP/PhraseProperty.h +++ b/moses/PP/PhraseProperty.h @@ -15,7 +15,7 @@ public: virtual void ProcessValue() {}; - const std::string &GetValueString() { return m_value; }; + const std::string &GetValueString() const { return m_value; }; protected: From d1554c4cdce78dd630b3242799f7a5d24f2268cc Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 10 Jun 2014 16:28:46 +0100 Subject: [PATCH 24/27] Add moses speedtesting framework and readmes. --- contrib/moses-speedtest/README.md | 122 ++++++++ .../moses-speedtest/check_for_regression.py | 63 ++++ contrib/moses-speedtest/cronjob | 7 + contrib/moses-speedtest/helpers/README.md | 5 + .../helpers/sys_drop_caches.py | 22 ++ contrib/moses-speedtest/html/README.md | 5 + contrib/moses-speedtest/html/index.html | 32 ++ contrib/moses-speedtest/html/style.css | 21 ++ contrib/moses-speedtest/html_gen.py | 192 ++++++++++++ contrib/moses-speedtest/runtests.py | 293 ++++++++++++++++++ contrib/moses-speedtest/sys_drop_caches.py | 22 ++ contrib/moses-speedtest/test_config | 3 + contrib/moses-speedtest/testsuite_common.py | 54 ++++ contrib/moses-speedtest/testsuite_config | 5 + 14 files changed, 846 insertions(+) create mode 100644 contrib/moses-speedtest/README.md create mode 100644 contrib/moses-speedtest/check_for_regression.py create mode 100644 contrib/moses-speedtest/cronjob create mode 100644 contrib/moses-speedtest/helpers/README.md create mode 100644 contrib/moses-speedtest/helpers/sys_drop_caches.py create mode 100644 contrib/moses-speedtest/html/README.md create mode 100644 contrib/moses-speedtest/html/index.html create mode 100644 contrib/moses-speedtest/html/style.css create mode 100644 contrib/moses-speedtest/html_gen.py create mode 100644 contrib/moses-speedtest/runtests.py create mode 100644 contrib/moses-speedtest/sys_drop_caches.py create mode 100644 contrib/moses-speedtest/test_config create mode 100644 contrib/moses-speedtest/testsuite_common.py create mode 100644 contrib/moses-speedtest/testsuite_config diff --git a/contrib/moses-speedtest/README.md b/contrib/moses-speedtest/README.md new file mode 100644 index 000000000..c95c6a400 --- /dev/null +++ b/contrib/moses-speedtest/README.md @@ -0,0 +1,122 @@ +# Moses speedtesting framework + +### Description + +This is an automatic test framework that is designed to test the day to day performance changes in Moses. + +### Set up + +#### Set up a Moses repo +Set up a Moses repo and build it with the desired configuration. +```bash +git clone https://github.com/moses-smt/mosesdecoder.git +cd mosesdecoder +./bjam -j10 --with-cmph=/usr/include/ +``` +You need to build Moses first, so that the testsuite knows what command you want it to use when rebuilding against newer revisions. + +#### Create a parent directory. +Create a parent directory where the **runtests.py** and related scripts and configuration file should reside. +This should also be the location of the TEST_DIR and TEST_LOG_DIR as explained in the next section. + +#### Set up a global configuration file. +You need a configuration file for the testsuite. A sample configuration file is provided in **testsuite\_config** +
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1
+
+ +The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses. +The _DROP\_CACHES\_COMM_ is the command that would beused to drop caches. It should run without needing root access. +_TEST\_DIR_ is the directory where all the tests will reside. +_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time. +_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release. + +### Creating tests + +In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test. +Inside that folder one should place a configuration file named **config**. The naming is mandatory. +An example such configuration file is **test\_config** + +
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, 
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
+
+ +The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths. +The _LDPRE:_ specifies if tests should be run with any LD\_PRELOAD flags. +The _Variants:_ line specifies what type of tests should we run. This particular line will run the following tests: +1. A Vanilla test meaning just the command after _Command_ will be issued. +2. A vanilla cached test meaning that after the vanilla test, the test will be run again without dropping caches in order to benchmark performance on cached filesystem. +3. A test with LD_PRELOAD ldpreloads moses -f command. For each available LDPRELOAD comma separated library to preload. +4. A cached version of all LD_PRELOAD tests. + +### Running tests. +Running the tests is done through the **runtests.py** script. + +#### Running all tests. +To run all tests, with the base branch and the latests revision (and generate new basebranch test data if such is missing) do a: +```bash +python3 runtests.py -c testsuite_config +``` + +#### Running specific tests. +The script allows the user to manually run a particular test or to test against a specific branch or revision: +
+moses-speedtest@crom:~/phrase_tables$ python3 runtests.py --help
+usage: runtests.py [-h] -c CONFIGFILE [-s SINGLETESTDIR] [-r REVISION]
+                   [-b BRANCH]
+
+A python based speedtest suite for moses.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -c CONFIGFILE, --configfile CONFIGFILE
+                        Specify test config file
+  -s SINGLETESTDIR, --singletest SINGLETESTDIR
+                        Single test name directory. Specify directory name,
+                        not full path!
+  -r REVISION, --revision REVISION
+                        Specify a specific revison for the test.
+  -b BRANCH, --branch BRANCH
+                        Specify a branch for the test.
+
+ +### Generating HTML report. +To generate a summary of the test results use the **html\_gen.py** script. It places a file named *index.html* in the current script directory. +```bash +python3 html_gen.py testsuite_config +``` +You should use the generated file with the **style.css** file provided in the html directory. + +### Command line regression testing. +Alternatively you could check for regressions from the command line using the **check\_fo\r_regression.py** script: +```bash +python3 check_for_regression.py TESTLOGS_DIRECTORY +``` + +Alternatively the results of all tests are logged inside the the specified TESTLOGS directory so you can manually check them for additional information such as date, time, revision, branch, etc... + +### Create a cron job: +Create a cron job to run the tests daily and generate an html report. An example *cronjob* is available. +```bash +#!/bin/sh +cd /home/moses-speedtest/phrase_tables + +python3 runtests.py -c testsuite_config #Run the tests. +python3 html_gen.py testsuite_config #Generate html + +cp index.html /fs/thor4/html/www/speed-test/ #Update the html +``` + +Place the script in _/etc/cron.daily_ for dayly testing + +###### Author +Nikolay Bogoychev, 2014 + +###### License +This software is licensed under the LGPL. \ No newline at end of file diff --git a/contrib/moses-speedtest/check_for_regression.py b/contrib/moses-speedtest/check_for_regression.py new file mode 100644 index 000000000..1e269c0c6 --- /dev/null +++ b/contrib/moses-speedtest/check_for_regression.py @@ -0,0 +1,63 @@ +"""Checks if any of the latests tests has performed considerably different than + the previous ones. Takes the log directory as an argument.""" +import os +import sys +from testsuite_common import Result, processLogLine, bcolors, getLastTwoLines + +LOGDIR = sys.argv[1] #Get the log directory as an argument +PERCENTAGE = 5 #Default value for how much a test shoudl change +if len(sys.argv) == 3: + PERCENTAGE = float(sys.argv[2]) #Default is 5%, but we can specify more + #line parameter + +def printResults(regressed, better, unchanged, firsttime): + """Pretty print the results in different colours""" + if regressed != []: + for item in regressed: + print(bcolors.RED + "REGRESSION! " + item.testname + " Was: "\ + + str(item.previous) + " Is: " + str(item.current) + " Change: "\ + + str(abs(item.percentage)) + "%. Revision: " + item.revision\ + + bcolors.ENDC) + print('\n') + if unchanged != []: + for item in unchanged: + print(bcolors.BLUE + "UNCHANGED: " + item.testname + " Revision: " +\ + item.revision + bcolors.ENDC) + print('\n') + if better != []: + for item in better: + print(bcolors.GREEN + "IMPROVEMENT! " + item.testname + " Was: "\ + + str(item.previous) + " Is: " + str(item.current) + " Change: "\ + + str(abs(item.percentage)) + "%. Revision: " + item.revision\ + + bcolors.ENDC) + if firsttime != []: + for item in firsttime: + print(bcolors.PURPLE + "First time test! " + item.testname +\ + " Took: " + str(item.real) + " seconds. Revision: " +\ + item.revision + bcolors.ENDC) + + +all_files = os.listdir(LOGDIR) +regressed = [] +better = [] +unchanged = [] +firsttime = [] + +#Go through all log files and find which tests have performed better. +for logfile in all_files: + (line1, line2) = getLastTwoLines(logfile, LOGDIR) + log1 = processLogLine(line1) + if line2 == '\n': # Empty line, only one test ever run + firsttime.append(log1) + continue + log2 = processLogLine(line2) + res = Result(log1.testname, log1.real, log2.real, log2.revision,\ + log2.branch, log1.revision, log1.branch) + if res.percentage < -PERCENTAGE: + regressed.append(res) + elif res.change > PERCENTAGE: + better.append(res) + else: + unchanged.append(res) + +printResults(regressed, better, unchanged, firsttime) diff --git a/contrib/moses-speedtest/cronjob b/contrib/moses-speedtest/cronjob new file mode 100644 index 000000000..4f7183a48 --- /dev/null +++ b/contrib/moses-speedtest/cronjob @@ -0,0 +1,7 @@ +#!/bin/sh +cd /home/moses-speedtest/phrase_tables + +python3 runtests.py -c testsuite_config #Run the tests. +python3 html_gen.py testsuite_config #Generate html + +cp index.html /fs/thor4/html/www/speed-test/ #Update the html \ No newline at end of file diff --git a/contrib/moses-speedtest/helpers/README.md b/contrib/moses-speedtest/helpers/README.md new file mode 100644 index 000000000..87efbc78f --- /dev/null +++ b/contrib/moses-speedtest/helpers/README.md @@ -0,0 +1,5 @@ +###Helpers + +This is a python script that basically gives you the equivalent of: +```echo 3 > /proc/sys/vm/drop_caches``` +You need to set it up so it is executed with root access without needing a password so that the tests can be automated. \ No newline at end of file diff --git a/contrib/moses-speedtest/helpers/sys_drop_caches.py b/contrib/moses-speedtest/helpers/sys_drop_caches.py new file mode 100644 index 000000000..d4796e090 --- /dev/null +++ b/contrib/moses-speedtest/helpers/sys_drop_caches.py @@ -0,0 +1,22 @@ +#!/usr/bin/spython +from sys import argv, stderr, exit +from os import linesep as ls +procfile = "/proc/sys/vm/drop_caches" +options = ["1","2","3"] +flush_type = None +try: + flush_type = argv[1][0:1] + if not flush_type in options: + raise IndexError, "not in options" + with open(procfile, "w") as f: + f.write("%s%s" % (flush_type,ls)) + exit(0) +except IndexError, e: + stderr.write("Argument %s required.%s" % (options, ls)) +except IOError, e: + stderr.write("Error writing to file.%s" % ls) +except StandardError, e: + stderr.write("Unknown Error.%s" % ls) + +exit(1) + diff --git a/contrib/moses-speedtest/html/README.md b/contrib/moses-speedtest/html/README.md new file mode 100644 index 000000000..342a8cedf --- /dev/null +++ b/contrib/moses-speedtest/html/README.md @@ -0,0 +1,5 @@ +###HTML files. + +_index.html_ is a sample generated file by this testsuite. + +_style.css_ should be placed in the html directory in which _index.html_ will be placed in order to visualize the test results in a browser. diff --git a/contrib/moses-speedtest/html/index.html b/contrib/moses-speedtest/html/index.html new file mode 100644 index 000000000..fc75b1028 --- /dev/null +++ b/contrib/moses-speedtest/html/index.html @@ -0,0 +1,32 @@ + + +Moses speed testing +Basebranch: RELEASE-2.1 Revision: c977ca2f434ed6f12a352806c088061c492b1676 + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DateTimeTestnameRevisionBranchTimePrevtimePrevrevChange (%)Time (Basebranch)Change (%, Basebranch)Time (Days -2)Change (%, Days -2)Time (Days -3)Change (%, Days -3)Time (Days -4)Change (%, Days -4)Time (Days -5)Change (%, Days -5)Time (Days -6)Change (%, Days -6)Time (Days -7)Change (%, Days -7)Time (Days -14)Change (%, Days -14)Time (Years -1)Change (%, Years -1)
10.06.201410:27:57ondisk_minreord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster21.3621.49169c3fce383bc66ae580884bfa72d60712beffef0.00625.890.1699N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:29:38minpt_reord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster9.739.52169c3fce383bc66ae580884bfa72d60712beffef-0.022112.20.2197N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:22:32ondisk_hierarchical_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster25.7325.77169c3fce383bc66ae580884bfa72d60712beffef0.001633.630.2337N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:22:06ondisk_hierarchical_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster83.282.6169c3fce383bc66ae580884bfa72d60712beffef-0.0073127.590.3526N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:28:57binary_reord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster24.5424.85169c3fce383bc66ae580884bfa72d60712beffef0.012529.090.1458N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:28:08ondisk_minreord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster10.7110.54169c3fce383bc66ae580884bfa72d60712beffef-0.016114.820.2888N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:30:00binary_minreord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster20.8220.77169c3fce383bc66ae580884bfa72d60712beffef-0.002425.770.194N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:27:35score.hiero_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster131.37130.63169c3fce383bc66ae580884bfa72d60712beffef-0.0057141.850.0791N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:29:10binary_reord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster13.4113.4169c3fce383bc66ae580884bfa72d60712beffef-0.000718.120.2605N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:29:28minpt_reord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster17.4617.37169c3fce383bc66ae580884bfa72d60712beffef-0.005220.00.1315N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:28:22minpt_minreord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster13.7513.56169c3fce383bc66ae580884bfa72d60712beffef-0.01417.190.2112N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:22:59ondisk_reord_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster25.2825.0169c3fce383bc66ae580884bfa72d60712beffef-0.011229.110.1412N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:28:31minpt_minreord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster8.638.6169c3fce383bc66ae580884bfa72d60712beffef-0.003511.780.2699N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:23:10ondisk_reord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster11.5711.59169c3fce383bc66ae580884bfa72d60712beffef0.001715.40.2474N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:25:24score.hiero_vanilla169c3fce383bc66ae580884bfa72d60712beffefmaster132.33130.02169c3fce383bc66ae580884bfa72d60712beffef-0.0178141.350.0802N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
10.06.201410:30:12binary_minreord_vanilla_cached169c3fce383bc66ae580884bfa72d60712beffefmaster12.4712.61169c3fce383bc66ae580884bfa72d60712beffef0.011117.890.2951N/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/AN/A
diff --git a/contrib/moses-speedtest/html/style.css b/contrib/moses-speedtest/html/style.css new file mode 100644 index 000000000..16221f91f --- /dev/null +++ b/contrib/moses-speedtest/html/style.css @@ -0,0 +1,21 @@ +table,th,td +{ +border:1px solid black; + border-collapse:collapse +} + +tr:nth-child(odd) { + background-color: Gainsboro; +} + +.better { + color: Green; +} + +.worse { + color: Red; +} + +.unchanged { + color: SkyBlue; +} \ No newline at end of file diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py new file mode 100644 index 000000000..4564b9200 --- /dev/null +++ b/contrib/moses-speedtest/html_gen.py @@ -0,0 +1,192 @@ +"""Generates HTML page containing the testresults""" +from testsuite_common import Result, processLogLine, getLastTwoLines +from runtests import parse_testconfig +import os +import sys + +from datetime import datetime, timedelta + +HTML_HEADING = """ + +Moses speed testing +""" +HTML_ENDING = "\n" + +TABLE_HEADING = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + """ + +def get_prev_days(date, numdays): + """Gets the date numdays previous days so that we could search for + that test in the config file""" + date_obj = datetime.strptime(date, '%d.%m.%Y').date() + past_date = date_obj - timedelta(days=numdays) + return past_date.strftime('%d.%m.%Y') + +def gather_necessary_lines(logfile, date): + """Gathers the necessary lines corresponding to past dates + and parses them if they exist""" + #Get a dictionary of dates + dates = {} + dates[get_prev_days(date, 2)] = ('-2', None) + dates[get_prev_days(date, 3)] = ('-3', None) + dates[get_prev_days(date, 4)] = ('-4', None) + dates[get_prev_days(date, 5)] = ('-5', None) + dates[get_prev_days(date, 6)] = ('-6', None) + dates[get_prev_days(date, 7)] = ('-7', None) + dates[get_prev_days(date, 14)] = ('-14', None) + dates[get_prev_days(date, 365)] = ('-365', None) + + openfile = open(logfile, 'r') + for line in openfile: + if line.split()[0] in dates.keys(): + day = dates[line.split()[0]][0] + dates[line.split()[0]] = (day, processLogLine(line)) + openfile.close() + return dates + +def append_date_to_table(resline): + """Appends past dates to the html""" + cur_html = '' + + if resline.percentage > 0.05: #If we have improvement of more than 5% + cur_html = cur_html + '' + elif resline.percentage < -0.05: #We have a regression of more than 5% + cur_html = cur_html + '' + else: + cur_html = cur_html + '' + return cur_html + +def compare_rev(filename, rev1, rev2, branch1=False, branch2=False): + """Compare the test results of two lines. We can specify either a + revision or a branch for comparison. The first rev should be the + base version and the second revision should be the later version""" + + #In the log file the index of the revision is 2 but the index of + #the branch is 12. Alternate those depending on whether we are looking + #for a specific revision or branch. + firstidx = 2 + secondidx = 2 + if branch1 == True: + firstidx = 12 + if branch2 == True: + secondidx = 12 + + rev1line = '' + rev2line = '' + resfile = open(filename, 'r') + for line in resfile: + if rev1 == line.split()[firstidx]: + rev1line = line + elif rev2 == line.split()[secondidx]: + rev2line = line + if rev1line != '' and rev2line != '': + break + resfile.close() + if rev1line == '': + raise ValueError('Revision ' + rev1 + " was not found!") + if rev2line == '': + raise ValueError('Revision ' + rev2 + " was not found!") + + logLine1 = processLogLine(rev1line) + logLine2 = processLogLine(rev2line) + res = Result(logLine1.testname, logLine1.real, logLine2.real,\ + logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch) + + return res + +def produce_html(path, global_config): + """Produces html file for the report.""" + html = '' #The table HTML + for filenam in os.listdir(global_config.testlogs): + #Generate html for the newest two lines + #Get the lines from the config file + (ll1, ll2) = getLastTwoLines(filenam, global_config.testlogs) + logLine1 = processLogLine(ll1) + logLine2 = processLogLine(ll2) + + #Generate html + res1 = Result(logLine1.testname, logLine1.real, logLine2.real,\ + logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch) + html = html + '' + + #Add fancy colours depending on the change + if res1.percentage > 0.05: #If we have improvement of more than 5% + html = html + '' + elif res1.percentage < -0.05: #We have a regression of more than 5% + html = html + '' + else: + html = html + '' + + #Get comparison against the base version + filenam = global_config.testlogs + '/' + filenam #Get proper directory + res2 = compare_rev(filenam, global_config.basebranch, res1.revision, branch1=True) + html = html + '' + + #Add fancy colours depending on the change + if res2.percentage > 0.05: #If we have improvement of more than 5% + html = html + '' + elif res2.percentage < -0.05: #We have a regression of more than 5% + html = html + '' + else: + html = html + '' + + #Add extra dates comparison dating from the beginning of time if they exist + past_dates = list(range(2, 8)) + past_dates.append(14) + past_dates.append(365) # Get the 1 year ago day + linesdict = gather_necessary_lines(filenam, logLine2.date) + + for days in past_dates: + act_date = get_prev_days(logLine2.date, days) + if linesdict[act_date][1] is not None: + logline_date = linesdict[act_date][0] + restemp = Result(logline_date.testname, logline_date.real, logLine2.real,\ + logLine2.revision, logLine2.branch, logline_date.revision, logline_date.branch) + html = html + append_date_to_table(restemp) + else: + html = html + '' + + + + html = html + '' #End row + + #Write out the file + basebranch_info = 'Basebranch: ' + res2.prevbranch + ' Revision: ' +\ + res2.prevrev + '' + writeoutstr = HTML_HEADING + basebranch_info + TABLE_HEADING + html + HTML_ENDING + writefile = open(path, 'w') + writefile.write(writeoutstr) + writefile.close() + +if __name__ == '__main__': + CONFIG = parse_testconfig(sys.argv[1]) + produce_html('index.html', CONFIG) diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py new file mode 100644 index 000000000..0978c8ef2 --- /dev/null +++ b/contrib/moses-speedtest/runtests.py @@ -0,0 +1,293 @@ +"""Given a config file, runs tests""" +import os +import subprocess +import time +from argparse import ArgumentParser +from testsuite_common import processLogLine + +def parse_cmd(): + """Parse the command line arguments""" + description = "A python based speedtest suite for moses." + parser = ArgumentParser(description=description) + parser.add_argument("-c", "--configfile", action="store",\ + dest="configfile", required=True,\ + help="Specify test config file") + parser.add_argument("-s", "--singletest", action="store",\ + dest="singletestdir", default=None,\ + help="Single test name directory. Specify directory name,\ + not full path!") + parser.add_argument("-r", "--revision", action="store",\ + dest="revision", default=None,\ + help="Specify a specific revison for the test.") + parser.add_argument("-b", "--branch", action="store",\ + dest="branch", default=None,\ + help="Specify a branch for the test.") + + arguments = parser.parse_args() + return arguments + +def repoinit(testconfig): + """Determines revision and sets up the repo.""" + revision = '' + #Update the repo + os.chdir(testconfig.repo) + #Checkout specific branch, else maintain main branch + if testconfig.branch != 'master': + subprocess.call(['git', 'checkout', testconfig.branch]) + rev, _ = subprocess.Popen(['git', 'rev-parse', 'HEAD'],\ + stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + revision = str(rev).replace("\\n'", '').replace("b'", '') + else: + subprocess.call(['git checkout master'], shell=True) + + #Check a specific revision. Else checkout master. + if testconfig.revision: + subprocess.call(['git', 'checkout', testconfig.revision]) + revision = testconfig.revision + elif testconfig.branch == 'master': + subprocess.call(['git pull'], shell=True) + rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\ + stderr=subprocess.PIPE, shell=True).communicate() + revision = str(rev).replace("\\n'", '').replace("b'", '') + + return revision + +class Configuration: + """A simple class to hold all of the configuration constatns""" + def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev): + self.repo = repo + self.drop_caches = drop_caches + self.tests = tests + self.testlogs = testlogs + self.basebranch = basebranch + self.baserev = baserev + self.singletest = None + self.revision = None + self.branch = 'master' # Default branch + + def additional_args(self, singletest, revision, branch): + """Additional configuration from command line arguments""" + self.singletest = singletest + if revision is not None: + self.revision = revision + if branch is not None: + self.branch = branch + + def set_revision(self, revision): + """Sets the current revision that is being tested""" + self.revision = revision + + +class Test: + """A simple class to contain all information about tests""" + def __init__(self, name, command, ldopts, permutations): + self.name = name + self.command = command + self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet + self.permutations = permutations + +def parse_configfile(conffile, testdir, moses_repo): + """Parses the config file""" + command, ldopts = '', '' + permutations = [] + fileopen = open(conffile, 'r') + for line in fileopen: + line = line.split('#')[0] # Discard comments + if line == '' or line == '\n': + continue # Discard lines with comments only and empty lines + opt, args = line.split(' ', 1) # Get arguments + + if opt == 'Command:': + command = args.replace('\n', '') + command = moses_repo + '/bin/' + command + elif opt == 'LDPRE:': + ldopts = args.replace('\n', '') + elif opt == 'Variants:': + permutations = args.replace('\n', '').replace(' ', '').split(',') + else: + raise ValueError('Unrecognized option ' + opt) + #We use the testdir as the name. + testcase = Test(testdir, command, ldopts, permutations) + fileopen.close() + return testcase + +def parse_testconfig(conffile): + """Parses the config file for the whole testsuite.""" + repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', '' + basebranch, baserev = '', '' + fileopen = open(conffile, 'r') + for line in fileopen: + line = line.split('#')[0] # Discard comments + if line == '' or line == '\n': + continue # Discard lines with comments only and empty lines + opt, args = line.split(' ', 1) # Get arguments + if opt == 'MOSES_REPO_PATH:': + repo_path = args.replace('\n', '') + elif opt == 'DROP_CACHES_COMM:': + drop_caches = args.replace('\n', '') + elif opt == 'TEST_DIR:': + tests_dir = args.replace('\n', '') + elif opt == 'TEST_LOG_DIR:': + testlog_dir = args.replace('\n', '') + elif opt == 'BASEBRANCH:': + basebranch = args.replace('\n', '') + elif opt == 'BASEREV:': + baserev = args.replace('\n', '') + else: + raise ValueError('Unrecognized option ' + opt) + config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\ + basebranch, baserev) + fileopen.close() + return config + +def get_config(): + """Builds the config object with all necessary attributes""" + args = parse_cmd() + config = parse_testconfig(args.configfile) + config.additional_args(args.singletestdir, args.revision, args.branch) + revision = repoinit(config) + config.set_revision(revision) + return config + +def check_for_basever(testlogfile, basebranch): + """Checks if the base revision is present in the testlogs""" + filetoopen = open(testlogfile, 'r') + for line in filetoopen: + templine = processLogLine(line) + if templine.branch == basebranch: + return True + return False + +def split_time(filename): + """Splits the output of the time function into seperate parts. + We will write time to file, because many programs output to + stderr which makes it difficult to get only the exact results we need.""" + timefile = open(filename, 'r') + realtime = float(timefile.readline().replace('\n', '').split()[1]) + usertime = float(timefile.readline().replace('\n', '').split()[1]) + systime = float(timefile.readline().replace('\n', '').split()[1]) + timefile.close() + + return (realtime, usertime, systime) + + +def write_log(time_file, logname, config): + """Writes to a logfile""" + log_write = open(config.testlogs + '/' + logname, 'a') # Open logfile + date_run = time.strftime("%d.%m.%Y %H:%M:%S") # Get the time of the test + realtime, usertime, systime = split_time(time_file) # Get the times in a nice form + + # Append everything to a log file. + writestr = date_run + " " + config.revision + " Testname: " + logname +\ + " RealTime: " + str(realtime) + " UserTime: " + str(usertime) +\ + " SystemTime: " + str(systime) + " Branch: " + config.branch +'\n' + log_write.write(writestr) + log_write.close() + + +def execute_tests(testcase, cur_directory, config): + """Executes timed tests based on the config file""" + #Figure out the order of which tests must be executed. + #Change to the current test directory + os.chdir(config.tests + '/' + cur_directory) + #Clear caches + subprocess.call(['sync'], shell=True) + subprocess.call([config.drop_caches], shell=True) + #Perform vanilla test and if a cached test exists - as well + print(testcase.name) + if 'vanilla' in testcase.permutations: + print(testcase.command) + subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\ + stderr=subprocess.PIPE, shell=True).communicate() + write_log('/tmp/time_moses_tests', testcase.name + '_vanilla', config) + if 'cached' in testcase.permutations: + subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\ + stderr=None, shell=True).communicate() + write_log('/tmp/time_moses_tests', testcase.name + '_vanilla_cached', config) + + #Now perform LD_PRELOAD tests + if 'ldpre' in testcase.permutations: + for opt in testcase.ldopts: + #Clear caches + subprocess.call(['sync'], shell=True) + subprocess.call([config.drop_caches], shell=True) + + #test + subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\ + stderr=None, shell=True).communicate() + write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' + opt, config) + if 'cached' in testcase.permutations: + subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\ + stderr=None, shell=True).communicate() + write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' +opt +'_cached', config) + +# Go through all the test directories and executes tests +if __name__ == '__main__': + CONFIG = get_config() + ALL_DIR = os.listdir(CONFIG.tests) + + #We should first check if any of the tests is run for the first time. + #If some of them are run for the first time we should first get their + #time with the base version (usually the previous release) + FIRSTTIME = [] + TESTLOGS = [] + #Strip filenames of test underscores + for listline in os.listdir(CONFIG.testlogs): + listline = listline.replace('_vanilla', '') + listline = listline.replace('_cached', '') + listline = listline.replace('_ldpre', '') + TESTLOGS.append(listline) + for directory in ALL_DIR: + if directory not in TESTLOGS: + FIRSTTIME.append(directory) + + #Sometimes even though we have the log files, we will need to rerun them + #Against a base version, because we require a different baseversion (for + #example when a new version of Moses is released.) Therefore we should + #Check if the version of Moses that we have as a base version is in all + #of the log files. + + for logfile in os.listdir(CONFIG.testlogs): + logfile_name = CONFIG.testlogs + '/' + logfile + if not check_for_basever(logfile_name, CONFIG.basebranch): + logfile = logfile.replace('_vanilla', '') + logfile = logfile.replace('_cached', '') + logfile = logfile.replace('_ldpre', '') + FIRSTTIME.append(logfile) + FIRSTTIME = list(set(FIRSTTIME)) #Deduplicate + + if FIRSTTIME != []: + #Create a new configuration for base version tests: + BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\ + CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\ + CONFIG.baserev) + BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch) + #Set up the repository and get its revision: + REVISION = repoinit(BASECONFIG) + BASECONFIG.set_revision(REVISION) + #Build + os.chdir(BASECONFIG.repo) + subprocess.call(['./previous.sh'], shell=True) + + #Perform tests + for directory in FIRSTTIME: + cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\ + '/config', directory, BASECONFIG.repo) + execute_tests(cur_testcase, directory, BASECONFIG) + + #Reset back the repository to the normal configuration + repoinit(CONFIG) + + #Builds moses + os.chdir(CONFIG.repo) + subprocess.call(['./previous.sh'], shell=True) + + if CONFIG.singletest: + TESTCASE = parse_configfile(CONFIG.tests + '/' +\ + CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo) + execute_tests(TESTCASE, CONFIG.singletest, CONFIG) + else: + for directory in ALL_DIR: + cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\ + '/config', directory, CONFIG.repo) + execute_tests(cur_testcase, directory, CONFIG) diff --git a/contrib/moses-speedtest/sys_drop_caches.py b/contrib/moses-speedtest/sys_drop_caches.py new file mode 100644 index 000000000..d4796e090 --- /dev/null +++ b/contrib/moses-speedtest/sys_drop_caches.py @@ -0,0 +1,22 @@ +#!/usr/bin/spython +from sys import argv, stderr, exit +from os import linesep as ls +procfile = "/proc/sys/vm/drop_caches" +options = ["1","2","3"] +flush_type = None +try: + flush_type = argv[1][0:1] + if not flush_type in options: + raise IndexError, "not in options" + with open(procfile, "w") as f: + f.write("%s%s" % (flush_type,ls)) + exit(0) +except IndexError, e: + stderr.write("Argument %s required.%s" % (options, ls)) +except IOError, e: + stderr.write("Error writing to file.%s" % ls) +except StandardError, e: + stderr.write("Unknown Error.%s" % ls) + +exit(1) + diff --git a/contrib/moses-speedtest/test_config b/contrib/moses-speedtest/test_config new file mode 100644 index 000000000..4a480f496 --- /dev/null +++ b/contrib/moses-speedtest/test_config @@ -0,0 +1,3 @@ +Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config +LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, +Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla diff --git a/contrib/moses-speedtest/testsuite_common.py b/contrib/moses-speedtest/testsuite_common.py new file mode 100644 index 000000000..be96f98b5 --- /dev/null +++ b/contrib/moses-speedtest/testsuite_common.py @@ -0,0 +1,54 @@ +"""Common functions of the testsuitce""" +import os +#Clour constants +class bcolors: + PURPLE = '\033[95m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + ENDC = '\033[0m' + +class LogLine: + """A class to contain logfile line""" + def __init__(self, date, time, revision, testname, real, user, system, branch): + self.date = date + self.time = time + self.revision = revision + self.testname = testname + self.real = real + self.system = system + self.user = user + self.branch = branch + +class Result: + """A class to contain results of benchmarking""" + def __init__(self, testname, previous, current, revision, branch, prevrev, prevbranch): + self.testname = testname + self.previous = previous + self.current = current + self.change = previous - current + self.revision = revision + self.branch = branch + self.prevbranch = prevbranch + self.prevrev = prevrev + #Produce a percentage with fewer digits + self.percentage = float(format(1 - current/previous, '.4f')) + +def processLogLine(logline): + """Parses the log line into a nice datastructure""" + logline = logline.split() + log = LogLine(logline[0], logline[1], logline[2], logline[4],\ + float(logline[6]), float(logline[8]), float(logline[10]), logline[12]) + return log + +def getLastTwoLines(filename, logdir): + """Just a call to tail to get the diff between the last two runs""" + try: + line1, line2 = os.popen("tail -n2 " + logdir + '/' + filename) + except ValueError: #Check for new tests + tempfile = open(logdir + '/' + filename) + line1 = tempfile.readline() + tempfile.close() + return (line1, '\n') + return (line1, line2) diff --git a/contrib/moses-speedtest/testsuite_config b/contrib/moses-speedtest/testsuite_config new file mode 100644 index 000000000..b6ad6181c --- /dev/null +++ b/contrib/moses-speedtest/testsuite_config @@ -0,0 +1,5 @@ +MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder +DROP_CACHES_COMM: sys_drop_caches 3 +TEST_DIR: /home/moses-speedtest/phrase_tables/tests +TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs +BASEBRANCH: RELEASE-2.1.1 \ No newline at end of file From 4b98495e789b47a221fdc5061459731d32e1f194 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 10 Jun 2014 17:27:20 +0100 Subject: [PATCH 25/27] don't load data when just showing weightd --- moses/StaticData.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 0340778ed..8109f245d 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -535,7 +535,9 @@ bool StaticData::LoadData(Parameter *parameter) NoCache(); OverrideFeatures(); - LoadFeatureFunctions(); + if (!m_parameter->isParamSpecified("show-weights")) { + LoadFeatureFunctions(); + } if (!LoadDecodeGraphs()) return false; From 793aef6715862aef94756b1bc601e5f863f34438 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Wed, 11 Jun 2014 10:24:44 +0100 Subject: [PATCH 26/27] Fix small oversight --- contrib/moses-speedtest/html_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py index 4564b9200..c8255e312 100644 --- a/contrib/moses-speedtest/html_gen.py +++ b/contrib/moses-speedtest/html_gen.py @@ -73,7 +73,7 @@ def gather_necessary_lines(logfile, date): def append_date_to_table(resline): """Appends past dates to the html""" - cur_html = '' + cur_html = '' if resline.percentage > 0.05: #If we have improvement of more than 5% cur_html = cur_html + '' @@ -168,7 +168,7 @@ def produce_html(path, global_config): for days in past_dates: act_date = get_prev_days(logLine2.date, days) if linesdict[act_date][1] is not None: - logline_date = linesdict[act_date][0] + logline_date = linesdict[act_date][1] restemp = Result(logline_date.testname, logline_date.real, logLine2.real,\ logLine2.revision, logLine2.branch, logline_date.revision, logline_date.branch) html = html + append_date_to_table(restemp) From 2f752fe83347f2766f5c1badc2a70662531e9b0d Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Wed, 11 Jun 2014 10:51:31 +0100 Subject: [PATCH 27/27] Truncate revision output --- contrib/moses-speedtest/html_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py index c8255e312..740b7bf91 100644 --- a/contrib/moses-speedtest/html_gen.py +++ b/contrib/moses-speedtest/html_gen.py @@ -135,8 +135,8 @@ def produce_html(path, global_config): res1 = Result(logLine1.testname, logLine1.real, logLine2.real,\ logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch) html = html + '' + res1.testname + '' #Add fancy colours depending on the change if res1.percentage > 0.05: #If we have improvement of more than 5%
DateTimeTestnameRevisionBranchTimePrevtimePrevrevChange (%)Time (Basebranch)Change (%, Basebranch)Time (Days -2)Change (%, Days -2)Time (Days -3)Change (%, Days -3)Time (Days -4)Change (%, Days -4)Time (Days -5)Change (%, Days -5)Time (Days -6)Change (%, Days -6)Time (Days -7)Change (%, Days -7)Time (Days -14)Change (%, Days -14)Time (Years -1)Change (%, Years -1)
' + resline.current + '' + str(resline.percentage) + '' + str(resline.percentage) + '' + str(resline.percentage) + '
' + logLine2.date + '' + logLine2.time + '' +\ + res1.testname + '' + res1.revision + '' + res1.branch + '' +\ + str(res1.current) + '' + str(res1.previous) + '' + res1.prevrev + '' + str(res1.percentage) + '' + str(res1.percentage) + '' + str(res1.percentage) + '' + str(res2.previous) + '' + str(res2.percentage) + '' + str(res2.percentage) + '' + str(res2.percentage) + 'N/AN/A
' + resline.current + '' + str(resline.current) + '' + str(resline.percentage) + '
' + logLine2.date + '' + logLine2.time + '' +\ - res1.testname + '' + res1.revision + '' + res1.branch + '' +\ - str(res1.current) + '' + str(res1.previous) + '' + res1.prevrev + '' + res1.revision[:10] + '' + res1.branch + '' +\ + str(res1.current) + '' + str(res1.previous) + '' + res1.prevrev[:10] + '