From 8dee1725fb4f362a5d65f25ecdecafec7b08b1ae Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Tue, 3 Jun 2014 21:36:04 +0100
Subject: [PATCH 01/27] Removed Phrase penalty as a built-in feature function.

---
 moses/TranslationModel/UG/mmsapt.cpp | 20 ++++++++++----------
 moses/TranslationModel/UG/mmsapt.h   |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 128dcfe80..789907321 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -196,8 +196,8 @@ namespace Moses
     // currently always active by default; may (should) change later
     num_feats  = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
 
-    if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility
-      num_feats  = apply_pp.init(num_feats);
+    // if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility
+    // num_feats  = apply_pp.init(num_feats);
 
     if (num_feats < this->m_numScoreComponents)
       {
@@ -283,8 +283,8 @@ namespace Moses
   {
     PhrasePair pp;   
     pp.init(pid1, stats, this->m_numScoreComponents);
-    if (this->m_numScoreComponents%2)
-      apply_pp(bt,pp);
+    // if (this->m_numScoreComponents%2)
+    // apply_pp(bt,pp);
     pstats::trg_map_t::const_iterator t;
     for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
       {
@@ -318,8 +318,8 @@ namespace Moses
       pp.init(pid1b, *statsb, this->m_numScoreComponents);
     else return false; // throw "no stats for pooling available!";
 
-    if (this->m_numScoreComponents%2)
-      apply_pp(bta,pp);
+    // if (this->m_numScoreComponents%2)
+    // apply_pp(bta,pp);
     pstats::trg_map_t::const_iterator b;
     pstats::trg_map_t::iterator a;
     if (statsb)
@@ -415,8 +415,8 @@ namespace Moses
     if (statsb)
       {
 	pool.init(pid1b,*statsb,0);
-	if (this->m_numScoreComponents%2)
-	  apply_pp(btb,ppdyn);
+	// if (this->m_numScoreComponents%2)
+	// apply_pp(btb,ppdyn);
 	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
 	  {
 	    ppdyn.update(b->first,b->second);
@@ -456,8 +456,8 @@ namespace Moses
     if (statsa)
       {
 	pool.init(pid1a,*statsa,0);
-	if (this->m_numScoreComponents%2)
-	  apply_pp(bta,ppfix);
+	// if (this->m_numScoreComponents%2)
+	// apply_pp(bta,ppfix);
 	for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
 	  {
 	    if (!a->second.valid()) continue; // done above
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index 5353a1c46..e0e2d8950 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -71,7 +71,7 @@ namespace Moses
     PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
     PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
     PScoreLex<Token>  calc_lex; // this one I'd like to see as an external ff eventually
-    PScorePP<Token>   apply_pp; // apply phrase penalty 
+    // PScorePP<Token>   apply_pp; // apply phrase penalty 
     PScoreLogCounts<Token>   add_logcounts_fix;
     PScoreLogCounts<Token>   add_logcounts_dyn;
     void init(string const& line);

From ce853731aec10c99738291a24b695bb2c6abffd9 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:38:55 +0100
Subject: [PATCH 02/27] Added mmsapt lookup utility.

---
 Jamroot                                    |  1 +
 moses/TranslationModel/UG/lookup_mmsapt.cc | 76 ++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 moses/TranslationModel/UG/lookup_mmsapt.cc

diff --git a/Jamroot b/Jamroot
index 1f7ca48cd..f6ce6b8f3 100644
--- a/Jamroot
+++ b/Jamroot
@@ -145,6 +145,7 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
 if [ option.get "with-mm" : : "yes" ]
 {
  alias mm :  
+  moses/TranslationModel/UG//lookup_mmsapt 
   moses/TranslationModel/UG/mm//mtt-build 
   moses/TranslationModel/UG/mm//mtt-dump 
   moses/TranslationModel/UG/mm//symal2mam 
diff --git a/moses/TranslationModel/UG/lookup_mmsapt.cc b/moses/TranslationModel/UG/lookup_mmsapt.cc
new file mode 100644
index 000000000..39ac23cc7
--- /dev/null
+++ b/moses/TranslationModel/UG/lookup_mmsapt.cc
@@ -0,0 +1,76 @@
+#include "mmsapt.h"
+#include <boost/foreach.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+class SimplePhrase : public Moses::Phrase
+{
+  vector<FactorType> const m_fo; // factor order
+public:
+  SimplePhrase(): m_fo(1,FactorType(0)) {}
+  
+  void init(string const& s) 
+  {
+    istringstream buf(s); string w;
+    while (buf >> w) 
+      {
+	Word wrd; 
+	this->AddWord().CreateFromString(Input,m_fo,StringPiece(w),false,false);
+      }
+  }
+};
+
+class TargetPhraseIndexSorter
+{
+  TargetPhraseCollection const& my_tpc;
+  CompareTargetPhrase cmp;
+public:
+  TargetPhraseIndexSorter(TargetPhraseCollection const& tpc) : my_tpc(tpc) {}
+  bool operator()(size_t a, size_t b) const
+  {
+    return cmp(*my_tpc[a], *my_tpc[b]);
+  }
+};
+
+int main(int argc, char* argv[])
+{
+  Parameter params;
+  if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
+    exit(1);
+
+  Mmsapt* PT;
+  BOOST_FOREACH(PhraseDictionary* pd, PhraseDictionary::GetColl())
+    if ((PT = dynamic_cast<Mmsapt*>(pd))) break;
+
+  string line;
+  while (getline(cin,line))
+    {
+      SimplePhrase p; p.init(line); 
+      cout << p << endl;
+      TargetPhraseCollection const* trg = PT->GetTargetPhraseCollectionLEGACY(p);
+      if (!trg) continue;
+      vector<size_t> order(trg->GetSize()); 
+      for (size_t i = 0; i < order.size(); ++i) order[i] = i;
+      sort(order.begin(),order.end(),TargetPhraseIndexSorter(*trg));
+      size_t k = 0;
+      BOOST_FOREACH(size_t i, order)
+	{
+	  Phrase const& phr = static_cast<Phrase const&>(*(*trg)[i]);
+	  cout << setw(3) << ++k << " " << phr << endl;
+	}
+      PT->Release(trg);
+    }
+  exit(0);
+}
+  
+  
+

From 5ae57f09d7e97d7608ae848b3aebd5502c37292e Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:39:44 +0100
Subject: [PATCH 03/27] Commented out unused variable.

---
 moses/ChartManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index 139256171..e137da915 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -125,7 +125,7 @@ void ChartManager::ProcessSentence()
  */
 void ChartManager::AddXmlChartOptions()
 {
-  const StaticData &staticData = StaticData::Instance();
+  // const StaticData &staticData = StaticData::Instance();
 
   const std::vector <ChartTranslationOptions*> xmlChartOptionsList = m_source.GetXmlChartTranslationOptions();
   IFVERBOSE(2) {

From 0e98a08446b1280e56c0ca130540a03ad46fe9d7 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:40:27 +0100
Subject: [PATCH 04/27] Commented out unused variable.

---
 moses/ConfusionNet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp
index 5861ee5f1..d9270bd1b 100644
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@@ -142,7 +142,7 @@ namespace Moses
   {
     Clear();
 
-    const StaticData   &staticData   = StaticData::Instance();
+    // const StaticData   &staticData   = StaticData::Instance();
     const InputFeature &inputFeature = InputFeature::Instance();
     size_t numInputScores   = inputFeature.GetNumInputScores();
     size_t numRealWordCount = inputFeature.GetNumRealWordsInInput();

From 3145bf3cc4de18eb56e8c6bd30cd152abd00d5c3 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:40:51 +0100
Subject: [PATCH 05/27] Commented out unused variable.

---
 moses/InputPath.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/InputPath.cpp b/moses/InputPath.cpp
index f00f1a7a4..523b03d53 100644
--- a/moses/InputPath.cpp
+++ b/moses/InputPath.cpp
@@ -85,7 +85,7 @@ size_t InputPath::GetTotalRuleSize() const
   size_t ret = 0;
   std::map<const PhraseDictionary*, std::pair<const TargetPhraseCollection*, const void*> >::const_iterator iter;
   for (iter = m_targetPhrases.begin(); iter != m_targetPhrases.end(); ++iter) {
-	const PhraseDictionary *pt = iter->first;
+    // const PhraseDictionary *pt = iter->first;
 	const TargetPhraseCollection *tpColl = iter->second.first;
 
 	if (tpColl) {

From 004b8c907856ba01d54c8c7f3f1b198e5d3ad4ba Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:41:50 +0100
Subject: [PATCH 06/27] Changed Phrase.m_words from private to protected.

---
 moses/Phrase.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/moses/Phrase.h b/moses/Phrase.h
index 4a5c4828a..f6eb661de 100644
--- a/moses/Phrase.h
+++ b/moses/Phrase.h
@@ -47,8 +47,8 @@ class WordsRange;
 class Phrase
 {
   friend std::ostream& operator<<(std::ostream&, const Phrase&);
-private:
-
+  // private:
+protected:
   std::vector<Word>			m_words;
 
 public:

From c7a0520a18c19388d923221b00f673394b960710 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:43:26 +0100
Subject: [PATCH 07/27] Made moses shut up by changing unconditional 'cerr's to
 VERBOSE(1,...)

---
 moses/StaticData.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 0340778ed..f5cb1b77d 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -494,7 +494,8 @@ bool StaticData::LoadData(Parameter *parameter)
     }
     m_xmlBrackets.first= brackets[0];
     m_xmlBrackets.second=brackets[1];
-    cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
+    VERBOSE(1,"XML tags opening and closing brackets for XML input are: " 
+	    << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl);
   }
 
   if (m_parameter->GetParam("placeholder-factor").size() > 0) {
@@ -511,7 +512,7 @@ bool StaticData::LoadData(Parameter *parameter)
   const vector<string> &features = m_parameter->GetParam("feature");
   for (size_t i = 0; i < features.size(); ++i) {
     const string &line = Trim(features[i]);
-    cerr << "line=" << line << endl;
+    VERBOSE(1,"line=" << line << endl);
     if (line.empty())
       continue;
 
@@ -640,7 +641,8 @@ void StaticData::LoadNonTerminals()
     		  "Incorrect unknown LHS format: " << line);
       UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
       m_unknownLHS.push_back(entry);
-      const Factor *targetFactor = factorCollection.AddFactor(Output, 0, tokens[0], true);
+      // const Factor *targetFactor = 
+      factorCollection.AddFactor(Output, 0, tokens[0], true);
     }
 
   }
@@ -734,7 +736,7 @@ bool StaticData::LoadDecodeGraphs()
       DecodeGraph *decodeGraph;
       if (IsChart()) {
         size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
-        cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
+        VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
         decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
       } else {
         decodeGraph = new DecodeGraph(m_decodeGraphs.size());
@@ -866,7 +868,7 @@ void StaticData::SetExecPath(const std::string &path)
   if (pos !=  string::npos) {
     m_binPath = path.substr(0, pos);
   }
-  cerr << m_binPath << endl;
+  VERBOSE(1,m_binPath << endl);
 }
 
 const string &StaticData::GetBinDirectory() const
@@ -920,7 +922,8 @@ void StaticData::LoadFeatureFunctions()
     FeatureFunction *ff = *iter;
     bool doLoad = true;
 
-    if (PhraseDictionary *ffCast = dynamic_cast<PhraseDictionary*>(ff)) {
+    // if (PhraseDictionary *ffCast = dynamic_cast<PhraseDictionary*>(ff)) {
+    if (dynamic_cast<PhraseDictionary*>(ff)) {
       doLoad = false;
     }
 
@@ -964,7 +967,7 @@ bool StaticData::CheckWeights() const
     set<string>::iterator iter;
     for (iter = weightNames.begin(); iter != weightNames.end(); ) {
       string fname = (*iter).substr(0, (*iter).find("_"));
-      cerr << fname << "\n";
+      VERBOSE(1,fname << "\n");
       if (featureNames.find(fname) != featureNames.end()) {
         weightNames.erase(iter++);
       }
@@ -1039,7 +1042,7 @@ bool StaticData::LoadAlternateWeightSettings()
       vector<string> tokens = Tokenize(weightSpecification[i]);
       vector<string> args = Tokenize(tokens[0], "=");
       currentId = args[1];
-      cerr << "alternate weight setting " << currentId << endl;
+      VERBOSE(1,"alternate weight setting " << currentId << endl);
       UTIL_THROW_IF2(m_weightSetting.find(currentId) != m_weightSetting.end(),
     		  "Duplicate alternate weight id: " << currentId);
       m_weightSetting[ currentId ] = new ScoreComponentCollection;

From 22ec93b85ce5e64a04a24942e51b2ca84d36625b Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:44:11 +0100
Subject: [PATCH 08/27] Added operator [] to TargetPhraseCollection.

---
 moses/TargetPhraseCollection.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h
index 47eee0458..0c6a7a74c 100644
--- a/moses/TargetPhraseCollection.h
+++ b/moses/TargetPhraseCollection.h
@@ -44,6 +44,12 @@ public:
   typedef CollType::iterator iterator;
   typedef CollType::const_iterator const_iterator;
 
+  TargetPhrase const* 
+  operator[](size_t const i) const
+  {
+    return m_collection.at(i);
+  }  
+
   iterator begin() {
     return m_collection.begin();
   }

From a40fcbae02827d31bad754c90b9596b977914d2e Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:45:34 +0100
Subject: [PATCH 09/27] Added utility lookup_mmsapt

---
 moses/TranslationModel/UG/Jamfile | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile
index 1ee663044..547928423 100644
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@@ -9,6 +9,17 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
 $(TOP)/util//kenutil 
 ; 
 
+exe lookup_mmsapt : 
+lookup_mmsapt.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
+
 install $(PREFIX)/bin : try-align ; 
 
 fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;

From 2f109621bff2eacac5168155a171f84f5de4f9a9 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:47:29 +0100
Subject: [PATCH 10/27] Added configurable options and SetTableLimit to Mmsapt.

---
 moses/TranslationModel/UG/mmsapt.cpp | 33 ++++++++++++++++++----------
 moses/TranslationModel/UG/mmsapt.h   |  3 +++
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 789907321..b2c4c10f2 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -122,16 +122,16 @@ namespace Moses
     if (m != param.end())
       withPbwd = m->second != "0";
       
-    m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
-
     m = param.find("workers");
     m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
     m_workers = min(m_workers,24UL);
 
+    m = param.find("limit");
+    if (m != param.end()) m_tableLimit = atoi(m->second.c_str());
+
     m = param.find("cache-size");
-    m_history.reserve(m != param.end() 
-		      ? max(1000,atoi(m->second.c_str()))
-		      : 10000);
+    m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000);
+    // in plain language: cache size is at least 1000, and 10,000 by default
     
     this->m_numScoreComponents = atoi(param["num-features"].c_str());
 
@@ -368,6 +368,13 @@ namespace Moses
 	  }
 	else 
 	  pp.update(a->first,a->second);
+#if 0
+	// jstats const& j = a->second;
+	cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
+	     << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
+	cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " 
+	     << pp.joint << " " << pp.raw2 << endl;
+#endif
 
 	UTIL_THROW_IF2(pp.raw2 == 0, 
 		       "OOPS" 
@@ -376,12 +383,6 @@ namespace Moses
 		       << pp.raw1 << " " << pp.sample1 << " " 
 		       << pp.good1 << " " << pp.joint << " " 
 		       << pp.raw2);
-#if 0
-	jstats const& j = a->second;
-	cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
-	     << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
-	cerr << j.rcnt() << " " << j.cnt2() << " " << j.wcnt() << endl;
-#endif
 	calc_lex(bta,pp);
 	if (withPfwd) calc_pfwd_fix(bta,pp);
 	if (withPbwd) calc_pbwd_fix(bta,pp);
@@ -662,7 +663,7 @@ namespace Moses
 	|| combine_pstats(src, mfix.getPid(),sfix.get(),btfix, 
 			  mdyn.getPid(),sdyn.get(),*dyn,ret))
       {
-	ret->NthElement(m_tableLimit);
+	if (m_tableLimit) ret->Prune(true,m_tableLimit);
 #if 0
 	sort(ret->begin(), ret->end(), CompareTargetPhrase());
 	cout << "SOURCE PHRASE: " << src << endl;
@@ -683,6 +684,14 @@ namespace Moses
     return encache(ret);
   }
 
+  size_t 
+  Mmsapt::
+  SetTableLimit(size_t limit)
+  {
+    std::swap(m_tableLimit,limit);
+    return limit;
+  }
+
   void
   Mmsapt::
   CleanUpAfterSentenceProcessing(const InputType& source)
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index e0e2d8950..b5a5b15e2 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -168,6 +168,9 @@ namespace Moses
     void
     Load();
     
+    // returns the prior table limit
+    size_t SetTableLimit(size_t limit);
+
 #ifndef NO_MOSES
     TargetPhraseCollection const* 
     GetTargetPhraseCollectionLEGACY(const Phrase& src) const;

From b92d599727f53f7c97f6cf4fee92516a7e40ca6a Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:48:11 +0100
Subject: [PATCH 11/27] Bug fix in mmlex-lookup.

---
 moses/TranslationModel/UG/mm/mmlex-lookup.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
index 14d839edf..fbdceeaa0 100644
--- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc
@@ -131,7 +131,7 @@ interpret_args(int ac, char* av[])
   o.add_options()
     ("help,h",    "print this message")
     ("source,s",po::value<string>(&swrd),"source word")
-    ("target,t",po::value<string>(&swrd),"target word")
+    ("target,t",po::value<string>(&twrd),"target word")
     ;
   
   h.add_options()

From 5116f0072b162ed06dc7132d0f727fcaaecab306 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Thu, 5 Jun 2014 01:50:55 +0100
Subject: [PATCH 12/27] Minor edits to ug_bitext.h. Added min_diverse to
 ug_bitext::job to ensure minimum number of translation alternatives before
 sampling stops.

---
 moses/TranslationModel/UG/mm/ug_bitext.h | 50 +++++++++++++++++-------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 84c3713ac..5dfbec285 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -318,10 +318,10 @@ namespace Moses {
 	assert(pp.sample1);
 	assert(pp.joint);
 	assert(pp.raw2);
-	(*dest)[i] = log(pp.raw1);
-	(*dest)[++i] = log(pp.sample1);
-	(*dest)[++i] = log(pp.joint);
-	(*dest)[++i] = log(pp.raw2);
+	(*dest)[i]   = -log(pp.raw1);
+	(*dest)[++i] = -log(pp.sample1);
+	(*dest)[++i] = +log(pp.joint);
+	(*dest)[++i] = -log(pp.raw2);
       }
     };
 
@@ -590,8 +590,9 @@ namespace Moses {
 	static ThreadSafeCounter active;
 	boost::mutex lock; 
 	friend class agenda;
-	boost::taus88 rnd; // every job has its own pseudo random generator 
-	double rnddenom;   // denominator for scaling random sampling
+	boost::taus88 rnd;  // every job has its own pseudo random generator 
+	double rnddenom;    // denominator for scaling random sampling
+	size_t min_diverse; // minimum number of distinct translations
       public:
 	size_t         workers; // how many workers are working on this job?
 	sptr<TSA<Token> const> root; // root of the underlying suffix array
@@ -644,34 +645,47 @@ namespace Moses {
     step(uint64_t & sid, uint64_t & offset)
     {
       boost::lock_guard<boost::mutex> jguard(lock);
-      if ((max_samples == 0) && (next < stop))
+      bool ret = (max_samples == 0) && (next < stop);
+      if (ret)
 	{
 	  next = root->readSid(next,stop,sid);
 	  next = root->readOffset(next,stop,offset);
 	  boost::lock_guard<boost::mutex> sguard(stats->lock);
 	  if (stats->raw_cnt == ctr) ++stats->raw_cnt;
 	  stats->sample_cnt++;
-	  return true;
 	}
       else 
 	{
-	  while (next < stop && stats->good < max_samples)
+	  while (next < stop && (stats->good < max_samples || 
+				 stats->trg.size() < min_diverse))
 	    {
 	      next = root->readSid(next,stop,sid);
 	      next = root->readOffset(next,stop,offset);
-	      {
-		boost::lock_guard<boost::mutex> sguard(stats->lock);
+	      { // brackets required for lock scoping; see sguard immediately below
+		boost::lock_guard<boost::mutex> sguard(stats->lock); 
 		if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-		size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
+		size_t scalefac = (stats->raw_cnt - ctr++);
+		size_t rnum = scalefac*(rnd()/(rnd.max()+1.));
+#if 0
+		cerr << rnum << "/" << scalefac << " vs. " 
+		     << max_samples - stats->good << " ("
+		     << max_samples << " - " << stats->good << ")" 
+		     << endl;
+#endif
 		if (rnum < max_samples - stats->good)
 		  {
 		    stats->sample_cnt++;
-		    return true;
+		    ret = true;
+		    break;
 		  }
 	      }
 	    }
-	  return false;
 	}
+      
+      // boost::lock_guard<boost::mutex> sguard(stats->lock); 
+      // abuse of lock for clean output to cerr
+      // cerr << stats->sample_cnt++;
+      return ret;
     }
 
     template<typename Token>
@@ -713,6 +727,13 @@ namespace Moses {
     worker::
     operator()()
     {
+      // things to do:
+      // - have each worker maintain their own pstats object and merge results at the end;
+      // - ensure the minimum size of samples considered by a non-locked counter that is only 
+      //   ever incremented -- who cares if we look at more samples than required, as long
+      //   as we look at at least the minimum required
+      // This way, we can reduce the number of lock / unlock operations we need to do during 
+      // sampling. 
       size_t s1=0, s2=0, e1=0, e2=0;
       uint64_t sid=0, offset=0; // of the source phrase
       while(sptr<job> j = ag.get_job())
@@ -812,6 +833,7 @@ namespace Moses {
 	sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
       : rnd(0)
       , rnddenom(rnd.max() + 1.)
+      , min_diverse(10)
       , workers(0)
       , root(r)
       , next(m.lower_bound(-1))

From a5f46e65cb0f01ec0280a5d045cb61f342b9d51b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 6 Jun 2014 17:25:09 +0100
Subject: [PATCH 13/27] eclipse

---
 contrib/other-builds/extractor/.cproject | 4 +++-
 contrib/other-builds/extractor/.project  | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 5f0b24ef0..06a8a8a24 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -42,9 +42,11 @@
 								</option>
 								<option id="gnu.cpp.link.option.libs.585257079" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
 									<listOptionValue builtIn="false" value="mert_lib"/>
-									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="util"/>
+									<listOptionValue builtIn="false" value="boost_system-mt"/>
+									<listOptionValue builtIn="false" value="boost_thread-mt"/>
 									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="pthread"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.656319745" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
diff --git a/contrib/other-builds/extractor/.project b/contrib/other-builds/extractor/.project
index e4fe08579..56d560019 100644
--- a/contrib/other-builds/extractor/.project
+++ b/contrib/other-builds/extractor/.project
@@ -4,6 +4,7 @@
 	<comment></comment>
 	<projects>
 		<project>mert_lib</project>
+		<project>util</project>
 	</projects>
 	<buildSpec>
 		<buildCommand>

From 91a7c19b7c5035eb986c6bca5dc628db8052b71c Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 12:41:54 +0100
Subject: [PATCH 14/27] eclipse project for consolidate

---
 contrib/other-builds/consolidate/.cproject | 132 +++++++++++++++++++++
 contrib/other-builds/consolidate/.project  |  64 ++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 contrib/other-builds/consolidate/.cproject
 create mode 100644 contrib/other-builds/consolidate/.project

diff --git a/contrib/other-builds/consolidate/.cproject b/contrib/other-builds/consolidate/.cproject
new file mode 100644
index 000000000..3c70ed365
--- /dev/null
+++ b/contrib/other-builds/consolidate/.cproject
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686" name="Debug" parent="cdt.managedbuild.config.gnu.cross.exe.debug">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.1312813804" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1457158442" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/consolidate}/Debug" id="cdt.managedbuild.builder.gnu.cross.401817170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.584773180" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.548826159" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.69309976" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1869389417" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1684035985" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.1978964587" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.1174628687" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.1899244069" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1369007077" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.988122551" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.580092188" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<option id="gnu.cpp.link.option.libs.1224797947" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
+									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+								</option>
+								<option id="gnu.cpp.link.option.paths.845281969" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1562981657" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.1813579853" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.660034723" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.2016181080" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="cdt.managedbuild.config.gnu.cross.exe.release.1197533473">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.cross.exe.release.1197533473" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.cross.exe.release.1197533473" name="Release" parent="cdt.managedbuild.config.gnu.cross.exe.release">
+					<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.release.1197533473." name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.release.1193312581" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.release">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1614674218" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
+							<builder buildPath="${workspace_loc:/consolidate}/Release" id="cdt.managedbuild.builder.gnu.cross.1921548268" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1402792534" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
+								<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.172258714" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
+								<option id="gnu.c.compiler.option.debugging.level.949623548" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1960225725" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1697856596" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
+								<option id="gnu.cpp.compiler.option.optimization.level.1575999400" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.debugging.level.732263649" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1685852561" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.1332869586" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.cpp.linker.484647585" name="Cross G++ Linker" superClass="cdt.managedbuild.tool.gnu.cross.cpp.linker">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2140954002" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.cross.archiver.620666274" name="Cross GCC Archiver" superClass="cdt.managedbuild.tool.gnu.cross.archiver"/>
+							<tool id="cdt.managedbuild.tool.gnu.cross.assembler.1478840357" name="Cross GCC Assembler" superClass="cdt.managedbuild.tool.gnu.cross.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.412043972" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="consolidate.cdt.managedbuild.target.gnu.cross.exe.1166003694" name="Executable" projectType="cdt.managedbuild.target.gnu.cross.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686;cdt.managedbuild.config.gnu.cross.exe.debug.1847651686.;cdt.managedbuild.tool.gnu.cross.c.compiler.584773180;cdt.managedbuild.tool.gnu.c.compiler.input.1869389417">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1197533473;cdt.managedbuild.config.gnu.cross.exe.release.1197533473.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1697856596;cdt.managedbuild.tool.gnu.cpp.compiler.input.1685852561">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.debug.1847651686;cdt.managedbuild.config.gnu.cross.exe.debug.1847651686.;cdt.managedbuild.tool.gnu.cross.cpp.compiler.1684035985;cdt.managedbuild.tool.gnu.cpp.compiler.input.1369007077">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.cross.exe.release.1197533473;cdt.managedbuild.config.gnu.cross.exe.release.1197533473.;cdt.managedbuild.tool.gnu.cross.c.compiler.1402792534;cdt.managedbuild.tool.gnu.c.compiler.input.1960225725">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/consolidate"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/consolidate"/>
+		</configuration>
+	</storageModule>
+</cproject>
diff --git a/contrib/other-builds/consolidate/.project b/contrib/other-builds/consolidate/.project
new file mode 100644
index 000000000..4095862b4
--- /dev/null
+++ b/contrib/other-builds/consolidate/.project
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>consolidate</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+	<linkedResources>
+		<link>
+			<name>InputFileStream.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp</locationURI>
+		</link>
+		<link>
+			<name>InputFileStream.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h</locationURI>
+		</link>
+		<link>
+			<name>OutputFileStream.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp</locationURI>
+		</link>
+		<link>
+			<name>OutputFileStream.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
+		</link>
+		<link>
+			<name>consolidate-main.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/consolidate-main.cpp</locationURI>
+		</link>
+		<link>
+			<name>tables-core.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp</locationURI>
+		</link>
+		<link>
+			<name>tables-core.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h</locationURI>
+		</link>
+	</linkedResources>
+</projectDescription>

From f58c7fc831ada7701eb070014def87a5988f509a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 13:17:23 +0100
Subject: [PATCH 15/27] use standard c++ getline instead of old Moses
 SAFE_GETLINE

---
 phrase-extract/consolidate-main.cpp | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index de0d7f646..c57cc7747 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -30,8 +30,6 @@
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 
-#define LINE_MAX_LENGTH 10000
-
 using namespace std;
 
 bool hierarchicalFlag = false;
@@ -46,12 +44,11 @@ inline float maybeLogProb( float a )
   return logProbFlag ? log(a) : a;
 }
 
-char line[LINE_MAX_LENGTH];
 void processFiles( char*, char*, char*, char* );
 void loadCountOfCounts( char* );
 void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
-vector< string > splitLine();
+vector< string > splitLine(const char *line);
 vector< int > countBin;
 bool sparseCountBinFeatureFlag = false;
 
@@ -140,14 +137,13 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
   istream &fileP = fileCountOfCounts;
 
   countOfCounts.push_back(0.0);
-  while(1) {
-    if (fileP.eof()) break;
-    SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (fileP.eof()) break;
+
+  string line;
+  while (getline(fileP, line)) {
     if (totalCount < 0)
-      totalCount = atof(line); // total number of distinct phrase pairs
+      totalCount = atof(line.c_str()); // total number of distinct phrase pairs
     else
-      countOfCounts.push_back( atof(line) );
+      countOfCounts.push_back( atof(line.c_str()) );
   }
   fileCountOfCounts.Close();
 
@@ -370,16 +366,16 @@ bool getLine( istream &fileP, vector< string > &item )
   if (fileP.eof())
     return false;
 
-  SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-  if (fileP.eof())
+  string line;
+  if (!getline(fileP, line))
     return false;
 
-  item = splitLine();
+  item = splitLine(line.c_str());
 
   return true;
 }
 
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
 {
   vector< string > item;
   int start=0;

From d979b24314944348cea2d7f8d8e00691c64abebb Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 14:06:33 +0100
Subject: [PATCH 16/27] use standard c++ getline instead of old Moses
 SAFE_GETLINE

---
 phrase-extract/score-main.cpp | 37 ++++++++++++-----------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 46538010f..dfb5103f4 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -40,8 +40,6 @@
 using namespace std;
 using namespace MosesTraining;
 
-#define LINE_MAX_LENGTH 100000
-
 namespace MosesTraining
 {
 LexicalTable lexTable;
@@ -232,7 +230,7 @@ int main(int argc, char* argv[])
   }
 
   // loop through all extracted phrase translations
-  char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH];
+  string line, lastLine;
   lastLine[0] = '\0';
   ExtractionPhrasePair *phrasePair = NULL;
   std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource;
@@ -245,8 +243,8 @@ int main(int argc, char* argv[])
   float tmpCount=0.0f, tmpPcfgSum=0.0f;
 
   int i=0;
-  SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
-  if ( !extractFileP.eof() ) {
+  // TODO why read only the 1st line?
+  if ( getline(extractFileP, line)) {
     ++i;
     tmpPhraseSource = new PHRASE();
     tmpPhraseTarget = new PHRASE();
@@ -265,23 +263,21 @@ int main(int argc, char* argv[])
     if ( hierarchicalFlag ) {
       phrasePairsWithSameSourceAndTarget.push_back( phrasePair );
     }
-    strcpy( lastLine, line );
-    SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ );
+    lastLine = line;
   }
 
-  while ( !extractFileP.eof() ) {
+  while ( getline(extractFileP, line) ) {
 
     if ( ++i % 100000 == 0 ) {
       std::cerr << "." << std::flush;
     }
 
     // identical to last line? just add count
-    if (strcmp(line,lastLine) == 0) {
+    if (line == lastLine) {
       phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum);
-      SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
       continue;
     } else {
-      strcpy( lastLine, line );
+      lastLine = line;
     }
 
     tmpPhraseSource = new PHRASE();
@@ -359,8 +355,6 @@ int main(int argc, char* argv[])
       }
     }
 
-    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-
   }
 
   processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb );
@@ -750,11 +744,9 @@ void loadFunctionWords( const string &fileName )
   }
   istream *inFileP = &inFile;
 
-  char line[LINE_MAX_LENGTH];
-  while(true) {
-    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (inFileP->eof()) break;
-    std::vector<string> token = tokenize( line );
+  string line;
+  while(getline(*inFileP, line)) {
+    std::vector<string> token = tokenize( line.c_str() );
     if (token.size() > 0)
       functionWordList.insert( token[0] );
   }
@@ -799,16 +791,13 @@ void LexicalTable::load( const string &fileName )
   }
   istream *inFileP = &inFile;
 
-  char line[LINE_MAX_LENGTH];
-
+  string line;
   int i=0;
-  while(true) {
+  while(getline(*inFileP, line)) {
     i++;
     if (i%100000 == 0) std::cerr << "." << flush;
-    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (inFileP->eof()) break;
 
-    std::vector<string> token = tokenize( line );
+    std::vector<string> token = tokenize( line.c_str() );
     if (token.size() != 3) {
         std::cerr << "line " << i << " in " << fileName
            << " has wrong number of tokens, skipping:" << std::endl

From 23ba0de2247e84db69759445a41c4c4f04840460 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 15:41:27 +0100
Subject: [PATCH 17/27] use standard c++ getline instead of old Moses
 SAFE_GETLINE

---
 phrase-extract/SentenceAlignment.cpp |  6 +++++-
 phrase-extract/SentenceAlignment.h   |  7 +++++--
 phrase-extract/extract-main.cpp      | 28 +++++++++++++---------------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index c3d71d525..120c9154d 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -54,7 +54,11 @@ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bo
   return true;
 }
 
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], char weightString[], int sentenceID, bool boundaryRules)
+bool SentenceAlignment::create(const char targetString[],
+							const char sourceString[],
+							const char alignmentString[],
+							const char weightString[],
+							int sentenceID, bool boundaryRules)
 {
   using namespace std;
   this->sentenceID = sentenceID;
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index 1df61cf02..576d3279e 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -43,8 +43,11 @@ public:
 
   virtual bool processSourceSentence(const char *, int, bool boundaryRules);
 
-  bool create(char targetString[], char sourceString[],
-              char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
+  bool create(const char targetString[],
+		  	  const char sourceString[],
+		  	  const char alignmentString[],
+		  	  const char weightString[],
+		  	  int sentenceID, bool boundaryRules);
 
   void invertAlignment();
 
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 5d58028d6..698599a10 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -32,10 +32,6 @@ using namespace MosesTraining;
 namespace MosesTraining
 {
 
-
-const long int LINE_MAX_LENGTH = 500000 ;
-
-
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
 
@@ -277,20 +273,18 @@ int main(int argc, char* argv[])
 
   int i = sentenceOffset;
 
-  while(true) {
+  string englishString, foreignString, alignmentString, weightString;
+
+  while(getline(*eFileP, englishString)) {
     i++;
     if (i%10000 == 0) cerr << "." << flush;
-    char englishString[LINE_MAX_LENGTH];
-    char foreignString[LINE_MAX_LENGTH];
-    char alignmentString[LINE_MAX_LENGTH];
-    char weightString[LINE_MAX_LENGTH];
-    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (eFileP->eof()) break;
-    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
-    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+
+    getline(*fFileP, foreignString);
+    getline(*aFileP, alignmentString);
     if (iwFileP) {
-      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
+      getline(*iwFileP, weightString);
     }
+
     SentenceAlignment sentence;
     // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
     //az: output src, tgt, and alingment line
@@ -300,7 +294,11 @@ int main(int argc, char* argv[])
       cout << "LOG: ALT: " << alignmentString << endl;
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
-    if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
+    if (sentence.create( englishString.c_str(),
+    					foreignString.c_str(),
+    					alignmentString.c_str(),
+    					weightString.c_str(),
+    					i, false)) {
       if (options.placeholders.size()) {
         sentence.invertAlignment();
       }

From cb94a3181bd00c74bf0b2b81fea4aee2195dc121 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 16:23:14 +0100
Subject: [PATCH 18/27] use standard c++ getline instead of old Moses
 SAFE_GETLINE

---
 phrase-extract/DomainFeature.cpp            | 11 +++-----
 phrase-extract/consolidate-direct-main.cpp  | 24 +++++++-----------
 phrase-extract/consolidate-reverse-main.cpp | 22 ++++++++--------
 phrase-extract/extract-ordering-main.cpp    | 28 +++++++++------------
 phrase-extract/extract-rules-main.cpp       | 22 +++++++---------
 phrase-extract/relax-parse-main.cpp         |  8 ++----
 phrase-extract/statistics-main.cpp          | 28 ++++++++-------------
 7 files changed, 57 insertions(+), 86 deletions(-)

diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
index 2f99a8709..337364b1d 100644
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -4,8 +4,6 @@
 #include "InputFileStream.h"
 #include "SafeGetline.h"
 
-#define TABLE_LINE_MAX_LENGTH 1000
-
 using namespace std;
 
 namespace MosesTraining
@@ -16,12 +14,11 @@ void Domain::load( const std::string &domainFileName )
 {
   Moses::InputFileStream fileS( domainFileName );
   istream *fileP = &fileS;
-  while(true) {
-    char line[TABLE_LINE_MAX_LENGTH];
-    SAFE_GETLINE((*fileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__);
-    if (fileP->eof()) break;
+
+	string line;
+  while(getline(*fileP, line)) {
     // read
-    vector< string > domainSpecLine = tokenize( line );
+    vector< string > domainSpecLine = tokenize( line.c_str() );
     int lineNumber;
     if (domainSpecLine.size() != 2 ||
         ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index 3b38f741c..40e0e35d4 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -26,16 +26,9 @@
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 
-#include "SafeGetline.h"
-
-#define LINE_MAX_LENGTH 10000
-
 using namespace std;
 
-char line[LINE_MAX_LENGTH];
-
-
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
 {
   vector< string > item;
   int start=0;
@@ -61,14 +54,15 @@ bool getLine( istream &fileP, vector< string > &item )
 {
   if (fileP.eof())
     return false;
-
-  SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-  if (fileP.eof())
+  
+  string line;
+  if (getline(fileP, line)) {
+    item = splitLine(line.c_str());
     return false;
-
-  item = splitLine();
-
-  return true;
+  }
+  else {
+    return false;
+  }
 }
 
 
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index 6843bf3aa..891773418 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -30,20 +30,17 @@
 #include "SafeGetline.h"
 #include "InputFileStream.h"
 
-#define LINE_MAX_LENGTH 10000
-
 using namespace std;
 
 bool hierarchicalFlag = false;
 bool onlyDirectFlag = false;
 bool phraseCountFlag = true;
 bool logProbFlag = false;
-char line[LINE_MAX_LENGTH];
 
 void processFiles( char*, char*, char* );
 bool getLine( istream &fileP, vector< string > &item );
 string reverseAlignment(const string &alignments);
-vector< string > splitLine();
+vector< string > splitLine(const char *lin);
 
 inline void Tokenize(std::vector<std::string> &output
                      , const std::string& str
@@ -190,17 +187,18 @@ bool getLine( istream &fileP, vector< string > &item )
 {
   if (fileP.eof())
     return false;
-
-  SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-  if (fileP.eof())
+  
+  string line;
+  if (getline(fileP, line)) {
+    item = splitLine(line.c_str());
     return false;
-
-  item = splitLine();
-
-  return true;
+  }
+  else {
+    return false;
+  }
 }
 
-vector< string > splitLine()
+vector< string > splitLine(const char *line)
 {
   vector< string > item;
   bool betweenWords = true;
diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp
index 104457b01..78132d4fd 100644
--- a/phrase-extract/extract-ordering-main.cpp
+++ b/phrase-extract/extract-ordering-main.cpp
@@ -32,10 +32,6 @@ using namespace MosesTraining;
 namespace MosesTraining
 {
 
-
-const long int LINE_MAX_LENGTH = 500000 ;
-
-
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
 
@@ -246,20 +242,20 @@ int main(int argc, char* argv[])
 
   int i = sentenceOffset;
 
-  while(true) {
+  string englishString, foreignString, alignmentString, weightString;
+
+  while(getline(*eFileP, englishString)) {
     i++;
-    if (i%10000 == 0) cerr << "." << flush;
-    char englishString[LINE_MAX_LENGTH];
-    char foreignString[LINE_MAX_LENGTH];
-    char alignmentString[LINE_MAX_LENGTH];
-    char weightString[LINE_MAX_LENGTH];
-    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (eFileP->eof()) break;
-    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
-    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+
+    getline(*eFileP, englishString);
+    getline(*fFileP, foreignString);
+    getline(*aFileP, alignmentString);
     if (iwFileP) {
-      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
+      getline(*iwFileP, weightString);
     }
+
+    if (i%10000 == 0) cerr << "." << flush;
+
     SentenceAlignment sentence;
     // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
     //az: output src, tgt, and alingment line
@@ -269,7 +265,7 @@ int main(int argc, char* argv[])
       cout << "LOG: ALT: " << alignmentString << endl;
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
-    if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
+    if (sentence.create( englishString.c_str(), foreignString.c_str(), alignmentString.c_str(), weightString.c_str(), i, false)) {
       ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
       task->Run();
       delete task;
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index f5f44316e..30963f32b 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -47,8 +47,6 @@
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 
-#define LINE_MAX_LENGTH 500000
-
 using namespace std;
 using namespace MosesTraining;
 
@@ -326,17 +324,15 @@ int main(int argc, char* argv[])
 
   // loop through all sentence pairs
   size_t i=sentenceOffset;
-  while(true) {
-    i++;
-    if (i%1000 == 0) cerr << i << " " << flush;
+  string targetString, sourceString, alignmentString;
 
-    char targetString[LINE_MAX_LENGTH];
-    char sourceString[LINE_MAX_LENGTH];
-    char alignmentString[LINE_MAX_LENGTH];
-    SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (tFileP->eof()) break;
-    SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
-    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+  while(getline(*tFileP, targetString)) {
+    i++;
+
+    getline(*sFileP, sourceString);
+    getline(*aFileP, alignmentString);
+
+    if (i%1000 == 0) cerr << i << " " << flush;
 
     SentenceAlignmentWithSyntax sentence
     (targetLabelCollection, sourceLabelCollection,
@@ -349,7 +345,7 @@ int main(int argc, char* argv[])
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
 
-    if (sentence.create(targetString, sourceString, alignmentString,"", i, options.boundaryRules)) {
+    if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
       if (options.unknownWordLabelFlag) {
         collectWordLabelCounts(sentence);
       }
diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index a58d4d97f..c04cae85b 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -33,17 +33,13 @@ int main(int argc, char* argv[])
 
   // loop through all sentences
   int i=0;
-  char inBuffer[LINE_MAX_LENGTH];
-  while(true) {
+  string inBuffer;
+  while(getline(cin, inBuffer)) {
     i++;
     if (i%1000 == 0) cerr << "." << flush;
     if (i%10000 == 0) cerr << ":" << flush;
     if (i%100000 == 0) cerr << "!" << flush;
 
-    // get line from stdin
-    SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (cin.eof()) break;
-
     // process into syntax tree representation
     string inBufferString = string( inBuffer );
     set< string > labelCollection;         // set of labels, not used
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp
index 67373ec93..f1563dc05 100644
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@@ -19,8 +19,6 @@
 using namespace std;
 using namespace MosesTraining;
 
-#define LINE_MAX_LENGTH 10000
-
 namespace MosesTraining
 {
 
@@ -31,7 +29,7 @@ public:
   vector< vector<size_t> > alignedToE;
   vector< vector<size_t> > alignedToF;
 
-  bool create( char*, int );
+  bool create( const char*, int );
   void clear();
   bool equals( const PhraseAlignment& );
 };
@@ -106,16 +104,14 @@ int main(int argc, char* argv[])
   vector< PhraseAlignment > phrasePairsWithSameF;
   int i=0;
   int fileCount = 0;
-  while(true) {
+
+  string line;
+  while(getline(extractFileP, line)) {
     if (extractFileP.eof()) break;
     if (++i % 100000 == 0) cerr << "." << flush;
-    char line[LINE_MAX_LENGTH];
-    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    //    if (fileCount>0)
-    if (extractFileP.eof())
-      break;
+
     PhraseAlignment phrasePair;
-    bool isPhrasePair = phrasePair.create( line, i );
+    bool isPhrasePair = phrasePair.create( line.c_str(), i );
     if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
       processPhrasePairs( phrasePairsWithSameF );
       for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
@@ -124,7 +120,7 @@ int main(int argc, char* argv[])
       phraseTableE.clear();
       phraseTableF.clear();
       phrasePair.clear(); // process line again, since phrase tables flushed
-      phrasePair.create( line, i );
+      phrasePair.create( line.c_str(), i );
       phrasePairBase = 0;
     }
     lastForeign = phrasePair.foreign;
@@ -242,7 +238,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
   }
 }
 
-bool PhraseAlignment::create( char line[], int lineID )
+bool PhraseAlignment::create(const char line[], int lineID )
 {
   vector< string > token = tokenize( line );
   int item = 1;
@@ -321,16 +317,14 @@ void LexicalTable::load( const string &filePath )
   }
   istream *inFileP = &inFile;
 
-  char line[LINE_MAX_LENGTH];
+  string line;
 
   int i=0;
-  while(true) {
+  while(getline(*inFileP, line)) {
     i++;
     if (i%100000 == 0) cerr << "." << flush;
-    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (inFileP->eof()) break;
 
-    vector<string> token = tokenize( line );
+    vector<string> token = tokenize( line.c_str() );
     if (token.size() != 3) {
       cerr << "line " << i << " in " << filePath << " has wrong number of tokens, skipping:\n" <<
            token.size() << " " << token[0] << " " << line << endl;

From d68257c34d05ab278f2b043bb208403cb4e98872 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 16:37:12 +0100
Subject: [PATCH 19/27] use standard c++ getline instead of old Moses
 SAFE_GETLINE

---
 .../PhraseDictionaryMultiModelCounts.cpp      | 14 ++++-------
 .../fuzzy-match/FuzzyMatchWrapper.cpp         | 24 +++++++------------
 .../fuzzy-match/SuffixArray.cpp               | 15 +++++-------
 .../TranslationModel/fuzzy-match/Vocabulary.h | 14 -----------
 4 files changed, 18 insertions(+), 49 deletions(-)

diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index 04bb321d0..99d3ad256 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -17,12 +17,8 @@ License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 #include "util/exception.hh"
-
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
 
-#define LINE_MAX_LENGTH 100000
-#include "phrase-extract/SafeGetline.h" // for SAFE_GETLINE()
-
 using namespace std;
 
 template<typename T>
@@ -461,16 +457,14 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
   }
   istream *inFileP = &inFile;
 
-  char line[LINE_MAX_LENGTH];
-
   int i=0;
-  while(true) {
+  string line;
+
+  while(getline(*inFileP, line)) {
     i++;
     if (i%100000 == 0) cerr << "." << flush;
-    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (inFileP->eof()) break;
 
-    vector<string> token = tokenize( line );
+    vector<string> token = tokenize( line.c_str() );
     if (token.size() != 4) {
       cerr << "line " << i << " in " << fileName
            << " has wrong number of tokens, skipping:\n"
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index fc68e1f0d..8766743b3 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -413,11 +413,9 @@ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector
 
   istream *fileStreamP = &fileStream;
 
-  char line[LINE_MAX_LENGTH];
-  while(true) {
-    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
-    if (fileStreamP->eof()) break;
-    corpus.push_back( GetVocabulary().Tokenize( line ) );
+  string line;
+  while(getline(*fileStreamP, line)) {
+    corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
   }
 }
 
@@ -436,12 +434,9 @@ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector<
   WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
 
   int lineNum = 0;
-  char line[LINE_MAX_LENGTH];
-  while(true) {
-    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
-    if (fileStreamP->eof()) break;
-
-    vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
+  string line;
+  while(getline(*fileStreamP, line)) {
+    vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
 
     corpus.push_back(vector< SentenceAlignment >());
     vector< SentenceAlignment > &vec = corpus.back();
@@ -493,11 +488,8 @@ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vect
   string delimiter = "|||";
 
   int lineNum = 0;
-  char line[LINE_MAX_LENGTH];
-  while(true) {
-    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
-    if (fileStreamP->eof()) break;
-
+  string line;
+  while(getline(*fileStreamP, line)) {
     vector< SentenceAlignment > &vec = corpus[lineNum];
     size_t targetInd = 0;
     SentenceAlignment *sentence = &vec[targetInd];
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
index 536bff741..2930147ab 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
@@ -14,17 +14,16 @@ SuffixArray::SuffixArray( string fileName )
   m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
 
   ifstream extractFile;
-  char line[LINE_MAX_LENGTH];
 
   // count the number of words first;
   extractFile.open(fileName.c_str());
   istream *fileP = &extractFile;
   m_size = 0;
   size_t sentenceCount = 0;
-  while(!fileP->eof()) {
-    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
-    if (fileP->eof()) break;
-    vector< WORD_ID > words = m_vcb.Tokenize( line );
+  string line;
+  while(getline(*fileP, line)) {
+
+    vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
     m_size += words.size() + 1;
     sentenceCount++;
   }
@@ -43,10 +42,8 @@ SuffixArray::SuffixArray( string fileName )
   int sentenceId = 0;
   extractFile.open(fileName.c_str());
   fileP = &extractFile;
-  while(!fileP->eof()) {
-    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
-    if (fileP->eof()) break;
-    vector< WORD_ID > words = m_vcb.Tokenize( line );
+  while(getline(*fileP, line)) {
+    vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
 
     // add to corpus vector
     corpus.push_back(words);
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h
index dfa11c1db..5a79e2f26 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.h
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h
@@ -17,20 +17,6 @@
 
 namespace tmmt
 {
-
-#define MAX_LENGTH 10000
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
-                _IS.getline(_LINE, _SIZE, _DELIM); \
-                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
-                if (_IS.gcount() == _SIZE-1) { \
-                  cerr << "Line too long! Buffer overflow. Delete lines >=" \
-                    << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
-                    << endl; \
-                    exit(1); \
-                } \
-              }
-
 typedef std::string WORD;
 typedef unsigned int WORD_ID;
 

From 1b667e3e24620fb55fb7f62d3d643455521cdcb4 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 17:07:12 +0100
Subject: [PATCH 20/27] delete any mention of SAFE_GETLINE so it doesn't
 reappear

---
 phrase-extract/DomainFeature.cpp            |  1 -
 phrase-extract/ExtractionPhrasePair.cpp     |  1 -
 phrase-extract/SafeGetline.h                | 35 ---------------------
 phrase-extract/consolidate-main.cpp         |  1 -
 phrase-extract/consolidate-reverse-main.cpp |  1 -
 phrase-extract/extract-main.cpp             |  1 -
 phrase-extract/extract-ordering-main.cpp    |  1 -
 phrase-extract/extract-rules-main.cpp       |  1 -
 phrase-extract/score-main.cpp               |  1 -
 phrase-extract/statistics-main.cpp          |  1 -
 10 files changed, 44 deletions(-)
 delete mode 100644 phrase-extract/SafeGetline.h

diff --git a/phrase-extract/DomainFeature.cpp b/phrase-extract/DomainFeature.cpp
index 337364b1d..99f0713a7 100644
--- a/phrase-extract/DomainFeature.cpp
+++ b/phrase-extract/DomainFeature.cpp
@@ -2,7 +2,6 @@
 #include "ExtractionPhrasePair.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
-#include "SafeGetline.h"
 
 using namespace std;
 
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index f70d106d1..2b26c2ad6 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -19,7 +19,6 @@
 
 #include <sstream>
 #include "ExtractionPhrasePair.h"
-#include "SafeGetline.h"
 #include "tables-core.h"
 #include "score.h"
 #include "moses/Util.h"
diff --git a/phrase-extract/SafeGetline.h b/phrase-extract/SafeGetline.h
deleted file mode 100644
index 0e03b8468..000000000
--- a/phrase-extract/SafeGetline.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef SAFE_GETLINE_INCLUDED_
-#define SAFE_GETLINE_INCLUDED_
-
-#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) {            \
-    _IS.getline(_LINE, _SIZE, _DELIM);                              \
-    if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();         \
-    if (_IS.gcount() == _SIZE-1) {                                  \
-      cerr << "Line too long! Buffer overflow. Delete lines >="     \
-       << _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE   \
-       << endl;                                                     \
-      exit(1);                                                      \
-    }                                                               \
-  }
-
-#endif
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index c57cc7747..43d912b81 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -26,7 +26,6 @@
 #include <cstring>
 
 #include "tables-core.h"
-#include "SafeGetline.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index 891773418..ce59315b9 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -27,7 +27,6 @@
 #include <cstring>
 
 #include "tables-core.h"
-#include "SafeGetline.h"
 #include "InputFileStream.h"
 
 using namespace std;
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 698599a10..fe3d99cd2 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -19,7 +19,6 @@
 #include <set>
 #include <vector>
 
-#include "SafeGetline.h"
 #include "SentenceAlignment.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp
index 78132d4fd..b418ba24d 100644
--- a/phrase-extract/extract-ordering-main.cpp
+++ b/phrase-extract/extract-ordering-main.cpp
@@ -19,7 +19,6 @@
 #include <set>
 #include <vector>
 
-#include "SafeGetline.h"
 #include "SentenceAlignment.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 30963f32b..592946b0d 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -39,7 +39,6 @@
 #include "Hole.h"
 #include "HoleCollection.h"
 #include "RuleExist.h"
-#include "SafeGetline.h"
 #include "SentenceAlignmentWithSyntax.h"
 #include "SyntaxTree.h"
 #include "tables-core.h"
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index dfb5103f4..3ab6e2fd3 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -29,7 +29,6 @@
 #include <vector>
 #include <algorithm>
 
-#include "SafeGetline.h"
 #include "ScoreFeature.h"
 #include "tables-core.h"
 #include "ExtractionPhrasePair.h"
diff --git a/phrase-extract/statistics-main.cpp b/phrase-extract/statistics-main.cpp
index f1563dc05..9d814ed76 100644
--- a/phrase-extract/statistics-main.cpp
+++ b/phrase-extract/statistics-main.cpp
@@ -12,7 +12,6 @@
 #include <time.h>
 
 #include "AlignmentPhrase.h"
-#include "SafeGetline.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
 

From 29d83d94b109b0b2e6fc134692d61d824656c1e6 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sun, 8 Jun 2014 17:18:07 +0100
Subject: [PATCH 21/27] delete any mention of SAFE_GETLINE so it doesn't
 reappear

---
 phrase-extract/relax-parse-main.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp
index c04cae85b..e5feb94d0 100644
--- a/phrase-extract/relax-parse-main.cpp
+++ b/phrase-extract/relax-parse-main.cpp
@@ -20,8 +20,6 @@
  ***********************************************************************/
 
 #include "relax-parse.h"
-
-#include "SafeGetline.h"
 #include "tables-core.h"
 
 using namespace std;

From 169c3fce383bc66ae580884bfa72d60712beffef Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Mon, 9 Jun 2014 15:24:41 +0100
Subject: [PATCH 22/27] convert CoNNL-X to Moses XML format

---
 scripts/training/wrappers/conll2mosesxml.py | 188 ++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100755 scripts/training/wrappers/conll2mosesxml.py

diff --git a/scripts/training/wrappers/conll2mosesxml.py b/scripts/training/wrappers/conll2mosesxml.py
new file mode 100755
index 000000000..d85695b16
--- /dev/null
+++ b/scripts/training/wrappers/conll2mosesxml.py
@@ -0,0 +1,188 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# takes a file in the CoNLL dependency format (from the CoNLL-X shared task on dependency parsing; http://ilk.uvt.nl/conll/#dataformat )
+# and produces Moses XML format. Note that the structure is built based on fields 9 and 10 (projective HEAD and RELATION),
+# which not all parsers produce.
+
+# usage: conll2mosesxml.py [--brackets] < input_file > output_file
+
+from __future__ import print_function, unicode_literals
+import sys
+import re
+import codecs
+from collections import namedtuple,defaultdict
+from lxml import etree as ET
+
+
+Word = namedtuple('Word', ['pos','word','lemma','tag','head','func', 'proj_head', 'proj_func'])
+
+def main(output_format='xml'):
+    sentence = []
+
+    for line in sys.stdin:
+
+        # process sentence
+        if line == "\n":
+            sentence.insert(0,[])
+            if is_projective(sentence):
+                write(sentence,output_format)
+            else:
+                sys.stderr.write(' '.join(w.word for w in sentence[1:]) + '\n')
+                sys.stdout.write('\n')
+            sentence = []
+            continue
+
+        try:
+            pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = line.split()
+        except ValueError: # word may be unicode whitespace
+            pos, word, lemma, tag, tag2, morph, head, func, proj_head, proj_func = re.split(' *\t*',line.strip())
+
+        word = escape_special_chars(word)
+        lemma = escape_special_chars(lemma)
+
+        if proj_head == '_':
+            proj_head = head
+            proj_func = func
+
+        sentence.append(Word(int(pos), word, lemma, tag2,int(head), func, int(proj_head), proj_func))
+
+
+# this script performs the same escaping as escape-special-chars.perl in Moses.
+# most of it is done in function write(), but quotation marks need to be processed first
+def escape_special_chars(line):
+
+    line = line.replace('\'','&apos;') # xml
+    line = line.replace('"','&quot;') # xml
+
+    return line
+
+
+# make a check if structure is projective
+def is_projective(sentence):
+    dominates = defaultdict(set)
+    for i,w in enumerate(sentence):
+        dominates[i].add(i)
+        if not i:
+            continue
+        head = int(w.proj_head)
+        while head != 0:
+            if i in dominates[head]:
+                break
+            dominates[head].add(i)
+            head = int(sentence[head].proj_head)
+
+    for i in dominates:
+        dependents = dominates[i]
+        if max(dependents) - min(dependents) != len(dependents)-1:
+            sys.stderr.write("error: non-projective structure.\n")
+            return False
+    return True
+
+
+def write(sentence, output_format='xml'):
+
+    if output_format == 'xml':
+        tree = create_subtree(0,sentence)
+        out = ET.tostring(tree, encoding = 'UTF-8').decode('UTF-8')
+
+    if output_format == 'brackets':
+        out = create_brackets(0,sentence)
+
+    out = out.replace('|','&#124;') # factor separator
+    out = out.replace('[','&#91;') # syntax non-terminal
+    out = out.replace(']','&#93;') # syntax non-terminal
+
+    out = out.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
+    out = out.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
+
+    print(out)
+
+# write node in Moses XML format
+def create_subtree(position, sentence):
+
+    element = ET.Element('tree')
+
+    if position:
+        element.set('label', sentence[position].proj_func)
+    else:
+        element.set('label', 'sent')
+
+    for i in range(1,position):
+        if sentence[i].proj_head == position:
+            element.append(create_subtree(i, sentence))
+
+    if position:
+
+        if preterminals:
+            head = ET.Element('tree')
+            head.set('label', sentence[position].tag)
+            head.text = sentence[position].word
+            element.append(head)
+
+        else:
+            if len(element):
+                element[-1].tail = sentence[position].word
+            else:
+                element.text = sentence[position].word
+
+    for i in range(position, len(sentence)):
+        if i and sentence[i].proj_head == position:
+            element.append(create_subtree(i, sentence))
+
+    return element
+
+
+# write node in bracket format (Penn treebank style)
+def create_brackets(position, sentence):
+
+    if position:
+        element = "( " + sentence[position].proj_func + ' '
+    else:
+        element = "( sent "
+
+    for i in range(1,position):
+        if sentence[i].proj_head == position:
+            element += create_brackets(i, sentence)
+
+    if position:
+        word = sentence[position].word
+        if word == ')':
+            word = 'RBR'
+        elif word == '(':
+            word = 'LBR'
+
+        tag = sentence[position].tag
+        if tag == '$(':
+            tag = '$BR'
+
+        if preterminals:
+            element += '( ' + tag + ' ' + word + ' ) '
+        else:
+            element += word + ' ) '
+
+    for i in range(position, len(sentence)):
+        if i and sentence[i].proj_head == position:
+            element += create_brackets(i, sentence)
+
+    if preterminals or not position:
+        element += ') '
+
+    return element
+
+if __name__ == '__main__':
+    if sys.version_info < (3,0,0):
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+
+    if '--no_preterminals' in sys.argv:
+        preterminals = False
+    else:
+        preterminals = True
+
+    if '--brackets' in sys.argv:
+        main('brackets')
+    else:
+        main('xml')

From 8edb3444925a2af26297189adb46d1a9aabe855d Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 10 Jun 2014 10:16:17 +0100
Subject: [PATCH 23/27] =?UTF-8?q?minor=20const=C2=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 moses/PP/PhraseProperty.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/PP/PhraseProperty.h b/moses/PP/PhraseProperty.h
index b977787b2..a4353e634 100644
--- a/moses/PP/PhraseProperty.h
+++ b/moses/PP/PhraseProperty.h
@@ -15,7 +15,7 @@ public:
 
   virtual void ProcessValue() {};
 
-  const std::string &GetValueString() { return m_value; };
+  const std::string &GetValueString() const { return m_value; };
 
 protected:
 

From d1554c4cdce78dd630b3242799f7a5d24f2268cc Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Tue, 10 Jun 2014 16:28:46 +0100
Subject: [PATCH 24/27] Add moses speedtesting framework and readmes.

---
 contrib/moses-speedtest/README.md             | 122 ++++++++
 .../moses-speedtest/check_for_regression.py   |  63 ++++
 contrib/moses-speedtest/cronjob               |   7 +
 contrib/moses-speedtest/helpers/README.md     |   5 +
 .../helpers/sys_drop_caches.py                |  22 ++
 contrib/moses-speedtest/html/README.md        |   5 +
 contrib/moses-speedtest/html/index.html       |  32 ++
 contrib/moses-speedtest/html/style.css        |  21 ++
 contrib/moses-speedtest/html_gen.py           | 192 ++++++++++++
 contrib/moses-speedtest/runtests.py           | 293 ++++++++++++++++++
 contrib/moses-speedtest/sys_drop_caches.py    |  22 ++
 contrib/moses-speedtest/test_config           |   3 +
 contrib/moses-speedtest/testsuite_common.py   |  54 ++++
 contrib/moses-speedtest/testsuite_config      |   5 +
 14 files changed, 846 insertions(+)
 create mode 100644 contrib/moses-speedtest/README.md
 create mode 100644 contrib/moses-speedtest/check_for_regression.py
 create mode 100644 contrib/moses-speedtest/cronjob
 create mode 100644 contrib/moses-speedtest/helpers/README.md
 create mode 100644 contrib/moses-speedtest/helpers/sys_drop_caches.py
 create mode 100644 contrib/moses-speedtest/html/README.md
 create mode 100644 contrib/moses-speedtest/html/index.html
 create mode 100644 contrib/moses-speedtest/html/style.css
 create mode 100644 contrib/moses-speedtest/html_gen.py
 create mode 100644 contrib/moses-speedtest/runtests.py
 create mode 100644 contrib/moses-speedtest/sys_drop_caches.py
 create mode 100644 contrib/moses-speedtest/test_config
 create mode 100644 contrib/moses-speedtest/testsuite_common.py
 create mode 100644 contrib/moses-speedtest/testsuite_config

diff --git a/contrib/moses-speedtest/README.md b/contrib/moses-speedtest/README.md
new file mode 100644
index 000000000..c95c6a400
--- /dev/null
+++ b/contrib/moses-speedtest/README.md
@@ -0,0 +1,122 @@
+# Moses speedtesting framework 
+
+### Description
+
+This is an automatic test framework that is designed to test the day to day performance changes in Moses.
+
+### Set up
+
+#### Set up a Moses repo
+Set up a Moses repo and build it with the desired configuration.
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git
+cd mosesdecoder
+./bjam -j10 --with-cmph=/usr/include/
+```
+You need to build Moses first, so that the testsuite knows what command you want it to use when rebuilding against newer revisions.
+
+#### Create a parent directory.
+Create a parent directory where the **runtests.py** and related scripts and configuration file should reside.
+This should also be the location of the TEST_DIR and TEST_LOG_DIR as explained in the next section.
+
+#### Set up a global configuration file.
+You need a configuration file for the testsuite. A sample configuration file is provided in **testsuite\_config**
+<pre>
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1
+</pre>
+
+The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
+The _DROP\_CACHES\_COMM_ is the command that would beused to drop caches. It should run without needing root access.
+_TEST\_DIR_ is the directory where all the tests will reside.
+_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
+_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
+
+### Creating tests
+
+In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
+Inside that folder one should place a configuration file named **config**. The naming is mandatory.
+An example such configuration file is **test\_config**
+
+<pre>
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, 
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
+</pre>
+
+The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
+The _LDPRE:_ specifies if tests should be run with any LD\_PRELOAD flags.
+The _Variants:_ line specifies what type of tests should we run. This particular line will run the following tests:
+1. A Vanilla test meaning just the command after _Command_ will be issued.
+2. A vanilla cached test meaning that after the vanilla test, the test will be run again without dropping caches in order to benchmark performance on cached filesystem.
+3. A test with LD_PRELOAD ldpreloads moses -f command. For each available LDPRELOAD comma separated library to preload.
+4. A cached version of all LD_PRELOAD tests.
+
+### Running tests.
+Running the tests is done through the **runtests.py** script.
+
+#### Running all tests.
+To run all tests, with the base branch and the latests revision (and generate new basebranch test data if such is missing) do a:
+```bash
+python3 runtests.py -c testsuite_config
+```
+
+#### Running specific tests.
+The script allows the user to manually run a particular test or to test against a specific branch or revision:
+<pre>
+moses-speedtest@crom:~/phrase_tables$ python3 runtests.py --help
+usage: runtests.py [-h] -c CONFIGFILE [-s SINGLETESTDIR] [-r REVISION]
+                   [-b BRANCH]
+
+A python based speedtest suite for moses.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -c CONFIGFILE, --configfile CONFIGFILE
+                        Specify test config file
+  -s SINGLETESTDIR, --singletest SINGLETESTDIR
+                        Single test name directory. Specify directory name,
+                        not full path!
+  -r REVISION, --revision REVISION
+                        Specify a specific revison for the test.
+  -b BRANCH, --branch BRANCH
+                        Specify a branch for the test.
+</pre>
+
+### Generating HTML report.
+To generate a summary of the test results use the **html\_gen.py** script. It places a file named *index.html* in the current script directory.
+```bash
+python3 html_gen.py testsuite_config
+```
+You should use the generated file with the **style.css** file provided in the html directory.
+
+### Command line regression testing.
+Alternatively you could check for regressions from the command line using the **check\_fo\r_regression.py** script:
+```bash
+python3 check_for_regression.py TESTLOGS_DIRECTORY
+```
+
+Alternatively the results of all tests are logged inside the the specified TESTLOGS directory so you can manually check them for additional information such as date, time, revision, branch, etc...
+
+### Create a cron job:
+Create a cron job to run the tests daily and generate an html report. An example *cronjob* is available.
+```bash
+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html
+```
+
+Place the script in _/etc/cron.daily_ for dayly testing
+
+###### Author
+Nikolay Bogoychev, 2014
+
+###### License
+This software is licensed under the LGPL.
\ No newline at end of file
diff --git a/contrib/moses-speedtest/check_for_regression.py b/contrib/moses-speedtest/check_for_regression.py
new file mode 100644
index 000000000..1e269c0c6
--- /dev/null
+++ b/contrib/moses-speedtest/check_for_regression.py
@@ -0,0 +1,63 @@
+"""Checks if any of the latests tests has performed considerably different than
+ the previous ones. Takes the log directory as an argument."""
+import os
+import sys
+from testsuite_common import Result, processLogLine, bcolors, getLastTwoLines
+
+LOGDIR = sys.argv[1] #Get the log directory as an argument
+PERCENTAGE = 5 #Default value for how much a test shoudl change
+if len(sys.argv) == 3:
+    PERCENTAGE = float(sys.argv[2]) #Default is 5%, but we can specify more
+    #line parameter
+
+def printResults(regressed, better, unchanged, firsttime):
+    """Pretty print the results in different colours"""
+    if regressed != []:
+        for item in regressed:
+            print(bcolors.RED + "REGRESSION! " + item.testname + " Was: "\
+            + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+            + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+            + bcolors.ENDC)
+    print('\n')
+    if unchanged != []:
+        for item in unchanged:
+            print(bcolors.BLUE + "UNCHANGED: " + item.testname + " Revision: " +\
+                item.revision + bcolors.ENDC)
+    print('\n')
+    if better != []:
+        for item in better:
+            print(bcolors.GREEN + "IMPROVEMENT! " + item.testname + " Was: "\
+            + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+            + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+            + bcolors.ENDC)
+    if firsttime != []:
+        for item in firsttime:
+            print(bcolors.PURPLE + "First time test! " + item.testname +\
+            " Took: " + str(item.real) +  " seconds. Revision: " +\
+            item.revision + bcolors.ENDC)
+
+
+all_files = os.listdir(LOGDIR)
+regressed = []
+better = []
+unchanged = []
+firsttime = []
+
+#Go through all log files and find which tests have performed better.
+for logfile in all_files:
+    (line1, line2) = getLastTwoLines(logfile, LOGDIR)
+    log1 = processLogLine(line1)
+    if line2 == '\n': # Empty line, only one test ever run
+        firsttime.append(log1)
+        continue
+    log2 = processLogLine(line2)
+    res = Result(log1.testname, log1.real, log2.real, log2.revision,\
+    log2.branch, log1.revision, log1.branch)
+    if res.percentage < -PERCENTAGE:
+        regressed.append(res)
+    elif res.change > PERCENTAGE:
+        better.append(res)
+    else:
+        unchanged.append(res)
+
+printResults(regressed, better, unchanged, firsttime)
diff --git a/contrib/moses-speedtest/cronjob b/contrib/moses-speedtest/cronjob
new file mode 100644
index 000000000..4f7183a48
--- /dev/null
+++ b/contrib/moses-speedtest/cronjob
@@ -0,0 +1,7 @@
+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html
\ No newline at end of file
diff --git a/contrib/moses-speedtest/helpers/README.md b/contrib/moses-speedtest/helpers/README.md
new file mode 100644
index 000000000..87efbc78f
--- /dev/null
+++ b/contrib/moses-speedtest/helpers/README.md
@@ -0,0 +1,5 @@
+###Helpers
+
+This is a python script that basically gives you the equivalent of:
+```echo 3 > /proc/sys/vm/drop_caches```
+You need to set it up so it is executed with root access without needing a password so that the tests can be automated.
\ No newline at end of file
diff --git a/contrib/moses-speedtest/helpers/sys_drop_caches.py b/contrib/moses-speedtest/helpers/sys_drop_caches.py
new file mode 100644
index 000000000..d4796e090
--- /dev/null
+++ b/contrib/moses-speedtest/helpers/sys_drop_caches.py
@@ -0,0 +1,22 @@
+#!/usr/bin/spython
+from sys import argv, stderr, exit
+from os import linesep as ls
+procfile = "/proc/sys/vm/drop_caches"
+options = ["1","2","3"]
+flush_type = None
+try:
+    flush_type = argv[1][0:1] 
+    if not flush_type in options:
+        raise IndexError, "not in options"
+    with open(procfile, "w") as f:
+        f.write("%s%s" % (flush_type,ls))
+    exit(0)
+except IndexError, e:
+    stderr.write("Argument %s required.%s" % (options, ls))
+except IOError, e:
+    stderr.write("Error writing to file.%s" % ls)
+except StandardError, e:
+    stderr.write("Unknown Error.%s" % ls)
+
+exit(1)
+
diff --git a/contrib/moses-speedtest/html/README.md b/contrib/moses-speedtest/html/README.md
new file mode 100644
index 000000000..342a8cedf
--- /dev/null
+++ b/contrib/moses-speedtest/html/README.md
@@ -0,0 +1,5 @@
+###HTML files.
+
+_index.html_ is a sample generated file by this testsuite. 
+
+_style.css_ should be placed in the html directory in which _index.html_ will be placed in order to visualize the test results in a browser.
diff --git a/contrib/moses-speedtest/html/index.html b/contrib/moses-speedtest/html/index.html
new file mode 100644
index 000000000..fc75b1028
--- /dev/null
+++ b/contrib/moses-speedtest/html/index.html
@@ -0,0 +1,32 @@
+<html>
+<head>
+<title>Moses speed testing</title>
+<link rel="stylesheet" type="text/css" href="style.css"></head><body><text><b>Basebranch:</b> RELEASE-2.1 <b>Revision:</b> c977ca2f434ed6f12a352806c088061c492b1676</text><table><tr class="heading">
+  <th>Date</th>
+  <th>Time</th> 
+  <th>Testname</th>
+  <th>Revision</th>
+  <th>Branch</th> 
+  <th>Time</th>
+  <th>Prevtime</th>
+  <th>Prevrev</th> 
+  <th>Change (%)</th>
+  <th>Time (Basebranch)</th> 
+  <th>Change (%, Basebranch)</th>
+  <th>Time (Days -2)</th> 
+  <th>Change (%, Days -2)</th>
+  <th>Time (Days -3)</th> 
+  <th>Change (%, Days -3)</th>
+  <th>Time (Days -4)</th> 
+  <th>Change (%, Days -4)</th>
+  <th>Time (Days -5)</th> 
+  <th>Change (%, Days -5)</th>
+  <th>Time (Days -6)</th> 
+  <th>Change (%, Days -6)</th>
+  <th>Time (Days -7)</th> 
+  <th>Change (%, Days -7)</th>
+  <th>Time (Days -14)</th> 
+  <th>Change (%, Days -14)</th>
+  <th>Time (Years -1)</th> 
+  <th>Change (%, Years -1)</th>
+ </tr><tr><td>10.06.2014</td><td>10:27:57</td><td>ondisk_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>21.36</td><td>21.49</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.006</td><td>25.89</td><td class="better">0.1699</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:38</td><td>minpt_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>9.73</td><td>9.52</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0221</td><td>12.2</td><td class="better">0.2197</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:32</td><td>ondisk_hierarchical_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>25.73</td><td>25.77</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0016</td><td>33.63</td><td class="better">0.2337</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:06</td><td>ondisk_hierarchical_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>83.2</td><td>82.6</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0073</td><td>127.59</td><td class="better">0.3526</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:57</td><td>binary_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>24.54</td><td>24.85</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0125</td><td>29.09</td><td class="better">0.1458</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:08</td><td>ondisk_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>10.71</td><td>10.54</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0161</td><td>14.82</td><td class="better">0.2888</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:30:00</td><td>binary_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>20.82</td><td>20.77</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0024</td><td>25.77</td><td class="better">0.194</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:27:35</td><td>score.hiero_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>131.37</td><td>130.63</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0057</td><td>141.85</td><td class="better">0.0791</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:10</td><td>binary_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>13.41</td><td>13.4</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0007</td><td>18.12</td><td class="better">0.2605</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:29:28</td><td>minpt_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>17.46</td><td>17.37</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0052</td><td>20.0</td><td class="better">0.1315</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:22</td><td>minpt_minreord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>13.75</td><td>13.56</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.014</td><td>17.19</td><td class="better">0.2112</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:22:59</td><td>ondisk_reord_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>25.28</td><td>25.0</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0112</td><td>29.11</td><td class="better">0.1412</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:28:31</td><td>minpt_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>8.63</td><td>8.6</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0035</td><td>11.78</td><td class="better">0.2699</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:23:10</td><td>ondisk_reord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>11.57</td><td>11.59</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0017</td><td>15.4</td><td class="better">0.2474</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:25:24</td><td>score.hiero_vanilla</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>132.33</td><td>130.02</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">-0.0178</td><td>141.35</td><td class="better">0.0802</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr><tr><td>10.06.2014</td><td>10:30:12</td><td>binary_minreord_vanilla_cached</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td>master</td><td>12.47</td><td>12.61</td><td>169c3fce383bc66ae580884bfa72d60712beffef</td><td class="unchanged">0.0111</td><td>17.89</td><td class="better">0.2951</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td><td>N/A</td></tr></table></body></html>
diff --git a/contrib/moses-speedtest/html/style.css b/contrib/moses-speedtest/html/style.css
new file mode 100644
index 000000000..16221f91f
--- /dev/null
+++ b/contrib/moses-speedtest/html/style.css
@@ -0,0 +1,21 @@
+table,th,td
+{
+border:1px solid black;
+ border-collapse:collapse
+}
+
+tr:nth-child(odd) {
+    background-color: Gainsboro;
+}
+
+.better {
+	color: Green;
+}
+
+.worse {
+	color: Red;
+}
+
+.unchanged {
+	color: SkyBlue;
+}
\ No newline at end of file
diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py
new file mode 100644
index 000000000..4564b9200
--- /dev/null
+++ b/contrib/moses-speedtest/html_gen.py
@@ -0,0 +1,192 @@
+"""Generates HTML page containing the testresults"""
+from testsuite_common import Result, processLogLine, getLastTwoLines
+from runtests import parse_testconfig
+import os
+import sys
+
+from datetime import datetime, timedelta
+
+HTML_HEADING = """<html>
+<head>
+<title>Moses speed testing</title>
+<link rel="stylesheet" type="text/css" href="style.css"></head><body>"""
+HTML_ENDING = "</table></body></html>\n"
+
+TABLE_HEADING = """<table><tr class="heading">
+  <th>Date</th>
+  <th>Time</th> 
+  <th>Testname</th>
+  <th>Revision</th>
+  <th>Branch</th> 
+  <th>Time</th>
+  <th>Prevtime</th>
+  <th>Prevrev</th> 
+  <th>Change (%)</th>
+  <th>Time (Basebranch)</th> 
+  <th>Change (%, Basebranch)</th>
+  <th>Time (Days -2)</th> 
+  <th>Change (%, Days -2)</th>
+  <th>Time (Days -3)</th> 
+  <th>Change (%, Days -3)</th>
+  <th>Time (Days -4)</th> 
+  <th>Change (%, Days -4)</th>
+  <th>Time (Days -5)</th> 
+  <th>Change (%, Days -5)</th>
+  <th>Time (Days -6)</th> 
+  <th>Change (%, Days -6)</th>
+  <th>Time (Days -7)</th> 
+  <th>Change (%, Days -7)</th>
+  <th>Time (Days -14)</th> 
+  <th>Change (%, Days -14)</th>
+  <th>Time (Years -1)</th> 
+  <th>Change (%, Years -1)</th>
+ </tr>"""
+
+def get_prev_days(date, numdays):
+    """Gets the date numdays previous days so that we could search for
+    that test in the config file"""
+    date_obj = datetime.strptime(date, '%d.%m.%Y').date()
+    past_date = date_obj - timedelta(days=numdays)
+    return past_date.strftime('%d.%m.%Y')
+
+def gather_necessary_lines(logfile, date):
+    """Gathers the necessary lines corresponding to past dates
+    and parses them if they exist"""
+    #Get a dictionary of dates
+    dates = {}
+    dates[get_prev_days(date, 2)] = ('-2', None)
+    dates[get_prev_days(date, 3)] = ('-3', None)
+    dates[get_prev_days(date, 4)] = ('-4', None)
+    dates[get_prev_days(date, 5)] = ('-5', None)
+    dates[get_prev_days(date, 6)] = ('-6', None)
+    dates[get_prev_days(date, 7)] = ('-7', None)
+    dates[get_prev_days(date, 14)] = ('-14', None)
+    dates[get_prev_days(date, 365)] = ('-365', None)
+
+    openfile = open(logfile, 'r')
+    for line in openfile:
+        if line.split()[0] in dates.keys():
+            day = dates[line.split()[0]][0]
+            dates[line.split()[0]] = (day, processLogLine(line))
+    openfile.close()
+    return dates
+
+def append_date_to_table(resline):
+    """Appends past dates to the html"""
+    cur_html = '<td>' + resline.current + '</td>'
+
+    if resline.percentage > 0.05: #If we have improvement of more than 5%
+        cur_html = cur_html +  '<td class="better">' + str(resline.percentage) + '</td>'
+    elif resline.percentage < -0.05: #We have a regression of more than 5%
+        cur_html = cur_html +  '<td class="worse">' + str(resline.percentage) + '</td>'
+    else:
+        cur_html = cur_html +  '<td class="unchanged">' + str(resline.percentage) + '</td>'
+    return cur_html
+
+def compare_rev(filename, rev1, rev2, branch1=False, branch2=False):
+    """Compare the test results of two lines. We can specify either a
+    revision or a branch for comparison. The first rev should be the
+    base version and the second revision should be the later version"""
+
+    #In the log file the index of the revision is 2 but the index of
+    #the branch is 12. Alternate those depending on whether we are looking
+    #for a specific revision or branch.
+    firstidx = 2
+    secondidx = 2
+    if branch1 == True:
+        firstidx = 12
+    if branch2 == True:
+        secondidx = 12
+
+    rev1line = ''
+    rev2line = ''
+    resfile = open(filename, 'r')
+    for line in resfile:
+        if rev1 == line.split()[firstidx]:
+            rev1line = line
+        elif rev2 == line.split()[secondidx]:
+            rev2line = line
+        if rev1line != '' and rev2line != '':
+            break
+    resfile.close()
+    if rev1line == '':
+        raise ValueError('Revision ' + rev1 + " was not found!")
+    if rev2line == '':
+        raise ValueError('Revision ' + rev2 + " was not found!")
+
+    logLine1 = processLogLine(rev1line)
+    logLine2 = processLogLine(rev2line)
+    res = Result(logLine1.testname, logLine1.real, logLine2.real,\
+        logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch)
+
+    return res
+
+def produce_html(path, global_config):
+    """Produces html file for the report."""
+    html = '' #The table HTML
+    for filenam in os.listdir(global_config.testlogs):
+        #Generate html for the newest two lines
+        #Get the lines from the config file
+        (ll1, ll2) = getLastTwoLines(filenam, global_config.testlogs)
+        logLine1 = processLogLine(ll1)
+        logLine2 = processLogLine(ll2)
+
+        #Generate html
+        res1 = Result(logLine1.testname, logLine1.real, logLine2.real,\
+            logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch)
+        html = html + '<tr><td>' + logLine2.date + '</td><td>' + logLine2.time + '</td><td>' +\
+        res1.testname + '</td><td>' + res1.revision + '</td><td>' + res1.branch + '</td><td>' +\
+        str(res1.current) + '</td><td>' + str(res1.previous) + '</td><td>' + res1.prevrev + '</td>'
+
+        #Add fancy colours depending on the change
+        if res1.percentage > 0.05: #If we have improvement of more than 5%
+            html = html +  '<td class="better">' + str(res1.percentage) + '</td>'
+        elif res1.percentage < -0.05: #We have a regression of more than 5%
+            html = html +  '<td class="worse">' + str(res1.percentage) + '</td>'
+        else:
+            html = html +  '<td class="unchanged">' + str(res1.percentage) + '</td>'
+
+        #Get comparison against the base version
+        filenam = global_config.testlogs + '/' + filenam #Get proper directory
+        res2 = compare_rev(filenam, global_config.basebranch, res1.revision, branch1=True)
+        html = html + '<td>' + str(res2.previous) + '</td>'
+
+        #Add fancy colours depending on the change
+        if res2.percentage > 0.05: #If we have improvement of more than 5%
+            html = html +  '<td class="better">' + str(res2.percentage) + '</td>'
+        elif res2.percentage < -0.05: #We have a regression of more than 5%
+            html = html +  '<td class="worse">' + str(res2.percentage) + '</td>'
+        else:
+            html = html +  '<td class="unchanged">' + str(res2.percentage) + '</td>'
+
+        #Add extra dates comparison dating from the beginning of time if they exist
+        past_dates = list(range(2, 8))
+        past_dates.append(14)
+        past_dates.append(365) # Get the 1 year ago day
+        linesdict = gather_necessary_lines(filenam, logLine2.date)
+
+        for days in past_dates:
+            act_date = get_prev_days(logLine2.date, days)
+            if linesdict[act_date][1] is not None:
+                logline_date = linesdict[act_date][0]
+                restemp = Result(logline_date.testname, logline_date.real, logLine2.real,\
+                logLine2.revision, logLine2.branch, logline_date.revision, logline_date.branch)
+                html = html + append_date_to_table(restemp)
+            else:
+                html = html + '<td>N/A</td><td>N/A</td>'
+
+
+
+        html = html + '</tr>' #End row
+
+    #Write out the file
+    basebranch_info = '<text><b>Basebranch:</b> ' + res2.prevbranch + ' <b>Revision:</b> ' +\
+    res2.prevrev + '</text>'
+    writeoutstr = HTML_HEADING + basebranch_info + TABLE_HEADING + html + HTML_ENDING
+    writefile = open(path, 'w')
+    writefile.write(writeoutstr)
+    writefile.close()
+
+if __name__ == '__main__':
+    CONFIG = parse_testconfig(sys.argv[1])
+    produce_html('index.html', CONFIG)
diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py
new file mode 100644
index 000000000..0978c8ef2
--- /dev/null
+++ b/contrib/moses-speedtest/runtests.py
@@ -0,0 +1,293 @@
+"""Given a config file, runs tests"""
+import os
+import subprocess
+import time
+from argparse import ArgumentParser
+from testsuite_common import processLogLine
+
+def parse_cmd():
+    """Parse the command line arguments"""
+    description = "A python based speedtest suite for moses."
+    parser = ArgumentParser(description=description)
+    parser.add_argument("-c", "--configfile", action="store",\
+                dest="configfile", required=True,\
+                help="Specify test config file")
+    parser.add_argument("-s", "--singletest", action="store",\
+                dest="singletestdir", default=None,\
+                help="Single test name directory. Specify directory name,\
+                not full path!")
+    parser.add_argument("-r", "--revision", action="store",\
+                dest="revision", default=None,\
+                help="Specify a specific revison for the test.")
+    parser.add_argument("-b", "--branch", action="store",\
+                dest="branch", default=None,\
+                help="Specify a branch for the test.")
+
+    arguments = parser.parse_args()
+    return arguments
+
+def repoinit(testconfig):
+    """Determines revision and sets up the repo."""
+    revision = ''
+    #Update the repo
+    os.chdir(testconfig.repo)
+    #Checkout specific branch, else maintain main branch
+    if testconfig.branch != 'master':
+        subprocess.call(['git', 'checkout', testconfig.branch])
+        rev, _ = subprocess.Popen(['git', 'rev-parse', 'HEAD'],\
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+        revision = str(rev).replace("\\n'", '').replace("b'", '')
+    else:
+        subprocess.call(['git checkout master'], shell=True)
+
+    #Check a specific revision. Else checkout master.
+    if testconfig.revision:
+        subprocess.call(['git', 'checkout', testconfig.revision])
+        revision = testconfig.revision
+    elif testconfig.branch == 'master':
+        subprocess.call(['git pull'], shell=True)
+        rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\
+            stderr=subprocess.PIPE, shell=True).communicate()
+        revision = str(rev).replace("\\n'", '').replace("b'", '')
+    
+    return revision
+
+class Configuration:
+    """A simple class to hold all of the configuration constatns"""
+    def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev):
+        self.repo = repo
+        self.drop_caches = drop_caches
+        self.tests = tests
+        self.testlogs = testlogs
+        self.basebranch = basebranch
+        self.baserev = baserev
+        self.singletest = None
+        self.revision = None
+        self.branch = 'master' # Default branch
+
+    def additional_args(self, singletest, revision, branch):
+        """Additional configuration from command line arguments"""
+        self.singletest = singletest
+        if revision is not None:
+            self.revision = revision
+        if branch is not None:
+            self.branch = branch
+
+    def set_revision(self, revision):
+        """Sets the current revision that is being tested"""
+        self.revision = revision
+
+
+class Test:
+    """A simple class to contain all information about tests"""
+    def __init__(self, name, command, ldopts, permutations):
+        self.name = name
+        self.command = command
+        self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
+        self.permutations = permutations
+
+def parse_configfile(conffile, testdir, moses_repo):
+    """Parses the config file"""
+    command, ldopts = '', ''
+    permutations = []
+    fileopen = open(conffile, 'r')
+    for line in fileopen:
+        line = line.split('#')[0] # Discard comments
+        if line == '' or line == '\n':
+            continue # Discard lines with comments only and empty lines
+        opt, args = line.split(' ', 1) # Get arguments
+
+        if opt == 'Command:':
+            command = args.replace('\n', '')
+            command = moses_repo + '/bin/' + command
+        elif opt == 'LDPRE:':
+            ldopts = args.replace('\n', '')
+        elif opt == 'Variants:':
+            permutations = args.replace('\n', '').replace(' ', '').split(',')
+        else:
+            raise ValueError('Unrecognized option ' + opt)
+    #We use the testdir as the name.
+    testcase = Test(testdir, command, ldopts, permutations)
+    fileopen.close()
+    return testcase
+
+def parse_testconfig(conffile):
+    """Parses the config file for the whole testsuite."""
+    repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
+    basebranch, baserev = '', ''
+    fileopen = open(conffile, 'r')
+    for line in fileopen:
+        line = line.split('#')[0] # Discard comments
+        if line == '' or line == '\n':
+            continue # Discard lines with comments only and empty lines
+        opt, args = line.split(' ', 1) # Get arguments
+        if opt == 'MOSES_REPO_PATH:':
+            repo_path = args.replace('\n', '')
+        elif opt == 'DROP_CACHES_COMM:':
+            drop_caches = args.replace('\n', '')
+        elif opt == 'TEST_DIR:':
+            tests_dir = args.replace('\n', '')
+        elif opt == 'TEST_LOG_DIR:':
+            testlog_dir = args.replace('\n', '')
+        elif opt == 'BASEBRANCH:':
+            basebranch = args.replace('\n', '')
+        elif opt == 'BASEREV:':
+            baserev = args.replace('\n', '')
+        else:
+            raise ValueError('Unrecognized option ' + opt)
+    config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
+    basebranch, baserev)
+    fileopen.close()
+    return config
+
+def get_config():
+    """Builds the config object with all necessary attributes"""
+    args = parse_cmd()
+    config = parse_testconfig(args.configfile)
+    config.additional_args(args.singletestdir, args.revision, args.branch)
+    revision = repoinit(config)
+    config.set_revision(revision)
+    return config
+
+def check_for_basever(testlogfile, basebranch):
+    """Checks if the base revision is present in the testlogs"""
+    filetoopen = open(testlogfile, 'r')
+    for line in filetoopen:
+        templine = processLogLine(line)
+        if templine.branch == basebranch:
+            return True
+    return False
+
+def split_time(filename):
+    """Splits the output of the time function into seperate parts.
+    We will write time to file, because many programs output to
+    stderr which makes it difficult to get only the exact results we need."""
+    timefile = open(filename, 'r')
+    realtime = float(timefile.readline().replace('\n', '').split()[1])
+    usertime = float(timefile.readline().replace('\n', '').split()[1])
+    systime = float(timefile.readline().replace('\n', '').split()[1])
+    timefile.close()
+
+    return (realtime, usertime, systime)
+
+
+def write_log(time_file, logname, config):
+    """Writes to a logfile"""
+    log_write = open(config.testlogs + '/' + logname, 'a') # Open logfile
+    date_run = time.strftime("%d.%m.%Y %H:%M:%S") # Get the time of the test
+    realtime, usertime, systime = split_time(time_file) # Get the times in a nice form
+
+    # Append everything to a log file.
+    writestr = date_run + " " + config.revision + " Testname: " + logname +\
+    " RealTime: " + str(realtime) + " UserTime: " + str(usertime) +\
+    " SystemTime: " + str(systime) + " Branch: " + config.branch +'\n'
+    log_write.write(writestr)
+    log_write.close()
+
+
+def execute_tests(testcase, cur_directory, config):
+    """Executes timed tests based on the config file"""
+    #Figure out the order of which tests must be executed.
+    #Change to the current test directory
+    os.chdir(config.tests + '/' + cur_directory)
+    #Clear caches
+    subprocess.call(['sync'], shell=True)
+    subprocess.call([config.drop_caches], shell=True)
+    #Perform vanilla test and if a cached test exists - as well
+    print(testcase.name)
+    if 'vanilla' in testcase.permutations:
+        print(testcase.command)
+        subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+         stderr=subprocess.PIPE, shell=True).communicate()
+        write_log('/tmp/time_moses_tests', testcase.name + '_vanilla', config)
+        if 'cached' in testcase.permutations:
+            subprocess.Popen(['time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+            stderr=None, shell=True).communicate()
+            write_log('/tmp/time_moses_tests', testcase.name + '_vanilla_cached', config)
+    
+    #Now perform LD_PRELOAD tests
+    if 'ldpre' in testcase.permutations:
+        for opt in testcase.ldopts:
+            #Clear caches
+            subprocess.call(['sync'], shell=True)
+            subprocess.call([config.drop_caches], shell=True)
+
+            #test
+            subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+            stderr=None, shell=True).communicate()
+            write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' + opt, config)
+            if 'cached' in testcase.permutations:
+                subprocess.Popen(['LD_PRELOAD ' + opt + ' time -p -o /tmp/time_moses_tests ' + testcase.command], stdout=None,\
+                stderr=None, shell=True).communicate()
+                write_log('/tmp/time_moses_tests', testcase.name + '_ldpre_' +opt +'_cached', config)
+
+# Go through all the test directories and executes tests
+if __name__ == '__main__':
+    CONFIG = get_config()
+    ALL_DIR = os.listdir(CONFIG.tests)
+
+    #We should first check if any of the tests is run for the first time.
+    #If some of them are run for the first time we should first get their
+    #time with the base version (usually the previous release)
+    FIRSTTIME = []
+    TESTLOGS = []
+    #Strip filenames of test underscores
+    for listline in os.listdir(CONFIG.testlogs):
+        listline = listline.replace('_vanilla', '')
+        listline = listline.replace('_cached', '')
+        listline = listline.replace('_ldpre', '')
+        TESTLOGS.append(listline)
+    for directory in ALL_DIR:
+        if directory not in TESTLOGS:
+            FIRSTTIME.append(directory)
+
+    #Sometimes even though we have the log files, we will need to rerun them
+    #Against a base version, because we require a different baseversion (for
+    #example when a new version of Moses is released.) Therefore we should
+    #Check if the version of Moses that we have as a base version is in all
+    #of the log files.
+
+    for logfile in os.listdir(CONFIG.testlogs):
+        logfile_name = CONFIG.testlogs + '/' + logfile
+        if not check_for_basever(logfile_name, CONFIG.basebranch):
+            logfile = logfile.replace('_vanilla', '')
+            logfile = logfile.replace('_cached', '')
+            logfile = logfile.replace('_ldpre', '')
+            FIRSTTIME.append(logfile)
+    FIRSTTIME = list(set(FIRSTTIME)) #Deduplicate
+
+    if FIRSTTIME != []:
+        #Create a new configuration for base version tests:
+        BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
+            CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
+            CONFIG.baserev)
+        BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
+        #Set up the repository and get its revision:
+        REVISION = repoinit(BASECONFIG)
+        BASECONFIG.set_revision(REVISION)
+        #Build
+        os.chdir(BASECONFIG.repo)
+        subprocess.call(['./previous.sh'], shell=True)
+
+        #Perform tests
+        for directory in FIRSTTIME:
+            cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
+            '/config', directory, BASECONFIG.repo)
+            execute_tests(cur_testcase, directory, BASECONFIG)
+
+        #Reset back the repository to the normal configuration
+        repoinit(CONFIG)
+
+    #Builds moses
+    os.chdir(CONFIG.repo)
+    subprocess.call(['./previous.sh'], shell=True)
+
+    if CONFIG.singletest:
+        TESTCASE = parse_configfile(CONFIG.tests + '/' +\
+            CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo)
+        execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
+    else:
+        for directory in ALL_DIR:
+            cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
+            '/config', directory, CONFIG.repo)
+            execute_tests(cur_testcase, directory, CONFIG)
diff --git a/contrib/moses-speedtest/sys_drop_caches.py b/contrib/moses-speedtest/sys_drop_caches.py
new file mode 100644
index 000000000..d4796e090
--- /dev/null
+++ b/contrib/moses-speedtest/sys_drop_caches.py
@@ -0,0 +1,22 @@
+#!/usr/bin/spython
+from sys import argv, stderr, exit
+from os import linesep as ls
+procfile = "/proc/sys/vm/drop_caches"
+options = ["1","2","3"]
+flush_type = None
+try:
+    flush_type = argv[1][0:1] 
+    if not flush_type in options:
+        raise IndexError, "not in options"
+    with open(procfile, "w") as f:
+        f.write("%s%s" % (flush_type,ls))
+    exit(0)
+except IndexError, e:
+    stderr.write("Argument %s required.%s" % (options, ls))
+except IOError, e:
+    stderr.write("Error writing to file.%s" % ls)
+except StandardError, e:
+    stderr.write("Unknown Error.%s" % ls)
+
+exit(1)
+
diff --git a/contrib/moses-speedtest/test_config b/contrib/moses-speedtest/test_config
new file mode 100644
index 000000000..4a480f496
--- /dev/null
+++ b/contrib/moses-speedtest/test_config
@@ -0,0 +1,3 @@
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, 
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
diff --git a/contrib/moses-speedtest/testsuite_common.py b/contrib/moses-speedtest/testsuite_common.py
new file mode 100644
index 000000000..be96f98b5
--- /dev/null
+++ b/contrib/moses-speedtest/testsuite_common.py
@@ -0,0 +1,54 @@
+"""Common functions of the testsuitce"""
+import os
+#Clour constants
+class bcolors:
+    PURPLE = '\033[95m'
+    BLUE = '\033[94m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    RED = '\033[91m'
+    ENDC = '\033[0m'
+
+class LogLine:
+    """A class to contain logfile line"""
+    def __init__(self, date, time, revision, testname, real, user, system, branch):
+        self.date = date
+        self.time = time
+        self.revision = revision
+        self.testname = testname
+        self.real = real
+        self.system = system
+        self.user = user
+        self.branch = branch
+
+class Result:
+    """A class to contain results of benchmarking"""
+    def __init__(self, testname, previous, current, revision, branch, prevrev, prevbranch):
+        self.testname = testname
+        self.previous = previous
+        self.current = current
+        self.change = previous - current
+        self.revision = revision
+        self.branch = branch
+        self.prevbranch = prevbranch
+        self.prevrev = prevrev
+        #Produce a percentage with fewer digits
+        self.percentage = float(format(1 - current/previous, '.4f'))
+
+def processLogLine(logline):
+    """Parses the log line into a nice datastructure"""
+    logline = logline.split()
+    log = LogLine(logline[0], logline[1], logline[2], logline[4],\
+        float(logline[6]), float(logline[8]), float(logline[10]), logline[12])
+    return log
+
+def getLastTwoLines(filename, logdir):
+    """Just a call to tail to get the diff between the last two runs"""
+    try:
+        line1, line2 = os.popen("tail -n2 " + logdir + '/' + filename)
+    except ValueError: #Check for new tests
+        tempfile = open(logdir + '/' + filename)
+        line1 = tempfile.readline()
+        tempfile.close()
+        return (line1, '\n')
+    return (line1, line2)
diff --git a/contrib/moses-speedtest/testsuite_config b/contrib/moses-speedtest/testsuite_config
new file mode 100644
index 000000000..b6ad6181c
--- /dev/null
+++ b/contrib/moses-speedtest/testsuite_config
@@ -0,0 +1,5 @@
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1
\ No newline at end of file

From 4b98495e789b47a221fdc5061459731d32e1f194 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 10 Jun 2014 17:27:20 +0100
Subject: [PATCH 25/27] don't load data when just showing weightd

---
 moses/StaticData.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 0340778ed..8109f245d 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -535,7 +535,9 @@ bool StaticData::LoadData(Parameter *parameter)
   NoCache();
   OverrideFeatures();
 
-  LoadFeatureFunctions();
+  if (!m_parameter->isParamSpecified("show-weights")) {
+    LoadFeatureFunctions();
+  }
 
   if (!LoadDecodeGraphs()) return false;
 

From 793aef6715862aef94756b1bc601e5f863f34438 Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Wed, 11 Jun 2014 10:24:44 +0100
Subject: [PATCH 26/27] Fix small oversight

---
 contrib/moses-speedtest/html_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py
index 4564b9200..c8255e312 100644
--- a/contrib/moses-speedtest/html_gen.py
+++ b/contrib/moses-speedtest/html_gen.py
@@ -73,7 +73,7 @@ def gather_necessary_lines(logfile, date):
 
 def append_date_to_table(resline):
     """Appends past dates to the html"""
-    cur_html = '<td>' + resline.current + '</td>'
+    cur_html = '<td>' + str(resline.current) + '</td>'
 
     if resline.percentage > 0.05: #If we have improvement of more than 5%
         cur_html = cur_html +  '<td class="better">' + str(resline.percentage) + '</td>'
@@ -168,7 +168,7 @@ def produce_html(path, global_config):
         for days in past_dates:
             act_date = get_prev_days(logLine2.date, days)
             if linesdict[act_date][1] is not None:
-                logline_date = linesdict[act_date][0]
+                logline_date = linesdict[act_date][1]
                 restemp = Result(logline_date.testname, logline_date.real, logLine2.real,\
                 logLine2.revision, logLine2.branch, logline_date.revision, logline_date.branch)
                 html = html + append_date_to_table(restemp)

From 2f752fe83347f2766f5c1badc2a70662531e9b0d Mon Sep 17 00:00:00 2001
From: XapaJIaMnu <nheart@gmail.com>
Date: Wed, 11 Jun 2014 10:51:31 +0100
Subject: [PATCH 27/27] Truncate revision output

---
 contrib/moses-speedtest/html_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/moses-speedtest/html_gen.py b/contrib/moses-speedtest/html_gen.py
index c8255e312..740b7bf91 100644
--- a/contrib/moses-speedtest/html_gen.py
+++ b/contrib/moses-speedtest/html_gen.py
@@ -135,8 +135,8 @@ def produce_html(path, global_config):
         res1 = Result(logLine1.testname, logLine1.real, logLine2.real,\
             logLine2.revision, logLine2.branch, logLine1.revision, logLine1.branch)
         html = html + '<tr><td>' + logLine2.date + '</td><td>' + logLine2.time + '</td><td>' +\
-        res1.testname + '</td><td>' + res1.revision + '</td><td>' + res1.branch + '</td><td>' +\
-        str(res1.current) + '</td><td>' + str(res1.previous) + '</td><td>' + res1.prevrev + '</td>'
+        res1.testname + '</td><td>' + res1.revision[:10] + '</td><td>' + res1.branch + '</td><td>' +\
+        str(res1.current) + '</td><td>' + str(res1.previous) + '</td><td>' + res1.prevrev[:10] + '</td>'
 
         #Add fancy colours depending on the change
         if res1.percentage > 0.05: #If we have improvement of more than 5%

Date	Time	Testname	Revision	Branch	Time	Prevtime	Prevrev	Change (%)	Time (Basebranch)	Change (%, Basebranch)	Time (Days -2)	Change (%, Days -2)	Time (Days -3)	Change (%, Days -3)	Time (Days -4)	Change (%, Days -4)	Time (Days -5)	Change (%, Days -5)	Time (Days -6)	Change (%, Days -6)	Time (Days -7)	Change (%, Days -7)	Time (Days -14)	Change (%, Days -14)	Time (Years -1)	Change (%, Years -1)
10.06.2014	10:27:57	ondisk_minreord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	21.36	21.49	169c3fce383bc66ae580884bfa72d60712beffef	0.006	25.89	0.1699	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:29:38	minpt_reord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	9.73	9.52	169c3fce383bc66ae580884bfa72d60712beffef	-0.0221	12.2	0.2197	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:22:32	ondisk_hierarchical_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	25.73	25.77	169c3fce383bc66ae580884bfa72d60712beffef	0.0016	33.63	0.2337	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:22:06	ondisk_hierarchical_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	83.2	82.6	169c3fce383bc66ae580884bfa72d60712beffef	-0.0073	127.59	0.3526	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:28:57	binary_reord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	24.54	24.85	169c3fce383bc66ae580884bfa72d60712beffef	0.0125	29.09	0.1458	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:28:08	ondisk_minreord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	10.71	10.54	169c3fce383bc66ae580884bfa72d60712beffef	-0.0161	14.82	0.2888	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:30:00	binary_minreord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	20.82	20.77	169c3fce383bc66ae580884bfa72d60712beffef	-0.0024	25.77	0.194	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:27:35	score.hiero_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	131.37	130.63	169c3fce383bc66ae580884bfa72d60712beffef	-0.0057	141.85	0.0791	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:29:10	binary_reord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	13.41	13.4	169c3fce383bc66ae580884bfa72d60712beffef	-0.0007	18.12	0.2605	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:29:28	minpt_reord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	17.46	17.37	169c3fce383bc66ae580884bfa72d60712beffef	-0.0052	20.0	0.1315	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:28:22	minpt_minreord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	13.75	13.56	169c3fce383bc66ae580884bfa72d60712beffef	-0.014	17.19	0.2112	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:22:59	ondisk_reord_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	25.28	25.0	169c3fce383bc66ae580884bfa72d60712beffef	-0.0112	29.11	0.1412	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:28:31	minpt_minreord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	8.63	8.6	169c3fce383bc66ae580884bfa72d60712beffef	-0.0035	11.78	0.2699	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:23:10	ondisk_reord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	11.57	11.59	169c3fce383bc66ae580884bfa72d60712beffef	0.0017	15.4	0.2474	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:25:24	score.hiero_vanilla	169c3fce383bc66ae580884bfa72d60712beffef	master	132.33	130.02	169c3fce383bc66ae580884bfa72d60712beffef	-0.0178	141.35	0.0802	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
10.06.2014	10:30:12	binary_minreord_vanilla_cached	169c3fce383bc66ae580884bfa72d60712beffef	master	12.47	12.61	169c3fce383bc66ae580884bfa72d60712beffef	0.0111	17.89	0.2951	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A	N/A
Date	Time	Testname	Revision	Branch	Time	Prevtime	Prevrev	Change (%)	Time (Basebranch)	Change (%, Basebranch)	Time (Days -2)	Change (%, Days -2)	Time (Days -3)	Change (%, Days -3)	Time (Days -4)	Change (%, Days -4)
' + resline.current + '	' + str(resline.percentage) + '	' + str(resline.percentage) + '	' + str(resline.percentage) + '
' + logLine2.date + '	' + logLine2.time + '	' +\ + res1.testname + '	' + res1.revision + '	' + res1.branch + '	' +\ + str(res1.current) + '	' + str(res1.previous) + '	' + res1.prevrev + '	' + str(res1.percentage) + '	' + str(res1.percentage) + '	' + str(res1.percentage) + '	' + str(res2.previous) + '	' + str(res2.percentage) + '	' + str(res2.percentage) + '	' + str(res2.percentage) + '	N/A	N/A
' + resline.current + '	' + str(resline.current) + '	' + str(resline.percentage) + '
' + logLine2.date + '	' + logLine2.time + '	' +\ - res1.testname + '	' + res1.revision + '	' + res1.branch + '	' +\ - str(res1.current) + '	' + str(res1.previous) + '	' + res1.prevrev + '	' + res1.revision[:10] + '	' + res1.branch + '	' +\ + str(res1.current) + '	' + str(res1.previous) + '	' + res1.prevrev[:10] + '