Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-12-30 15:34:01 +03:00 · 2011-12-26 17:26:39 +07:00 · 2011-12-26 17:26:39 +07:00 · 216af91d1f
commit 216af91d1f
parent e05eaf1419 232c092d70
35 changed files with 26557 additions and 24998 deletions
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -73,7 +73,7 @@ you're ready to install packages in non-standard paths:

 #For Boost:
 ./bootstrap.sh
-./b2 --prefix=$PREFIX --libdir=$PREFIX/lib64 link=static,shared threading=multi install
+./b2 --prefix=$PREFIX --libdir=$PREFIX/lib64 --layout=tagged link=static,shared threading=multi install

 --------------------------------------------------------------------------

--- a/13
+++ b/13
@ -1,13 +1,14 @@
 #!/bin/bash
 set -e
-if 
-  which bjam >/dev/null 2>/dev/null && #Have a bjam in path
-  ! grep UFIHGUFIHBDJKNCFZXAEVA "$(which bjam)" >/dev/null && #bjam in path isn't this script
-  bjam --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
-  bjam --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.  
+if
+  bjam="$(which bjam 2>/dev/null)" && #exists
+  [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
+  ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
+  "${bjam}" --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
+  "${bjam}" --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.  
 then
  #Delegate to system bjam
-  exec bjam "$@"
+  exec "${bjam}" "$@"
 fi

 top="$(dirname "$0")"
--- a/lm/ngram_query.cc
+++ b/lm/ngram_query.cc
@ -1,90 +1,9 @@
-#include "lm/enumerate_vocab.hh"
-#include "lm/model.hh"
+#include "lm/ngram_query.hh"

-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include <ctype.h>
-#if !defined(_WIN32) && !defined(_WIN64)
-#include <sys/resource.h>
-#include <sys/time.h>
-#endif
-
-#if !defined(_WIN32) && !defined(_WIN64)
-float FloatSec(const struct timeval &tv) {
-  return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
-}
-#endif
-
-void PrintUsage(const char *message) {
-#if !defined(_WIN32) && !defined(_WIN64)
-  struct rusage usage;
-  if (getrusage(RUSAGE_SELF, &usage)) {
-    perror("getrusage");
-    return;
-  }
-  std::cerr << message;
-  std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
-
-  // Linux doesn't set memory usage :-(.  
-  std::ifstream status("/proc/self/status", std::ios::in);
-  std::string line;
-  while (getline(status, line)) {
-    if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
-      std::cerr << "rss " << (line.c_str() + 7) << '\n';
-      break;
-    }
-  }
-#endif
-}
-
-template <class Model> void Query(const Model &model, bool sentence_context) {
-  PrintUsage("Loading statistics:\n");
-  typename Model::State state, out;
-  lm::FullScoreReturn ret;
-  std::string word;
-
-  while (std::cin) {
-    state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
-    float total = 0.0;
-    bool got = false;
-    unsigned int oov = 0;
-    while (std::cin >> word) {
-      got = true;
-      lm::WordIndex vocab = model.GetVocabulary().Index(word);
-      if (vocab == 0) ++oov;
-      ret = model.FullScore(state, vocab, out);
-      total += ret.prob;
-      std::cout << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
-      state = out;
-      char c;
-      while (true) {
-        c = std::cin.get();
-        if (!std::cin) break;
-        if (c == '\n') break;
-        if (!isspace(c)) {
-          std::cin.unget();
-          break;
-        }
-      }
-      if (c == '\n') break;
-    }
-    if (!got && !std::cin) break;
-    if (sentence_context) {
-      ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
-      total += ret.prob;
-      std::cout << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
-    }
-    std::cout << "Total: " << total << " OOV: " << oov << '\n';
-  }
-  PrintUsage("After queries:\n");
-}

 template <class Model> void Query(const char *name) {
  lm::ngram::Config config;
-  Model model(name, config);
+  Model model(name, config, std::cin, std::cout);
  Query(model);
 }

@ -100,19 +19,19 @@ int main(int argc, char *argv[]) {
    if (lm::ngram::RecognizeBinary(argv[1], model_type)) {
      switch(model_type) {
        case lm::ngram::HASH_PROBING:
-          Query<lm::ngram::ProbingModel>(argv[1], sentence_context);
+          Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
          break;
        case lm::ngram::TRIE_SORTED:
-          Query<lm::ngram::TrieModel>(argv[1], sentence_context);
+          Query<lm::ngram::TrieModel>(argv[1], sentence_context, std::cin, std::cout);
          break;
        case lm::ngram::QUANT_TRIE_SORTED:
-          Query<lm::ngram::QuantTrieModel>(argv[1], sentence_context);
+          Query<lm::ngram::QuantTrieModel>(argv[1], sentence_context, std::cin, std::cout);
          break;
        case lm::ngram::ARRAY_TRIE_SORTED:
-          Query<lm::ngram::ArrayTrieModel>(argv[1], sentence_context);
+          Query<lm::ngram::ArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
          break;
        case lm::ngram::QUANT_ARRAY_TRIE_SORTED:
-          Query<lm::ngram::QuantArrayTrieModel>(argv[1], sentence_context);
+          Query<lm::ngram::QuantArrayTrieModel>(argv[1], sentence_context, std::cin, std::cout);
          break;
        case lm::ngram::HASH_SORTED:
        default:
@ -120,7 +39,7 @@ int main(int argc, char *argv[]) {
          abort();
      }
    } else {
-      Query<lm::ngram::ProbingModel>(argv[1], sentence_context);
+      Query<lm::ngram::ProbingModel>(argv[1], sentence_context, std::cin, std::cout);
    }

    PrintUsage("Total time including destruction:\n");
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@ -0,0 +1,91 @@
+#ifndef LM_NGRAM_QUERY__
+#define LM_NGRAM_QUERY__
+
+#include "lm/enumerate_vocab.hh"
+#include "lm/model.hh"
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include <ctype.h>
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/resource.h>
+#include <sys/time.h>
+#endif
+
+#if !defined(_WIN32) && !defined(_WIN64)
+float FloatSec(const struct timeval &tv) {
+  return static_cast<float>(tv.tv_sec) + (static_cast<float>(tv.tv_usec) / 1000000000.0);
+}
+#endif
+
+void PrintUsage(const char *message) {
+#if !defined(_WIN32) && !defined(_WIN64)
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage)) {
+    perror("getrusage");
+    return;
+  }
+  std::cerr << message;
+  std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n';
+
+  // Linux doesn't set memory usage :-(.  
+  std::ifstream status("/proc/self/status", std::ios::in);
+  std::string line;
+  while (getline(status, line)) {
+    if (!strncmp(line.c_str(), "VmRSS:\t", 7)) {
+      std::cerr << "rss " << (line.c_str() + 7) << '\n';
+      break;
+    }
+  }
+#endif
+}
+
+template <class Model> void Query(const Model &model, bool sentence_context, std::istream &inStream, std::ostream &outStream) {
+  PrintUsage("Loading statistics:\n");
+  typename Model::State state, out;
+  lm::FullScoreReturn ret;
+  std::string word;
+
+  while (inStream) {
+    state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
+    float total = 0.0;
+    bool got = false;
+    unsigned int oov = 0;
+    while (inStream >> word) {
+      got = true;
+      lm::WordIndex vocab = model.GetVocabulary().Index(word);
+      if (vocab == 0) ++oov;
+      ret = model.FullScore(state, vocab, out);
+      total += ret.prob;
+      outStream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
+      state = out;
+      char c;
+      while (true) {
+        c = inStream.get();
+        if (!inStream) break;
+        if (c == '\n') break;
+        if (!isspace(c)) {
+          inStream.unget();
+          break;
+        }
+      }
+      if (c == '\n') break;
+    }
+    if (!got && !inStream) break;
+    if (sentence_context) {
+      ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
+      total += ret.prob;
+      outStream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
+    }
+    outStream << "Total: " << total << " OOV: " << oov << '\n';
+  }
+  PrintUsage("After queries:\n");
+}
+
+
+#endif // LM_NGRAM_QUERY__
+
+
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@ -47,6 +47,99 @@ Data::~Data() {
  }
 }

+//ADDED BY TS
+void Data::remove_duplicates() {
+
+  size_t nSentences = featdata->size();
+  assert(scoredata->size() == nSentences);
+
+  for (size_t s=0; s < nSentences; s++) {
+
+    FeatureArray& feat_array =  featdata->get(s);
+    ScoreArray& score_array =  scoredata->get(s);
+
+    assert(feat_array.size() == score_array.size());
+
+    //serves as a hash-map:
+    std::map<double, std::vector<size_t> > lookup;
+
+    size_t end_pos = feat_array.size() - 1;
+
+    size_t nRemoved = 0;
+    for (size_t k=0; k <= end_pos; k++) {
+
+      const FeatureStats& cur_feats = feat_array.get(k);
+
+      double sum = 0.0;
+      for (size_t l=0; l < cur_feats.size(); l++)
+	sum += cur_feats.get(l);
+
+      if (lookup.find(sum) != lookup.end()) {
+
+	//std::cerr << "hit" << std::endl;
+
+	std::vector<size_t>& cur_list = lookup[sum];
+
+	size_t l=0;
+	for (l=0; l < cur_list.size(); l++) {
+	  
+	  size_t j=cur_list[l];
+
+	  if (cur_feats == feat_array.get(j)
+	      && score_array.get(k) == score_array.get(j)) {
+
+	    if (k < end_pos) {
+	      
+	      feat_array.swap(k,end_pos);
+	      score_array.swap(k,end_pos);
+	      
+	      k--;
+	    }
+	    
+	    end_pos--;
+	    nRemoved++;
+	    break;
+	  }
+	}
+
+	if (l == lookup[sum].size())
+	  cur_list.push_back(k);
+      }
+      else
+	lookup[sum].push_back(k);
+
+      // for (size_t j=0; j < k; j++) {
+
+      // 	if (feat_array.get(k) == feat_array.get(j)
+      // 	    && score_array.get(k) == score_array.get(j)) {
+
+      // 	  if (k < end_pos) {
+
+      // 	    feat_array.swap(k,end_pos);
+      // 	    score_array.swap(k,end_pos);
+
+      // 	    k--;
+      // 	  }
+
+      //          end_pos--;
+      // 	  nRemoved++;
+      //          break;
+      // 	}
+      // }
+    }
+
+    std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl;
+
+    if (nRemoved > 0) {
+
+      feat_array.resize(end_pos+1);
+      score_array.resize(end_pos+1);
+    }
+  }
+}
+//END_ADDED
+
+
 void Data::loadnbest(const std::string &file)
 {
  TRACE_ERR("loading nbest from " << file << std::endl);
--- a/mert/Data.h
+++ b/mert/Data.h
@ -73,7 +73,7 @@ public:
  void mergeSparseFeatures();

  void loadnbest(const std::string &file);
-
+  
  void load(const std::string &featfile,const std::string &scorefile) {
    featdata->load(featfile);
    scoredata->load(scorefile);
@ -81,6 +81,10 @@ public:
      _sparse_flag = true;
  }

+  //ADDED BY TS
+  void remove_duplicates();
+  //END_ADDED
+
  void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {

    if (bin) cerr << "Binary write mode is selected" << endl;
--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@ -63,6 +63,16 @@ public:
    array_.push_back(e);
  }

+  //ADDED BY TS
+  void swap(size_t i, size_t j) {
+    std::swap(array_[i],array_[j]);
+  }
+  
+  void resize(size_t new_size) {
+    array_.resize(std::min(new_size,array_.size()));
+  }
+  //END_ADDED
+
  void merge(FeatureArray& e);

  inline size_t size() const {
--- a/mert/FeatureDataIterator.h
+++ b/mert/FeatureDataIterator.h
@ -41,7 +41,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 class FileFormatException : public util::Exception 
 {
  public:
-    explicit FileFormatException(const std::string filename, const std::string& line) {
+    explicit FileFormatException(const std::string& filename, const std::string& line) {
      *this << "Error in line \"" << line << "\" of " << filename;
    }
 };
@ -68,7 +68,7 @@ class FeatureDataIterator :
 {
  public:
    FeatureDataIterator();
-    FeatureDataIterator(const std::string& filename);
+    explicit FeatureDataIterator(const std::string& filename);

    static FeatureDataIterator end() {
      return FeatureDataIterator();
@ -89,5 +89,3 @@ class FeatureDataIterator :
 };

 #endif
-
-
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@ -218,3 +218,19 @@ ostream& operator<<(ostream& o, const FeatureStats& e)

  return o;
 }
+
+//ADEED_BY_TS
+bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
+  size_t size = f1.size();
+
+  if (size != f2.size())
+    return false;
+
+  for (size_t k=0; k < size; k++) {
+    if (f1.get(k) != f2.get(k))
+      return false;
+  }
+  
+  return true;
+}
+//END_ADDED
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@ -134,4 +134,8 @@ public:
  friend ostream& operator<<(ostream& o, const FeatureStats& e);
 };

+//ADEED_BY_TS
+bool operator==(const FeatureStats& f1, const FeatureStats& f2);
+//END_ADDED
+
 #endif  // FEATURE_STATS_H
--- a/mert/ScoreArray.h
+++ b/mert/ScoreArray.h
@ -62,6 +62,16 @@ public:
    array_.push_back(e);
  }

+  //ADDED BY TS
+  void swap(size_t i, size_t j) {
+    std::swap(array_[i],array_[j]);
+  }
+
+  void resize(size_t new_size) {
+    array_.resize(std::min(new_size,array_.size()));
+  }
+  //END_ADDED
+
  void merge(ScoreArray& e);

  inline std::string name() const {
--- a/mert/ScoreDataIterator.h
+++ b/mert/ScoreDataIterator.h
@ -43,7 +43,7 @@ class ScoreDataIterator :
 {
  public:
    ScoreDataIterator();
-    ScoreDataIterator(const std::string& filename);
+    explicit ScoreDataIterator(const std::string& filename);

    static ScoreDataIterator end() {
      return ScoreDataIterator();
@ -62,6 +62,4 @@ class ScoreDataIterator :
    std::vector<ScoreDataItem> m_next;
 };

-
 #endif
-
--- a/mert/ScoreStats.cpp
+++ b/mert/ScoreStats.cpp
@ -132,3 +132,19 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
    o << e.get(i) << " ";
  return o;
 }
+
+//ADDED_BY_TS
+bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
+  size_t size = s1.size();
+
+  if (size != s2.size())
+    return false;
+
+  for (size_t k=0; k < size; k++) {
+    if (s1.get(k) != s2.get(k))
+      return false;
+  }
+  
+  return true;
+}
+//END_ADDED
--- a/mert/ScoreStats.h
+++ b/mert/ScoreStats.h
@ -100,4 +100,8 @@ public:
  friend ostream& operator<<(ostream& o, const ScoreStats& e);
 };

+//ADDED_BY_TS
+bool operator==(const ScoreStats& s1, const ScoreStats& s2); 
+//END_ADDED
+
 #endif  // SCORE_STATS_H
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@ -84,7 +84,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
    } else if ( result.scoreAv() > tmp_result.scoreAv() ) {
      result = tmp_result;
    }
-
+    delete evaluation;
  }
  ostringstream stats;
  // multiplication by 100 in order to keep the average precision
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@ -182,6 +182,10 @@ int main(int argc, char** argv)

    PrintUserTime("Nbest entries loaded and scored");

+    //ADDED_BY_TS
+    data.remove_duplicates();
+    //END_ADDED
+
    if (binmode)
      cerr << "Binary write mode is selected" << endl;
    else
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@ -1,5 +1,5 @@
 /**
- * \description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon
+ * \description This is the main for the new version of the mert algorithm developed during the 2nd MT marathon
 */

 #include <limits>
@ -260,6 +260,7 @@ int main (int argc, char **argv)
      if(j<pdim) {
        cerr<<initfile<<":Too few minimum weights." << endl;
        cerr<<"error could not initialize start point with " << initfile << endl;
+	std::cerr << "j: " << j << ", pdim: " << pdim << std::endl;
        exit(3);
      }
      max.resize(pdim);
@ -297,6 +298,10 @@ int main (int argc, char **argv)
    D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
  }

+  //ADDED_BY_TS
+  D.remove_duplicates();
+  //END_ADDED
+
  PrintUserTime("Data loaded");

  // starting point score over latest n-best, accumulative n-best
--- a/moses/src/LM/Ken.cpp
+++ b/moses/src/LM/Ken.cpp
@ -101,11 +101,11 @@ template <class Model> class LanguageModelKen : public LanguageModel {
      lm::WordIndex *end = indices + m_ngram->Order() - 1;
      int position = hypo.GetCurrTargetWordsRange().GetEndPos();
      for (; ; ++index, --position) {
+        if (index == end) return index;
        if (position == -1) {
          *index = m_ngram->GetVocabulary().BeginSentence();
          return index + 1;
        }
-        if (index == end) return index;
        *index = TranslateID(hypo.GetWord(position));
      }
    }
--- a/regression-testing/tests/mert.extractor-bin/truth/results.txt
+++ b/regression-testing/tests/mert.extractor-bin/truth/results.txt
@ -2,20 +2,20 @@ STDOUT_1=1020 14190 86273 FEATSTAT.run1
 STDOUT_2=1020 9060 26328 SCORESTAT.run1
 STDOUT_3=1020 14190 86273 FEATSTAT.run1.2
 STDOUT_4=1020 9060 26328 SCORESTAT.run1.2
-STDOUT_5=2020 28190 172503 FEATSTAT.run2
-STDOUT_6=2020 18060 52341 SCORESTAT.run2
-STDOUT_7=2020 28190 172503 FEATSTAT.run2.2
-STDOUT_8=2020 18060 52341 SCORESTAT.run2.2
-STDOUT_9=3020 42190 264672 FEATSTAT.run3
-STDOUT_10=3020 27060 77299 SCORESTAT.run3
-STDOUT_11=3020 42190 264672 FEATSTAT.run3.2
-STDOUT_12=3020 27060 77299 SCORESTAT.run3.2
-STDOUT_13=4020 56190 360150 FEATSTAT.run4
-STDOUT_14=4020 36060 103698 SCORESTAT.run4
-STDOUT_15=4020 56190 360150 FEATSTAT.run4.2
-STDOUT_16=4020 36060 103698 SCORESTAT.run4.2
-STDOUT_17=5020 70190 462892 FEATSTAT.run5
-STDOUT_18=5020 45060 129840 SCORESTAT.run5
-STDOUT_19=5020 70190 462892 FEATSTAT.run5.2
-STDOUT_20=5020 45060 129840 SCORESTAT.run5.2
-TOTAL_WALLTIME ~ 5
+STDOUT_5=2019 28176 172418 FEATSTAT.run2
+STDOUT_6=2019 18051 52315 SCORESTAT.run2
+STDOUT_7=2019 28176 172418 FEATSTAT.run2.2
+STDOUT_8=2019 18051 52315 SCORESTAT.run2.2
+STDOUT_9=3019 42176 264587 FEATSTAT.run3
+STDOUT_10=3019 27051 77273 SCORESTAT.run3
+STDOUT_11=3019 42176 264587 FEATSTAT.run3.2
+STDOUT_12=3019 27051 77273 SCORESTAT.run3.2
+STDOUT_13=3963 55392 355328 FEATSTAT.run4
+STDOUT_14=3963 35547 102216 SCORESTAT.run4
+STDOUT_15=3963 55392 355328 FEATSTAT.run4.2
+STDOUT_16=3963 35547 102216 SCORESTAT.run4.2
+STDOUT_17=4932 68958 455449 FEATSTAT.run5
+STDOUT_18=4932 44268 127552 SCORESTAT.run5
+STDOUT_19=4932 68958 455449 FEATSTAT.run5.2
+STDOUT_20=4932 44268 127552 SCORESTAT.run5.2
+TOTAL_WALLTIME ~ 2
--- a/regression-testing/tests/mert.extractor-txt/truth/results.txt
+++ b/regression-testing/tests/mert.extractor-txt/truth/results.txt
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
-wmt10-data = $working-dir/data
+wmt12-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -95,7 +104,7 @@ max-sentence-length = 80

 ### raw corpus files (untokenized, but sentence aligned)
 # 
-raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
+raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
 #lowercased-stem = 

 [CORPUS:nc]
-raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
+raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

 [CORPUS:un] IGNORE
-raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
+raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

 #################################################################
 # LANGUAGE MODEL TRAINING
@ -123,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
@ -138,27 +152,21 @@ order = 5
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
-
-#
-# if binarized, set type (default srilm; if binarized: irstlm)
-#
-# set to 8 when using kenlm
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -170,7 +178,7 @@ order = 5

 ### raw corpus (untokenized)
 #
-raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
+raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -182,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
 #lm = 

 [LM:nc]
-raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 [LM:un] IGNORE
-raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

 [LM:news] IGNORE
-raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
+raw-corpus = $wmt12-data/training/news.$output-extension.shuffled


 #################################################################
@ -208,32 +216,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
 ### tuning set
 # you may use the same set that is used for mert tuning (reference set)
 #
-tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-tuning =
 #tokenized-tuning = 
 #factored-tuning = 
 #lowercased-tuning = 
 #split-tuning = 

+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -261,12 +273,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -355,18 +373,18 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
 #
-input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
 #raw-input = 
 #tokenized-input = 
 #factorized-input = 
 #input =
 # 
-reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-reference = 
 #tokenized-reference = 
 #factorized-reference = 
@ -394,14 +412,14 @@ decoder-settings = ""
 # and also point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #########################################################
 ## RECASER: restore case, this part only trains the model

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -448,6 +466,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -470,8 +493,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

@ -502,11 +525,11 @@ report-segmentation = yes
 # further precision breakdown by factor
 #precision-by-coverage-factor = pos

-[EVALUATION:newstest2009]
+[EVALUATION:newstest2011]

 ### input data
 #
-input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
 # raw-input = 
 # tokenized-input = 
 # factorized-input =
@ -514,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm

 ### reference data
 #
-reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
 # raw-reference =
 # tokenized-reference = 
 # reference = 
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
-wmt10-data = $working-dir/data
+wmt12-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/misc/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -95,7 +104,7 @@ max-sentence-length = 80

 ### raw corpus files (untokenized, but sentence aligned)
 # 
-raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
+raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
 #lowercased-stem = 

 [CORPUS:nc]
-raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
+raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

 [CORPUS:un] IGNORE
-raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
+raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

 #################################################################
 # LANGUAGE MODEL TRAINING
@ -123,36 +132,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
+#lm-binarizer = $moses-bin-dir/build_binary
 #type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -164,7 +178,7 @@ order = 5

 ### raw corpus (untokenized)
 #
-raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
+raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -176,19 +190,19 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
 #lm = 

 [LM:nc]
-raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 [LM:un] IGNORE
-raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

 [LM:news] IGNORE
-raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
+raw-corpus = $wmt12-data/training/news.$output-extension.shuffled

 [LM:nc=pos]
 factors = "pos"
 order = 7
 settings = "-interpolate -unk"
-raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 #################################################################
 # INTERPOLATING LANGUAGE MODELS
@ -207,32 +221,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
 ### tuning set
 # you may use the same set that is used for mert tuning (reference set)
 #
-tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-tuning =
 #tokenized-tuning = 
 #factored-tuning = 
 #lowercased-tuning = 
 #split-tuning = 

+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
+#lm-binarizer = $moses-bin-dir/build_binary
 #type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # FACTOR DEFINITION
@ -275,12 +293,18 @@ reordering-factors = "word -> word"
 #generation-factors = 
 decoding-steps = "t0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -354,7 +378,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -369,18 +393,18 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
 #
-input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
 #raw-input = 
 #tokenized-input = 
 #factorized-input = 
 #input =
 # 
-reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-reference = 
 #tokenized-reference = 
 #factorized-reference = 
@ -415,7 +439,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -462,6 +486,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -484,8 +513,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

@ -516,11 +545,11 @@ report-segmentation = yes
 # further precision breakdown by factor
 #precision-by-coverage-factor = pos

-[EVALUATION:newstest2009]
+[EVALUATION:newstest2011]

 ### input data
 #
-input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
 # raw-input = 
 # tokenized-input = 
 # factorized-input =
@ -528,7 +557,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm

 ### reference data
 #
-reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
 # raw-reference =
 # tokenized-reference = 
 # reference = 
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
-wmt10-data = $working-dir/data
+wmt12-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses_chart
+decoder = $moses-bin-dir/moses_chart

 # conversion of phrase table into binary on-disk format
-#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+#ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -95,7 +104,7 @@ max-sentence-length = 80

 ### raw corpus files (untokenized, but sentence aligned)
 # 
-raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
+raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -112,10 +121,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
 #lowercased-stem = 

 [CORPUS:nc]
-raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
+raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

 [CORPUS:un] IGNORE
-raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
+raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

 #################################################################
 # LANGUAGE MODEL TRAINING
@ -123,36 +132,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -164,7 +178,7 @@ order = 5

 ### raw corpus (untokenized)
 #
-raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
+raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -176,13 +190,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
 #lm = 

 [LM:nc]
-raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 [LM:un] IGNORE
-raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

 [LM:news] IGNORE
-raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
+raw-corpus = $wmt12-data/training/news.$output-extension.shuffled


 #################################################################
@ -202,32 +216,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
 ### tuning set
 # you may use the same set that is used for mert tuning (reference set)
 #
-tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-tuning =
 #tokenized-tuning = 
 #factored-tuning = 
 #lowercased-tuning = 
 #split-tuning = 

+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -255,12 +273,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -334,7 +358,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -349,18 +373,18 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
 #
-input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
 #raw-input = 
 #tokenized-input = 
 #factorized-input = 
 #input =
 # 
-reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-reference = 
 #tokenized-reference = 
 #factorized-reference = 
@ -395,7 +419,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -442,6 +466,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 #decoder-settings = ""

@ -464,8 +493,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

@ -496,11 +525,11 @@ report-segmentation = yes
 # further precision breakdown by factor
 #precision-by-coverage-factor = pos

-[EVALUATION:newstest2009]
+[EVALUATION:newstest2011]

 ### input data
 #
-input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
 # raw-input = 
 # tokenized-input = 
 # factorized-input =
@ -508,7 +537,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm

 ### reference data
 #
-reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
 # raw-reference =
 # tokenized-reference = 
 # reference = 
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
-wmt10-data = $working-dir/data
+wmt12-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses_chart
+decoder = $moses-bin-dir/moses_chart

 # conversion of phrase table into binary on-disk format
-#ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+#ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -99,7 +108,7 @@ max-sentence-length = 80

 ### raw corpus files (untokenized, but sentence aligned)
 # 
-raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
+raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -116,10 +125,10 @@ raw-stem = $wmt10-data/training/europarl-v5.$pair-extension
 #lowercased-stem = 

 [CORPUS:nc]
-raw-stem = $wmt10-data/training/news-commentary10.$pair-extension
+raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

 [CORPUS:un] IGNORE
-raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
+raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

 #################################################################
 # LANGUAGE MODEL TRAINING
@ -127,36 +136,41 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -168,7 +182,7 @@ order = 5

 ### raw corpus (untokenized)
 #
-raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
+raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

 ### tokenized corpus files (may contain long sentences)
 #
@ -180,13 +194,13 @@ raw-corpus = $wmt10-data/training/europarl-v5.$output-extension
 #lm = 

 [LM:nc]
-raw-corpus = $wmt10-data/training/news-commentary10.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 [LM:un] IGNORE
-raw-corpus = $wmt10-data/training/undoc.2000.$pair-extension.$output-extension
+raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

 [LM:news] IGNORE
-raw-corpus = $wmt10-data/training/news.$output-extension.shuffled
+raw-corpus = $wmt12-data/training/news.$output-extension.shuffled


 #################################################################
@ -206,32 +220,36 @@ script = $moses-script-dir/ems/support/interpolate-lm.perl
 ### tuning set
 # you may use the same set that is used for mert tuning (reference set)
 #
-tuning-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-tuning =
 #tokenized-tuning = 
 #factored-tuning = 
 #lowercased-tuning = 
 #split-tuning = 

+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -259,12 +277,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -338,7 +362,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -353,18 +377,18 @@ score-settings = "--GoodTuring"
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
 #
-input-sgm = $wmt10-data/dev/news-test2008-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
 #raw-input = 
 #tokenized-input = 
 #factorized-input = 
 #input =
 # 
-reference-sgm = $wmt10-data/dev/news-test2008-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-reference = 
 #tokenized-reference = 
 #factorized-reference = 
@ -399,7 +423,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -446,6 +470,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 #decoder-settings = ""

@ -468,8 +497,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

@ -500,11 +529,11 @@ report-segmentation = yes
 # further precision breakdown by factor
 #precision-by-coverage-factor = pos

-[EVALUATION:newstest2009]
+[EVALUATION:newstest2011]

 ### input data
 #
-input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm
+input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
 # raw-input = 
 # tokenized-input = 
 # factorized-input =
@ -512,7 +541,7 @@ input-sgm = $wmt10-data/dev/newstest2009-src.$input-extension.sgm

 ### reference data
 #
-reference-sgm = $wmt10-data/dev/newstest2009-ref.$output-extension.sgm
+reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
 # raw-reference =
 # tokenized-reference = 
 # reference = 
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -18,25 +18,34 @@ pair-extension = fr-en
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
+# moses binaries
+moses-bin-dir = $moses-src-dir/dist/bin
+#
 # moses scripts
-moses-script-dir = /home/pkoehn/moses/scripts
+moses-script-dir = $moses-src-dir/scripts
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
+# irstlm
+irstlm-dir = $moses-src-dir/irstlm/bin
+#
+# randlm
+randlm-dir = $moses-src-dir/randlm/bin
+#
 # data
 toy-data = $moses-script-dir/ems/example/data

 ### basic tools
 #
 # moses decoder
-decoder = $moses-src-dir/dist/bin/moses
+decoder = $moses-bin-dir/moses

 # conversion of phrase table into binary on-disk format
-ttable-binarizer = $moses-src-dir/dist/bin/processPhraseTable
+ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
-#ttable-binarizer = "$moses-src-dir/dist/bin/CreateOnDiskPt 1 1 5 100 2"
+#ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 5 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
@ -117,36 +126,41 @@ raw-stem = $toy-data/nc-5k
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
-#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

@ -193,25 +207,29 @@ raw-corpus = $toy-data/nc-5k.$output-extension
 #lowercased-tuning = 
 #split-tuning = 

+### group language models for hierarchical interpolation
+# (flat interpolation is limited to 10 language models)
+#group = "first,second fourth,fifth"
+
 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
-#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
+#lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
-#lm-binarizer = $moses-src-dir/dist/bin/build_binary
-#type = 8
+lm-binarizer = $moses-bin-dir/build_binary
+type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 # 
-#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
+#lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
-#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"
+#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # TRANSLATION MODEL TRAINING
@ -239,12 +257,18 @@ script = $moses-script-dir/training/train-model.perl
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

+### parallelization of data preparation step
+# the two directions of the data preparation can be run in parallel
+# comment out if not needed
+#
+parallel = yes
+
 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
-run-giza-in-parts = 5
+#run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
@ -318,7 +342,7 @@ score-settings = "--GoodTuring"
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
-#config = 
+#config-with-reused-weights = 

 #####################################################
 ### TUNING: finding good weights for model components
@ -333,7 +357,7 @@ weight-config = $toy-data/weight.ini
 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
-tuning-settings = "-mertdir $moses-src-dir/mert"
+tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning 
 # it should contain 1000s of sentences
@ -379,7 +403,7 @@ decoder-settings = ""

 [RECASING]

-#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
+#decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
@ -422,6 +446,11 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### additional decoder settings
 # switches for the Moses decoder
+# common choices: 
+#   "-threads N" for multi-threading
+#   "-mbr" for MBR decoding
+#   "-drop-unknown" for dropping unknown source words
+#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

@ -444,8 +473,8 @@ wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension

 ### BLEU
 #
-nist-bleu = $moses-script-dir/generic/mteval-v12.pl
-nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
+nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
+nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -107,7 +107,7 @@ consolidate
 	default-name: truecaser/corpus
 	template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
 train
-        in: tokenized-stem
+  in: tokenized-stem
 	out: truecase-model
 	rerun-on-change: trainer
 	default-name: truecaser/truecase-model
@ -207,6 +207,7 @@ binarize
 	rerun-on-change: lm
 	default-name: lm/binlm
 	template: $lm-binarizer IN OUT
+  error: set kMaxOrder to at least this value

 [INTERPOLATED-LM] single
 tuning-from-sgm
@ -253,27 +254,26 @@ split-tuning
 	template: $output-splitter -model IN1.$output-extension < IN > OUT
 interpolate
 	in: script split-tuning LM:lm
-	rerun-on-change: srilm-dir
+	rerun-on-change: srilm-dir group
 	out: lm
 	default-name: lm/interpolated-lm
 randomize
 	in: lm
 	out: rlm
 	pass-unless: lm-randomizer
-	default-name: lm/rlm
+	default-name: lm/interpolated-rlm
 quantize
 	in: rlm
 	out: qlm
 	pass-unless: lm-quantizer
 	default-name: lm/interpolated-qlm
-	template: $lm-quantizer IN OUT
 binarize
 	in: qlm
 	out: binlm
 	pass-unless: lm-binarizer
 	rerun-on-change: lm
 	default-name: lm/interpolated-binlm
-	template: $lm-binarizer IN OUT
+  error: set kMaxOrder to at least this value

 [TRAINING] single
 consolidate
@ -370,17 +370,9 @@ build-generation-custom
 	ignore-unless: AND generation-factors generation-corpus
 	default-name: model/generation-table
 create-config
-	in: reordering-table phrase-translation-table generation-table LM:binlm
-	out: config
-	ignore-if: use-hiero INTERPOLATED-LM:script
-	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
-	default-name: model/moses.ini
-	error: Unknown option
-create-config-interpolated-lm
-	in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm
+	in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
 	out: config
 	ignore-if: use-hiero
-	ignore-unless: INTERPOLATED-LM:script
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings
 	default-name: model/moses.ini
 	error: Unknown option
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -934,7 +934,12 @@ sub define_step {
 	    &define_training_create_config($i);
 	}
 	elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') {
-	    &define_training_interpolated_lm_interpolate($i);
+	    &define_interpolated_lm_interpolate($i);
+	}
+	elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:binarize' ||
+         $DO_STEP[$i] eq 'INTERPOLATED-LM:quantize' ||
+         $DO_STEP[$i] eq 'INTERPOLATED-LM:randomize') {
+	    &define_interpolated_lm_process($i);
 	}
 	elsif ($DO_STEP[$i] eq 'TUNING:factorize-input') {
            &define_tuningevaluation_factorize($i);
@ -991,7 +996,10 @@ sub execute_steps {
    while(1) {

 	# find steps to be done
-	for(my $i=0;$i<=$#DO_STEP;$i++) {
+  my $repeat_if_passed = 1;
+  while($repeat_if_passed) {
+    $repeat_if_passed = 0;
+	  for(my $i=0;$i<=$#DO_STEP;$i++) {
 	    next if (defined($DONE{$i}));
 	    next if (defined($DO{$i}));
 	    next if (defined($CRASHED{$i}));
@ -1000,10 +1008,19 @@ sub execute_steps {
 	    foreach my $prev_step (@{$DEPENDENCY[$i]}) {
 		$doable = 0 if !defined($DONE{$prev_step});
 	    }
-	    $DO{$i} = 1 if $doable;
+      next unless $doable;
+      $DO{$i} = 1;
+
+      # immediately label pass steps as done
+	    next unless defined($PASS{$i});
+      $DONE{$i} = 1;
+		  delete($DO{$i});
+      $repeat_if_passed = 1;
+    }
 	}

 	print "number of steps doable or running: ".(scalar keys %DO)."\n";
+  foreach my $step (keys %DO) { print "\t".($DO{$step}==2?"running: ":"doable: ").$DO_STEP[$step]."\n"; }
 	return unless scalar keys %DO;
 	
 	# execute new step
@ -1033,7 +1050,7 @@ sub execute_steps {
 		elsif ($CLUSTER || $active < $MAX_ACTIVE) {
 		    $active++;
 		    $DO{$i}++;
-		    print "sh ($active)\n";
+		    print "sh ($active active)\n";
 		    sleep(5);
 		    if (!fork) {
 		        `sh $step >$step.STDOUT 2> $step.STDERR`;
@ -1275,7 +1292,8 @@ sub check_if_crashed {
 	foreach my $pattern (@{$ERROR{&defined_step_id($i)}},
 			     'error','killed','core dumped','can\'t read',
 			     'no such file or directory','unknown option',
-			     'died at','exit code','permission denied') {
+			     'died at','exit code','permission denied',
+           "Can't locate") {
 	    if (/$pattern/i) {
 		my $not_error = 0;
 		if (defined($NOT_ERROR{&defined_step_id($i)})) {
@ -1769,11 +1787,11 @@ sub define_training_create_config {

    # find out which language model files have been built
    my @LM_SETS = &get_sets("LM");
+    my %INTERPOLATED_AWAY;
    my %OUTPUT_FACTORS;
    %OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors");

-    my $interpolated = &get("INTERPOLATED-LM:script"); # flag
-    if ($interpolated) {
+    if (&get("INTERPOLATED-LM:script")) {
 	my $type = 0;
 	# binarizing the lm?
 	$type = 1 if (&get("INTERPOLATED-LM:binlm") ||
@ -1783,23 +1801,32 @@ sub define_training_create_config {
 		      &backoff_and_get("INTERPOLATED-LM:lm-randomizer"));

  # manually set type 
-  $type = &get("INTERPOLATED-LM:type") if (&get("INTERPOLATED-LM:type"));
+  $type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");

-	# order and factor inherited from individual LMs
-	my $set = shift @LM_SETS;
-	my $order = &check_backoff_and_get("LM:$set:order");
-	my $factor = 0;
-	if (&backoff_and_get("TRAINING:output-factors") &&
-	    &backoff_and_get("LM:$set:factors")) {
-	    $factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
-	}
-	$cmd .= "-lm $factor:$order:$LM[0]:$type ";
+  # go through each interpolated language model
+  my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
+  my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
+  foreach my $factor (keys %{$ILM_SETS}) {
+    foreach my $order (keys %{$$ILM_SETS{$factor}}) {
+      next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
+      my $suffix = "";
+      $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
+      $suffix .= ".order$order" if $icount > 1;
+	    $cmd .= "-lm $factor:$order:$LM[0]$suffix:$type ";
+      foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
+        my ($id,$set) = split(/ /,$id_set,2);
+        $INTERPOLATED_AWAY{$set} = 1;
+      }
    }
-    else {
+  }
+  shift @LM;
+  }
+
 	die("ERROR: number of defined LM sets (".(scalar @LM_SETS).":".join(",",@LM_SETS).") and LM files (".(scalar @LM).":".join(",",@LM).") does not match")
 	    unless scalar @LM == scalar @LM_SETS;
 	foreach my $lm (@LM) {
 	    my $set = shift @LM_SETS;
+      next if defined($INTERPOLATED_AWAY{$set});
 	    my $order = &check_backoff_and_get("LM:$set:order");
 	    my $lm_file = "$lm";
 	    my $type = 0; # default: SRILM
@ -1824,54 +1851,143 @@ sub define_training_create_config {
 	    }

 	    $cmd .= "-lm $factor:$order:$lm_file:$type ";
-	}
    }

    &create_step($step_id,$cmd);
 }

-sub define_training_interpolated_lm_interpolate {
+sub define_interpolated_lm_interpolate {
    my ($step_id) = @_;

    my ($interpolated_lm,
-	$interpolation_script, $tuning, @LM) 
-	= &get_output_and_input($step_id);
+	$interpolation_script, $tuning, @LM) = &get_output_and_input($step_id);
    my $srilm_dir = &check_backoff_and_get("INTERPOLATED-LM:srilm-dir");
+    my $group = &get("INTERPOLATED-LM:group");

-    my $lm_list = "";
-    foreach (@LM) {
-	$lm_list .= $_.",";
-    }
-    chop($lm_list);
+    my $cmd = "";

-    # sanity checks on order and factors
-    my @LM_SETS = &get_sets("LM");
-    my %OUTPUT_FACTORS;
-    %OUTPUT_FACTORS = &get_factor_id("output") 
-	if &backoff_and_get("TRAINING:output-factors");
-    my ($factor,$order);
-    foreach my $set (@LM_SETS) {
-	my $set_order = &check_backoff_and_get("LM:$set:order");
-	if (defined($order) && $order != $set_order) {
-	    die("ERROR: language models have mismatching order - no interpolation possible!");
-	}
-	$order = $set_order;
-	
-	if (&backoff_and_get("TRAINING:output-factors") &&
-	    &backoff_and_get("LM:$set:factors")) {
-	    my $set_factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
-	    if (defined($factor) && $factor != $set_factor) {
-		die("ERROR: language models have mismatching factors - no interpolation possible!");
-	    }
-	    $factor = $set_factor;
-	}
+    # go through language models by factor and order 
+    my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
+    foreach my $factor (keys %{$ILM_SETS}) {
+      foreach my $order (keys %{$$ILM_SETS{$factor}}) {
+        next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
+
+        # get list of language model files
+        my $lm_list = "";
+        foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
+          my ($id,$set) = split(/ /,$id_set,2);
+          $lm_list .= $LM[$id].",";
+        }
+        chop($lm_list);
+
+        # if grouping, identify position in list
+        my $numbered_string = "";
+        if (defined($group)) {
+          my %POSITION;
+          foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) {
+            my ($id,$set) = split(/ /,$id_set,2);
+            $POSITION{$set} = scalar keys %POSITION;
+          }
+          my $group_string = $group;
+          $group_string =~ s/\s+/ /g;
+          $group_string =~ s/ *, */,/g;
+          $group_string =~ s/^ //;
+          $group_string =~ s/ $//;
+          $group_string .= " ";
+          while($group_string =~ /^([^ ,]+)([ ,]+)(.*)$/) {
+            die("ERROR: unknown set $1 in INTERPOLATED-LM:group definition")
+          if ! defined($POSITION{$1});
+            $numbered_string .= $POSITION{$1}.$2;
+            $group_string = $3;
+          }
+          chop($numbered_string);
+        }
+
+        my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
+        my $name = $interpolated_lm;
+        if ($icount > 1) {
+          $name .= ".$$FACTOR[$factor]" if defined($FACTOR);
+          $name .= ".order$order";
+        }
+        $cmd .= "$interpolation_script --tuning $tuning --name $name --srilm $srilm_dir --lm $lm_list";
+        $cmd .= " --group \"$numbered_string\"" if defined($group);
+        $cmd .= "\n";
+      }
    }

-    my $cmd = "$interpolation_script --tuning $tuning --name $interpolated_lm --srilm $srilm_dir --lm $lm_list";
-
+    die("ERROR: Nothing to interpolate, remove interpolation step!") if $cmd eq "";
    &create_step($step_id,$cmd);
 }

+sub define_interpolated_lm_process {
+  my ($step_id) = @_;
+
+  my ($processed_lm, $interpolatd_lm) = &get_output_and_input($step_id);
+  my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]);
+  my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r");
+  my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
+
+  # go through language models by factor and order 
+  my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
+  my $cmd = "";
+  foreach my $factor (keys %{$ILM_SETS}) {
+    foreach my $order (keys %{$$ILM_SETS{$factor}}) {
+      next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
+      my $suffix = "";
+      $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
+      $suffix .= ".order$order" if $icount > 1;
+      $cmd .= "$tool $interpolatd_lm$suffix $processed_lm$suffix\n"; 
+    }
+  }
+
+  &create_step($step_id,$cmd);
+}
+
+sub get_interpolated_lm_processed_names {
+  my ($processed_lm) = @_;
+  my @ILM_NAME;
+  my ($icount,$ILM_SETS) = &get_interpolated_lm_sets();
+  my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
+  foreach my $factor (keys %{$ILM_SETS}) {
+    foreach my $order (keys %{$$ILM_SETS{$factor}}) {
+      if (scalar(@{$$ILM_SETS{$factor}{$order}}) > 1) {
+        my $suffix = "";
+        $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
+        $suffix .= ".order$order" if $icount > 1;
+        push @ILM_NAME,"$processed_lm$suffix";
+      }
+      else {
+        push @ILM_NAME,"$processed_lm.".($FACTOR?"":".$$FACTOR[$factor]").".order$order";
+      }
+    }
+  }
+  return @ILM_NAME;
+}
+
+sub get_interpolated_lm_sets {
+  my %ILM_SETS;
+
+  my @LM_SETS = &get_sets("LM");
+  my %OUTPUT_FACTORS;
+  %OUTPUT_FACTORS = &get_factor_id("output") if &backoff_and_get("TRAINING:output-factors");
+
+  my $count=0;
+  my $icount=0;
+  foreach my $set (@LM_SETS) {
+    my $order = &check_backoff_and_get("LM:$set:order");
+	
+    my $factor = 0;
+	  if (&backoff_and_get("TRAINING:output-factors") &&
+	      &backoff_and_get("LM:$set:factors")) {
+      $factor = $OUTPUT_FACTORS{&backoff_and_get("LM:$set:factors")};
+    }
+
+    push @{$ILM_SETS{$factor}{$order}}, ($count++)." ".$set;
+    $icount++ if scalar(@{$ILM_SETS{$factor}{$order}}) == 2;
+  }
+  return ($icount,\%ILM_SETS);
+}
+
 sub get_training_setting {
    my ($step) = @_;
    my $dir = &check_and_get("GENERAL:working-dir");
@ -1888,6 +2004,7 @@ sub get_training_setting {
    my $source_syntax = &get("GENERAL:input-parser");
    my $target_syntax = &get("GENERAL:output-parser");
    my $score_settings = &get("TRAINING:score-settings");
+    my $parallel = &get("TRAINING:parallel");

    my $xml = $source_syntax || $target_syntax;

@ -1909,6 +2026,7 @@ sub get_training_setting {
    $cmd .= "-source-syntax " if $source_syntax;
    $cmd .= "-glue-grammar " if $hierarchical;
    $cmd .= "-score-options '".$score_settings."' " if $score_settings;
+    $cmd .= "-parallel " if $parallel;

    # factored training
    if (&backoff_and_get("TRAINING:input-factors")) {
@ -2261,12 +2379,13 @@ sub define_reporting_report {
 ### subs for step definition

 sub get_output_and_input {
-    my ($step_id) = @_;
+  my ($step_id) = @_;

-    my $step = $DO_STEP[$step_id];
-    my $output = &get_default_file(&deconstruct_name($step));
+  my $step = $DO_STEP[$step_id];
+  my $output = &get_default_file(&deconstruct_name($step));

-    my @INPUT;
+  my @INPUT;
+  if (defined($USES_INPUT{$step_id})) { 
    for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) {
 	# get name of input file needed
 	my $in_file = $USES_INPUT{$step_id}[$i];
@ -2298,7 +2417,8 @@ sub get_output_and_input {
 	push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file),
 						   &deconstruct_name($prev_step));
    }
-    return ($output,@INPUT);
+  }
+  return ($output,@INPUT);
 }

 sub define_template {
@ -2397,6 +2517,9 @@ sub define_template {
    }
    # input is defined as IN or IN0, IN1, IN2
    else {
+  if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
+    die("ERROR: Step $step requires input from prior steps, but none defined.");
+  }
 	$cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g;  # a bit trickier to
 	$cmd =~ s/([^ANS])IN/$1$INPUT[0]/g;        # avoid matching TRAINING, RECASING
 	$cmd =~ s/^IN(\d+)/$INPUT[$2]/g;
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@ -12,13 +12,14 @@ binmode(STDERR, ":utf8");

 my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
 my $TEMPDIR = "/tmp";
-my ($TUNING,$LM,$NAME);
+my ($TUNING,$LM,$NAME,$GROUP);

-die("interpolate-lm.perl --tuning set --name out-lm --lm lm1,lm2,lm3 [--srilm srtilm-dir --tempdir tempdir]")
+die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
    unless &GetOptions('tuning=s' => => \$TUNING,
 		       'name=s' => \$NAME,
 		       'srilm=s' => \$SRILM,
 		       'tempdir=s' => \$TEMPDIR,
+           'group=s' => \$GROUP,
 		       'lm=s' => \$LM);

 # check and set default to unset parameters
@ -52,49 +53,109 @@ foreach my $lm (@LM) {
 }
 print STDERR "language models have order $order.\n";

-my $tmp = tempdir(DIR=>$TEMPDIR);
+# too many language models? group them first
+if (!defined($GROUP) && scalar(@LM) > 10) {
+  print STDERR "more than 10, automatically grouping language models.\n";
+  my $num_groups = int(scalar(@LM)/10 + 0.99);
+  my $size_groups = int(scalar(@LM)/$num_groups + 0.99);

-# compute perplexity
-my $i = 0;
-foreach my $lm (@LM) {
-  print STDERR "compute perplexity for $lm\n";
-  safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
-  print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
-  $i++;
+  $GROUP = "";
+  for(my $i=0;$i<$num_groups;$i++) {
+    $GROUP .= " " unless $i==0;
+    for(my $j=0;$j<$size_groups;$j++) {
+      my $lm_i = $i*$size_groups+$j;
+      next if $lm_i >= scalar(@LM);
+      $GROUP .= "," unless $j==0;
+      $GROUP .= $lm_i;
+    }
+  }
+  print STDERR "groups: $GROUP\n";
 }

-# compute lambdas
-print STDERR "computing lambdas...\n";
-my $cmd = "$SRILM/compute-best-mix";
-for(my $i=0;$i<scalar(@LM);$i++) {
-  $cmd .= " $tmp/iplm.$$.$i";
+# normal interpolation
+if (!defined($GROUP)) {
+  &interpolate($NAME,@LM);
+  exit;
 }
-my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
-die "Failed to mix models: $mixerr" if $mixexitcode != 0;
-my $mix = $mixout;
-`rm $tmp/iplm.$$.*`;
-$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
-my @LAMBDA = split(/ /,$1);

-# create new language models
-print STDERR "creating new language model...\n";
-$i = 0;
-$cmd = "$SRILM/ngram -unk -order $order -write-lm $NAME";
-foreach my $lm (@LM) {
-  $cmd .= " -lm " if $i==0;
-  $cmd .= " -mix-lm " if $i==1;
-  $cmd .= " -mix-lm$i " if $i>1;
-  $cmd .= $lm;
-  $cmd .= " -lambda " if $i==0;
-  $cmd .= " -mix-lambda$i " if $i>1;
-  $cmd .= $LAMBDA[$i] if $i!=1;
-  $i++;
+# group language models into sub-interpolated models
+my %ALREADY;
+my $g = 0;
+my @SUB_NAME;
+foreach my $subgroup (split(/ /,$GROUP)) {
+  my @SUB_LM;
+  foreach my $lm_i (split(/,/,$subgroup)) {
+    die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM);
+    push @SUB_LM,$LM[$lm_i];
+    $ALREADY{$lm_i} = 1;
+  }
+  #if (scalar @SUB_NAME == 0 && scalar keys %ALREADY == scalar @LM) {
+  #  print STDERR "WARNING: grouped all language models into one, perform normal interpolation\n";
+  #  &interpolate($NAME,@LM);
+  #  exit;
+  #}
+  my $name = $NAME.".group-".chr(97+($g++));
+  push @SUB_NAME,$name;
+  print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
+  &interpolate($name, @SUB_LM);
 }
-safesystem($cmd) or die "Failed.";
+for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
+  next if defined($ALREADY{$lm_i});
+  push @SUB_NAME, $LM[$lm_i];
+}
+print STDERR "\n=== BUILDING FINAL LM ===\n\n";
+&interpolate($NAME, @SUB_NAME);

-rmtree($tmp); # remove the temp dir
-print STDERR "done.\n";
+# main interpolation function
+sub interpolate {
+  my ($name,@LM) = @_;

+  die("cannot interpolate more than 10 language models at once.")
+    if scalar(@LM) > 10;
+
+  my $tmp = tempdir(DIR=>$TEMPDIR);
+
+  # compute perplexity
+  my $i = 0;
+  foreach my $lm (@LM) {
+    print STDERR "compute perplexity for $lm\n";
+    safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
+    print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
+    $i++;
+  }
+
+  # compute lambdas
+  print STDERR "computing lambdas...\n";
+  my $cmd = "$SRILM/compute-best-mix";
+  for(my $i=0;$i<scalar(@LM);$i++) {
+    $cmd .= " $tmp/iplm.$$.$i";
+  }
+  my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
+  die "Failed to mix models: $mixerr" if $mixexitcode != 0;
+  my $mix = $mixout;
+  `rm $tmp/iplm.$$.*`;
+  $mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
+  my @LAMBDA = split(/ /,$1);
+
+  # create new language model
+  print STDERR "creating new language model...\n";
+  $i = 0;
+  $cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
+  foreach my $lm (@LM) {
+    $cmd .= " -lm " if $i==0;
+    $cmd .= " -mix-lm " if $i==1;
+    $cmd .= " -mix-lm$i " if $i>1;
+    $cmd .= $lm;
+    $cmd .= " -lambda " if $i==0;
+    $cmd .= " -mix-lambda$i " if $i>1;
+    $cmd .= $LAMBDA[$i] if $i!=1;
+    $i++;
+  }
+  safesystem($cmd) or die "Failed.";
+
+  rmtree($tmp); # remove the temp dir
+  print STDERR "done.\n";
+}

 sub safesystem {
  print STDERR "Executing: @_\n";
--- a/scripts/ems/support/reuse-weights.perl
+++ b/scripts/ems/support/reuse-weights.perl
@ -16,7 +16,7 @@ while(<WEIGHT>) {
    if (/^\[weight\-(\S+)\]/) {
 	$current_weight = $1;
    }
-  elsif ($current_weight && /^([\-\d\.]+)([Ee][+-]?[\d]+)?$/) {
+  elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
 	push @{$WEIGHT{$current_weight}},$1;
    }
    elsif (/^\[/) {
--- a/scripts/generic/mteval-v12.pl
+++ b/scripts/generic/mteval-v12.pl
@ -553,6 +553,7 @@ sub bleu_score {
    my $score = 0;
    my $iscore = 0;
    my $len_score = min (0, 1-$shortest_ref_length/$tst_ngrams->[1]);
+    print "length ratio: ".($tst_ngrams->[1]/$shortest_ref_length)." ($tst_ngrams->[1]/$shortest_ref_length), penalty (log): $len_score\n";

    for (my $j=1; $j<=$max_Ngram; $j++) {
        if ($matching_ngrams->[$j] == 0) {
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
--- a/scripts/generic/multi-bleu.perl
+++ b/scripts/generic/multi-bleu.perl
@ -3,9 +3,15 @@
 # $Id$
 use strict;

+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+
 my $stem = $ARGV[0];
 if (!defined $stem) {
-  print STDERR "usage: multi-bleu.pl reference < hypothesis\n";
+  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
  exit(1);
 }
@ -35,12 +41,14 @@ my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 my $s=0;
 while(<STDIN>) {
    chop;
+    $_ = lc if $lowercase;
    my @WORD = split;
    my %REF_NGRAM = ();
    my $length_translation_this_sentence = scalar(@WORD);
    my ($closest_diff,$closest_length) = (9999,9999);
    foreach my $reference (@{$REF[$s]}) {
 #      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
 	my @WORD = split(/ /,$reference);
 	my $length = scalar(@WORD);
        my $diff = abs($length_translation_this_sentence-$length);
--- a/scripts/generic/trainlm-irst.perl
+++ b/scripts/generic/trainlm-irst.perl
@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+
+# Compatible with sri LM-creating script, eg.
+#    ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
+# To use it in the EMS, add this to the [LM] section
+#    lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
+#    settings = ""
+# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section. 
+# It should point to the root of the LM toolkit, eg
+#    irst-dir = /Users/hieu/workspace/irstlm/trunk
+# And make sure that $cores is defined, eg $cores = 8
+
+use strict;
+use FindBin qw($Bin);
+use Getopt::Long;
+
+my $order;
+my $corpusPath;
+my $lmPath;
+my $cores = 2;
+my $irstPath;
+my $tempPath = "tmp";
+
+GetOptions("order=s"  => \$order,
+           "text=s"   => \$corpusPath,
+           "lm=s"     => \$lmPath,
+           "cores=s"  => \$cores,
+           "irst-dir=s"  => \$irstPath,
+           "temp-dir=s"  => \$tempPath
+	   ) or exit 1;
+
+die("ERROR: please set order") unless defined($order);
+die("ERROR: please set text") unless defined($corpusPath);
+die("ERROR: please set lm") unless defined($lmPath);
+die("ERROR: please set irst-dir") unless defined($irstPath);
+
+my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
+print "extension is $ext\n";
+
+$tempPath .= "/irstlm-build-tmp.$$";
+`mkdir -p $tempPath`;
+
+my $cmd;
+if ($ext eq "gz")
+{
+    $cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
+}
+else
+{
+    $cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
+}
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+
+$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+
+$ext = ($lmPath =~ m/([^.]+)$/)[0];
+print "extension is $ext\n";
+
+if ($ext eq "gz")
+{
+    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
+}
+else
+{
+    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
+}
+
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+
+$cmd = "rm -rf $tempPath";
+print STDERR "EXECUTING $cmd\n";
+`$cmd`;
+
+print STDERR "FINISH.\n";
--- a/scripts/training/mert-moses-multi.pl
+++ b/scripts/training/mert-moses-multi.pl
@ -10,6 +10,7 @@

 # Excerpts from revision history

+# Dec 2011    update the script for the mert-moses.pl compatibility
 # Sept 2011   multi-threaded mert (Barry Haddow)
 # 3 Aug 2011  Added random directions, historic best, pairwise ranked (PK)
 # Jul 2011    simplifications (Ondrej Bojar)
@ -47,9 +48,13 @@
 # 13 Oct 2004 Use alternative decoders (DWC)
 # Original version by Philipp Koehn

+use strict;
 use FindBin qw($Bin);
 use File::Basename;
 use File::Path;
+use File::Spec;
+use Cwd;
+
 my $SCRIPTS_ROOTDIR = $Bin;
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
@ -82,12 +87,16 @@ my $minimum_required_change_in_weights = 0.00001;

 my $verbose = 0;
 my $usage = 0; # request for --help
-my $___WORKING_DIR = "mert-work";
+
+# We assume that if you don't specify working directory,
+# we set the default is set to `pwd`/mert-work
+my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work");
 my $___DEV_F = undef; # required, input text to decode
 my $___DEV_E = undef; # required, basename of files with references
 my $___DECODER = undef; # required, pathname to the decoder executable
 my $___CONFIG = undef; # required, pathname to startup ini file
 my $___N_BEST_LIST_SIZE = 100;
+my $___LATTICE_SAMPLES = 0;
 my $queue_flags = "-hard";  # extra parameters for parallelizer
      # the -l ws0ssmt was relevant only to JHU 2006 workshop
 my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial)
@ -133,7 +142,6 @@ my $filtercmd = undef; # path to filter-model-given-input.pl
 my $filterfile = undef;
 my $qsubwrapper = undef;
 my $moses_parallel_cmd = undef;
-my $scorer_config = "BLEU:1";
 my $old_sge = 0; # assume sge<6.0
 my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering
 my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on 
@ -144,10 +152,10 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
                                  # -1 means all previous, i.e. from iteration 1
                                  # 0 means no previous data, i.e. from actual iteration
                                  # 1 means 1 previous data , i.e. from the actual iteration and from the previous one
-                                  # and so on 
+                                  # and so on
 my $maximum_iterations = 25;
+my $scorer_config = undef ;

-use strict;
 use Getopt::Long;
 GetOptions(
  "working-dir=s" => \$___WORKING_DIR,
@ -157,6 +165,7 @@ GetOptions(
  "decoder=s" => \$___DECODER,
  "config=s" => \$___CONFIG,
  "nbest=i" => \$___N_BEST_LIST_SIZE,
+  "lattice-samples=i" => \$___LATTICE_SAMPLES,
  "queue-flags=s" => \$queue_flags,
  "jobs=i" => \$___JOBS,
  "decoder-flags=s" => \$___DECODER_FLAGS,
@ -191,8 +200,8 @@ GetOptions(
  "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER,
  "pro-starting-point" => \$___PRO_STARTING_POINT,
  "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION,
-  "threads=i" => \$__THREADS,
-  "sc-config=s" => \$scorer_config
+  "sc-config=s" => \$scorer_config,
+  "threads=i" => \$__THREADS
 ) or exit(1);

 # the 4 required parameters can be supplied on the command line directly
@ -210,6 +219,7 @@ if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER |
 Options:
  --working-dir=mert-dir ... where all the files are created
  --nbest=100            ... how big nbestlist to generate
+  --lattice-samples      ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010)
  --jobs=N               ... set this to anything to run moses in parallel
  --mosesparallelcmd=STR ... use a different script instead of moses-parallel
  --queue-flags=STRING   ... anything you with to pass to qsub, eg.
@ -276,7 +286,7 @@ Options:
  --threads=NUMBER          ... Use multi-threaded mert (must be compiled in).
  --historic-interpolation  ... Interpolate optimized weights with prior iterations' weight
                                (parameter sets factor [0;1] given to current weights)
-  --sc-config=STRING     ... extra option to specify multiscoring.
+  --sc-config=\"METRIC1:WEIGHT1,METRIC2:WEIGHT2\"  ... extra option to specify tuning with multiple metrics.
 ";
  exit 1;
 }
@ -284,7 +294,6 @@ Options:

 # Check validity of input parameters and set defaults if needed

-print STDERR "Using WORKING_DIR: $___WORKING_DIR\n";
 print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";

 # path of script for filtering phrase tables and running the decoder
@ -308,9 +317,11 @@ if (!defined $mertdir) {

 my $mert_extract_cmd = "$mertdir/extractor";
 my $mert_mert_cmd = "$mertdir/mert";
+my $mert_pro_cmd = "$mertdir/pro";

 die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd;
 die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd;
+die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd;

 my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation
 if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
@ -610,6 +621,8 @@ my $oldallsorted = undef;
 my $allsorted = undef;

 my $nbest_file=undef;
+my $lsamp_file=undef; #Lattice samples
+my $orig_nbest_file=undef; # replaced if lattice sampling

 while(1) {
  $run++;
@ -629,8 +642,20 @@ while(1) {
  # skip running the decoder if the user wanted
  if (!$skip_decoder) {
      print "($run) run decoder to produce n-best lists\n";
-      $nbest_file = run_decoder($featlist, $run, $need_to_normalize);
+      ($nbest_file,$lsamp_file) = run_decoder($featlist, $run, $need_to_normalize);
      $need_to_normalize = 0;
+      if ($___LATTICE_SAMPLES) {
+        my $combined_file = "$nbest_file.comb";
+        safesystem("sort -k1,1n $nbest_file $lsamp_file > $combined_file") or
+          die("failed to merge nbest and lattice samples");
+        safesystem("gzip -f $nbest_file; gzip -f $lsamp_file") or 
+          die "Failed to gzip nbests and lattice samples";
+        $orig_nbest_file = "$nbest_file.gz";
+        $orig_nbest_file = "$nbest_file.gz";
+        $lsamp_file = "$lsamp_file.gz";
+        $lsamp_file = "$lsamp_file.gz";
+        $nbest_file = "$combined_file";
+      }
      safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out";
      $nbest_file = $nbest_file.".gz";
  }
@ -648,9 +673,12 @@ while(1) {
  my $base_score_file = "scores.dat";
  my $feature_file = "run$run.${base_feature_file}";
  my $score_file = "run$run.${base_score_file}";
-
-# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
  my $cmd = "";
+
+if (defined($scorer_config))
+{
+#process the mulitple metric way
+    print STDERR "-- process the mulitple metric way --\n";
  my $scorer_name;
  my $scorer_weight;
  $scorer_config=~s/ //g;
@ -659,108 +687,153 @@ while(1) {
  my $scorer_config_spec;
  foreach $scorer_config_spec(@lists_scorer_config)
  {
-#     print STDERR $scorer_config_spec."\n";
    my @lists_scorer_config_spec=split(":",$scorer_config_spec);
    $scorer_name=$lists_scorer_config_spec[0];
    $scorer_weight=$lists_scorer_config_spec[1];
-#     print STDERR $scorer_name."\n";
-#     print STDERR $scorer_weight."\n";
    $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file.$scorer_name --ffile $feature_file.$scorer_name --sctype $scorer_name -r ".join(",", @references)." -n $nbest_file";
-#     print STDERR "LANCEMENT $scorer_name ********************************************\n";
    &submit_or_exec($cmd,"extract.out.$scorer_name","extract.err.$scorer_name");
-#     print STDERR "FIN $scorer_name ************************************************** \n";
-#   print STDERR "executing $cmd\n";
-
-#   print STDERR "\n";
-#   safesystem("date"); 
-#   print STDERR "\n";
-
-#   if (defined $___JOBS) {
-#     safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=extract.out.$scorer_name -stderr=extract.err.$scorer_name" )
-#       or die "$scorer_name Failed to submit extraction to queue (via $qsubwrapper)";
-#   } else {
-#     safesystem("$cmd > extract.out.$scorer_name 2> extract.err.$scorer_name") or die "$scorer_name Failed to do extraction of statistics.";
-#   }
-
-#   print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n";
  }
-#   print STDERR "CREATION INI\n";
  my @scorer_content;
  my $fileIncrement=0;
  open(FILE,">merge.init") || die ("File creation ERROR : merge.init");
+  my $minFileName="";
+  my $minFileSize;
+  my %scoreFileContent;
+  my %featureFileContent;
+  my $firstContent;
  foreach $scorer_config_spec(@lists_scorer_config)
  {
    my @lists_scorer_config_spec=split(":",$scorer_config_spec);
    $scorer_name=$lists_scorer_config_spec[0];
    $scorer_weight=$lists_scorer_config_spec[1];
    print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n";
-    my @tmp_content=`/bin/cat $score_file.$scorer_name`;
-    $scorer_content[$fileIncrement] = [ @tmp_content ];
+    my @tmp_scoreContent=`/bin/cat $score_file.$scorer_name`;
+    my @tmp_featContent=`/bin/cat $feature_file.$scorer_name`;
+    my $localIncrementFileContent=0;
+    my $fileContentInfo=0;
+    my $localIncrementInfo=0;
+    for ($localIncrementFileContent=0; $localIncrementFileContent<scalar(@tmp_scoreContent); $localIncrementFileContent++)
+    {
+	if (rindex($tmp_scoreContent[$localIncrementFileContent],"BEGIN")>-1)
+	{
+	    my @split_local=split(" ",$tmp_scoreContent[$localIncrementFileContent]);
+	    $fileContentInfo=$split_local[1];
+	    
+	    $localIncrementInfo=0;
+	}
+	chomp($tmp_scoreContent[$localIncrementFileContent]);
+	chomp($tmp_featContent[$localIncrementFileContent]);
+	$scoreFileContent{$fileIncrement}{$fileContentInfo}{$localIncrementInfo}=$tmp_scoreContent[$localIncrementFileContent];
+	$featureFileContent{$fileIncrement}{$fileContentInfo}{$localIncrementInfo}=$tmp_featContent[$localIncrementFileContent];
+	$localIncrementInfo++;
+    }
    if ($fileIncrement==0)
    {
-	`/bin/cp $feature_file.$scorer_name $feature_file`;
+	$minFileSize=$localIncrementFileContent;
+	$minFileName=$scorer_name;
+    }
+    else
+    {
+	if ($minFileSize>$localIncrementFileContent)
+	{
+	    $minFileSize=$localIncrementFileContent;
+	    $minFileName=$scorer_name;
+	}
    }
    $fileIncrement++;
  }
  close(FILE);
-#   print STDERR "\n";
-#   safesystem("date");
-#   print STDERR "\n";
  
-#   print STDERR "ON  VA RASSEMBLER dans $score_file\n";
  open(SCOREFILE,">$score_file") || die ("File creation ERROR : $score_file");
+  open(FEATUREFILE,">$feature_file") || die ("File creation ERROR : $feature_file");
  my $newFileIncrement=0;
  my $contentIncrement=0;
-  my $contentSize=scalar(@{$scorer_content[0]});
-#   print STDERR "TAILLE : ".$contentSize."|".$fileIncrement."\n";
-  while ($contentIncrement< $contentSize)
+  my @nbestSize;
+  my $contentSize;
+  my $lineScore="";
+  my $lineFeature="";
+  my $minSize;
+  my $localContentIncrement=0;
+  my @localContentSizeSize;
+  my $scoreFileName;
+  my $notFinished=1;
+  my $scoreName=$minFileName;
+  my $minInfoSize=-1;
+  $fileIncrement=0;
+  while (defined($scoreFileContent{$fileIncrement}{$contentIncrement}))
  {
-      my $line="";
-      $newFileIncrement=0;
-      while($newFileIncrement< $fileIncrement)
+      if ($localContentIncrement==0)
      {
-	 if (rindex($scorer_content[$newFileIncrement][$contentIncrement],"BEGIN")<0)
-	 {
-	    $line=$line." ".$scorer_content[$newFileIncrement][$contentIncrement];
-	    chomp($line);
-	 }
-	 else
-	 {
-	    my @split_line_input=split(" ",$scorer_content[$newFileIncrement][$contentIncrement]);
-	    my @split_line=split(" ",$line);
+	foreach $fileIncrement(sort keys %scoreFileContent)
+	{
+# 	    process the score file
+	    my @tmp_split=split(" ",$scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement});
+	    if ($minInfoSize==-1)
+	    {
+		$minInfoSize=$tmp_split[2];
+	    }
+	    elsif ($minInfoSize>$tmp_split[2])
+	    {
+		$minInfoSize=$tmp_split[2];
+	    }
+	    my @split_line=split(" ",$lineScore);
 	    if (scalar(@split_line)>0)
 	    {
-		$split_line_input[3]=$split_line[3]+$split_line_input[3];
+		$tmp_split[3]=$split_line[3]+$tmp_split[3];
 	    }
-	    $line=$split_line_input[0]." ".$split_line_input[1]." ".$split_line_input[2]." ".$split_line_input[3]." MERGE";
+	    $lineScore=$tmp_split[0]." ".$contentIncrement." ".$minInfoSize." ".$tmp_split[3]." MERGE";	 
+# 	    process the feature file
+	    @tmp_split=split(" ",$featureFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement});
+	    $lineFeature=$tmp_split[0]." ".$contentIncrement." ".$minInfoSize." ".$tmp_split[3]." MERGE";	 
+
 	 }
-	 $newFileIncrement++;
+	 $localContentIncrement++;
      }
-      $line=~s/^[ ]+//g;
-      $line=~s/[ ]+$//g;
-      $line=~s/[ ]+/ /g;
-#       print STDERR $line."\n";
-      print SCOREFILE $line."\n";
-      $contentIncrement++;
+      else
+      {
+	LOOP_CONTENT: foreach $scoreName(sort keys %scoreFileContent)
+	{
+	  if ((rindex($scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement},"END")>-1) || ($minInfoSize < $localContentIncrement))
+	  {
+	      $lineScore="SCORES_TXT_END_0";
+	      $lineFeature="FEATURES_TXT_END_0";
+	      $localContentIncrement=0;
+	      $contentIncrement++;
+	      $minInfoSize=-1;
+	      last LOOP_CONTENT;
+	  }
+	  else
+	  {
+	      $lineScore=$lineScore." ".$scoreFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement};
+	      $lineFeature=$featureFileContent{$fileIncrement}{$contentIncrement}{$localContentIncrement};
+	  }
+	}
+	if ($localContentIncrement!=0) 
+	{
+	    $localContentIncrement++;
+	}
+      }
+      $lineScore=~s/^[ ]+//g;
+      $lineScore=~s/[ ]+$//g;
+      $lineScore=~s/[ ]+/ /g;
+      $lineFeature=~s/^[ ]+//g;
+      $lineFeature=~s/[ ]+$//g;
+      $lineFeature=~s/[ ]+/ /g;
+      print SCOREFILE $lineScore."\n";      
+      print FEATUREFILE $lineFeature."\n";      
+      $lineScore="";
+      $lineFeature="";
+    }
+    close(SCOREFILE);
+    close(FEATUREFILE);
+  }
+  else
+  {
+  # continue with the classical way  
+    $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file";
+    $cmd = create_extractor_script($cmd, $___WORKING_DIR);
+    &submit_or_exec($cmd,"extract.out","extract.err");
  }
-  close(SCOREFILE);
-#   `/bin/cp `
-  
-#   $cmd="$mertdir/mergeWeights -c merge.init -s $score_file -f $feature_file";
-#   print STDERR "executing : $cmd\n";
-
-#   if (defined $___JOBS) {
-#     safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=mergeWeight.out.MERGE -stderr=mergeWeight.err.MERGE" )
-#       or die "MERGE Failed to submit extraction to queue (via $qsubwrapper)";
-#   } else {
-#     safesystem("$cmd > mergeWeight.out.MERGE 2> mergeWeight.err.MERGE") or die "MERGE Failed to do extraction of statistics.";
-#   }
-
-# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
-
-#   my $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file";
-#   &submit_or_exec($cmd,"extract.out","extract.err");
-
  # Create the initial weights file for mert: init.opt

  my @MIN = @{$featlist->{"mins"}};
@ -785,10 +858,12 @@ while(1) {
  $cmd = "$mert_mert_cmd -d $DIM $mert_mert_args";
  
  my $mert_settings = " -n $___RANDOM_RESTARTS";
+  my $seed_settings = "";
  if ($___PREDICTABLE_SEEDS) {
      my $seed = $run * 1000;
-      $mert_settings .= " -r $seed";
+      $seed_settings .= " -r $seed";
  }
+  $mert_settings .= $seed_settings;
  if ($___RANDOM_DIRECTIONS) {
    if ($___NUM_RANDOM_DIRECTIONS == 0) {
      $mert_settings .= " -m 50";
@ -802,19 +877,25 @@ while(1) {
    $mert_settings .= " --threads $__THREADS";
  }

-  my $file_settings = "";
+  my $ffiles = "";
+  my $scfiles = "";
  if (defined $prev_feature_file) {
-    $file_settings .= " --ffile $prev_feature_file,$feature_file";
+    $ffiles = "$prev_feature_file,$feature_file";
  }
  else{
-    $file_settings .= " --ffile $feature_file";
+    $ffiles = "$feature_file";
  }
  if (defined $prev_score_file) {
-    $file_settings .= " --scfile $prev_score_file,$score_file";
+    $scfiles = "$prev_score_file,$score_file";
  }
  else{
-    $file_settings .= " --scfile $score_file";
+    $scfiles = "$score_file";
  }
+
+  my $file_settings = " --ffile $ffiles --scfile $scfiles";
+  my $pro_file_settings = "--ffile " . join( " --ffile ", split(/,/, $ffiles)) .
+                          " --scfile " .  join( " --scfile ", split(/,/, $scfiles)); 
+  
  if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) {
    $file_settings .= " --ifile $prev_init_file,run$run.$weights_in_file";
  }
@ -826,13 +907,13 @@ while(1) {

  # pro optimization
  if ($___PAIRWISE_RANKED_OPTIMIZER) {
-    $cmd .= " --pro run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
+    $cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
    &submit_or_exec($cmd,$mert_outfile,$mert_logfile);
  }
  # first pro, then mert
  elsif ($___PRO_STARTING_POINT) {
    # run pro...
-    my $pro_cmd = $cmd." --pro run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
+    my $pro_cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data";
    &submit_or_exec($pro_cmd,"run$run.pro.out","run$run.pro.err");
    # ... get results ...
    my %dummy;
@ -858,9 +939,8 @@ while(1) {
    chomp $extractFiles;
    safesystem ("\\cp -f $extractFiles run$run.$extractFiles") or die;
  }
-
-#  safesystem ("\\cp -f extract.err run$run.extract.err") or die;
-#  safesystem ("\\cp -f extract.out run$run.extract.out") or die;
+#   safesystem ("\\cp -f extract.err run$run.extract.err") or die;
+#   safesystem ("\\cp -f extract.out run$run.extract.out") or die;
  safesystem ("\\cp -f $mert_outfile run$run.$mert_outfile") or die;
  safesystem ("\\cp -f $mert_logfile run$run.$mert_logfile") or die;
  safesystem ("touch $mert_logfile run$run.$mert_logfile") or die;
@ -985,7 +1065,7 @@ if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; };
 safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die;
 safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die;

-create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu);
+create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu, $sparse_weights_file);

 # just to be sure that we have the really last finished step marked
 open F, "> finished_step.txt" or die "Can't mark finished step";
@ -1040,6 +1120,11 @@ sub run_decoder {
    my ($featlist, $run, $need_to_normalize) = @_;
    my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
    my $filename = sprintf($filename_template, $run);
+    my $lsamp_filename = undef;
+    if ($___LATTICE_SAMPLES) {
+      my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out";
+      $lsamp_filename = sprintf($lsamp_filename_template, $run);
+    }
    
    # user-supplied parameters
    print "params = $___DECODER_FLAGS\n";
@ -1060,23 +1145,28 @@ sub run_decoder {
      $model_weights{$name} .= sprintf " %.6f", $vals[$i];
    }
    my $decoder_config = join(" ", values %model_weights);
+    $decoder_config .= " -weight-file run$run.sparse-weights" if -e "run$run.sparse-weights";
    print STDERR "DECODER_CFG = $decoder_config\n";
    print "decoder_config = $decoder_config\n";

+
    # run the decoder
-    my $nBest_cmd = "-n-best-size $___N_BEST_LIST_SIZE";
    my $decoder_cmd;
+    my $lsamp_cmd = "";
+    if ($___LATTICE_SAMPLES) {
+      $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES ";
+    }

    if (defined $___JOBS && $___JOBS > 0) {
-      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
+      $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
    } else {
-      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out";
+      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out";
    }

    safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n";

    sanity_check_order_of_lambdas($featlist, $filename);
-    return $filename;
+    return ($filename, $lsamp_filename);
 }


@ -1374,3 +1464,20 @@ sub submit_or_exec {
    safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'.";
  }
 }
+
+sub create_extractor_script
+{
+  my ($cmd, $outdir) = @_;
+  my $script_path = File::Spec->catfile($outdir, "extractor.sh");
+
+  open my $out, '>', $script_path
+      or die "Couldn't open $script_path for writing: $!\n";
+  print $out "#!/bin/bash\n";
+  print $out "cd $outdir\n";
+  print $out "$cmd\n";
+  close($out);
+
+  `chmod +x $script_path`;
+
+  return $script_path;
+}
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -47,9 +47,13 @@
 # 13 Oct 2004 Use alternative decoders (DWC)
 # Original version by Philipp Koehn

+use strict;
 use FindBin qw($Bin);
 use File::Basename;
 use File::Path;
+use File::Spec;
+use Cwd;
+
 my $SCRIPTS_ROOTDIR = $Bin;
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
@ -82,7 +86,10 @@ my $minimum_required_change_in_weights = 0.00001;

 my $verbose = 0;
 my $usage = 0; # request for --help
-my $___WORKING_DIR = "mert-work";
+
+# We assume that if you don't specify working directory,
+# we set the default is set to `pwd`/mert-work
+my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work");
 my $___DEV_F = undef; # required, input text to decode
 my $___DEV_E = undef; # required, basename of files with references
 my $___DECODER = undef; # required, pathname to the decoder executable
@ -144,10 +151,9 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
                                  # -1 means all previous, i.e. from iteration 1
                                  # 0 means no previous data, i.e. from actual iteration
                                  # 1 means 1 previous data , i.e. from the actual iteration and from the previous one
-                                  # and so on 
+                                  # and so on
 my $maximum_iterations = 25;

-use strict;
 use Getopt::Long;
 GetOptions(
  "working-dir=s" => \$___WORKING_DIR,
@ -1298,19 +1304,16 @@ sub submit_or_exec {
 sub create_extractor_script()
 {
  my ($cmd, $outdir) = @_;
+  my $script_path = File::Spec->catfile($outdir, "extractor.sh");

-  my $script_path = $outdir."/extractor.sh";
-
-  open(OUT,"> $script_path")
-    or die "Can't write $script_path";
-  print OUT "#!/bin/bash\n";
-  print OUT "cd $outdir\n";
-  print OUT $cmd."\n";
-  close(OUT);
+  open my $out, '>', $script_path
+      or die "Couldn't open $script_path for writing: $!\n";
+  print $out "#!/bin/bash\n";
+  print $out "cd $outdir\n";
+  print $out "$cmd\n";
+  close($out);

  `chmod +x $script_path`;

-  return $script_path;  
+  return $script_path;
 }
-
-