Merge ../mosesdecoder into perf_ff

2024-09-11 11:25:40 +03:00 · 2015-10-13 12:59:24 +00:00 · 2015-10-13 12:59:24 +00:00 · 9e2024aa3c
commit 9e2024aa3c
parent 10c2dabe3d 1fca3f8a75
38 changed files with 1892 additions and 181 deletions
--- a/4
+++ b/4
@ -54,7 +54,7 @@
 # --static                       forces static linking (the default will fall
 #                                back to shared)
 #
-# debug-symbols=on|off           include (default) or exclude debugging
+# debug-symbols=on|off           include or exclude (default) debugging 
 #                                information also known as -g
 # --notrace                      compiles without TRACE macros
 #
@ -298,6 +298,8 @@ contrib/server//mosesserver
 mm
 rephraser
 contrib/c++tokenizer//tokenizer
+contrib/expected-bleu-training//train-expected-bleu
+contrib/expected-bleu-training//prepare-expected-bleu-training
 ;


--- a/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
+++ b/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
@ -0,0 +1,223 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include "ExpectedBleuOptimizer.h"
+
+
+namespace ExpectedBleuTraining
+{
+
+
+void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount, 
+                                                const std::vector<float>& sBleu,
+                                                const std::vector<double>& overallScoreUntransformed, 
+                                                const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                                                bool maintainUpdateSet) 
+{
+
+  // compute xBLEU
+  double sumUntransformedScores = 0.0;
+  for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
+       overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
+  {
+    sumUntransformedScores += *overallScoreUntransformedIt;
+  }
+
+  double xBleu = 0.0;
+  assert(nBestSizeCount == overallScoreUntransformed.size());
+  std::vector<double> p;
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    if (sumUntransformedScores != 0) {
+      p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
+    } else {
+      p.push_back( 0 );
+    }
+    xBleu += p.back() * sBleu[ i ];
+  }
+
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    double D = sBleu[ i ] - xBleu;
+    for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
+         sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
+    {
+      const size_t name = sparseScoreIt->first;
+      float N = sparseScoreIt->second;
+      if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
+      {
+        m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D 
+              << " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
+        m_err.flush();
+        exit(1);
+      } else {
+        m_gradient[name] += p[i] * N * D;
+        if ( maintainUpdateSet )
+        {
+          m_updateSet.insert(name);
+        }
+      }
+    }
+  }
+
+  m_xBleu += xBleu;
+}
+
+
+void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_gradient.resize(nFeatures);
+}
+
+
+float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor, 
+                                       size_t batchSize,
+                                       bool useUpdateSet)
+{
+
+  float xBleu = m_xBleu / batchSize;
+
+  // update sparse scaling factors
+
+  if (useUpdateSet) {
+
+    for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
+    {
+      size_t name = *it;
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+    
+    m_updateSet.clear();
+
+  } else {
+
+    for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+    {
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+
+  }
+
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+
+
+void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
+                                                         std::vector<float>& sparseScalingFactor, 
+                                                         size_t batchSize)
+{
+  // regularization
+  if ( m_regularizationParameter != 0 )
+  {
+    m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+  } else {
+    // need to normalize by dividing by batchSize
+    m_gradient[name] /= batchSize;
+  }
+
+  // the actual update
+  sparseScalingFactor[name] += m_learningRate * m_gradient[name];
+
+  // discard scaling factors below a threshold
+  if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+  {
+    sparseScalingFactor[name] = 0;
+  }
+}
+
+
+void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  m_previousSparseScalingFactor.resize(nFeatures);
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_previousGradient.resize(nFeatures);
+  m_gradient.resize(nFeatures);
+  m_stepSize.resize(nFeatures, m_initialStepSize);
+}
+
+
+float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                                         const size_t batchSize)
+{
+
+  float xBleu = m_xBleu / batchSize;
+
+  // update sparse scaling factors
+
+  for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+  {
+    // Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
+
+    // regularization
+    if ( m_regularizationParameter != 0 )
+    {
+      m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+    }
+
+    // step size
+    int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
+    if (sign > 0) {
+      m_stepSize[name] *= m_increaseRate;
+    } else if (sign < 0) {
+      m_stepSize[name] *= m_decreaseRate;
+    }
+    if (m_stepSize[name] < m_minStepSize) {
+      m_stepSize[name] = m_minStepSize;
+    }
+    if (m_stepSize[name] > m_maxStepSize) {
+      m_stepSize[name] = m_maxStepSize;
+    }
+
+    // the actual update
+
+    m_previousGradient[name] = m_gradient[name];
+    if (sign >= 0) {
+      if (m_gradient[name] > 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] += m_stepSize[name];
+      } else if (m_gradient[name] < 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] -= m_stepSize[name];
+      }
+    } else { 
+      sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
+      // m_previousGradient[name] = 0;
+    }
+
+    // discard scaling factors below a threshold
+    if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+    {
+      sparseScalingFactor[name] = 0;
+    }
+  }
+
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+
+
+}
+
--- a/contrib/expected-bleu-training/ExpectedBleuOptimizer.h
+++ b/contrib/expected-bleu-training/ExpectedBleuOptimizer.h
@ -0,0 +1,117 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <boost/unordered_map.hpp>
+#include "util/file_stream.hh"
+
+
+namespace ExpectedBleuTraining
+{
+
+class ExpectedBleuOptimizer
+{
+public:
+
+  ExpectedBleuOptimizer(util::FileStream& err,
+                        float learningRate=1,
+                        float initialStepSize=0.001,
+                        float decreaseRate=0.5,
+                        float increaseRate=1.2,
+                        float minStepSize=1e-7,
+                        float maxStepSize=1,
+                        float floorAbsScalingFactor=0,
+                        float regularizationParameter=0)
+    : m_err(err)
+    , m_learningRate(learningRate)
+    , m_initialStepSize(initialStepSize)
+    , m_decreaseRate(decreaseRate)
+    , m_increaseRate(increaseRate)
+    , m_minStepSize(minStepSize)
+    , m_maxStepSize(maxStepSize)
+    , m_floorAbsScalingFactor(floorAbsScalingFactor)
+    , m_regularizationParameter(regularizationParameter)
+    , m_xBleu(0)
+  { }
+
+  void AddTrainingInstance(const size_t nBestSizeCount, 
+                           const std::vector<float>& sBleu,
+                           const std::vector<double>& overallScoreUntransformed, 
+                           const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                           bool maintainUpdateSet = false); 
+
+  void InitSGD(const std::vector<float>& sparseScalingFactor);
+
+  float UpdateSGD(std::vector<float>& sparseScalingFactor, 
+                  size_t batchSize,
+                  bool useUpdateSet = false);
+
+  void InitRPROP(const std::vector<float>& sparseScalingFactor);
+  
+  float UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                    const size_t batchSize);
+
+protected:
+
+  util::FileStream& m_err;
+
+  // for SGD
+  const float m_learningRate;
+
+  // for RPROP
+  const float m_initialStepSize;
+  const float m_decreaseRate;
+  const float m_increaseRate;
+  const float m_minStepSize;
+  const float m_maxStepSize;
+
+  std::vector<float> m_previousSparseScalingFactor;
+  std::vector<float> m_previousGradient;
+  std::vector<float> m_gradient;
+  std::vector<float> m_stepSize;
+
+  // other
+  const float m_floorAbsScalingFactor;
+  const float m_regularizationParameter;
+
+  double m_xBleu;
+
+  std::set<size_t> m_updateSet;
+
+
+  void UpdateSingleScalingFactorSGD(size_t name,
+                                    std::vector<float>& sparseScalingFactor, 
+                                    size_t batchSize);
+
+
+  inline int Sign(double x) 
+  {
+    if (x > 0) return 1;
+    if (x < 0) return -1;
+    return 0;
+  }
+};
+
+}
+
+
--- a/contrib/expected-bleu-training/Jamfile
+++ b/contrib/expected-bleu-training/Jamfile
@ -0,0 +1,2 @@
+exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
+exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;
--- a/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
+++ b/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
@ -0,0 +1,222 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include <vector>
+#include <string>
+#include <sstream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <boost/program_options.hpp>
+#include "util/file_stream.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+namespace po = boost::program_options;
+
+
+int main(int argc, char **argv)
+{
+  util::FileStream err(2);
+
+  std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
+  size_t maxNBestSize;
+
+  try {
+
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(), 
+       "input n-best list file")
+      ("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(), 
+       "output file for mapping between feature names and indices")
+      ("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(), 
+       "input file containing list of feature names to be ignored")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100), 
+       "limit of n-best list entries to be considered")
+      ;
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      std::cout << os.str() << '\n';
+      exit(0);
+    }
+
+    po::notify(vm);
+
+  } catch(std::exception& e) {
+
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  util::FilePiece ifsNBest(filenameNBestListIn.c_str());
+  util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
+  util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
+  util::FileStream ofsFeatureNames(fdFeatureNames.get());
+  util::FileStream ofsNBest(1);
+
+  boost::unordered_set<std::string> ignoreFeatureNames;
+  StringPiece line;
+
+  while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
+  {
+    if ( !line.empty() ) {
+      util::TokenIter<util::AnyCharacter> item(line, " \t=");
+      if ( item != item.end() )
+      {
+        ignoreFeatureNames.insert(item->as_string());
+      }
+      err << "ignoring " << *item << '\n';
+    }
+  }
+
+  size_t maxFeatureNamesIdx = 0;
+  boost::unordered_map<std::string, size_t> featureNames;
+
+  size_t sentenceIndex = 0;
+  size_t nBestSizeCount = 0;
+  size_t globalIndex = 0;
+
+  while ( ifsNBest.ReadLineOrEOF(line) )
+  {
+    util::TokenIter<util::MultiCharacter> item(line, " ||| ");
+
+    if ( item == item.end() )
+    {
+      err << "Error: flawed content in " << filenameNBestListIn << '\n';
+      exit(1);
+    }
+
+    size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
+
+    if ( sentenceIndex != sentenceIndexCurrent )
+    {
+      nBestSizeCount = 0;
+      sentenceIndex = sentenceIndexCurrent;
+    }
+
+    if ( nBestSizeCount < maxNBestSize )
+    {
+      // process n-best list entry
+      
+      StringPiece scores;
+      StringPiece decoderScore;
+      for (size_t nItem=1; nItem<=3; ++nItem) 
+      {
+        if ( ++item == item.end() ) {
+          err << "Error: flawed content in " << filenameNBestListIn << '\n';
+          exit(1);
+        }
+        if (nItem == 2) {
+          scores = *item;
+        }
+        if (nItem == 3) {
+          decoderScore = *item;
+        }
+      }
+
+      ofsNBest << sentenceIndex << ' ' 
+               << decoderScore;
+
+      util::TokenIter<util::SingleCharacter> token(scores, ' ');
+      std::string featureNameCurrent("ERROR");
+      std::string featureNameCurrentBase("ERROR");
+      bool ignore = false;
+      int scoreComponentIndex = 0;
+      
+      while ( token != token.end() )
+      {
+        if ( token->ends_with("=") )
+        {
+          scoreComponentIndex = 0;
+          featureNameCurrent = token->substr(0,token->size()-1).as_string();
+          size_t idx = featureNameCurrent.find_first_of('_');
+          if ( idx == StringPiece::npos ) {
+            featureNameCurrentBase = featureNameCurrent;
+          } else {
+            featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
+          }
+          ignore = false;
+          if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
+          {
+            ignore = true;
+          } else {
+            if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
+                 (ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
+            {
+              ignore = true;
+            } 
+          }
+        }
+        else
+        {
+          if ( !ignore )
+          {
+            float featureValueCurrent = atof( token->as_string().c_str() );;
+            if ( scoreComponentIndex > 0 )
+            {
+              std::ostringstream oss;
+              oss << scoreComponentIndex;
+              featureNameCurrent.append("+");
+            }
+            if ( featureValueCurrent != 0 )
+            {
+              boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
+
+              if ( featureName == featureNames.end() )
+              {
+                std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted = 
+                  featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
+                ++maxFeatureNamesIdx;
+                featureName = inserted.first;
+              }
+
+              ofsNBest << ' ' << featureName->second // feature name index
+                       << ' ' << *token;             // feature value
+            }
+            ++scoreComponentIndex;
+          }
+        }
+        ++token;
+      }
+      ofsNBest << '\n';
+      ++nBestSizeCount;
+    }
+    ++globalIndex;
+  }
+
+  ofsFeatureNames << maxFeatureNamesIdx << '\n';
+  for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
+       featureNamesIt!=featureNames.end(); ++featureNamesIt)
+  {
+    ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
+  }
+
+}
+
--- a/contrib/expected-bleu-training/TrainExpectedBleu.cpp
+++ b/contrib/expected-bleu-training/TrainExpectedBleu.cpp
@ -0,0 +1,379 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include "ExpectedBleuOptimizer.h"
+#include "util/file_stream.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include <sstream>
+#include <boost/program_options.hpp>
+
+using namespace ExpectedBleuTraining;
+namespace po = boost::program_options;
+
+
+int main(int argc, char **argv) {
+
+  util::FileStream out(1);
+  util::FileStream err(2);
+
+  size_t maxNBestSize;
+  size_t iterationLimit;
+  std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
+
+  bool ignoreDecoderScore;
+
+  float learningRate;
+  float initialStepSize;
+  float decreaseRate;
+  float increaseRate;
+  float minStepSize;
+  float maxStepSize;
+  float floorAbsScalingFactor;
+  float regularizationParameter;
+  bool printZeroWeights;
+  bool miniBatches;
+  std::string optimizerTypeStr;
+  size_t optimizerType = 0;
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
+
+  try {
+
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100), 
+       "limit of n-best list entries to be considered for training")
+      ("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50), 
+       "number of training iterations")
+      ("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(), 
+       "file containing sentence-level BLEU scores for all n-best list entries")
+      ("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(), 
+       "input n-best list file, in prepared format for expected BLEU training")
+      ("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(), 
+       "file containing mapping between feature names and indices")
+      ("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""), 
+       "file containing start values for scaling factors (optional)")
+      ("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0), 
+       "exclude decoder score from computation of posterior probability")
+      ("regularization", boost::program_options::value<float>(&regularizationParameter)->default_value(0), // e.g. 1e-5 
+       "regularization parameter; suggested value range: [1e-8,1e-5]")
+      ("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1), 
+       "learning rate for the SGD optimizer")
+      ("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0),  // e.g. 1e-7
+       "set scaling factor to 0 if below this absolute value after update")
+      ("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001),  // TODO: try 0.01 and 0.1
+       "initial step size for the RPROP optimizer")
+      ("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5), 
+       "decrease rate for the RPROP optimizer")
+      ("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2), 
+       "increase rate for the RPROP optimizer")
+      ("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7), 
+       "minimum step size for the RPROP optimizer")
+      ("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1), 
+       "maximum step size for the RPROP optimizer")
+      ("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0), 
+       "output scaling factors even if they are trained to 0")
+      ("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"), 
+       "optimizer type used for training (known algorithms: RPROP, SGD)")
+      ("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0), 
+       "update after every single sentence (SGD only)")
+      ;
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      out << os.str() << '\n';
+      out.flush();
+      exit(0);
+    }
+
+    po::notify(vm);
+
+  } catch(std::exception& e) {
+
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
+  } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
+  } else {
+    err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  
+
+  util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
+
+  StringPiece lineFeatureName;
+  if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) 
+  {
+    err << "Error: flawed content in " << filenameFeatureNames << '\n';
+    err.flush();
+    exit(1);
+  }
+  size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
+
+  std::vector<std::string> featureNames(maxFeatureNamesIdx);
+  boost::unordered_map<std::string, size_t> featureIndexes;
+  for (size_t i=0; i<maxFeatureNamesIdx; ++i) 
+  {
+    if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
+      err << "Error: flawed content in " << filenameFeatureNames << '\n';
+      err.flush();
+      exit(1);
+    }
+    util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
+    size_t featureIndexCurrent = atol( token->as_string().c_str() );
+    token++;
+    featureNames[featureIndexCurrent] = token->as_string();
+    featureIndexes[token->as_string()] = featureIndexCurrent;
+  }
+
+
+  std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
+  std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
+
+  // read initial weights, if any given
+
+  if ( filenameInitialWeights.length() != 0 ) 
+  {
+    util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
+
+    StringPiece lineInitialWeight;
+    if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
+      err << "Error: flawed content in " << filenameInitialWeights << '\n';
+      err.flush();
+      exit(1);
+    }
+    do {
+      util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
+      boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
+      if ( found == featureIndexes.end() ) {
+        err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
+        err.flush();
+        exit(1);
+      }
+      token++;
+      sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
+    } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
+  }
+
+  // train
+
+  ExpectedBleuOptimizer optimizer(err, 
+                                  learningRate, 
+                                  initialStepSize, 
+                                  decreaseRate, 
+                                  increaseRate,
+                                  minStepSize, 
+                                  maxStepSize, 
+                                  floorAbsScalingFactor, 
+                                  regularizationParameter);
+
+  if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+  {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else {
+    err << "Error: unknown optimizer type" << '\n';
+    err.flush();
+    exit(1);
+  }
+  
+  for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration) 
+  {
+    util::FilePiece ifsSBleu(filenameSBleu.c_str());
+    util::FilePiece ifsNBest(filenameNBestList.c_str());
+
+    out << "### ITERATION " << nIteration << '\n' << '\n';
+
+    size_t sentenceIndex = 0;
+    size_t batchSize = 0;
+    size_t nBestSizeCount = 0;
+    size_t globalIndex = 0;
+    StringPiece lineNBest;
+    std::vector<double> overallScoreUntransformed;
+    std::vector<float> sBleu;
+    float xBleu = 0;
+    // double expPrecisionCorrection = 0.0;
+
+    while ( ifsNBest.ReadLineOrEOF(lineNBest) )
+    {
+
+      util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
+
+      if ( token == token.end() )
+      {
+        err << "Error: flawed content in " << filenameNBestList << '\n';
+        err.flush();
+        exit(1);
+      }
+
+      size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
+      token++;
+
+      if ( sentenceIndex != sentenceIndexCurrent )
+      {
+
+        if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+        {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
+        } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
+
+          if ( miniBatches ) {
+            xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+            // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
+            // for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+            // {
+            //   if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+            //   {
+            //     out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+            //   }
+            // }
+            // out << '\n';
+            // out.flush();
+          }
+        } else {
+           err << "Error: unknown optimizer type" << '\n';
+           err.flush();
+           exit(1);
+        }
+
+        for (size_t i=0; i<nBestSizeCount; ++i) {
+          sparseScore[i].clear();
+        }
+        nBestSizeCount = 0;
+        overallScoreUntransformed.clear();
+        sBleu.clear();
+        sentenceIndex = sentenceIndexCurrent;
+        ++batchSize;
+      }
+
+      StringPiece lineSBleu;
+      if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
+      {
+        err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
+        err.flush();
+        exit(1);
+      }
+
+      if ( nBestSizeCount < maxNBestSize )
+      {
+        // retrieve sBLEU
+
+        float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
+        sBleu.push_back(sBleuCurrent);
+
+        // process n-best list entry
+
+        if ( token == token.end() )
+        {
+          err << "Error: flawed content in " << filenameNBestList << '\n';
+          err.flush();
+          exit(1);
+        }
+        double scoreCurrent = 0; 
+        if ( !ignoreDecoderScore ) 
+        {
+          scoreCurrent = atof( token->as_string().c_str() ); // decoder score 
+        }
+        token++;
+
+        // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
+        // {
+        //   expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
+        // }
+
+        while (token != token.end())
+        {
+          size_t featureNameCurrent = atol( token->as_string().c_str() );
+          token++;
+          float featureValueCurrent = atof( token->as_string().c_str() );
+          sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
+          scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
+          token++;
+        }
+
+        // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
+        overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
+
+        ++nBestSizeCount;
+      }
+      ++globalIndex;
+    }
+
+    if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+    {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
+      xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
+      out << "xBLEU= " << xBleu << '\n';
+    } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
+      if ( miniBatches ) {
+        xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+        xBleu /= batchSize;
+      } else {
+        xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
+      }
+      out << "xBLEU= " << xBleu << '\n';
+    } else {
+      err << "Error: unknown optimizer type" << '\n';
+      err.flush();
+      exit(1);
+    }
+
+    for (size_t i=0; i<nBestSizeCount; ++i) {
+      sparseScore[i].clear();
+    }
+    nBestSizeCount = 0;
+    overallScoreUntransformed.clear();
+    sBleu.clear();
+
+    out << '\n';
+
+    for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+    {
+      if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+      {
+        out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+      }
+    }
+
+    out << '\n';
+    out.flush();
+  }
+
+}
+
--- a/contrib/sigtest-filter/Makefile
+++ b/contrib/sigtest-filter/Makefile
@ -1,5 +1,5 @@
 SALMDIR=/Users/hieuhoang/workspace/salm
-FLAVOR?=o64
+FLAVOR?=o32
 INC=-I$(SALMDIR)/Src/Shared -I$(SALMDIR)/Src/SuffixArrayApplications -I$(SALMDIR)/Src/SuffixArrayApplications/SuffixArraySearch
 OBJS=$(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArrayApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArraySearchApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_String.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_IDVocabulary.$(FLAVOR)

--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@ -110,13 +110,13 @@ public:
 BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
                             , BitmapContainer &parent
                             , const TranslationOptionList &translations
-                             , const SquareMatrix &futureScore,
+                             , const SquareMatrix &futureScores,
                             const InputType& itype)
  : m_initialized(false)
  , m_prevBitmapContainer(prevBitmapContainer)
  , m_parent(parent)
  , m_translations(translations)
-  , m_futurescore(futureScore)
+  , m_futureScores(futureScores)
  , m_seenPosition()
 {

@ -195,6 +195,10 @@ BackwardsEdge::Initialize()
    return;
  }

+  const WordsBitmap &bm = m_hypotheses[0]->GetWordsBitmap();
+  const WordsRange &newRange = m_translations.Get(0)->GetSourceWordsRange();
+  m_futureScore = m_futureScores.CalcFutureScore2(bm, newRange.GetStartPos(), newRange.GetEndPos());
+
  Hypothesis *expanded = CreateHypothesis(*m_hypotheses[0], *m_translations.Get(0));
  m_parent.Enqueue(0, 0, expanded, this);
  SetSeenPosition(0, 0);
@ -211,7 +215,7 @@ Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const
  IFVERBOSE(2) {
    hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
  }
-  newHypo->EvaluateWhenApplied(m_futurescore);
+  newHypo->EvaluateWhenApplied(m_futureScore);

  return newHypo;
 }
--- a/moses/BitmapContainer.h
+++ b/moses/BitmapContainer.h
@ -161,7 +161,8 @@ private:
  const BitmapContainer &m_prevBitmapContainer;
  BitmapContainer &m_parent;
  const TranslationOptionList &m_translations;
-  const SquareMatrix &m_futurescore;
+  const SquareMatrix &m_futureScores;
+  float m_futureScore;

  std::vector< const Hypothesis* > m_hypotheses;
  boost::unordered_set< int > m_seenPosition;
@ -180,7 +181,7 @@ public:
  BackwardsEdge(const BitmapContainer &prevBitmapContainer
                , BitmapContainer &parent
                , const TranslationOptionList &translations
-                , const SquareMatrix &futureScore,
+                , const SquareMatrix &futureScores,
                const InputType& source);
  ~BackwardsEdge();

--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@ -73,6 +73,7 @@
 #include "moses/FF/VW/VWFeatureSourceBigrams.h"
 #include "moses/FF/VW/VWFeatureSourceIndicator.h"
 #include "moses/FF/VW/VWFeatureSourcePhraseInternal.h"
+#include "moses/FF/VW/VWFeatureSourceSenseWindow.h"
 #include "moses/FF/VW/VWFeatureSourceWindow.h"
 #include "moses/FF/VW/VWFeatureTargetBigrams.h"
 #include "moses/FF/VW/VWFeatureTargetIndicator.h"
@ -279,6 +280,7 @@ FeatureRegistry::FeatureRegistry()
  MOSES_FNAME(VWFeatureSourceBigrams);
  MOSES_FNAME(VWFeatureSourceIndicator);
  MOSES_FNAME(VWFeatureSourcePhraseInternal);
+  MOSES_FNAME(VWFeatureSourceSenseWindow);
  MOSES_FNAME(VWFeatureSourceWindow);
  MOSES_FNAME(VWFeatureTargetBigrams);
  MOSES_FNAME(VWFeatureTargetPhraseInternal);
--- a/moses/FF/VW/VWFeatureSourceSenseWindow.h
+++ b/moses/FF/VW/VWFeatureSourceSenseWindow.h
@ -0,0 +1,141 @@
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <boost/foreach.hpp>
+#include "ThreadLocalByFeatureStorage.h"
+#include "VWFeatureSource.h"
+#include "moses/Util.h"
+
+/*
+ * Produces features from factors in the following format:
+ * wordsense1:0.25^wordsense1:0.7^wordsense3:0.05
+ *
+ * This is useful e.g. for including different possible word senses as features weighted
+ * by their probability.
+ *
+ * By default, features are extracted from a small context window around the current
+ * phrase and from within the phrase.
+ */
+
+namespace Moses
+{
+
+class VWFeatureSourceSenseWindow : public VWFeatureSource
+{
+public:
+  VWFeatureSourceSenseWindow(const std::string &line)
+    : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
+    ReadParameters();
+
+    // Call this last
+    VWFeatureBase::UpdateRegister();
+  }
+
+  // precompute feature strings for each input sentence
+  virtual void InitializeForInput(ttasksptr const& ttask) {
+    InputType const& input = *(ttask->GetSource().get());
+
+    std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
+    std::vector<std::string>& forms = *m_tlsForms.GetStored();
+    senses.clear();
+    forms.clear();
+
+    senses.resize(input.GetSize());
+    forms.resize(input.GetSize());
+
+    for (size_t i = 0; i < input.GetSize(); i++) {
+      senses[i] = GetSenses(input, i);
+      forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
+    }
+  }
+
+  void operator()(const InputType &input
+                  , const InputPath &inputPath
+                  , const WordsRange &sourceRange
+                  , Discriminative::Classifier &classifier) const {
+    int begin = sourceRange.GetStartPos();
+    int end   = sourceRange.GetEndPos() + 1;
+    int inputLen = input.GetSize();
+
+    const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
+    const std::vector<std::string>& forms = *m_tlsForms.GetStored();
+
+    // before current phrase
+    for (int i = std::max(0, begin - m_size); i < begin; i++) {
+      BOOST_FOREACH(const Sense &sense, senses[i]) {
+        classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+        classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob);
+      }
+    }
+
+    // within current phrase
+    for (int i = begin; i < end; i++) {
+      BOOST_FOREACH(const Sense &sense, senses[i]) {
+        classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+        classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob);
+      }
+    }
+
+    // after current phrase
+    for (int i = end; i < std::min(end + m_size, inputLen); i++) {
+      BOOST_FOREACH(const Sense &sense, senses[i]) {
+        classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+        classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob);
+      }
+    }
+  }
+
+  virtual void SetParameter(const std::string& key, const std::string& value) {
+    if (key == "size") {
+      m_size = Scan<size_t>(value);
+    } else if (key == "lexicalized") {
+      m_lexicalized = Scan<bool>(value);
+    } else {
+      VWFeatureSource::SetParameter(key, value);
+    }
+  }
+
+private:
+  static const int DEFAULT_WINDOW_SIZE = 3;
+
+  struct Sense {
+    std::string m_label;
+    float m_prob;
+  };
+
+  typedef std::vector<Sense> WordSenses;
+  typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
+  typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;
+
+  TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word
+  TLSWordForms m_tlsForms; // word forms for each input sentence
+
+
+  std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
+    std::string w = GetWord(input, pos);
+    std::vector<std::string> senseTokens = Tokenize(w, "^");
+
+    std::vector<Sense> out(senseTokens.size());
+    for (size_t i = 0; i < senseTokens.size(); i++) {
+      std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
+      if (senseColumns.size() != 2) {
+        UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
+      }
+      out[i].m_label = senseColumns[0];
+      out[i].m_prob = Scan<float>(senseColumns[1]);
+    }
+
+    return out;
+  }
+
+  // assuming that word surface form is always factor 0, output the word form
+  inline std::string GetWordForm(const InputType &input, size_t pos) const {
+    return input.GetWord(pos).GetString(0).as_string();
+  }
+
+  bool m_lexicalized;
+  int m_size;
+};
+
+}
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@ -226,7 +226,7 @@ EvaluateWhenApplied(const StatelessFeatureFunction& slff)
 */
 void
 Hypothesis::
-EvaluateWhenApplied(const SquareMatrix &futureScore)
+EvaluateWhenApplied(float futureScore)
 {
  IFVERBOSE(2) {
    m_manager.GetSentenceStats().StartTimeOtherScore();
@ -263,7 +263,7 @@ EvaluateWhenApplied(const SquareMatrix &futureScore)
  }

  // FUTURE COST
-  m_futureScore = futureScore.CalcFutureScore( m_sourceCompleted );
+  m_futureScore = futureScore;

  // TOTAL
  m_totalScore = m_currScoreBreakdown.GetWeightedScore() + m_futureScore;
--- a/moses/Hypothesis.h
+++ b/moses/Hypothesis.h
@ -146,7 +146,7 @@ public:
    return m_currTargetWordsRange.GetNumWordsCovered();
  }

-  void EvaluateWhenApplied(const SquareMatrix &futureScore);
+  void EvaluateWhenApplied(float futureScore);

  int GetId()const {
    return m_id;
--- a/moses/SearchNormal.cpp
+++ b/moses/SearchNormal.cpp
@ -248,14 +248,16 @@ ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos
  // early discarding: check if hypothesis is too bad to build
  // this idea is explained in (Moore&Quirk, MT Summit 2007)
  float expectedScore = 0.0f;
+
+  const WordsBitmap &sourceCompleted = hypothesis.GetWordsBitmap();
+  float futureScore = m_transOptColl.GetFutureScore().CalcFutureScore2( sourceCompleted, startPos, endPos );
+
  if (m_options.search.UseEarlyDiscarding()) {
    // expected score is based on score of current hypothesis
    expectedScore = hypothesis.GetScore();

    // add new future score estimate
-    expectedScore +=
-      m_transOptColl.GetFutureScore()
-      .CalcFutureScore(hypothesis.GetWordsBitmap(), startPos, endPos);
+    expectedScore += futureScore;
  }

  // loop through all translation options
@ -264,7 +266,7 @@ ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos
  if (!tol) return;
  TranslationOptionList::const_iterator iter;
  for (iter = tol->begin() ; iter != tol->end() ; ++iter) {
-    ExpandHypothesis(hypothesis, **iter, expectedScore);
+    ExpandHypothesis(hypothesis, **iter, expectedScore, futureScore);
  }
 }

@ -277,7 +279,10 @@ ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos
 * \param expectedScore base score for early discarding
 *        (base hypothesis score plus future score estimation)
 */
-void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt, float expectedScore)
+void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis,
+                                    const TranslationOption &transOpt,
+                                    float expectedScore,
+                                    float futureScore)
 {
  const StaticData &staticData = StaticData::Instance();
  SentenceStats &stats = m_manager.GetSentenceStats();
@ -293,7 +298,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat
      stats.StopTimeBuildHyp();
    }
    if (newHypo==NULL) return;
-    newHypo->EvaluateWhenApplied(m_transOptColl.GetFutureScore());
+    newHypo->EvaluateWhenApplied(futureScore);
  } else
    // early discarding: check if hypothesis is too bad to build
  {
--- a/moses/SearchNormal.h
+++ b/moses/SearchNormal.h
@ -44,8 +44,10 @@ protected:
  ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos);

  virtual void
-  ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt,
-                   float expectedScore);
+  ExpandHypothesis(const Hypothesis &hypothesis,
+                   const TranslationOption &transOpt,
+                   float expectedScore,
+                   float futureScore);

 public:
  SearchNormal(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl);
--- a/moses/SquareMatrix.cpp
+++ b/moses/SquareMatrix.cpp
@ -76,7 +76,7 @@ float SquareMatrix::CalcFutureScore( WordsBitmap const &bitmap ) const
 * /param endPos end of the span that is added to the coverage
 */

-float SquareMatrix::CalcFutureScore( WordsBitmap const &bitmap, size_t startPos, size_t endPos ) const
+float SquareMatrix::CalcFutureScore2( WordsBitmap const &bitmap, size_t startPos, size_t endPos ) const
 {
  const size_t notInGap= numeric_limits<size_t>::max();
  float futureScore = 0.0f;
--- a/moses/SquareMatrix.h
+++ b/moses/SquareMatrix.h
@ -62,7 +62,7 @@ public:
    m_array[startPos * m_size + endPos] = value;
  }
  float CalcFutureScore( WordsBitmap const& ) const;
-  float CalcFutureScore( WordsBitmap const&, size_t startPos, size_t endPos ) const;
+  float CalcFutureScore2( WordsBitmap const&, size_t startPos, size_t endPos ) const;

  TO_STRING();
 };
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
@ -98,11 +98,11 @@ size_t BlockHashIndex::GetFprint(const char* key) const

 size_t BlockHashIndex::GetHash(size_t i, const char* key)
 {
-#ifdef WITH_THREADS
-  boost::mutex::scoped_lock lock(m_mutex);
-#endif
-  if(m_hashes[i] == 0)
-    LoadRange(i);
+//#ifdef WITH_THREADS
+//  boost::mutex::scoped_lock lock(m_mutex);
+//#endif
+  //if(m_hashes[i] == 0)
+    //LoadRange(i);
 #ifdef HAVE_CMPH
  size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
 #else
@ -322,6 +322,7 @@ size_t BlockHashIndex::GetSize() const

 void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
 {
+  /*
 #ifdef WITH_THREADS
  boost::mutex::scoped_lock lock(m_mutex);
 #endif
@ -338,7 +339,7 @@ void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
    for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
        it != lastLoaded.rend(); it++)
      DropRange(it->second);
-  }
+  }*/
 }

 void BlockHashIndex::CalcHash(size_t current, void* source_void)
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@ -155,10 +155,12 @@ LexicalReorderingTableCompact::
 Load(std::string filePath)
 {
  std::FILE* pFile = std::fopen(filePath.c_str(), "r");
-  if(m_inMemory)
+  UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");
+
+  //if(m_inMemory)
    m_hash.Load(pFile);
-  else
-    m_hash.LoadIndex(pFile);
+  //else
+    //m_hash.LoadIndex(pFile);

  size_t read = 0;
  read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <algorithm>
 #include <sys/stat.h>
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/thread/tss.hpp>

 #include "PhraseDictionaryCompact.h"
 #include "moses/FactorCollection.h"
@ -43,6 +44,8 @@ using namespace boost::algorithm;
 namespace Moses
 {

+typename PhraseDictionaryCompact::SentenceCache PhraseDictionaryCompact::m_sentenceCache;
+
 PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
  :PhraseDictionary(line, true)
  ,m_inMemory(true)
@ -75,12 +78,12 @@ void PhraseDictionaryCompact::Load()
  std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");

  size_t indexSize;
-  if(m_inMemory)
+  //if(m_inMemory)
    // Load source phrase index into memory
    indexSize = m_hash.Load(pFile);
-  else
+ // else
    // Keep source phrase index on disk
-    indexSize = m_hash.LoadIndex(pFile);
+    //indexSize = m_hash.LoadIndex(pFile);

  size_t coderSize = m_phraseDecoder->Load(pFile);

@ -162,13 +165,9 @@ PhraseDictionaryCompact::~PhraseDictionaryCompact()

 void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc)
 {
-#ifdef WITH_THREADS
-  boost::mutex::scoped_lock lock(m_sentenceMutex);
-  PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
-#else
-  PhraseCache &ref = m_sentenceCache;
-#endif
-  ref.push_back(tpc);
+  if(!m_sentenceCache.get())
+    m_sentenceCache.reset(new PhraseCache());
+  m_sentenceCache->push_back(tpc);
 }

 void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,
@ -176,23 +175,16 @@ void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,

 void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source)
 {
-  if(!m_inMemory)
-    m_hash.KeepNLastRanges(0.01, 0.2);
+  if(!m_sentenceCache.get())
+    m_sentenceCache.reset(new PhraseCache());

  m_phraseDecoder->PruneCache();
-
-#ifdef WITH_THREADS
-  boost::mutex::scoped_lock lock(m_sentenceMutex);
-  PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
-#else
-  PhraseCache &ref = m_sentenceCache;
-#endif
-
-  for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
+  for(PhraseCache::iterator it = m_sentenceCache->begin(); 
+      it != m_sentenceCache->end(); it++)
    delete *it;

  PhraseCache temp;
-  temp.swap(ref);
+  temp.swap(*m_sentenceCache);

  ReduceCache();
 }
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
@ -52,13 +52,8 @@ protected:
  bool m_useAlignmentInfo;

  typedef std::vector<TargetPhraseCollection*> PhraseCache;
-#ifdef WITH_THREADS
-  boost::mutex m_sentenceMutex;
-  typedef std::map<boost::thread::id, PhraseCache> SentenceCache;
-#else
-  typedef PhraseCache SentenceCache;
-#endif
-  SentenceCache m_sentenceCache;
+  typedef boost::thread_specific_ptr<PhraseCache> SentenceCache;
+  static SentenceCache m_sentenceCache;

  BlockHashIndex m_hash;
  PhraseDecoder* m_phraseDecoder;
--- a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
+++ b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
@ -0,0 +1,32 @@
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "TargetPhraseCollectionCache.h"
+
+namespace Moses
+{
+
+
+boost::thread_specific_ptr<typename TargetPhraseCollectionCache::CacheMap> 
+TargetPhraseCollectionCache::m_phraseCache;
+
+}
+
--- a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
+++ b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
@ -26,12 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <set>
 #include <vector>

-#ifdef WITH_THREADS
-#ifdef BOOST_HAS_PTHREADS
-#include <boost/thread/mutex.hpp>
-#endif
-#endif
-
+#include <boost/thread/tss.hpp>
 #include <boost/shared_ptr.hpp>

 #include "moses/Phrase.h"
@ -63,12 +58,7 @@ private:
  };

  typedef std::map<Phrase, LastUsed> CacheMap;
-
-  CacheMap m_phraseCache;
-
-#ifdef WITH_THREADS
-  boost::mutex m_mutex;
-#endif
+  static boost::thread_specific_ptr<CacheMap> m_phraseCache;

 public:

@ -80,31 +70,37 @@ public:
  }

  iterator Begin() {
-    return m_phraseCache.begin();
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->begin();
  }

  const_iterator Begin() const {
-    return m_phraseCache.begin();
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->begin();
  }

  iterator End() {
-    return m_phraseCache.end();
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->end();
  }

  const_iterator End() const {
-    return m_phraseCache.end();
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->end();
  }

  /** retrieve translations for source phrase from persistent cache **/
  void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
             size_t bitsLeft = 0, size_t maxRank = 0) {
-#ifdef WITH_THREADS
-    boost::mutex::scoped_lock lock(m_mutex);
-#endif
-
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
    // check if source phrase is already in cache
-    iterator it = m_phraseCache.find(sourcePhrase);
-    if(it != m_phraseCache.end())
+    iterator it = m_phraseCache->find(sourcePhrase);
+    if(it != m_phraseCache->end())
      // if found, just update clock
      it->second.m_clock = clock();
    else {
@ -113,19 +109,17 @@ public:
        TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
        tpv_temp->resize(maxRank);
        std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
-        m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
+        (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
      } else
-        m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
+        (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
    }
  }

  std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
-#ifdef WITH_THREADS
-    boost::mutex::scoped_lock lock(m_mutex);
-#endif
-
-    iterator it = m_phraseCache.find(sourcePhrase);
-    if(it != m_phraseCache.end()) {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    iterator it = m_phraseCache->find(sourcePhrase);
+    if(it != m_phraseCache->end()) {
      LastUsed &lu = it->second;
      lu.m_clock = clock();
      return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
@ -135,34 +129,31 @@ public:

  // if cache full, reduce
  void Prune() {
-#ifdef WITH_THREADS
-    boost::mutex::scoped_lock lock(m_mutex);
-#endif
-
-    if(m_phraseCache.size() > m_max * (1 + m_tolerance)) {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    if(m_phraseCache->size() > m_max * (1 + m_tolerance)) {
      typedef std::set<std::pair<clock_t, Phrase> > Cands;
      Cands cands;
-      for(CacheMap::iterator it = m_phraseCache.begin();
-          it != m_phraseCache.end(); it++) {
+      for(CacheMap::iterator it = m_phraseCache->begin();
+          it != m_phraseCache->end(); it++) {
        LastUsed &lu = it->second;
        cands.insert(std::make_pair(lu.m_clock, it->first));
      }

      for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
        const Phrase& p = it->second;
-        m_phraseCache.erase(p);
+        m_phraseCache->erase(p);

-        if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
+        if(m_phraseCache->size() < (m_max * (1 - m_tolerance)))
          break;
      }
    }
  }

  void CleanUp() {
-#ifdef WITH_THREADS
-    boost::mutex::scoped_lock lock(m_mutex);
-#endif
-    m_phraseCache.clear();
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    m_phraseCache->clear();
  }

 };
--- a/scripts/OSM/OSM-Train.perl
+++ b/scripts/OSM/OSM-Train.perl
@ -14,7 +14,7 @@ print STDERR "Training OSM - Start\n".`date`;
 my $ORDER = 5;
 my $OUT_DIR = "/tmp/osm.$$";
 my $___FACTOR_DELIMITER = "|";
-my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ);
+my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$LMPLZ,$DOMAIN,$TUNE,$INP_EXT,$OP_EXT);

 my $cmd;

@ -29,6 +29,10 @@ die("ERROR: wrong syntax when invoking OSM-Train.perl")
 		       'alignment=s' => \$ALIGNMENT,
 		       'order=i' => \$ORDER,
 		       'factor=s' => \$FACTOR,
+		       'input-extension=s' => \$INP_EXT,
+		       'output-extension=s' => \$OP_EXT,	
+		       'tune=s' => \$TUNE,
+		       'domain=s' => \$DOMAIN,
 		       'srilm-dir=s' => \$SRILM_DIR,
 		       'lmplz=s' => \$LMPLZ,
 		       'out-dir=s' => \$OUT_DIR);
@ -74,19 +78,172 @@ if (defined($FACTOR)) {

    `ln -s $corpus_stem_f.$factor_val.$ext_f $OUT_DIR/$factor_val/f`;
    `ln -s $corpus_stem_e.$factor_val.$ext_e $OUT_DIR/$factor_val/e`;
-     create_model($factor_val);
+
+     if (defined($TUNE) && defined($DOMAIN) && $factor_val eq "0-0")
+     {
+
+	die("ERROR: For Interpolated OSM model, you need SRILM")
+		unless -e $SRILM_DIR;
+
+	`mkdir $OUT_DIR/TUNE`;
+
+	`$MOSES_SRC_DIR/scripts/training/reduce-factors.perl --corpus $TUNE.$INP_EXT --reduced $OUT_DIR/TUNE/tune.$INP_EXT --factor 0`;
+	`$MOSES_SRC_DIR/scripts/training/reduce-factors.perl --corpus $TUNE.$OP_EXT --reduced $OUT_DIR/TUNE/tune.$OP_EXT --factor 0`;
+	
+	create_interpolated_model($factor_val);
+     }
+     else
+     {		
+       create_model($factor_val);
+     }
  }
 }
 else {
    `ln -s $CORPUS_F $OUT_DIR/f`;
    `ln -s $CORPUS_E $OUT_DIR/e`;
-     create_model("");
+
+
+
+	if (defined($TUNE) && defined($DOMAIN))
+     	{
+
+		die("ERROR: For Interpolated OSM model, you need SRILM")
+		unless -e $SRILM_DIR;
+
+		`mkdir $OUT_DIR/TUNE`;
+
+		`cp $TUNE.$INP_EXT --reduced $OUT_DIR/TUNE/tune.$INP_EXT`;
+		`cp $TUNE.$OP_EXT --reduced $OUT_DIR/TUNE/tune.$OP_EXT`;
+
+		 create_interpolated_model("");
+     	}
+     	else
+     	{		
+	     create_model("");
+     	}
+
 }

 # create model

 print "Training OSM - End".`date`;

+
+sub read_domain_file{
+
+	open(my $fh, '<:encoding(UTF-8)', $DOMAIN)
+		  or die "Could not open file '$DOMAIN' $!";
+
+	my @corpora;
+
+	while (my $row = <$fh>) {
+		  chomp $row;
+
+		my ($num,$dom) = split(/\ /,$row);
+
+		push @corpora, $dom;
+		push @corpora, $num;
+		 
+	  	#print "$dom $num\n";
+	}
+
+	return @corpora;
+
+}
+
+sub create_interpolated_model{
+
+			
+	my ($factor_val) = @_;
+	my $fNum = 0;
+	my $dName;
+	my @corpora = read_domain_file();
+	my $i = 0;
+
+	while($i < scalar(@corpora))
+	{
+		$dName = "$OUT_DIR/$factor_val/$corpora[$i]";
+		$cmd = "mkdir $dName";
+		`$cmd`;
+		
+		my $cal = $corpora[$i+1] - $fNum;
+		$cmd = "head -$corpora[$i+1] $OUT_DIR/$factor_val/e | tail -$cal > $dName/e";
+		`$cmd`;
+		$cmd = "head -$corpora[$i+1] $OUT_DIR/$factor_val/f | tail -$cal > $dName/f";
+		`$cmd`;
+		$cmd = "head -$corpora[$i+1] $OUT_DIR/align | tail -$cal > $dName/align";
+		`$cmd`;
+
+		#print STDERR "Flip Alignment\n";
+		#`$MOSES_SRC_DIR/scripts/OSM/flipAlignment.perl $dName/alignment > $dName/align`;
+
+		print STDERR "Extracting Singletons\n";
+		$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $dName/e $dName/f $dName/align > $dName/Singletons";
+		print STDERR "Executing: $cmd\n";
+		`$cmd`;
+
+		print STDERR "Converting Bilingual Sentence Pair into Operation Corpus\n";
+		$cmd = "$MOSES_SRC_DIR/bin/generateSequences $dName/e $dName/f $dName/align $dName/Singletons > $dName/opCorpus";
+		print STDERR "Executing: $cmd\n";
+		`$cmd`;
+
+		print STDERR "Learning Operation Sequence Translation Model\n";
+		if (defined($SRILM_DIR)) {
+    			$cmd = "$SRILM_DIR/ngram-count -kndiscount -order $ORDER -unk -text $dName/opCorpus -lm $dName/operationLM 2>> /dev/stderr";
+    			print STDERR "Executing: $cmd\n";
+		 	`$cmd`;
+		}
+		else {
+  			$cmd = "$LMPLZ -T $OUT_DIR --order $ORDER --text $dName/opCorpus --arpa $dName/operationLM --prune 0 0 1 2>> /dev/stderr";
+			  print STDERR "Executing: $cmd\n";
+			  `$cmd`;
+		}
+	
+		print "$cmd\n";
+		$fNum = $corpora[$i+1];
+		$i = $i+2;
+	}
+	
+
+	`$MOSES_SRC_DIR/scripts/OSM/flipAlignment.perl $TUNE.align > $OUT_DIR/TUNE/tune.align`;
+	
+	print STDERR "Extracting Singletons\n";
+	$cmd = "$MOSES_SRC_DIR/scripts/OSM/extract-singletons.perl $OUT_DIR/TUNE/tune.$OP_EXT $OUT_DIR/TUNE/tune.$INP_EXT $OUT_DIR/TUNE/tune.align > $OUT_DIR/TUNE/Singletons";
+	print STDERR "Executing: $cmd\n";
+	`$cmd`;
+
+	print STDERR "Converting Bilingual Sentence Pair into Operation Corpus\n";
+	$cmd = "$MOSES_SRC_DIR/bin/generateSequences $OUT_DIR/TUNE/tune.$OP_EXT $OUT_DIR/TUNE/tune.$INP_EXT $OUT_DIR/TUNE/tune.align $OUT_DIR/TUNE/Singletons > $OUT_DIR/TUNE/tune.opCorpus";
+	print STDERR "Executing: $cmd\n";
+	`$cmd`;
+
+
+	print STDERR "Interpolating OSM Models\n";
+	$cmd = "$MOSES_SRC_DIR/scripts/ems/support/interpolate-lm.perl --tuning $OUT_DIR/TUNE/tune.opCorpus --name $OUT_DIR/$factor_val/operationLM --srilm $SRILM_DIR --lm ";
+
+	$i = 0;
+	$dName = "$OUT_DIR/$factor_val/$corpora[$i]/operationLM";
+	$cmd = $cmd . $dName;
+	$i = $i+2;
+
+	while($i < scalar(@corpora))
+	{
+		$cmd = $cmd . ","; 	
+		$dName = "$OUT_DIR/$factor_val/$corpora[$i]/operationLM";
+		$cmd = $cmd . $dName;
+		$i = $i+2;
+	}	
+
+	print STDERR "Executing: $cmd\n";
+	`$cmd`;
+
+	print STDERR "Binarizing\n";
+	$cmd = "$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin";
+	print STDERR "Executing: $cmd\n";
+	system($cmd) == 0 or die("system $cmd failed: $?");	
+
+}
+
 sub create_model{
 my ($factor_val) = @_;

--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -391,6 +391,28 @@ alignment-symmetrization-method = grow-diag-final-and
 #operation-sequence-model-order = 5
 #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
 #
+# OR if you want to use with SRILM
+#
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64"
+
+## Class-based Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Koehn, Schmid, Fraser (COLING, 2014).
+#Investigating the Usefulness of Generalized Word Representations in SMT
+#
+#operation-sequence-model-settings = "--factor 0-0+1-1"
+
+## Interpolated Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015).
+# Using Joint Models for Domain Adaptation in Statistical Machine Translation
+#
+#interpolated-operation-sequence-model = "yes"
+#operation-sequence-model-order = 5
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file"
+#Interpolated OSM can only be used with SRILM because of the interpolation script
+
+
 # if OSM training should be skipped, point to OSM Model 
 #osm-model =

--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -411,9 +411,30 @@ alignment-symmetrization-method = grow-diag-final-and
 #operation-sequence-model-order = 5
 #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
 #
+# OR if you want to use with SRILM
+#
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64"
+
+## Class-based Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Koehn, Schmid, Fraser (COLING, 2014).
+#Investigating the Usefulness of Generalized Word Representations in SMT
+#
+#operation-sequence-model-settings = "--factor 0-0+1-1"
+
+## Interpolated Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015).
+# Using Joint Models for Domain Adaptation in Statistical Machine Translation
+#
+#interpolated-operation-sequence-model = "yes"
+#operation-sequence-model-order = 5
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file"
+#Interpolated OSM can only be used with SRILM because of the interpolation script
+
+
 # if OSM training should be skipped, point to OSM Model 
 #osm-model =
-
 ### unsupervised transliteration module
 # Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
 # "Integrating an Unsupervised Transliteration Model 
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -373,8 +373,30 @@ alignment-symmetrization-method = grow-diag-final-and
 # 
 #operation-sequence-model = "yes"
 #operation-sequence-model-order = 5
-#operation-sequence-model-settings = ""
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
 #
+# OR if you want to use with SRILM
+#
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64"
+
+## Class-based Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Koehn, Schmid, Fraser (COLING, 2014).
+#Investigating the Usefulness of Generalized Word Representations in SMT
+#
+#operation-sequence-model-settings = "--factor 0-0+1-1"
+
+## Interpolated Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015).
+# Using Joint Models for Domain Adaptation in Statistical Machine Translation
+#
+#interpolated-operation-sequence-model = "yes"
+#operation-sequence-model-order = 5
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file"
+#Interpolated OSM can only be used with SRILM because of the interpolation script
+
+
 # if OSM training should be skipped, point to OSM Model 
 #osm-model =

--- a/scripts/ems/example/config.toy.bilinguallm
+++ b/scripts/ems/example/config.toy.bilinguallm
@ -389,8 +389,30 @@ alignment-symmetrization-method = grow-diag-final-and
 # 
 #operation-sequence-model = "yes"
 #operation-sequence-model-order = 5
-#operation-sequence-model-settings = ""
+#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
 #
+# OR if you want to use with SRILM
+#
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64"
+
+## Class-based Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Koehn, Schmid, Fraser (COLING, 2014).
+#Investigating the Usefulness of Generalized Word Representations in SMT
+#
+#operation-sequence-model-settings = "--factor 0-0+1-1"
+
+## Interpolated Operation Sequence Model  (OSM)
+# if OSM has to be enabled with factors then add factors as below. 
+# Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015).
+# Using Joint Models for Domain Adaptation in Statistical Machine Translation
+#
+#interpolated-operation-sequence-model = "yes"
+#operation-sequence-model-order = 5
+#operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file"
+#Interpolated OSM can only be used with SRILM because of the interpolation script
+
+
 # if OSM training should be skipped, point to OSM Model 
 #osm-model =

--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@ -533,7 +533,7 @@ build-domains
 	in: CORPUS:post-split-factorized-stem
 	out: domains
 	default-name: model/domains
-	ignore-unless: domain-features mml-filter-corpora
+	ignore-unless: domain-features mml-filter-corpora interpolated-operation-sequence-model
 	template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
 	final-model: yes
 mml-score
@ -702,7 +702,14 @@ build-osm
 	out: osm-model
 	ignore-unless: operation-sequence-model
 	rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings
-	template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir $operation-sequence-model-settings
+	template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --input-extension $input-extension --output-extension $output-extension $operation-sequence-model-settings
+	default-name: model/OSM
+build-interpolated-osm
+	in: corpus word-alignment domains
+	out: osm-model
+	ignore-unless: interpolated-operation-sequence-model
+	rerun-on-change: interpolated-operation-sequence-model training-options script giza-settings operation-sequence-model-settings
+	template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --input-extension $input-extension --output-extension $output-extension $operation-sequence-model-settings --domain IN2
 	default-name: model/OSM
 build-transliteration-model
 	in: corpus word-alignment 
@ -940,6 +947,21 @@ parse-input-devtest
 	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
 	ignore-unless: use-mira	
 	template: $input-parser < IN > OUT
+parse-relax-input
+	in: split-input
+	out: input
+	default-name: tuning/input.parse-relaxed
+	pass-unless: input-parse-relaxer
+	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
+    template: $input-parse-relaxer < IN > OUT
+parse-relax-input-devtest
+	in: split-input-devtest
+	out: input-devtest
+	default-name: tuning/input.devtest.parse-relaxed
+	pass-unless: input-parse-relaxer
+	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
+	ignore-unless: use-mira
+    template: $input-parse-relaxer < IN > OUT
 factorize-input
 	in: parsed-input
 	out: factorized-input
@ -1001,35 +1023,20 @@ truecase-input-devtest
 	ignore-unless: AND input-truecaser use-mira
        template: $input-truecaser -model IN1.$input-extension < IN > OUT
 split-input 
-	in: truecased-input
+	in: truecased-input SPLITTER:splitter-model
 	out: split-input
-	rerun-on-change: input-splitter SPLITTER:splitter-model
+	rerun-on-change: input-splitter
 	default-name: tuning/input.split
 	pass-unless: input-splitter
-	template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT
+	template: $input-splitter -model IN1.$input-extension < IN > OUT
 split-input-devtest
-	in: truecased-input-devtest
+	in: truecased-input-devtest SPLITTER:splitter-model
 	out: split-input-devtest
 	rerun-on-change: input-splitter
 	default-name: tuning/input.devtest.split
 	pass-unless: input-splitter
 	ignore-unless: use-mira
-	template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT
-parse-relax-input
-	in: split-input
-	out: input
-	default-name: tuning/input.parse-relaxed
-	pass-unless: input-parse-relaxer
-	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
-    template: $input-parse-relaxer < IN > OUT
-parse-relax-input-devtest
-	in: split-input-devtest
-	out: input-devtest
-	default-name: tuning/input.devtest.parse-relaxed
-	pass-unless: input-parse-relaxer
-	pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval
-	ignore-unless: use-mira
-    template: $input-parse-relaxer < IN > OUT
+	template: $input-splitter -model IN1.$input-extension < IN > OUT
 reference-from-sgm
 	in: reference-sgm input-sgm
 	out: raw-reference
@ -1269,12 +1276,11 @@ truecase-input
 	ignore-unless: input-truecaser
        template: $input-truecaser -model IN1.$input-extension < IN > OUT
 split-input
-	in: truecased-input
+	in: truecased-input SPLITTER:splitter-model
 	out: split-input
-	rerun-on-change: input-splitter SPLITTER:splitter-model
 	default-name: evaluation/input.split
 	pass-unless: input-splitter
-	template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT
+	template: $input-splitter -model IN1.$input-extension < IN > OUT
 filter
 	in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
 	out: filtered-dir 
--- a/scripts/generic/multi_moses.py
+++ b/scripts/generic/multi_moses.py
@ -0,0 +1,312 @@
+#!/usr/bin/env python
+
+# Written by Michael Denkowski
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+'''Parallelize decoding with multiple instances of moses on a local machine
+
+To use with mert-moses.pl, activate --multi-moses and set the number of moses
+instances and threads per instance with --decoder-flags='--threads P:T:E'
+
+This script runs a specified number of moses instances, each using one or more
+threads.  The highest speed is generally seen with many single-threaded
+instances while the lowest memory usage is seen with a single many-threaded
+instance.  It is recommended to use the maximum number of instances that will
+fit into memory (up to the number of available CPUs) and distribute CPUs across
+them equally.  For example, a machine with 32 CPUs that can fit 3 copies of
+moses into memory would use --threads 2:11:10 for 2 instances with 11 threads
+each and an extra instance with 10 threads (3 instances total using all CPUs).
+
+Memory mapped models can be shared by multiple processes and increase the number
+of instances that can fit into memory:
+
+Mmaped phrase tables (Ulrich Germann)
+http://www.statmt.org/moses/?n=Advanced.Incremental#ntoc3
+
+Mmaped mapped language models (Kenneth Heafield)
+http://www.statmt.org/moses/?n=FactoredTraining.BuildingLanguageModel#ntoc19
+'''
+
+import collections
+import os
+import Queue
+import signal
+import subprocess
+import sys
+import threading
+
+HELP = '''Multiple process decoding with Moses
+
+Usage:
+    {} moses --config moses.ini [options] [decoder flags]
+
+Options:
+    --threads P:T:E
+            P: Number of parallel instances to run
+            T: Number of threads per instance
+            E: Number of threads in optional extra instance
+            (default 1:1:0, overrides [threads] in moses.ini.  Specifying T
+             and E is optional, e.g. --threads 16 starts 16 single-threaded
+             instances)
+    --n-best-list nbest.out N [distinct]: location and size of N-best list
+    --show-weights: for mert-moses.pl, just call moses and exit
+
+Other options (decoder flags) are passed through to moses instances
+'''
+
+# Defaults
+INPUT = sys.stdin
+PROCS = 1
+THREADS = 1
+EXTRA = 0
+DONE = threading.Event()
+PID = os.getpid()
+# A very long time, used as Queue operation timeout even though we don't
+# actually want a timeout but we do want interruptibility
+# (https://bugs.python.org/issue1360)
+NEVER = 60 * 60 * 24 * 365 * 1000
+
+# Single unit of computation: decode a line, output result, signal done
+Task = collections.namedtuple('Task', ['id', 'line', 'out', 'event'])
+
+
+def kill_main(msg):
+    '''kill -9 the main thread to stop everything immediately'''
+    sys.stderr.write('{}\n'.format(msg))
+    os.kill(PID, signal.SIGKILL)
+
+
+def gzopen(f):
+    '''Open plain or gzipped text'''
+    return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
+
+
+def run_instance(cmd_base, threads, tasks, n_best=False):
+    '''Run an instance of moses that processes tasks (input lines) from a
+    queue using a specified number of threads'''
+    cmd = cmd_base[:]
+    cmd.append('--threads')
+    cmd.append(str(threads))
+    try:
+        # Queue of tasks instance is currently working on, limited to the number of
+        # threads.  The queue should be kept full for optimal CPU usage.
+        work = Queue.Queue(maxsize=threads)
+        # Multi-threaded instance
+        moses = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+        # Read and handle instance output as available
+        def handle_output():
+            while True:
+                # Output line triggers task completion
+                line = moses.stdout.readline()
+                # End of output (instance finished)
+                if not line:
+                    break
+                task = work.get(timeout=NEVER)
+                if n_best:
+                    # Read and copy lines until sentinel line, copy real line id
+                    # id ||| hypothesis words  ||| feature scores ||| total score
+                    (first_i, rest) = line.split(' ||| ', 1)
+                    task.out.append(' ||| '.join((task.id, rest)))
+                    while True:
+                        line = moses.stdout.readline()
+                        (i, rest) = line.split(' ||| ', 1)
+                        # Sentinel
+                        if i != first_i:
+                            break
+                        task.out.append(' ||| '.join((task.id, rest)))
+                else:
+                    task.out.append(line)
+                # Signal task done
+                task.event.set()
+        # Output thread
+        handler = threading.Thread(target=handle_output, args=())
+        # Daemon: guaranteed to finish before non-daemons
+        handler.setDaemon(True)
+        handler.start()
+
+        # Input thread: take tasks as they are available and add them to work
+        # queue.  Stop when DONE encountered.
+        while True:
+            task = tasks.get(timeout=NEVER)
+            work.put(task, timeout=NEVER)
+            if task.event == DONE:
+                break
+            if n_best:
+                # Input line followed by blank line (sentinel)
+                moses.stdin.write(task.line)
+                moses.stdin.write('\n')
+            else:
+                moses.stdin.write(task.line)
+
+        # Cleanup
+        moses.stdin.close()
+        moses.wait()
+        handler.join()
+
+    except:
+        kill_main('Error with moses instance: see stderr')
+
+
+def write_results(results, n_best=False, n_best_out=None):
+    '''Write out results (output lines) from a queue as they are populated'''
+    while True:
+        task = results.get(timeout=NEVER)
+        if task.event == DONE:
+            break
+        task.event.wait()
+        if n_best:
+            # Write top-best and N-best
+            # id ||| hypothesis words  ||| feature scores ||| total score
+            top_best = task.out[0].split(' ||| ', 2)[1]
+            # Except don't write top-best if writing N-best to stdout "-"
+            if n_best_out != sys.stdout:
+                sys.stdout.write('{}\n'.format(top_best))
+                sys.stdout.flush()
+            for line in task.out:
+                n_best_out.write(line)
+            n_best_out.flush()
+        else:
+            sys.stdout.write(task.out[0])
+            sys.stdout.flush()
+
+
+def main(argv):
+    # Defaults
+    moses_ini = None
+    input = INPUT
+    procs = PROCS
+    threads = THREADS
+    extra = EXTRA
+    n_best = False
+    n_best_file = None
+    n_best_size = None
+    n_best_distinct = False
+    n_best_out = None
+    show_weights = False
+
+    # Decoder command
+    cmd = argv[1:]
+
+    # Parse special options and remove from cmd
+    i = 1
+    while i < len(cmd):
+        if cmd[i] in ('-f', '-config', '--config'):
+            moses_ini = cmd[i + 1]
+            # Do not remove from cmd
+            i += 2
+        elif cmd[i] in ('-i', '-input-file', '--input-file'):
+            input = gzopen(cmd[i + 1])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-th', '-threads', '--threads'):
+            # P:T:E
+            args = cmd[i + 1].split(':')
+            procs = int(args[0])
+            if len(args) > 1:
+                threads = int(args[1])
+            if len(args) > 2:
+                extra = int(args[2])
+            cmd = cmd[:i] + cmd[i + 2:]
+        elif cmd[i] in ('-n-best-list', '--n-best-list'):
+            n_best = True
+            n_best_file = cmd[i + 1]
+            n_best_size = cmd[i + 2]
+            # Optional "distinct"
+            if i + 3 < len(cmd) and cmd[i + 3] == 'distinct':
+                n_best_distinct = True
+                cmd = cmd[:i] + cmd[i + 4:]
+            else:
+                cmd = cmd[:i] + cmd[i + 3:]
+        # Handled specially for mert-moses.pl
+        elif cmd[i] in ('-show-weights', '--show-weights'):
+            show_weights = True
+            # Do not remove from cmd
+            i += 1
+        else:
+            i += 1
+
+    # If mert-moses.pl passes -show-weights, just call moses
+    if show_weights:
+        sys.stdout.write(subprocess.check_output(cmd))
+        sys.stdout.flush()
+        sys.exit(0)
+
+    # Check inputs
+    if not (len(cmd) > 0 and moses_ini):
+        sys.stderr.write(HELP.format(os.path.basename(argv[0])))
+        sys.exit(2)
+    if not (os.path.isfile(cmd[0]) and os.access(cmd[0], os.X_OK)):
+        raise Exception('moses "{}" is not executable\n'.format(cmd[0]))
+
+    # Report settings
+    sys.stderr.write('Moses flags: {}\n'.format(' '.join('\'{}\''.format(s) if ' ' in s else s for s in cmd[1:])))
+    sys.stderr.write('Instances:   {}\n'.format(procs))
+    sys.stderr.write('Threads per: {}\n'.format(threads))
+    if extra:
+        sys.stderr.write('Extra:       {}\n'.format(extra))
+    if n_best:
+        sys.stderr.write('N-best list: {} ({}{})\n'.format(n_best_file, n_best_size, ', distinct' if n_best_distinct else ''))
+
+    # Task and result queues (buffer 8 * total threads input lines)
+    tasks = Queue.Queue(maxsize=(8 * ((procs * threads) + extra)))
+    results = Queue.Queue()
+
+    # N-best capture
+    if n_best:
+        cmd.append('--n-best-list')
+        cmd.append('-')
+        cmd.append(n_best_size)
+        if n_best_distinct:
+            cmd.append('distinct')
+        if n_best_file == '-':
+            n_best_out = sys.stdout
+        else:
+            n_best_out = open(n_best_file, 'w')
+
+    # Start instances
+    instances = []
+    for i in range(procs + (1 if extra else 0)):
+        t = threading.Thread(target=run_instance, args=(cmd, (threads if i < procs else extra), tasks, n_best))
+        instances.append(t)
+        # Daemon: guaranteed to finish before non-daemons
+        t.setDaemon(True)
+        t.start()
+
+    # Start results writer
+    writer = threading.Thread(target=write_results, args=(results, n_best, n_best_out))
+    writer.start()
+
+    # Main loop: queue task for each input line
+    id = 0
+    while True:
+        line = input.readline()
+        if not line:
+            break
+        # (input, out lines, err lines, "done" event)
+        task = Task(str(id), line, [], threading.Event())
+        results.put(task, timeout=NEVER)
+        tasks.put(task, timeout=NEVER)
+        id += 1
+
+    # Tell instances to exit
+    for t in instances:
+        tasks.put(Task(None, None, None, DONE), timeout=NEVER)
+    for t in instances:
+        t.join()
+
+    # Stop results writer
+    results.put(Task(None, None, None, DONE), timeout=NEVER)
+    writer.join()
+
+    # Cleanup
+    if n_best:
+        n_best_out.close()
+
+
+if __name__ == '__main__':
+    try:
+        main(sys.argv)
+    except:
+        kill_main('Error with main I/O: see stderr')
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@ -52,9 +52,9 @@ parser.set_defaults(
    ngram_size=14,
    minibatch_size=1000,
    noise=100,
-    hidden=750,
+    hidden=0,
    input_embedding=150,
-    output_embedding=150,
+    output_embedding=750,
    threads=1,
    output_model="train.10k",
    output_dir=None,
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -166,6 +166,10 @@ my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loa
                                  # and so on
 my $maximum_iterations = 25;

+# Multiple instance parallelization
+my $___MULTI_MOSES = "$SCRIPTS_ROOTDIR/generic/multi_moses.py";
+my $___USE_MULTI_MOSES = undef;
+
 # Simulated post-editing
 my $___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py";
 my $___DEV_SYMAL = undef;
@ -227,7 +231,8 @@ GetOptions(
  "promix-training=s" => \$__PROMIX_TRAINING,
  "promix-table=s" => \@__PROMIX_TABLES,
  "threads=i" => \$__THREADS,
-  "spe-symal=s" => \$___DEV_SYMAL
+  "spe-symal=s" => \$___DEV_SYMAL,
+  "multi-moses" => \$___USE_MULTI_MOSES
 ) or exit(1);

 # the 4 required parameters can be supplied on the command line directly
@ -325,6 +330,9 @@ Options:
                                (parameter sets factor [0;1] given to current weights)
  --spe-symal=SYMAL      ... Use simulated post-editing when decoding.
                             (SYMAL aligns input to refs)
+  --multi-moses          ... Use multiple instances of moses instead of threads for decoding
+                             (Use with --decoder-flags='-threads N' to get N instances, each of
+                              which uses a single thread (overrides threads in moses.ini))
 ";
  exit 1;
 }
@ -1305,6 +1313,10 @@ sub run_decoder {
      $decoder_cmd = "$___DECODER $___DECODER_FLAGS  -config $___CONFIG";
      $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
      $decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd  -input-file $___DEV_F";
+      if (defined $___USE_MULTI_MOSES) {
+        # If requested, prefix full decoder command with multi-moses wrapper
+        $decoder_cmd = "$___MULTI_MOSES $decoder_cmd";
+      }
      if (defined $___DEV_SYMAL) {
        # If simulating post-editing, route command through moses_sim_pe.py
        # Always use single (first) reference.  Simulated post-editing undefined for multiple references.
--- a/util/fake_ostream.hh
+++ b/util/fake_ostream.hh
@ -42,41 +42,37 @@ template <class Derived> class FakeOStream {
      return C().write(str.data(), str.size());
    }

-    // For anything with ToStringBuf<T>::kBytes, define operator<< using ToString.
-    // This includes uint64_t, int64_t, uint32_t, int32_t, uint16_t, int16_t,
-    // float, double
+    // Handle integers by size and signedness.
  private:
-    template <int Arg> struct EnableIfKludge {
+    template <class Arg> struct EnableIfKludge {
      typedef Derived type;
    };
+    template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
+
+    template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
+    template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
+    template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
+
+    template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
+    template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
+    template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
  public:
-    template <class T> typename EnableIfKludge<ToStringBuf<T>::kBytes>::type &operator<<(const T value) {
-      return CallToString(value);
+    template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
+      return CallToString(static_cast<typename Coerce<From>::To>(value));
    }

-    /* clang on OS X appears to consider std::size_t aka unsigned long distinct
-     * from uint64_t.  So this function makes clang work.  gcc considers
-     * uint64_t and std::size_t the same (on 64-bit) so this isn't necessary.
-     * But it does no harm since gcc sees it as a specialization of the
-     * EnableIfKludge template.
-     * Also, delegating to *this << static_cast<uint64_t>(value) would loop
-     * indefinitely on gcc.
-     */
-    Derived &operator<<(std::size_t value) { return CoerceToString(value); }
-
-    // union types will map to int, but don't pass the template magic above in gcc.
-    Derived &operator<<(int value) { return CoerceToString(value); }
-
-    // gcc considers these distinct from uint64_t
-    Derived &operator<<(unsigned long long value) { return CoerceToString(value); }
-    Derived &operator<<(signed long long value) { return CoerceToString(value); }
-    Derived &operator<<(long value) { return CoerceToString(value); }
-
    // Character types that get copied as bytes instead of displayed as integers.
    Derived &operator<<(char val) { return put(val); }
    Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
    Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }

+    Derived &operator<<(bool val) { return put(val + '0'); }
+    // enums will fall back to int but are not caught by the template.
+    Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
+
+    Derived &operator<<(float val) { return CallToString(val); }
+    Derived &operator<<(double val) { return CallToString(val); }
+
    // This is here to catch all the other pointer types.
    Derived &operator<<(const void *value) { return CallToString(value); }
    // This is here because the above line also catches const char*.
@ -102,20 +98,6 @@ template <class Derived> class FakeOStream {
      return *static_cast<const Derived*>(this);
    }

-    template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed> struct Coerce {};
-
-    template <class From> struct Coerce<From, 2, false> { typedef uint16_t To; };
-    template <class From> struct Coerce<From, 4, false> { typedef uint32_t To; };
-    template <class From> struct Coerce<From, 8, false> { typedef uint64_t To; };
-
-    template <class From> struct Coerce<From, 2, true> { typedef int16_t To; };
-    template <class From> struct Coerce<From, 4, true> { typedef int32_t To; };
-    template <class From> struct Coerce<From, 8, true> { typedef int64_t To; };
-
-    template <class From> Derived &CoerceToString(const From value) {
-      return CallToString(static_cast<typename Coerce<From>::To>(value));
-    }
-
    // This is separate to prevent an infinite loop if the compiler considers
    // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
    template <class T> Derived &CallToString(const T value) {
--- a/util/integer_to_string.cc
+++ b/util/integer_to_string.cc
@ -644,14 +644,16 @@ const char kHexDigits[] = "0123456789abcdef";
 } // namespace

 char *ToString(const void *v, char *to) {
-  // Apparently it's 0, not 0x0.  
+  *to++ = '0';
+  *to++ = 'x';
+
+  // Fun fact: gcc/clang boost::lexical_cast on Linux do just "0" while clang on OS X does "0x0"
+  // I happen to prefer 0x0.
  if (!v) {
    *to++ = '0';
    return to;
  }

-  *to++ = '0';
-  *to++ = 'x';
  uintptr_t value = reinterpret_cast<uintptr_t>(v);
  uint8_t shift = sizeof(void*) * 8 - 4;
  for (; !(value >> shift); shift -= 4) {}
--- a/util/integer_to_string_test.cc
+++ b/util/integer_to_string_test.cc
@ -15,7 +15,12 @@ template <class T> void TestValue(const T value) {
  char buf[ToStringBuf<T>::kBytes];
  StringPiece result(buf, ToString(value, buf) - buf);
  BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
-  BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
+  if (value) {
+    BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
+  } else {
+    // Platforms can do void * as 0x0 or 0.
+    BOOST_CHECK(result == "0x0" || result == "0");
+  }
 }

 template <class T> void TestCorners() {
@ -33,7 +38,7 @@ BOOST_AUTO_TEST_CASE(Corners) {
  TestCorners<int16_t>();
  TestCorners<int32_t>();
  TestCorners<int64_t>();
-  //TestCorners<const void*>();
+  TestCorners<const void*>();
 }

 template <class T> void TestAll() {
@ -64,7 +69,6 @@ BOOST_AUTO_TEST_CASE(Tens) {
 }

 BOOST_AUTO_TEST_CASE(Pointers) {
-  /*
  for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
    TestValue((const void*)i);
  }
@ -72,7 +76,6 @@ BOOST_AUTO_TEST_CASE(Pointers) {
    TestValue((const void*)i);
    TestValue((const void*)(i + 0xf00));
  }
-  */
 }

 }} // namespaces
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@ -296,6 +296,13 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
      }
    }

+    ConstIterator RawBegin() const {
+      return begin_;
+    }
+    ConstIterator RawEnd() const {
+      return end_;
+    }
+
  private:
    friend class AutoProbing<Entry, Hash, Equal>;

@ -379,6 +386,13 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
      backend_.Clear();
    }

+    ConstIterator RawBegin() const {
+      return backend_.RawBegin();
+    }
+    ConstIterator RawEnd() const {
+      return backend_.RawEnd();
+    }
+
  private:
    void DoubleIfNeeded() {
      if (UTIL_LIKELY(Size() < threshold_))
--- a/util/string_stream_test.cc
+++ b/util/string_stream_test.cc
@ -54,4 +54,27 @@ BOOST_AUTO_TEST_CASE(EnumCase) {
  TestEqual(EnumValue);
 }

+BOOST_AUTO_TEST_CASE(Strings) {
+  TestEqual("foo");
+  const char *a = "bar";
+  TestEqual(a);
+  StringPiece piece("abcdef");
+  TestEqual(piece);
+  TestEqual(StringPiece());
+
+  char non_const[3];
+  non_const[0] = 'b';
+  non_const[1] = 'c';
+  non_const[2] = 0;
+  std::string out;
+  StringStream(out) << "a" << non_const << 'c';
+  BOOST_CHECK_EQUAL("abcc", out);
+
+  // Now test as a separate object.
+  out.clear();
+  StringStream stream(out);
+  stream << "a" << non_const << 'c' << piece;
+  BOOST_CHECK_EQUAL("abccabcdef", out);
+}
+
 }} // namespaces
--- a/util/usage.cc
+++ b/util/usage.cc
@ -286,7 +286,7 @@ template <class Num> uint64_t ParseNum(const std::string &arg) {
    return static_cast<uint64_t>(static_cast<double>(value) * static_cast<double>(mem) / 100.0);
  }

-  if (after == "k") after == "K";
+  if (after == "k") after = "K";
  std::string units("bKMGTPEZY");
  std::string::size_type index = units.find(after[0]);
  UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%.");