maximum expected BLEU trainer

2024-12-28 14:32:38 +03:00 · 2015-10-02 20:11:35 +01:00 · 2015-10-02 20:11:35 +01:00 · 0fbb3973c3
commit 0fbb3973c3
parent 758b270447
6 changed files with 945 additions and 0 deletions
--- a/2
+++ b/2
@ -298,6 +298,8 @@ contrib/server//mosesserver
 mm
 rephraser
 contrib/c++tokenizer//tokenizer
+contrib/expected-bleu-training//train-expected-bleu
+contrib/expected-bleu-training//prepare-expected-bleu-training
 ;


--- a/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
+++ b/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
@ -0,0 +1,223 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include "ExpectedBleuOptimizer.h"
+
+
+namespace ExpectedBleuTraining
+{
+
+
+void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount, 
+                                                const std::vector<float>& sBleu,
+                                                const std::vector<double>& overallScoreUntransformed, 
+                                                const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                                                bool maintainUpdateSet) 
+{
+
+  // compute xBLEU
+  double sumUntransformedScores = 0.0;
+  for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
+       overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
+  {
+    sumUntransformedScores += *overallScoreUntransformedIt;
+  }
+
+  double xBleu = 0.0;
+  assert(nBestSizeCount == overallScore.size());
+  std::vector<double> p;
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    if (sumUntransformedScores != 0) {
+      p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
+    } else {
+      p.push_back( 0 );
+    }
+    xBleu += p.back() * sBleu[ i ];
+  }
+
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    double D = sBleu[ i ] - xBleu;
+    for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
+         sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
+    {
+      const size_t name = sparseScoreIt->first;
+      float N = sparseScoreIt->second;
+      if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
+      {
+        m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D 
+              << " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
+        m_err.flush();
+        exit(1);
+      } else {
+        m_gradient[name] += p[i] * N * D;
+        if ( maintainUpdateSet )
+        {
+          m_updateSet.insert(name);
+        }
+      }
+    }
+  }
+
+  m_xBleu += xBleu;
+}
+
+
+void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_gradient.resize(nFeatures);
+}
+
+
+float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor, 
+                                       size_t batchSize,
+                                       bool useUpdateSet)
+{
+
+  float xBleu = m_xBleu / batchSize;
+
+  // update sparse scaling factors
+
+  if (useUpdateSet) {
+
+    for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
+    {
+      size_t name = *it;
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+    
+    m_updateSet.clear();
+
+  } else {
+
+    for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+    {
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+
+  }
+
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+
+
+void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
+                                                         std::vector<float>& sparseScalingFactor, 
+                                                         size_t batchSize)
+{
+  // regularization
+  if ( m_regularizationParameter != 0 )
+  {
+    m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+  } else {
+    // need to normalize by dividing by batchSize
+    m_gradient[name] /= batchSize;
+  }
+
+  // the actual update
+  sparseScalingFactor[name] += m_learningRate * m_gradient[name];
+
+  // discard scaling factors below a threshold
+  if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+  {
+    sparseScalingFactor[name] = 0;
+  }
+}
+
+
+void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  m_previousSparseScalingFactor.resize(nFeatures);
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_previousGradient.resize(nFeatures);
+  m_gradient.resize(nFeatures);
+  m_stepSize.resize(nFeatures, m_initialStepSize);
+}
+
+
+float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                                         const size_t batchSize)
+{
+
+  float xBleu = m_xBleu / batchSize;
+
+  // update sparse scaling factors
+
+  for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+  {
+    // Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
+
+    // regularization
+    if ( m_regularizationParameter != 0 )
+    {
+      m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+    }
+
+    // step size
+    int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
+    if (sign > 0) {
+      m_stepSize[name] *= m_increaseRate;
+    } else if (sign < 0) {
+      m_stepSize[name] *= m_decreaseRate;
+    }
+    if (m_stepSize[name] < m_minStepSize) {
+      m_stepSize[name] = m_minStepSize;
+    }
+    if (m_stepSize[name] > m_maxStepSize) {
+      m_stepSize[name] = m_maxStepSize;
+    }
+
+    // the actual update
+
+    m_previousGradient[name] = m_gradient[name];
+    if (sign >= 0) {
+      if (m_gradient[name] > 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] += m_stepSize[name];
+      } else if (m_gradient[name] < 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] -= m_stepSize[name];
+      }
+    } else { 
+      sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
+      // m_previousGradient[name] = 0;
+    }
+
+    // discard scaling factors below a threshold
+    if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+    {
+      sparseScalingFactor[name] = 0;
+    }
+  }
+
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+
+
+}
+
--- a/contrib/expected-bleu-training/ExpectedBleuOptimizer.h
+++ b/contrib/expected-bleu-training/ExpectedBleuOptimizer.h
@ -0,0 +1,117 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <boost/unordered_map.hpp>
+#include "util/file_stream.hh"
+
+
+namespace ExpectedBleuTraining
+{
+
+class ExpectedBleuOptimizer
+{
+public:
+
+  ExpectedBleuOptimizer(util::FileStream& err,
+                        float learningRate=1,
+                        float initialStepSize=0.001,
+                        float decreaseRate=0.5,
+                        float increaseRate=1.2,
+                        float minStepSize=1e-7,
+                        float maxStepSize=1,
+                        float floorAbsScalingFactor=0,
+                        float regularizationParameter=0)
+    : m_err(err)
+    , m_learningRate(learningRate)
+    , m_initialStepSize(initialStepSize)
+    , m_decreaseRate(decreaseRate)
+    , m_increaseRate(increaseRate)
+    , m_minStepSize(minStepSize)
+    , m_maxStepSize(maxStepSize)
+    , m_floorAbsScalingFactor(floorAbsScalingFactor)
+    , m_regularizationParameter(regularizationParameter)
+    , m_xBleu(0)
+  { }
+
+  void AddTrainingInstance(const size_t nBestSizeCount, 
+                           const std::vector<float>& sBleu,
+                           const std::vector<double>& overallScoreUntransformed, 
+                           const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                           bool maintainUpdateSet = false); 
+
+  void InitSGD(const std::vector<float>& sparseScalingFactor);
+
+  float UpdateSGD(std::vector<float>& sparseScalingFactor, 
+                  size_t batchSize,
+                  bool useUpdateSet = false);
+
+  void InitRPROP(const std::vector<float>& sparseScalingFactor);
+  
+  float UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                    const size_t batchSize);
+
+protected:
+
+  util::FileStream& m_err;
+
+  // for SGD
+  const float m_learningRate;
+
+  // for RPROP
+  const float m_initialStepSize;
+  const float m_decreaseRate;
+  const float m_increaseRate;
+  const float m_minStepSize;
+  const float m_maxStepSize;
+
+  std::vector<float> m_previousSparseScalingFactor;
+  std::vector<float> m_previousGradient;
+  std::vector<float> m_gradient;
+  std::vector<float> m_stepSize;
+
+  // other
+  const float m_floorAbsScalingFactor;
+  const float m_regularizationParameter;
+
+  double m_xBleu;
+
+  std::set<size_t> m_updateSet;
+
+
+  void UpdateSingleScalingFactorSGD(size_t name,
+                                    std::vector<float>& sparseScalingFactor, 
+                                    size_t batchSize);
+
+
+  inline int Sign(double x) 
+  {
+    if (x > 0) return 1;
+    if (x < 0) return -1;
+    return 0;
+  }
+};
+
+}
+
+
--- a/contrib/expected-bleu-training/Jamfile
+++ b/contrib/expected-bleu-training/Jamfile
@ -0,0 +1,2 @@
+exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
+exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;
--- a/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
+++ b/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
@ -0,0 +1,222 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include <vector>
+#include <string>
+#include <sstream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <boost/program_options.hpp>
+#include "util/file_stream.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+namespace po = boost::program_options;
+
+
+int main(int argc, char **argv)
+{
+  util::FileStream err(2);
+
+  std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
+  size_t maxNBestSize;
+
+  try {
+
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(), 
+       "input n-best list file")
+      ("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(), 
+       "output file for mapping between feature names and indices")
+      ("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(), 
+       "input file containing list of feature names to be ignored")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100), 
+       "limit of n-best list entries to be considered")
+      ;
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      std::cout << os.str() << '\n';
+      exit(0);
+    }
+
+    po::notify(vm);
+
+  } catch(std::exception& e) {
+
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  util::FilePiece ifsNBest(filenameNBestListIn.c_str());
+  util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
+  util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
+  util::FileStream ofsFeatureNames(fdFeatureNames.get());
+  util::FileStream ofsNBest(1);
+
+  boost::unordered_set<std::string> ignoreFeatureNames;
+  StringPiece line;
+
+  while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
+  {
+    if ( !line.empty() ) {
+      util::TokenIter<util::AnyCharacter> item(line, " \t=");
+      if ( item != item.end() )
+      {
+        ignoreFeatureNames.insert(item->as_string());
+      }
+      err << "ignoring " << *item << '\n';
+    }
+  }
+
+  size_t maxFeatureNamesIdx = 0;
+  boost::unordered_map<std::string, size_t> featureNames;
+
+  size_t sentenceIndex = 0;
+  size_t nBestSizeCount = 0;
+  size_t globalIndex = 0;
+
+  while ( ifsNBest.ReadLineOrEOF(line) )
+  {
+    util::TokenIter<util::MultiCharacter> item(line, " ||| ");
+
+    if ( item == item.end() )
+    {
+      err << "Error: flawed content in " << filenameNBestListIn << '\n';
+      exit(1);
+    }
+
+    size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
+
+    if ( sentenceIndex != sentenceIndexCurrent )
+    {
+      nBestSizeCount = 0;
+      sentenceIndex = sentenceIndexCurrent;
+    }
+
+    if ( nBestSizeCount < maxNBestSize )
+    {
+      // process n-best list entry
+      
+      StringPiece scores;
+      StringPiece decoderScore;
+      for (size_t nItem=1; nItem<=3; ++nItem) 
+      {
+        if ( ++item == item.end() ) {
+          err << "Error: flawed content in " << filenameNBestListIn << '\n';
+          exit(1);
+        }
+        if (nItem == 2) {
+          scores = *item;
+        }
+        if (nItem == 3) {
+          decoderScore = *item;
+        }
+      }
+
+      ofsNBest << sentenceIndex << ' ' 
+               << decoderScore;
+
+      util::TokenIter<util::SingleCharacter> token(scores, ' ');
+      std::string featureNameCurrent("ERROR");
+      std::string featureNameCurrentBase("ERROR");
+      bool ignore = false;
+      int scoreComponentIndex = 0;
+      
+      while ( token != token.end() )
+      {
+        if ( token->ends_with("=") )
+        {
+          scoreComponentIndex = 0;
+          featureNameCurrent = token->substr(0,token->size()-1).as_string();
+          size_t idx = featureNameCurrent.find_first_of('_');
+          if ( idx == StringPiece::npos ) {
+            featureNameCurrentBase = featureNameCurrent;
+          } else {
+            featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
+          }
+          ignore = false;
+          if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
+          {
+            ignore = true;
+          } else {
+            if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
+                 (ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
+            {
+              ignore = true;
+            } 
+          }
+        }
+        else
+        {
+          if ( !ignore )
+          {
+            float featureValueCurrent = atof( token->as_string().c_str() );;
+            if ( scoreComponentIndex > 0 )
+            {
+              std::ostringstream oss;
+              oss << scoreComponentIndex;
+              featureNameCurrent.append("+");
+            }
+            if ( featureValueCurrent != 0 )
+            {
+              boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
+
+              if ( featureName == featureNames.end() )
+              {
+                std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted = 
+                  featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
+                ++maxFeatureNamesIdx;
+                featureName = inserted.first;
+              }
+
+              ofsNBest << ' ' << featureName->second // feature name index
+                       << ' ' << *token;             // feature value
+            }
+            ++scoreComponentIndex;
+          }
+        }
+        ++token;
+      }
+      ofsNBest << '\n';
+      ++nBestSizeCount;
+    }
+    ++globalIndex;
+  }
+
+  ofsFeatureNames << maxFeatureNamesIdx << '\n';
+  for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
+       featureNamesIt!=featureNames.end(); ++featureNamesIt)
+  {
+    ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
+  }
+
+}
+
--- a/contrib/expected-bleu-training/TrainExpectedBleu.cpp
+++ b/contrib/expected-bleu-training/TrainExpectedBleu.cpp
@ -0,0 +1,379 @@
+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+
+#include "ExpectedBleuOptimizer.h"
+#include "util/file_stream.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include <sstream>
+#include <boost/program_options.hpp>
+
+using namespace ExpectedBleuTraining;
+namespace po = boost::program_options;
+
+
+int main(int argc, char **argv) {
+
+  util::FileStream out(1);
+  util::FileStream err(2);
+
+  size_t maxNBestSize;
+  size_t iterationLimit;
+  std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
+
+  bool ignoreDecoderScore;
+
+  float learningRate;
+  float initialStepSize;
+  float decreaseRate;
+  float increaseRate;
+  float minStepSize;
+  float maxStepSize;
+  float floorAbsScalingFactor;
+  float regularizationParameter;
+  bool printZeroWeights;
+  bool miniBatches;
+  std::string optimizerTypeStr;
+  size_t optimizerType = 0;
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
+
+  try {
+
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100), 
+       "limit of n-best list entries to be considered for training")
+      ("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50), 
+       "number of training iterations")
+      ("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(), 
+       "file containing sentence-level BLEU scores for all n-best list entries")
+      ("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(), 
+       "input n-best list file, in prepared format for expected BLEU training")
+      ("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(), 
+       "file containing mapping between feature names and indices")
+      ("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""), 
+       "file containing start values for scaling factors (optional)")
+      ("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0), 
+       "exclude decoder score from computation of posterior probability")
+      ("regularization", boost::program_options::value<float>(&regularizationParameter)->default_value(0), // e.g. 1e-5 
+       "regularization parameter; suggested value range: [1e-8,1e-5]")
+      ("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1), 
+       "learning rate for the SGD optimizer")
+      ("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0),  // e.g. 1e-7
+       "set scaling factor to 0 if below this absolute value after update")
+      ("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001),  // TODO: try 0.01 and 0.1
+       "initial step size for the RPROP optimizer")
+      ("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5), 
+       "decrease rate for the RPROP optimizer")
+      ("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2), 
+       "increase rate for the RPROP optimizer")
+      ("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7), 
+       "minimum step size for the RPROP optimizer")
+      ("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1), 
+       "maximum step size for the RPROP optimizer")
+      ("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0), 
+       "output scaling factors even if they are trained to 0")
+      ("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"), 
+       "optimizer type used for training (known algorithms: RPROP, SGD)")
+      ("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0), 
+       "update after every single sentence (SGD only)")
+      ;
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      out << os.str() << '\n';
+      out.flush();
+      exit(0);
+    }
+
+    po::notify(vm);
+
+  } catch(std::exception& e) {
+
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
+  } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
+  } else {
+    err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
+    err.flush();
+    exit(1);
+  }
+
+  
+
+  util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
+
+  StringPiece lineFeatureName;
+  if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) 
+  {
+    err << "Error: flawed content in " << filenameFeatureNames << '\n';
+    err.flush();
+    exit(1);
+  }
+  size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
+
+  std::vector<std::string> featureNames(maxFeatureNamesIdx);
+  boost::unordered_map<std::string, size_t> featureIndexes;
+  for (size_t i=0; i<maxFeatureNamesIdx; ++i) 
+  {
+    if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
+      err << "Error: flawed content in " << filenameFeatureNames << '\n';
+      err.flush();
+      exit(1);
+    }
+    util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
+    size_t featureIndexCurrent = atol( token->as_string().c_str() );
+    token++;
+    featureNames[featureIndexCurrent] = token->as_string();
+    featureIndexes[token->as_string()] = featureIndexCurrent;
+  }
+
+
+  std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
+  std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
+
+  // read initial weights, if any given
+
+  if ( filenameInitialWeights.length() != 0 ) 
+  {
+    util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
+
+    StringPiece lineInitialWeight;
+    if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
+      err << "Error: flawed content in " << filenameInitialWeights << '\n';
+      err.flush();
+      exit(1);
+    }
+    do {
+      util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
+      boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
+      if ( found == featureIndexes.end() ) {
+        err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
+        err.flush();
+        exit(1);
+      }
+      token++;
+      sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
+    } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
+  }
+
+  // train
+
+  ExpectedBleuOptimizer optimizer(err, 
+                                  learningRate, 
+                                  initialStepSize, 
+                                  decreaseRate, 
+                                  increaseRate,
+                                  minStepSize, 
+                                  maxStepSize, 
+                                  floorAbsScalingFactor, 
+                                  regularizationParameter);
+
+  if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+  {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else {
+    err << "Error: unknown optimizer type" << '\n';
+    err.flush();
+    exit(1);
+  }
+  
+  for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration) 
+  {
+    util::FilePiece ifsSBleu(filenameSBleu.c_str());
+    util::FilePiece ifsNBest(filenameNBestList.c_str());
+
+    out << "### ITERATION " << nIteration << '\n' << '\n';
+
+    size_t sentenceIndex = 0;
+    size_t batchSize = 0;
+    size_t nBestSizeCount = 0;
+    size_t globalIndex = 0;
+    StringPiece lineNBest;
+    std::vector<double> overallScoreUntransformed;
+    std::vector<float> sBleu;
+    float xBleu = 0;
+    // double expPrecisionCorrection = 0.0;
+
+    while ( ifsNBest.ReadLineOrEOF(lineNBest) )
+    {
+
+      util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
+
+      if ( token == token.end() )
+      {
+        err << "Error: flawed content in " << filenameNBestList << '\n';
+        err.flush();
+        exit(1);
+      }
+
+      size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
+      token++;
+
+      if ( sentenceIndex != sentenceIndexCurrent )
+      {
+
+        if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+        {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
+        } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
+
+          if ( miniBatches ) {
+            xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+            // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
+            // for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+            // {
+            //   if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+            //   {
+            //     out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+            //   }
+            // }
+            // out << '\n';
+            // out.flush();
+          }
+        } else {
+           err << "Error: unknown optimizer type" << '\n';
+           err.flush();
+           exit(1);
+        }
+
+        for (size_t i=0; i<nBestSizeCount; ++i) {
+          sparseScore[i].clear();
+        }
+        nBestSizeCount = 0;
+        overallScoreUntransformed.clear();
+        sBleu.clear();
+        sentenceIndex = sentenceIndexCurrent;
+        ++batchSize;
+      }
+
+      StringPiece lineSBleu;
+      if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
+      {
+        err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
+        err.flush();
+        exit(1);
+      }
+
+      if ( nBestSizeCount < maxNBestSize )
+      {
+        // retrieve sBLEU
+
+        float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
+        sBleu.push_back(sBleuCurrent);
+
+        // process n-best list entry
+
+        if ( token == token.end() )
+        {
+          err << "Error: flawed content in " << filenameNBestList << '\n';
+          err.flush();
+          exit(1);
+        }
+        double scoreCurrent = 0; 
+        if ( !ignoreDecoderScore ) 
+        {
+          scoreCurrent = atof( token->as_string().c_str() ); // decoder score 
+        }
+        token++;
+
+        // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
+        // {
+        //   expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
+        // }
+
+        while (token != token.end())
+        {
+          size_t featureNameCurrent = atol( token->as_string().c_str() );
+          token++;
+          float featureValueCurrent = atof( token->as_string().c_str() );
+          sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
+          scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
+          token++;
+        }
+
+        // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
+        overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
+
+        ++nBestSizeCount;
+      }
+      ++globalIndex;
+    }
+
+    if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) 
+    {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
+      xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
+      out << "xBLEU= " << xBleu << '\n';
+    } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
+      if ( miniBatches ) {
+        xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+        xBleu /= batchSize;
+      } else {
+        xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
+      }
+      out << "xBLEU= " << xBleu << '\n';
+    } else {
+      err << "Error: unknown optimizer type" << '\n';
+      err.flush();
+      exit(1);
+    }
+
+    for (size_t i=0; i<nBestSizeCount; ++i) {
+      sparseScore[i].clear();
+    }
+    nBestSizeCount = 0;
+    overallScoreUntransformed.clear();
+    sBleu.clear();
+
+    out << '\n';
+
+    for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+    {
+      if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+      {
+        out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+      }
+    }
+
+    out << '\n';
+    out.flush();
+  }
+
+}
+