mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
maximum expected BLEU trainer
This commit is contained in:
parent
758b270447
commit
0fbb3973c3
2
Jamroot
2
Jamroot
@ -298,6 +298,8 @@ contrib/server//mosesserver
|
||||
mm
|
||||
rephraser
|
||||
contrib/c++tokenizer//tokenizer
|
||||
contrib/expected-bleu-training//train-expected-bleu
|
||||
contrib/expected-bleu-training//prepare-expected-bleu-training
|
||||
;
|
||||
|
||||
|
||||
|
223
contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
Normal file
223
contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
Normal file
@ -0,0 +1,223 @@
|
||||
/*
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2005-2015 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "ExpectedBleuOptimizer.h"
|
||||
|
||||
|
||||
namespace ExpectedBleuTraining
|
||||
{
|
||||
|
||||
|
||||
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
|
||||
const std::vector<float>& sBleu,
|
||||
const std::vector<double>& overallScoreUntransformed,
|
||||
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
|
||||
bool maintainUpdateSet)
|
||||
{
|
||||
|
||||
// compute xBLEU
|
||||
double sumUntransformedScores = 0.0;
|
||||
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
|
||||
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
|
||||
{
|
||||
sumUntransformedScores += *overallScoreUntransformedIt;
|
||||
}
|
||||
|
||||
double xBleu = 0.0;
|
||||
assert(nBestSizeCount == overallScore.size());
|
||||
std::vector<double> p;
|
||||
for (size_t i=0; i<nBestSizeCount; ++i)
|
||||
{
|
||||
if (sumUntransformedScores != 0) {
|
||||
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
|
||||
} else {
|
||||
p.push_back( 0 );
|
||||
}
|
||||
xBleu += p.back() * sBleu[ i ];
|
||||
}
|
||||
|
||||
for (size_t i=0; i<nBestSizeCount; ++i)
|
||||
{
|
||||
double D = sBleu[ i ] - xBleu;
|
||||
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
|
||||
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
|
||||
{
|
||||
const size_t name = sparseScoreIt->first;
|
||||
float N = sparseScoreIt->second;
|
||||
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
|
||||
{
|
||||
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
|
||||
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
|
||||
m_err.flush();
|
||||
exit(1);
|
||||
} else {
|
||||
m_gradient[name] += p[i] * N * D;
|
||||
if ( maintainUpdateSet )
|
||||
{
|
||||
m_updateSet.insert(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_xBleu += xBleu;
|
||||
}
|
||||
|
||||
|
||||
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
|
||||
{
|
||||
const size_t nFeatures = sparseScalingFactor.size();
|
||||
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
||||
m_gradient.resize(nFeatures);
|
||||
}
|
||||
|
||||
|
||||
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
|
||||
size_t batchSize,
|
||||
bool useUpdateSet)
|
||||
{
|
||||
|
||||
float xBleu = m_xBleu / batchSize;
|
||||
|
||||
// update sparse scaling factors
|
||||
|
||||
if (useUpdateSet) {
|
||||
|
||||
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
|
||||
{
|
||||
size_t name = *it;
|
||||
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
||||
}
|
||||
|
||||
m_updateSet.clear();
|
||||
|
||||
} else {
|
||||
|
||||
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
||||
{
|
||||
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
m_xBleu = 0;
|
||||
m_gradient.clear();
|
||||
return xBleu;
|
||||
}
|
||||
|
||||
|
||||
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
|
||||
std::vector<float>& sparseScalingFactor,
|
||||
size_t batchSize)
|
||||
{
|
||||
// regularization
|
||||
if ( m_regularizationParameter != 0 )
|
||||
{
|
||||
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
||||
} else {
|
||||
// need to normalize by dividing by batchSize
|
||||
m_gradient[name] /= batchSize;
|
||||
}
|
||||
|
||||
// the actual update
|
||||
sparseScalingFactor[name] += m_learningRate * m_gradient[name];
|
||||
|
||||
// discard scaling factors below a threshold
|
||||
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
||||
{
|
||||
sparseScalingFactor[name] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
|
||||
{
|
||||
const size_t nFeatures = sparseScalingFactor.size();
|
||||
m_previousSparseScalingFactor.resize(nFeatures);
|
||||
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
||||
m_previousGradient.resize(nFeatures);
|
||||
m_gradient.resize(nFeatures);
|
||||
m_stepSize.resize(nFeatures, m_initialStepSize);
|
||||
}
|
||||
|
||||
|
||||
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
|
||||
const size_t batchSize)
|
||||
{
|
||||
|
||||
float xBleu = m_xBleu / batchSize;
|
||||
|
||||
// update sparse scaling factors
|
||||
|
||||
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
||||
{
|
||||
// Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
|
||||
|
||||
// regularization
|
||||
if ( m_regularizationParameter != 0 )
|
||||
{
|
||||
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
||||
}
|
||||
|
||||
// step size
|
||||
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
|
||||
if (sign > 0) {
|
||||
m_stepSize[name] *= m_increaseRate;
|
||||
} else if (sign < 0) {
|
||||
m_stepSize[name] *= m_decreaseRate;
|
||||
}
|
||||
if (m_stepSize[name] < m_minStepSize) {
|
||||
m_stepSize[name] = m_minStepSize;
|
||||
}
|
||||
if (m_stepSize[name] > m_maxStepSize) {
|
||||
m_stepSize[name] = m_maxStepSize;
|
||||
}
|
||||
|
||||
// the actual update
|
||||
|
||||
m_previousGradient[name] = m_gradient[name];
|
||||
if (sign >= 0) {
|
||||
if (m_gradient[name] > 0) {
|
||||
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
||||
sparseScalingFactor[name] += m_stepSize[name];
|
||||
} else if (m_gradient[name] < 0) {
|
||||
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
||||
sparseScalingFactor[name] -= m_stepSize[name];
|
||||
}
|
||||
} else {
|
||||
sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
|
||||
// m_previousGradient[name] = 0;
|
||||
}
|
||||
|
||||
// discard scaling factors below a threshold
|
||||
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
||||
{
|
||||
sparseScalingFactor[name] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
m_xBleu = 0;
|
||||
m_gradient.clear();
|
||||
return xBleu;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
117
contrib/expected-bleu-training/ExpectedBleuOptimizer.h
Normal file
117
contrib/expected-bleu-training/ExpectedBleuOptimizer.h
Normal file
@ -0,0 +1,117 @@
|
||||
/*
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2005-2015 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include "util/file_stream.hh"
|
||||
|
||||
|
||||
namespace ExpectedBleuTraining
|
||||
{
|
||||
|
||||
class ExpectedBleuOptimizer
|
||||
{
|
||||
public:
|
||||
|
||||
ExpectedBleuOptimizer(util::FileStream& err,
|
||||
float learningRate=1,
|
||||
float initialStepSize=0.001,
|
||||
float decreaseRate=0.5,
|
||||
float increaseRate=1.2,
|
||||
float minStepSize=1e-7,
|
||||
float maxStepSize=1,
|
||||
float floorAbsScalingFactor=0,
|
||||
float regularizationParameter=0)
|
||||
: m_err(err)
|
||||
, m_learningRate(learningRate)
|
||||
, m_initialStepSize(initialStepSize)
|
||||
, m_decreaseRate(decreaseRate)
|
||||
, m_increaseRate(increaseRate)
|
||||
, m_minStepSize(minStepSize)
|
||||
, m_maxStepSize(maxStepSize)
|
||||
, m_floorAbsScalingFactor(floorAbsScalingFactor)
|
||||
, m_regularizationParameter(regularizationParameter)
|
||||
, m_xBleu(0)
|
||||
{ }
|
||||
|
||||
void AddTrainingInstance(const size_t nBestSizeCount,
|
||||
const std::vector<float>& sBleu,
|
||||
const std::vector<double>& overallScoreUntransformed,
|
||||
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
|
||||
bool maintainUpdateSet = false);
|
||||
|
||||
void InitSGD(const std::vector<float>& sparseScalingFactor);
|
||||
|
||||
float UpdateSGD(std::vector<float>& sparseScalingFactor,
|
||||
size_t batchSize,
|
||||
bool useUpdateSet = false);
|
||||
|
||||
void InitRPROP(const std::vector<float>& sparseScalingFactor);
|
||||
|
||||
float UpdateRPROP(std::vector<float>& sparseScalingFactor,
|
||||
const size_t batchSize);
|
||||
|
||||
protected:
|
||||
|
||||
util::FileStream& m_err;
|
||||
|
||||
// for SGD
|
||||
const float m_learningRate;
|
||||
|
||||
// for RPROP
|
||||
const float m_initialStepSize;
|
||||
const float m_decreaseRate;
|
||||
const float m_increaseRate;
|
||||
const float m_minStepSize;
|
||||
const float m_maxStepSize;
|
||||
|
||||
std::vector<float> m_previousSparseScalingFactor;
|
||||
std::vector<float> m_previousGradient;
|
||||
std::vector<float> m_gradient;
|
||||
std::vector<float> m_stepSize;
|
||||
|
||||
// other
|
||||
const float m_floorAbsScalingFactor;
|
||||
const float m_regularizationParameter;
|
||||
|
||||
double m_xBleu;
|
||||
|
||||
std::set<size_t> m_updateSet;
|
||||
|
||||
|
||||
void UpdateSingleScalingFactorSGD(size_t name,
|
||||
std::vector<float>& sparseScalingFactor,
|
||||
size_t batchSize);
|
||||
|
||||
|
||||
inline int Sign(double x)
|
||||
{
|
||||
if (x > 0) return 1;
|
||||
if (x < 0) return -1;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
2
contrib/expected-bleu-training/Jamfile
Normal file
2
contrib/expected-bleu-training/Jamfile
Normal file
@ -0,0 +1,2 @@
|
||||
exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
|
||||
exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;
|
222
contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
Normal file
222
contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
Normal file
@ -0,0 +1,222 @@
|
||||
/*
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2005-2015 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/unordered_set.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/string_piece.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
util::FileStream err(2);
|
||||
|
||||
std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
|
||||
size_t maxNBestSize;
|
||||
|
||||
try {
|
||||
|
||||
po::options_description descr("Usage");
|
||||
descr.add_options()
|
||||
("help,h", "produce help message")
|
||||
("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(),
|
||||
"input n-best list file")
|
||||
("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(),
|
||||
"output file for mapping between feature names and indices")
|
||||
("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(),
|
||||
"input file containing list of feature names to be ignored")
|
||||
("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
|
||||
"limit of n-best list entries to be considered")
|
||||
;
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::parse_command_line(argc, argv, descr), vm);
|
||||
|
||||
if (vm.count("help")) {
|
||||
std::ostringstream os;
|
||||
os << descr;
|
||||
std::cout << os.str() << '\n';
|
||||
exit(0);
|
||||
}
|
||||
|
||||
po::notify(vm);
|
||||
|
||||
} catch(std::exception& e) {
|
||||
|
||||
err << "Error: " << e.what() << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
util::FilePiece ifsNBest(filenameNBestListIn.c_str());
|
||||
util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
|
||||
util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
|
||||
util::FileStream ofsFeatureNames(fdFeatureNames.get());
|
||||
util::FileStream ofsNBest(1);
|
||||
|
||||
boost::unordered_set<std::string> ignoreFeatureNames;
|
||||
StringPiece line;
|
||||
|
||||
while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
|
||||
{
|
||||
if ( !line.empty() ) {
|
||||
util::TokenIter<util::AnyCharacter> item(line, " \t=");
|
||||
if ( item != item.end() )
|
||||
{
|
||||
ignoreFeatureNames.insert(item->as_string());
|
||||
}
|
||||
err << "ignoring " << *item << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
size_t maxFeatureNamesIdx = 0;
|
||||
boost::unordered_map<std::string, size_t> featureNames;
|
||||
|
||||
size_t sentenceIndex = 0;
|
||||
size_t nBestSizeCount = 0;
|
||||
size_t globalIndex = 0;
|
||||
|
||||
while ( ifsNBest.ReadLineOrEOF(line) )
|
||||
{
|
||||
util::TokenIter<util::MultiCharacter> item(line, " ||| ");
|
||||
|
||||
if ( item == item.end() )
|
||||
{
|
||||
err << "Error: flawed content in " << filenameNBestListIn << '\n';
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
|
||||
|
||||
if ( sentenceIndex != sentenceIndexCurrent )
|
||||
{
|
||||
nBestSizeCount = 0;
|
||||
sentenceIndex = sentenceIndexCurrent;
|
||||
}
|
||||
|
||||
if ( nBestSizeCount < maxNBestSize )
|
||||
{
|
||||
// process n-best list entry
|
||||
|
||||
StringPiece scores;
|
||||
StringPiece decoderScore;
|
||||
for (size_t nItem=1; nItem<=3; ++nItem)
|
||||
{
|
||||
if ( ++item == item.end() ) {
|
||||
err << "Error: flawed content in " << filenameNBestListIn << '\n';
|
||||
exit(1);
|
||||
}
|
||||
if (nItem == 2) {
|
||||
scores = *item;
|
||||
}
|
||||
if (nItem == 3) {
|
||||
decoderScore = *item;
|
||||
}
|
||||
}
|
||||
|
||||
ofsNBest << sentenceIndex << ' '
|
||||
<< decoderScore;
|
||||
|
||||
util::TokenIter<util::SingleCharacter> token(scores, ' ');
|
||||
std::string featureNameCurrent("ERROR");
|
||||
std::string featureNameCurrentBase("ERROR");
|
||||
bool ignore = false;
|
||||
int scoreComponentIndex = 0;
|
||||
|
||||
while ( token != token.end() )
|
||||
{
|
||||
if ( token->ends_with("=") )
|
||||
{
|
||||
scoreComponentIndex = 0;
|
||||
featureNameCurrent = token->substr(0,token->size()-1).as_string();
|
||||
size_t idx = featureNameCurrent.find_first_of('_');
|
||||
if ( idx == StringPiece::npos ) {
|
||||
featureNameCurrentBase = featureNameCurrent;
|
||||
} else {
|
||||
featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
|
||||
}
|
||||
ignore = false;
|
||||
if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
|
||||
{
|
||||
ignore = true;
|
||||
} else {
|
||||
if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
|
||||
(ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
|
||||
{
|
||||
ignore = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( !ignore )
|
||||
{
|
||||
float featureValueCurrent = atof( token->as_string().c_str() );;
|
||||
if ( scoreComponentIndex > 0 )
|
||||
{
|
||||
std::ostringstream oss;
|
||||
oss << scoreComponentIndex;
|
||||
featureNameCurrent.append("+");
|
||||
}
|
||||
if ( featureValueCurrent != 0 )
|
||||
{
|
||||
boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
|
||||
|
||||
if ( featureName == featureNames.end() )
|
||||
{
|
||||
std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted =
|
||||
featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
|
||||
++maxFeatureNamesIdx;
|
||||
featureName = inserted.first;
|
||||
}
|
||||
|
||||
ofsNBest << ' ' << featureName->second // feature name index
|
||||
<< ' ' << *token; // feature value
|
||||
}
|
||||
++scoreComponentIndex;
|
||||
}
|
||||
}
|
||||
++token;
|
||||
}
|
||||
ofsNBest << '\n';
|
||||
++nBestSizeCount;
|
||||
}
|
||||
++globalIndex;
|
||||
}
|
||||
|
||||
ofsFeatureNames << maxFeatureNamesIdx << '\n';
|
||||
for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
|
||||
featureNamesIt!=featureNames.end(); ++featureNamesIt)
|
||||
{
|
||||
ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
|
||||
}
|
||||
|
||||
}
|
||||
|
379
contrib/expected-bleu-training/TrainExpectedBleu.cpp
Normal file
379
contrib/expected-bleu-training/TrainExpectedBleu.cpp
Normal file
@ -0,0 +1,379 @@
|
||||
/*
|
||||
Moses - statistical machine translation system
|
||||
Copyright (C) 2005-2015 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "ExpectedBleuOptimizer.h"
|
||||
#include "util/file_stream.hh"
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/string_piece.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
#include <sstream>
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
using namespace ExpectedBleuTraining;
|
||||
namespace po = boost::program_options;
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
util::FileStream out(1);
|
||||
util::FileStream err(2);
|
||||
|
||||
size_t maxNBestSize;
|
||||
size_t iterationLimit;
|
||||
std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
|
||||
|
||||
bool ignoreDecoderScore;
|
||||
|
||||
float learningRate;
|
||||
float initialStepSize;
|
||||
float decreaseRate;
|
||||
float increaseRate;
|
||||
float minStepSize;
|
||||
float maxStepSize;
|
||||
float floorAbsScalingFactor;
|
||||
float regularizationParameter;
|
||||
bool printZeroWeights;
|
||||
bool miniBatches;
|
||||
std::string optimizerTypeStr;
|
||||
size_t optimizerType = 0;
|
||||
#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
|
||||
#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
|
||||
|
||||
try {
|
||||
|
||||
po::options_description descr("Usage");
|
||||
descr.add_options()
|
||||
("help,h", "produce help message")
|
||||
("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
|
||||
"limit of n-best list entries to be considered for training")
|
||||
("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50),
|
||||
"number of training iterations")
|
||||
("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(),
|
||||
"file containing sentence-level BLEU scores for all n-best list entries")
|
||||
("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(),
|
||||
"input n-best list file, in prepared format for expected BLEU training")
|
||||
("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(),
|
||||
"file containing mapping between feature names and indices")
|
||||
("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""),
|
||||
"file containing start values for scaling factors (optional)")
|
||||
("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0),
|
||||
"exclude decoder score from computation of posterior probability")
|
||||
("regularization", boost::program_options::value<float>(®ularizationParameter)->default_value(0), // e.g. 1e-5
|
||||
"regularization parameter; suggested value range: [1e-8,1e-5]")
|
||||
("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1),
|
||||
"learning rate for the SGD optimizer")
|
||||
("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7
|
||||
"set scaling factor to 0 if below this absolute value after update")
|
||||
("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1
|
||||
"initial step size for the RPROP optimizer")
|
||||
("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5),
|
||||
"decrease rate for the RPROP optimizer")
|
||||
("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2),
|
||||
"increase rate for the RPROP optimizer")
|
||||
("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7),
|
||||
"minimum step size for the RPROP optimizer")
|
||||
("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1),
|
||||
"maximum step size for the RPROP optimizer")
|
||||
("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0),
|
||||
"output scaling factors even if they are trained to 0")
|
||||
("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"),
|
||||
"optimizer type used for training (known algorithms: RPROP, SGD)")
|
||||
("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0),
|
||||
"update after every single sentence (SGD only)")
|
||||
;
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::parse_command_line(argc, argv, descr), vm);
|
||||
|
||||
if (vm.count("help")) {
|
||||
std::ostringstream os;
|
||||
os << descr;
|
||||
out << os.str() << '\n';
|
||||
out.flush();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
po::notify(vm);
|
||||
|
||||
} catch(std::exception& e) {
|
||||
|
||||
err << "Error: " << e.what() << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
|
||||
optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
|
||||
} else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
|
||||
optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
|
||||
} else {
|
||||
err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
|
||||
|
||||
StringPiece lineFeatureName;
|
||||
if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) )
|
||||
{
|
||||
err << "Error: flawed content in " << filenameFeatureNames << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
|
||||
|
||||
std::vector<std::string> featureNames(maxFeatureNamesIdx);
|
||||
boost::unordered_map<std::string, size_t> featureIndexes;
|
||||
for (size_t i=0; i<maxFeatureNamesIdx; ++i)
|
||||
{
|
||||
if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
|
||||
err << "Error: flawed content in " << filenameFeatureNames << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
|
||||
size_t featureIndexCurrent = atol( token->as_string().c_str() );
|
||||
token++;
|
||||
featureNames[featureIndexCurrent] = token->as_string();
|
||||
featureIndexes[token->as_string()] = featureIndexCurrent;
|
||||
}
|
||||
|
||||
|
||||
std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
|
||||
std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
|
||||
|
||||
// read initial weights, if any given
|
||||
|
||||
if ( filenameInitialWeights.length() != 0 )
|
||||
{
|
||||
util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
|
||||
|
||||
StringPiece lineInitialWeight;
|
||||
if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
|
||||
err << "Error: flawed content in " << filenameInitialWeights << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
do {
|
||||
util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
|
||||
boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
|
||||
if ( found == featureIndexes.end() ) {
|
||||
err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
token++;
|
||||
sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
|
||||
} while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
|
||||
}
|
||||
|
||||
// train
|
||||
|
||||
ExpectedBleuOptimizer optimizer(err,
|
||||
learningRate,
|
||||
initialStepSize,
|
||||
decreaseRate,
|
||||
increaseRate,
|
||||
minStepSize,
|
||||
maxStepSize,
|
||||
floorAbsScalingFactor,
|
||||
regularizationParameter);
|
||||
|
||||
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
||||
{
|
||||
optimizer.InitRPROP(sparseScalingFactor);
|
||||
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
||||
optimizer.InitRPROP(sparseScalingFactor);
|
||||
} else {
|
||||
err << "Error: unknown optimizer type" << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration)
|
||||
{
|
||||
util::FilePiece ifsSBleu(filenameSBleu.c_str());
|
||||
util::FilePiece ifsNBest(filenameNBestList.c_str());
|
||||
|
||||
out << "### ITERATION " << nIteration << '\n' << '\n';
|
||||
|
||||
size_t sentenceIndex = 0;
|
||||
size_t batchSize = 0;
|
||||
size_t nBestSizeCount = 0;
|
||||
size_t globalIndex = 0;
|
||||
StringPiece lineNBest;
|
||||
std::vector<double> overallScoreUntransformed;
|
||||
std::vector<float> sBleu;
|
||||
float xBleu = 0;
|
||||
// double expPrecisionCorrection = 0.0;
|
||||
|
||||
while ( ifsNBest.ReadLineOrEOF(lineNBest) )
|
||||
{
|
||||
|
||||
util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
|
||||
|
||||
if ( token == token.end() )
|
||||
{
|
||||
err << "Error: flawed content in " << filenameNBestList << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
|
||||
token++;
|
||||
|
||||
if ( sentenceIndex != sentenceIndexCurrent )
|
||||
{
|
||||
|
||||
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
||||
{
|
||||
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
|
||||
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
||||
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
|
||||
|
||||
if ( miniBatches ) {
|
||||
xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
|
||||
// out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
|
||||
// for (size_t i=0; i<sparseScalingFactor.size(); ++i)
|
||||
// {
|
||||
// if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
|
||||
// {
|
||||
// out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
|
||||
// }
|
||||
// }
|
||||
// out << '\n';
|
||||
// out.flush();
|
||||
}
|
||||
} else {
|
||||
err << "Error: unknown optimizer type" << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (size_t i=0; i<nBestSizeCount; ++i) {
|
||||
sparseScore[i].clear();
|
||||
}
|
||||
nBestSizeCount = 0;
|
||||
overallScoreUntransformed.clear();
|
||||
sBleu.clear();
|
||||
sentenceIndex = sentenceIndexCurrent;
|
||||
++batchSize;
|
||||
}
|
||||
|
||||
StringPiece lineSBleu;
|
||||
if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
|
||||
{
|
||||
err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ( nBestSizeCount < maxNBestSize )
|
||||
{
|
||||
// retrieve sBLEU
|
||||
|
||||
float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
|
||||
sBleu.push_back(sBleuCurrent);
|
||||
|
||||
// process n-best list entry
|
||||
|
||||
if ( token == token.end() )
|
||||
{
|
||||
err << "Error: flawed content in " << filenameNBestList << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
double scoreCurrent = 0;
|
||||
if ( !ignoreDecoderScore )
|
||||
{
|
||||
scoreCurrent = atof( token->as_string().c_str() ); // decoder score
|
||||
}
|
||||
token++;
|
||||
|
||||
// if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
|
||||
// {
|
||||
// expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
|
||||
// }
|
||||
|
||||
while (token != token.end())
|
||||
{
|
||||
size_t featureNameCurrent = atol( token->as_string().c_str() );
|
||||
token++;
|
||||
float featureValueCurrent = atof( token->as_string().c_str() );
|
||||
sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
|
||||
scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
|
||||
token++;
|
||||
}
|
||||
|
||||
// overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
|
||||
overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
|
||||
|
||||
++nBestSizeCount;
|
||||
}
|
||||
++globalIndex;
|
||||
}
|
||||
|
||||
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
||||
{
|
||||
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
|
||||
xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
|
||||
out << "xBLEU= " << xBleu << '\n';
|
||||
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
||||
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
|
||||
if ( miniBatches ) {
|
||||
xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
|
||||
xBleu /= batchSize;
|
||||
} else {
|
||||
xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
|
||||
}
|
||||
out << "xBLEU= " << xBleu << '\n';
|
||||
} else {
|
||||
err << "Error: unknown optimizer type" << '\n';
|
||||
err.flush();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (size_t i=0; i<nBestSizeCount; ++i) {
|
||||
sparseScore[i].clear();
|
||||
}
|
||||
nBestSizeCount = 0;
|
||||
overallScoreUntransformed.clear();
|
||||
sBleu.clear();
|
||||
|
||||
out << '\n';
|
||||
|
||||
for (size_t i=0; i<sparseScalingFactor.size(); ++i)
|
||||
{
|
||||
if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
|
||||
{
|
||||
out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
out << '\n';
|
||||
out.flush();
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user