mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
2015-10-05 00:05:38 +01:00

224 lines
6.7 KiB
C++

/*
Moses - statistical machine translation system
Copyright (C) 2005-2015 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "ExpectedBleuOptimizer.h"
namespace ExpectedBleuTraining
{
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
const std::vector<float>& sBleu,
const std::vector<double>& overallScoreUntransformed,
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
bool maintainUpdateSet)
{
// compute xBLEU
double sumUntransformedScores = 0.0;
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
{
sumUntransformedScores += *overallScoreUntransformedIt;
}
double xBleu = 0.0;
assert(nBestSizeCount == overallScoreUntransformed.size());
std::vector<double> p;
for (size_t i=0; i<nBestSizeCount; ++i)
{
if (sumUntransformedScores != 0) {
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
} else {
p.push_back( 0 );
}
xBleu += p.back() * sBleu[ i ];
}
for (size_t i=0; i<nBestSizeCount; ++i)
{
double D = sBleu[ i ] - xBleu;
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
{
const size_t name = sparseScoreIt->first;
float N = sparseScoreIt->second;
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
{
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
m_err.flush();
exit(1);
} else {
m_gradient[name] += p[i] * N * D;
if ( maintainUpdateSet )
{
m_updateSet.insert(name);
}
}
}
}
m_xBleu += xBleu;
}
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
{
const size_t nFeatures = sparseScalingFactor.size();
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
m_gradient.resize(nFeatures);
}
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
size_t batchSize,
bool useUpdateSet)
{
float xBleu = m_xBleu / batchSize;
// update sparse scaling factors
if (useUpdateSet) {
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
{
size_t name = *it;
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
}
m_updateSet.clear();
} else {
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
{
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
}
}
m_xBleu = 0;
m_gradient.clear();
return xBleu;
}
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
std::vector<float>& sparseScalingFactor,
size_t batchSize)
{
// regularization
if ( m_regularizationParameter != 0 )
{
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
} else {
// need to normalize by dividing by batchSize
m_gradient[name] /= batchSize;
}
// the actual update
sparseScalingFactor[name] += m_learningRate * m_gradient[name];
// discard scaling factors below a threshold
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
{
sparseScalingFactor[name] = 0;
}
}
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
{
const size_t nFeatures = sparseScalingFactor.size();
m_previousSparseScalingFactor.resize(nFeatures);
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
m_previousGradient.resize(nFeatures);
m_gradient.resize(nFeatures);
m_stepSize.resize(nFeatures, m_initialStepSize);
}
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
const size_t batchSize)
{
float xBleu = m_xBleu / batchSize;
// update sparse scaling factors
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
{
// Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
// regularization
if ( m_regularizationParameter != 0 )
{
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
}
// step size
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
if (sign > 0) {
m_stepSize[name] *= m_increaseRate;
} else if (sign < 0) {
m_stepSize[name] *= m_decreaseRate;
}
if (m_stepSize[name] < m_minStepSize) {
m_stepSize[name] = m_minStepSize;
}
if (m_stepSize[name] > m_maxStepSize) {
m_stepSize[name] = m_maxStepSize;
}
// the actual update
m_previousGradient[name] = m_gradient[name];
if (sign >= 0) {
if (m_gradient[name] > 0) {
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
sparseScalingFactor[name] += m_stepSize[name];
} else if (m_gradient[name] < 0) {
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
sparseScalingFactor[name] -= m_stepSize[name];
}
} else {
sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
// m_previousGradient[name] = 0;
}
// discard scaling factors below a threshold
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
{
sparseScalingFactor[name] = 0;
}
}
m_xBleu = 0;
m_gradient.clear();
return xBleu;
}
}