mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
224 lines
6.7 KiB
C++
224 lines
6.7 KiB
C++
/*
|
|
Moses - statistical machine translation system
|
|
Copyright (C) 2005-2015 University of Edinburgh
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
|
|
#include "ExpectedBleuOptimizer.h"
|
|
|
|
|
|
namespace ExpectedBleuTraining
|
|
{
|
|
|
|
|
|
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
|
|
const std::vector<float>& sBleu,
|
|
const std::vector<double>& overallScoreUntransformed,
|
|
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
|
|
bool maintainUpdateSet)
|
|
{
|
|
|
|
// compute xBLEU
|
|
double sumUntransformedScores = 0.0;
|
|
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
|
|
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
|
|
{
|
|
sumUntransformedScores += *overallScoreUntransformedIt;
|
|
}
|
|
|
|
double xBleu = 0.0;
|
|
assert(nBestSizeCount == overallScoreUntransformed.size());
|
|
std::vector<double> p;
|
|
for (size_t i=0; i<nBestSizeCount; ++i)
|
|
{
|
|
if (sumUntransformedScores != 0) {
|
|
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
|
|
} else {
|
|
p.push_back( 0 );
|
|
}
|
|
xBleu += p.back() * sBleu[ i ];
|
|
}
|
|
|
|
for (size_t i=0; i<nBestSizeCount; ++i)
|
|
{
|
|
double D = sBleu[ i ] - xBleu;
|
|
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
|
|
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
|
|
{
|
|
const size_t name = sparseScoreIt->first;
|
|
float N = sparseScoreIt->second;
|
|
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
|
|
{
|
|
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
|
|
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
|
|
m_err.flush();
|
|
exit(1);
|
|
} else {
|
|
m_gradient[name] += p[i] * N * D;
|
|
if ( maintainUpdateSet )
|
|
{
|
|
m_updateSet.insert(name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
m_xBleu += xBleu;
|
|
}
|
|
|
|
|
|
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
|
|
{
|
|
const size_t nFeatures = sparseScalingFactor.size();
|
|
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
|
m_gradient.resize(nFeatures);
|
|
}
|
|
|
|
|
|
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
|
|
size_t batchSize,
|
|
bool useUpdateSet)
|
|
{
|
|
|
|
float xBleu = m_xBleu / batchSize;
|
|
|
|
// update sparse scaling factors
|
|
|
|
if (useUpdateSet) {
|
|
|
|
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
|
|
{
|
|
size_t name = *it;
|
|
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
|
}
|
|
|
|
m_updateSet.clear();
|
|
|
|
} else {
|
|
|
|
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
|
{
|
|
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
|
}
|
|
|
|
}
|
|
|
|
m_xBleu = 0;
|
|
m_gradient.clear();
|
|
return xBleu;
|
|
}
|
|
|
|
|
|
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
|
|
std::vector<float>& sparseScalingFactor,
|
|
size_t batchSize)
|
|
{
|
|
// regularization
|
|
if ( m_regularizationParameter != 0 )
|
|
{
|
|
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
|
} else {
|
|
// need to normalize by dividing by batchSize
|
|
m_gradient[name] /= batchSize;
|
|
}
|
|
|
|
// the actual update
|
|
sparseScalingFactor[name] += m_learningRate * m_gradient[name];
|
|
|
|
// discard scaling factors below a threshold
|
|
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
|
{
|
|
sparseScalingFactor[name] = 0;
|
|
}
|
|
}
|
|
|
|
|
|
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
|
|
{
|
|
const size_t nFeatures = sparseScalingFactor.size();
|
|
m_previousSparseScalingFactor.resize(nFeatures);
|
|
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
|
m_previousGradient.resize(nFeatures);
|
|
m_gradient.resize(nFeatures);
|
|
m_stepSize.resize(nFeatures, m_initialStepSize);
|
|
}
|
|
|
|
|
|
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
|
|
const size_t batchSize)
|
|
{
|
|
|
|
float xBleu = m_xBleu / batchSize;
|
|
|
|
// update sparse scaling factors
|
|
|
|
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
|
{
|
|
// Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
|
|
|
|
// regularization
|
|
if ( m_regularizationParameter != 0 )
|
|
{
|
|
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
|
}
|
|
|
|
// step size
|
|
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
|
|
if (sign > 0) {
|
|
m_stepSize[name] *= m_increaseRate;
|
|
} else if (sign < 0) {
|
|
m_stepSize[name] *= m_decreaseRate;
|
|
}
|
|
if (m_stepSize[name] < m_minStepSize) {
|
|
m_stepSize[name] = m_minStepSize;
|
|
}
|
|
if (m_stepSize[name] > m_maxStepSize) {
|
|
m_stepSize[name] = m_maxStepSize;
|
|
}
|
|
|
|
// the actual update
|
|
|
|
m_previousGradient[name] = m_gradient[name];
|
|
if (sign >= 0) {
|
|
if (m_gradient[name] > 0) {
|
|
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
|
sparseScalingFactor[name] += m_stepSize[name];
|
|
} else if (m_gradient[name] < 0) {
|
|
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
|
sparseScalingFactor[name] -= m_stepSize[name];
|
|
}
|
|
} else {
|
|
sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
|
|
// m_previousGradient[name] = 0;
|
|
}
|
|
|
|
// discard scaling factors below a threshold
|
|
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
|
{
|
|
sparseScalingFactor[name] = 0;
|
|
}
|
|
}
|
|
|
|
m_xBleu = 0;
|
|
m_gradient.clear();
|
|
return xBleu;
|
|
}
|
|
|
|
|
|
}
|
|
|