mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
Update mira optimatization code and merge Main.cpp
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3652 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
1bd263c4ec
commit
85a71793a6
@ -101,7 +101,7 @@ namespace Mira {
|
||||
vector< ScoreComponentCollection>& featureValues,
|
||||
vector< float>& bleuScores )
|
||||
{
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
|
||||
m_sentence = new Sentence(Input);
|
||||
stringstream in(source + "\n");
|
||||
|
262
mira/Main.cpp
262
mira/Main.cpp
@ -77,9 +77,11 @@ int main(int argc, char** argv) {
|
||||
vector<string> referenceFiles;
|
||||
size_t epochs;
|
||||
string learner;
|
||||
bool shuffle = true;
|
||||
bool shuffle = true; // TODO: parameterize?
|
||||
size_t mixFrequency;
|
||||
size_t weightDumpFrequency;
|
||||
size_t clippingScheme;
|
||||
float lowerBound, upperBound;
|
||||
po::options_description desc("Allowed options");
|
||||
desc.add_options()
|
||||
("help",po::value( &help )->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
@ -90,7 +92,11 @@ int main(int argc, char** argv) {
|
||||
("epochs,e", po::value<size_t>(&epochs)->default_value(1), "Number of epochs")
|
||||
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
||||
("mix-frequency", po::value<size_t>(&mixFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
|
||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights");
|
||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights")
|
||||
("clipping-scheme,c", po::value<size_t>(&clippingScheme)->default_value(1), "Select clipping scheme for weight updates (1: equal 2: varied")
|
||||
("lower-bound,lb", po::value<float>(&lowerBound)->default_value(-0.01), "Lower bound for mira clipping scheme")
|
||||
("upper-bound,ub", po::value<float>(&upperBound)->default_value(0.01), "Upper bound for mira clipping scheme");
|
||||
|
||||
|
||||
po::options_description cmdline_options;
|
||||
cmdline_options.add(desc);
|
||||
@ -122,22 +128,6 @@ int main(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
//FIXME: Make these configurable
|
||||
float miraLowerBound = 0;
|
||||
float miraUpperBound = 1;
|
||||
|
||||
Optimiser* optimiser = NULL;
|
||||
if (learner == "mira") {
|
||||
cerr << "Optimising using Mira" << endl;
|
||||
optimiser = new MiraOptimiser(miraLowerBound, miraUpperBound);
|
||||
} else if (learner == "perceptron") {
|
||||
cerr << "Optimising using Perceptron" << endl;
|
||||
optimiser = new Perceptron();
|
||||
} else {
|
||||
cerr << "Error: Unknown optimiser: " << learner << endl;
|
||||
}
|
||||
|
||||
|
||||
//load input and references
|
||||
vector<string> inputSentences;
|
||||
if (!loadSentences(inputFile, inputSentences)) {
|
||||
@ -158,9 +148,25 @@ int main(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//initialise moses
|
||||
initMoses(mosesConfigFile, verbosity);//, argc, argv);
|
||||
MosesDecoder* decoder = new MosesDecoder(referenceSentences) ;
|
||||
ScoreComponentCollection startWeights = decoder->getWeights();
|
||||
|
||||
// print feature function and weights
|
||||
// TODO: scaling of feature functions
|
||||
// TODO: initialise weights equally
|
||||
const vector<const ScoreProducer*> featureFunctions = StaticData::Instance().GetTranslationSystem (TranslationSystem::DEFAULT).GetFeatureFunctions();
|
||||
for (size_t i = 0; i < featureFunctions.size(); ++i) {
|
||||
cout << "Feature functions: " << featureFunctions[i]->GetScoreProducerDescription() << ": " << featureFunctions[i]->GetNumScoreComponents() << endl;
|
||||
vector< float> weights = startWeights.GetScoresForProducer(featureFunctions[i]);
|
||||
cout << "weights: ";
|
||||
for (size_t j = 0; j < weights.size(); ++j) {
|
||||
cout << weights[j];
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
//Optionally shuffle the sentences
|
||||
vector<size_t> order;
|
||||
@ -189,120 +195,174 @@ int main(int argc, char** argv) {
|
||||
shard.resize(shardSize);
|
||||
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
|
||||
|
||||
|
||||
Optimiser* optimiser = NULL;
|
||||
size_t n = 10; // size of n-best lists
|
||||
if (learner == "mira") {
|
||||
cerr << "Optimising using Mira" << endl;
|
||||
optimiser = new MiraOptimiser(n, clippingScheme, lowerBound, upperBound);
|
||||
} else if (learner == "perceptron") {
|
||||
cerr << "Optimising using Perceptron" << endl;
|
||||
optimiser = new Perceptron();
|
||||
} else {
|
||||
cerr << "Error: Unknown optimiser: " << learner << endl;
|
||||
}
|
||||
|
||||
//Main loop:
|
||||
ScoreComponentCollection cumulativeWeights;
|
||||
size_t modelHypoCount = 10;
|
||||
size_t hopeHypoCount = 10;
|
||||
size_t fearHypoCount = 10;
|
||||
ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
|
||||
size_t iterations = 0;
|
||||
size_t epoch = 0;
|
||||
|
||||
time_t now = time(0); // get current time
|
||||
struct tm* tm = localtime(&now); // get struct filled out
|
||||
cout << "Start date/time: " << tm->tm_mon+1 << "/" << tm->tm_mday << "/" << tm->tm_year + 1900
|
||||
<< ", " << tm->tm_hour << ":" << tm->tm_min << ":" << tm->tm_sec << endl;
|
||||
|
||||
|
||||
for (size_t epoch = 0; epoch < epochs; ++epoch) {
|
||||
//TODO: batching
|
||||
size_t shardPosition = 0;
|
||||
for (vector<size_t>::const_iterator sid = shard.begin();
|
||||
sid != shard.end(); ++sid) {
|
||||
const string& input = inputSentences[*sid];
|
||||
const vector<string>& refs = referenceSentences[*sid];
|
||||
// TODO: stop MIRA when score on dev or tuning set does not improve further?
|
||||
for (size_t epoch = 1; epoch <= epochs; ++epoch) {
|
||||
|
||||
vector<vector<ScoreComponentCollection > > allScores(1);
|
||||
vector<vector<float> > allLosses(1);
|
||||
cout << "\nEpoch " << epoch << std::endl;
|
||||
cumulativeWeights.ZeroAll();
|
||||
|
||||
// compute sum in objective function after each epoch
|
||||
float maxSum = 0.0;
|
||||
|
||||
// MODEL
|
||||
decoder->getNBest(input,
|
||||
//TODO: batching
|
||||
size_t batchSize = 1;
|
||||
size_t batch = 0;
|
||||
size_t shardPosition = 0;
|
||||
for (vector<size_t>::const_iterator sid = shard.begin(); sid != shard.end(); ++sid) {
|
||||
const string& input = inputSentences[*sid];
|
||||
const vector<string>& refs = referenceSentences[*sid];
|
||||
cout << "Input sentence " << *sid << ": \"" << input << "\"" << std::endl;
|
||||
|
||||
// feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
|
||||
vector<vector<ScoreComponentCollection > > featureValues(batchSize);
|
||||
vector<vector<float> > bleuScores(batchSize);
|
||||
|
||||
// MODEL
|
||||
cout << "Run decoder to get nbest wrt model score" << std::endl;
|
||||
decoder->getNBest(input,
|
||||
*sid,
|
||||
modelHypoCount,
|
||||
n,
|
||||
0.0,
|
||||
1.0,
|
||||
allScores[0],
|
||||
allLosses[0]);
|
||||
featureValues[batch],
|
||||
bleuScores[batch]);
|
||||
|
||||
|
||||
// HOPE
|
||||
size_t oraclePos = allScores.size();
|
||||
vector<const Word*> oracle =
|
||||
decoder->getNBest(input,
|
||||
*sid,
|
||||
modelHypoCount,
|
||||
// HOPE
|
||||
cout << "Run decoder to get nbest hope translations" << std::endl;
|
||||
size_t oraclePos = featureValues[batch].size();
|
||||
vector<const Word*> oracle = decoder->getNBest(input,
|
||||
*sid,
|
||||
n,
|
||||
1.0,
|
||||
1.0,
|
||||
allScores[0],
|
||||
allLosses[0]);
|
||||
featureValues[batch],
|
||||
bleuScores[batch]);
|
||||
|
||||
ScoreComponentCollection oracleScores = allScores[0][oraclePos];
|
||||
float oracleLoss = allLosses[0][oraclePos];
|
||||
ScoreComponentCollection oracleFeatureValues = featureValues[batch][oraclePos];
|
||||
float oracleBleuScore = bleuScores[batch][oraclePos];
|
||||
|
||||
// FEAR
|
||||
decoder->getNBest(input,
|
||||
// FEAR
|
||||
cout << "Run decoder to get nbest fear translations" << std::endl;
|
||||
decoder->getNBest(input,
|
||||
*sid,
|
||||
modelHypoCount,
|
||||
n,
|
||||
-1.0,
|
||||
1.0,
|
||||
allScores[0],
|
||||
allLosses[0]);
|
||||
featureValues[batch],
|
||||
bleuScores[batch]);
|
||||
|
||||
//set loss for each sentence as oracleloss - rawsentenceloss
|
||||
for (size_t i = 0; i < allScores.size(); ++i) {
|
||||
for (size_t j = 0; j < allScores[i].size(); ++j) {
|
||||
allLosses[i][j] = oracleLoss - allLosses[i][j];
|
||||
}
|
||||
}
|
||||
// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
|
||||
vector< vector<float> > losses(batchSize);
|
||||
for (size_t i = 0; i < batchSize; ++i) {
|
||||
for (size_t j = 0; j < bleuScores[i].size(); ++j) {
|
||||
losses[i].push_back(oracleBleuScore - bleuScores[i][j]);
|
||||
//cout << "loss[" << i << "," << j << "]" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// get weight vector and set weight for bleu feature to 0
|
||||
ScoreComponentCollection mosesWeights = decoder->getWeights();
|
||||
const vector<const ScoreProducer*> featureFunctions = StaticData::Instance().GetTranslationSystem (TranslationSystem::DEFAULT).GetFeatureFunctions();
|
||||
mosesWeights.Assign(featureFunctions.back(), 0);
|
||||
ScoreComponentCollection oldWeights(mosesWeights);
|
||||
|
||||
//run optimiser
|
||||
ScoreComponentCollection mosesWeights = decoder->getWeights();
|
||||
optimiser->updateWeights(mosesWeights
|
||||
, allScores
|
||||
, allLosses
|
||||
, oracleScores);
|
||||
|
||||
//update moses weights
|
||||
mosesWeights.L1Normalise();
|
||||
decoder->setWeights(mosesWeights);
|
||||
//run optimiser
|
||||
cout << "Run optimiser.." << endl;
|
||||
optimiser->updateWeights(mosesWeights, featureValues, losses, oracleFeatureValues);
|
||||
|
||||
//update moses weights
|
||||
mosesWeights.L1Normalise();
|
||||
decoder->setWeights(mosesWeights);
|
||||
|
||||
//history (for approx doc bleu)
|
||||
decoder->updateHistory(oracle);
|
||||
cumulativeWeights.PlusEquals(mosesWeights);
|
||||
decoder->cleanup();
|
||||
//history (for approx doc bleu)
|
||||
decoder->updateHistory(oracle);
|
||||
|
||||
++shardPosition;
|
||||
++iterations;
|
||||
cumulativeWeights.PlusEquals(mosesWeights);
|
||||
decoder->cleanup();
|
||||
|
||||
//mix weights?
|
||||
// Compute objective for all hypotheses of a training source sentence
|
||||
// add max(l_ij - Delta_ij * w') for check on objective
|
||||
float maxDiff = 0.0;
|
||||
for (size_t j = 0; j < 3*n; ++j) {
|
||||
ScoreComponentCollection featureDiff(oracleFeatureValues);
|
||||
featureDiff.MinusEquals(featureValues[batch][j]);
|
||||
float tmpMaxDiff = losses[batch][j] - featureDiff.InnerProduct(mosesWeights);
|
||||
if (tmpMaxDiff > maxDiff) {
|
||||
maxDiff = tmpMaxDiff;
|
||||
}
|
||||
}
|
||||
|
||||
maxSum += maxDiff;
|
||||
|
||||
++shardPosition;
|
||||
++iterations;
|
||||
|
||||
//mix weights?
|
||||
#ifdef MPI_ENABLE
|
||||
if (shardPosition % (shard.size() / mixFrequency) == 0) {
|
||||
ScoreComponentCollection averageWeights;
|
||||
VERBOSE(1, "Rank: " << rank << "Before mixing: " << mosesWeights << endl);
|
||||
mpi::reduce(world,mosesWeights,averageWeights,SCCPlus(),0);
|
||||
if (rank == 0) {
|
||||
averageWeights.MultiplyEquals(1.0f/size);
|
||||
VERBOSE(1, "After mixing: " << averageWeights << endl);
|
||||
}
|
||||
mpi::broadcast(world,averageWeights,0);
|
||||
decoder->setWeights(averageWeights);
|
||||
}
|
||||
if (shardPosition % (shard.size() / mixFrequency) == 0) {
|
||||
ScoreComponentCollection averageWeights;
|
||||
VERBOSE(1, "Rank: " << rank << "Before mixing: " << mosesWeights << endl);
|
||||
mpi::reduce(world,mosesWeights,averageWeights,SCCPlus(),0);
|
||||
if (rank == 0) {
|
||||
averageWeights.MultiplyEquals(1.0f/size);
|
||||
VERBOSE(1, "After mixing: " << averageWeights << endl);
|
||||
}
|
||||
|
||||
mpi::broadcast(world,averageWeights,0);
|
||||
decoder->setWeights(averageWeights);
|
||||
}
|
||||
#endif
|
||||
|
||||
//dump weights?
|
||||
if (shardPosition % (shard.size() / weightDumpFrequency) == 0) {
|
||||
ScoreComponentCollection totalWeights(cumulativeWeights);
|
||||
//dump weights?
|
||||
if (shardPosition % (shard.size() / weightDumpFrequency) == 0) {
|
||||
ScoreComponentCollection totalWeights(cumulativeWeights);
|
||||
#ifdef MPI_ENABLE
|
||||
//average across processes
|
||||
mpi::reduce(world,cumulativeWeights,totalWeights,SCCPlus(),0);
|
||||
//average across processes
|
||||
mpi::reduce(world,cumulativeWeights,totalWeights,SCCPlus(),0);
|
||||
#endif
|
||||
if (rank == 0) {
|
||||
cout << "WEIGHTS " << iterations << " ";
|
||||
totalWeights.L1Normalise();
|
||||
cout << totalWeights << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rank == 0) {
|
||||
cout << "Total weights (" << iterations << ") ";
|
||||
totalWeights.L1Normalise();
|
||||
cout << totalWeights << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// how has the objective function changed?
|
||||
cout << "objective = " << maxSum << endl;
|
||||
}
|
||||
|
||||
|
||||
// take average of cumulative weights of last pass over all source sentences
|
||||
cumulativeWeights.MultiplyEquals(1.0f/inputSentences.size());
|
||||
|
||||
cerr << "Start weights: " << startWeights << endl;
|
||||
cerr << "Averaged new weights: " << cumulativeWeights << endl;
|
||||
|
||||
tm = localtime(&now); // get struct filled out
|
||||
cout << "End date/time: " << tm->tm_mon+1 << "/" << tm->tm_mday << "/" << tm->tm_year + 1900
|
||||
<< ", " << tm->tm_hour << ":" << tm->tm_min << ":" << tm->tm_sec;
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
@ -4,75 +4,187 @@ using namespace Moses;
|
||||
using namespace std;
|
||||
|
||||
namespace Mira {
|
||||
void MiraOptimiser::updateWeights(Moses::ScoreComponentCollection& weights,
|
||||
const std::vector< std::vector<Moses::ScoreComponentCollection> >& scores,
|
||||
const std::vector< std::vector<float> >& losses,
|
||||
const Moses::ScoreComponentCollection& oracleScores) {
|
||||
|
||||
for(unsigned batch = 0; batch < scores.size(); batch++) {
|
||||
void MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
const vector< vector<ScoreComponentCollection> >& featureValues,
|
||||
const vector< vector<float> >& losses,
|
||||
const ScoreComponentCollection& oracleFeatureValues) {
|
||||
|
||||
Moses::ScoreComponentCollection oldWeights(weights);
|
||||
float maxTranslation = -1000.0; //what wrong with FLT_MIN ?!
|
||||
// TODO: do we need the oracle feature values?
|
||||
|
||||
for(unsigned analyseSentence = 0; analyseSentence < scores[batch].size(); analyseSentence++) {
|
||||
size_t numberOfUpdates = 0;
|
||||
|
||||
/* do this:
|
||||
for(unsigned score = 0; score < scores[batch][analyseSentence].size(); score++) {
|
||||
float currentScoreChange = oracleScores[score] - scores[batch][analyseSentence][score];
|
||||
scoreChange += currentScoreChange * weights[score];
|
||||
norm += currentScoreChange * currentScoreChange;
|
||||
}
|
||||
*/
|
||||
Moses::ScoreComponentCollection currentScoreColl = oracleScores;
|
||||
currentScoreColl.MinusEquals(scores[batch][analyseSentence]);
|
||||
currentScoreColl.MultiplyEquals(weights);
|
||||
float scoreChange = currentScoreColl.InnerProduct(weights);
|
||||
float norm = currentScoreColl.InnerProduct(currentScoreColl);
|
||||
cout << "Selected clipping scheme: " << m_clippingScheme << endl;
|
||||
cout << "lower bound: " << m_lowerBound << endl;
|
||||
cout << "upper bound: " << m_upperBound << endl;
|
||||
|
||||
float delta;
|
||||
if(norm == 0.0) //just in case... :-)
|
||||
delta = 0.0;
|
||||
else {
|
||||
|
||||
delta = (losses[batch][analyseSentence] - scoreChange) / norm;
|
||||
vector< float> alphas(3*m_n);
|
||||
for(size_t batch = 0; batch < featureValues.size(); batch++) {
|
||||
if (m_clippingScheme == 2) {
|
||||
// initialise alphas for each source (alpha for oracle translation = C, all other alphas = 0)
|
||||
for (size_t j = 0; j < 3*m_n; ++j) {
|
||||
if (j == m_n) {
|
||||
// oracle
|
||||
alphas[j] = m_upperBound;
|
||||
std::cout << "alpha " << j << ": " << alphas[j] << endl;
|
||||
}
|
||||
else {
|
||||
alphas[j] = 0;
|
||||
std::cout << "alpha " << j << ": " << alphas[j] << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//now get in shape
|
||||
if(delta > upperBound_)
|
||||
delta = upperBound_;
|
||||
else if(delta < lowerBound_)
|
||||
delta = lowerBound_;
|
||||
|
||||
cout << "scoreChange: " << scoreChange
|
||||
<< "\ndelta: " << delta
|
||||
<< "\nloss: " << losses[batch][analyseSentence] << endl;
|
||||
}
|
||||
|
||||
// do this: weights += delta * (oracleScores - scores[batch][analyseSentence])
|
||||
Moses::ScoreComponentCollection tempColl = oracleScores;
|
||||
tempColl.MinusEquals(scores[batch][analyseSentence]);
|
||||
tempColl.MultiplyEquals(delta);
|
||||
weights.MinusEquals(tempColl);
|
||||
// iterate over nbest lists of translations, feature list contains n*model, n*hope, n*fear)
|
||||
// Combinations for j and j': hope/fear, hope/model, model/fear?
|
||||
// Currently we compare each hope against each fear (10x10),
|
||||
// each hope against each model (10x10), each model against each fear translation (10x10)
|
||||
for (size_t j = 0; j < m_n; ++j) {
|
||||
size_t indexModel_j = j;
|
||||
size_t indexHope_j = j + m_n; // e_ij'
|
||||
size_t indexFear_j = j + 2*m_n; // e_ij
|
||||
|
||||
float tmp = losses[batch][analyseSentence] - oracleScores.InnerProduct(weights);
|
||||
if(tmp > maxTranslation)
|
||||
maxTranslation = tmp;
|
||||
|
||||
//calculate max. for criterion
|
||||
/*
|
||||
float sumWeightedFeatures = 0.0;
|
||||
for(unsigned score = 0; score < scores[analyseSentence]->size(); score++) {
|
||||
sumWeightedFeatures += oracleScores[score]*newWeights[score];
|
||||
}
|
||||
for (size_t k = 0; k < m_n; ++k) {
|
||||
size_t indexModel_k = k;
|
||||
size_t indexHope_k = k + m_n; // e_ij'
|
||||
size_t indexFear_k = k + 2*m_n; // e_ij
|
||||
|
||||
if((losses[analyseSentence] - sumWeightedFeatures) > maxTranslation_) {
|
||||
maxTranslation_ = losses[analyseSentence] - sumWeightedFeatures;
|
||||
}
|
||||
*/
|
||||
}
|
||||
oldWeights.MinusEquals(weights);
|
||||
float criterion = 0.5*oldWeights.InnerProduct(oldWeights) + 0.01*maxTranslation;
|
||||
cout << "criterion: " << criterion << endl;
|
||||
// Hypothesis pair hope/fear
|
||||
// Compute delta:
|
||||
cout << "\nComparing hope/fear (" << indexHope_j << "," << indexFear_k << ")" << endl;
|
||||
ScoreComponentCollection featureValueDiffs;
|
||||
float delta = computeDelta(currWeights, featureValues[batch], indexHope_j, indexFear_k, losses[batch], alphas, featureValueDiffs);
|
||||
|
||||
// update weight vector:
|
||||
if (delta != 0) {
|
||||
update(currWeights, featureValueDiffs, delta);
|
||||
++numberOfUpdates;
|
||||
}
|
||||
|
||||
// Hypothesis pair hope/model
|
||||
// Compute delta:
|
||||
cout << "\nComparing hope/model (" << indexHope_j << "," << indexModel_k << ")" << endl;
|
||||
featureValueDiffs.ZeroAll();
|
||||
delta = computeDelta(currWeights, featureValues[batch], indexHope_j, indexModel_k, losses[batch], alphas, featureValueDiffs);
|
||||
|
||||
// update weight vector:
|
||||
if (delta != 0) {
|
||||
update(currWeights, featureValueDiffs, delta);
|
||||
++numberOfUpdates;
|
||||
}
|
||||
|
||||
// Hypothesis pair model/fear
|
||||
// Compute delta:
|
||||
cout << "\nComparing model/fear (" << indexModel_j << "," << indexFear_k << ")" << endl;
|
||||
featureValueDiffs.ZeroAll();
|
||||
delta = computeDelta(currWeights, featureValues[batch], indexModel_j, indexFear_k, losses[batch], alphas, featureValueDiffs);
|
||||
|
||||
// update weight vector:
|
||||
if (delta != 0) {
|
||||
update(currWeights, featureValueDiffs, delta);
|
||||
++numberOfUpdates;
|
||||
}
|
||||
}
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cout << "Number of updates: " << numberOfUpdates << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute delta for weight update.
|
||||
* As part of this compute feature value differences
|
||||
* Dh_ij - Dh_ij' ---> h(e_ij') - h(e_ij)) --> h(hope) - h(fear)
|
||||
* which are used in the delta term and in the weight update term.
|
||||
*/
|
||||
float MiraOptimiser::computeDelta(ScoreComponentCollection& currWeights,
|
||||
const vector< ScoreComponentCollection>& featureValues,
|
||||
const size_t indexHope,
|
||||
const size_t indexFear,
|
||||
const vector< float>& losses,
|
||||
vector< float>& alphas,
|
||||
ScoreComponentCollection& featureValueDiffs) {
|
||||
|
||||
const ScoreComponentCollection featureValuesHope = featureValues[indexHope]; // hypothesis j'
|
||||
const ScoreComponentCollection featureValuesFear = featureValues[indexFear]; // hypothesis j
|
||||
|
||||
// compute delta
|
||||
float delta = 0.0;
|
||||
float diffOfModelScores = 0.0; // (Dh_ij - Dh_ij') * w' ---> (h(e_ij') - h(e_ij))) * w' (inner product)
|
||||
float squaredNorm = 0.0; // ||Dh_ij - Dh_ij'||^2 ---> sum over squares of elements of h(e_ij') - h(e_ij)
|
||||
|
||||
featureValueDiffs = featureValuesHope;
|
||||
featureValueDiffs.MinusEquals(featureValuesFear);
|
||||
cout << "feature value diffs: " << featureValueDiffs << endl;
|
||||
squaredNorm = featureValueDiffs.InnerProduct(featureValueDiffs);
|
||||
diffOfModelScores = featureValueDiffs.InnerProduct(currWeights);
|
||||
|
||||
if (squaredNorm == 0.0) {
|
||||
delta = 0.0;
|
||||
}
|
||||
else {
|
||||
// loss difference used to compute delta: (l_ij - l_ij') ---> B(e_ij') - B(e_ij)
|
||||
// TODO: simplify and use BLEU scores of hypotheses directly?
|
||||
float lossDiff = losses[indexFear] - losses[indexHope];
|
||||
delta = (lossDiff - diffOfModelScores) / squaredNorm;
|
||||
cout << "delta: " << delta << endl;
|
||||
cout << "loss diff - model diff: " << lossDiff << " - " << diffOfModelScores << endl;
|
||||
|
||||
// clipping
|
||||
switch (m_clippingScheme) {
|
||||
case 1:
|
||||
if (delta > m_upperBound) {
|
||||
cout << "clipping " << delta << " to " << m_upperBound << endl;
|
||||
delta = m_upperBound;
|
||||
}
|
||||
else if (delta < m_lowerBound) {
|
||||
cout << "clipping " << delta << " to " << m_lowerBound << endl;
|
||||
delta = m_lowerBound;
|
||||
}
|
||||
|
||||
// TODO: update
|
||||
//m_lowerBound += delta;
|
||||
//m_upperBound -= delta;
|
||||
//cout << "m_lowerBound = " << m_lowerBound << endl;
|
||||
//cout << "m_upperBound = " << m_upperBound << endl;
|
||||
|
||||
break;
|
||||
case 2:
|
||||
// fear translation: e_ij --> alpha_ij = alpha_ij + delta
|
||||
// hope translation: e_ij' --> alpha_ij' = alpha_ij' - delta
|
||||
// clipping interval: [-alpha_ij, alpha_ij']
|
||||
// clip delta
|
||||
cout << "Interval [" << (-1 * alphas[indexFear]) << "," << alphas[indexHope] << "]" << endl;
|
||||
if (delta > alphas[indexHope]) {
|
||||
cout << "clipping " << delta << " to " << alphas[indexHope] << endl;
|
||||
delta = alphas[indexHope];
|
||||
}
|
||||
else if (delta < (-1 * alphas[indexFear])) {
|
||||
cout << "clipping " << delta << " to " << (-1 * alphas[indexFear]) << endl;
|
||||
delta = (-1 * alphas[indexFear]);
|
||||
}
|
||||
|
||||
// update alphas
|
||||
alphas[indexHope] -= delta;
|
||||
alphas[indexFear] += delta;
|
||||
cout << "alpha[" << indexHope << "] = " << alphas[indexHope] << endl;
|
||||
cout << "alpha[" << indexFear << "] = " << alphas[indexFear] << endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the weight vector according to delta and the feature value difference
|
||||
* w' = w' + delta * (Dh_ij - Dh_ij') ---> w' = w' + delta * (h(e_ij') - h(e_ij)))
|
||||
*/
|
||||
void MiraOptimiser::update(ScoreComponentCollection& currWeights, ScoreComponentCollection& featureValueDiffs, const float delta) {
|
||||
featureValueDiffs.MultiplyEquals(delta);
|
||||
currWeights.PlusEquals(featureValueDiffs);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -54,24 +54,43 @@ namespace Mira {
|
||||
const Moses::ScoreComponentCollection& oracleScores);
|
||||
};
|
||||
|
||||
|
||||
class MiraOptimiser : public Optimiser {
|
||||
public:
|
||||
MiraOptimiser(float lowerBound, float upperBound) :
|
||||
Optimiser(),
|
||||
lowerBound_(lowerBound),
|
||||
upperBound_(upperBound) { }
|
||||
MiraOptimiser() :
|
||||
Optimiser() { }
|
||||
|
||||
~MiraOptimiser() {}
|
||||
MiraOptimiser(size_t n, size_t clippingScheme, float lowerBound, float upperBound) :
|
||||
Optimiser(),
|
||||
m_n(n),
|
||||
m_clippingScheme(clippingScheme),
|
||||
m_lowerBound(lowerBound),
|
||||
m_upperBound(upperBound) { }
|
||||
|
||||
~MiraOptimiser() {}
|
||||
|
||||
virtual void updateWeights(Moses::ScoreComponentCollection& weights,
|
||||
const std::vector< std::vector<Moses::ScoreComponentCollection> >& scores,
|
||||
const std::vector< std::vector<float> >& losses,
|
||||
const Moses::ScoreComponentCollection& oracleScores);
|
||||
const std::vector< std::vector<Moses::ScoreComponentCollection> >& scores,
|
||||
const std::vector< std::vector<float> >& losses,
|
||||
const Moses::ScoreComponentCollection& oracleScores);
|
||||
float computeDelta(Moses::ScoreComponentCollection& currWeights,
|
||||
const std::vector< Moses::ScoreComponentCollection>& featureValues,
|
||||
const size_t indexHope,
|
||||
const size_t indexFear,
|
||||
const std::vector< float>& losses,
|
||||
std::vector< float>& alphas,
|
||||
Moses::ScoreComponentCollection& featureValueDiffs);
|
||||
void update(Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& featureValueDiffs, const float delta);
|
||||
|
||||
private:
|
||||
float lowerBound_;
|
||||
float upperBound_;
|
||||
// number of hypotheses used for each nbest list (number of hope, fear, best model translations)
|
||||
size_t m_n;
|
||||
|
||||
// clipping scheme for weight updates
|
||||
// 1: equal, 2: varied
|
||||
size_t m_clippingScheme;
|
||||
|
||||
float m_lowerBound;
|
||||
float m_upperBound;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -25,19 +25,18 @@ using namespace std;
|
||||
namespace Mira {
|
||||
|
||||
void Perceptron::updateWeights(ScoreComponentCollection& currWeights,
|
||||
const vector< vector<ScoreComponentCollection> >& scores,
|
||||
const vector<vector<float> >& losses,
|
||||
const ScoreComponentCollection& oracleScores)
|
||||
const vector< vector<ScoreComponentCollection> >& scores,
|
||||
const vector<vector<float> >& losses,
|
||||
const ScoreComponentCollection& oracleScores)
|
||||
{
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
for (size_t j = 0; j < scores[i].size(); ++j) {
|
||||
if (losses[i][j] > 0) {
|
||||
currWeights.MinusEquals(scores[i][j]);
|
||||
currWeights.PlusEquals(oracleScores);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < scores.size(); ++i) {
|
||||
for (size_t j = 0; j < scores[i].size(); ++j) {
|
||||
if (losses[i][j] > 0) {
|
||||
currWeights.MinusEquals(scores[i][j]);
|
||||
currWeights.PlusEquals(oracleScores);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user