For hope-fear option, add only constraints between hope and fear translations, not between hope translations. Make hope-fear sizes flexible with --hope-n and --fear-n

git-svn-id: http://svn.statmt.org/repository/mira@3897 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
ehasler 2011-05-16 16:56:52 +00:00 committed by Ondrej Bojar
parent a177b58d18
commit 020c71216b
4 changed files with 533 additions and 130 deletions

View File

@ -144,7 +144,6 @@ int main(int argc, char** argv) {
float min_sentence_update;
size_t weightedLossFunction;
size_t n;
size_t nbest_first;
size_t batchSize;
bool distinctNbest;
bool onlyViolatedConstraints;
@ -190,7 +189,8 @@ int main(int argc, char** argv) {
bool analytical_update;
bool perceptron_update;
bool hope_fear;
size_t constraints;
int hope_n;
int fear_n;
po::options_description desc("Allowed options");
desc.add_options()
("accumulate-most-violated-constraints", po::value<bool>(&accumulateMostViolatedConstraints)->default_value(false),"Accumulate most violated constraint per example")
@ -206,7 +206,6 @@ int main(int argc, char** argv) {
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
("constraints", po::value<size_t>(&constraints)->default_value(1), "Number of constraints used for analytical update")
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
@ -215,11 +214,13 @@ int main(int argc, char** argv) {
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("hildreth", po::value<bool>(&hildreth)->default_value(true), "Use Hildreth's optimisation algorithm")
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations (not model)")
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
("learning-rate", po::value<float>(&learning_rate)->default_value(1), "Learning rate (fixed or flexible)")
@ -236,8 +237,7 @@ int main(int argc, char** argv) {
("msf-step", po::value<float>(&marginScaleFactorStep)->default_value(0), "Decrease margin scale factor iteratively by the value provided")
("multiplyA", po::value<bool>(&multiplyA)->default_value(true), "Multiply A with outcome before passing to Hildreth")
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
("nbest-first", po::value<size_t>(&nbest_first)->default_value(0), "Number of translations in nbest list in the first epoch")
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
("past-and-current-constraints", po::value<bool>(&pastAndCurrentConstraints)->default_value(false), "Accumulate most violated constraint per example and use them along all current constraints")
("perceptron-update", po::value<bool>(&perceptron_update)->default_value(false), "Do a simple perceptron style update")
@ -295,8 +295,9 @@ int main(int argc, char** argv) {
return 1;
}
if (nbest_first == 0) {
nbest_first = n;
if (hope_n == -1 && fear_n == -1) {
hope_n = n;
fear_n = n;
}
// load input and references
@ -486,7 +487,6 @@ int main(int argc, char** argv) {
cerr << "msf-min: " << marginScaleFactorMin << endl;
cerr << "weighted-loss-function: " << weightedLossFunction << endl;
cerr << "nbest: " << n << endl;
cerr << "nbest-first: " << nbest_first << endl;
cerr << "batch-size: " << batchSize << endl;
cerr << "distinct-nbest: " << distinctNbest << endl;
cerr << "only-violated-constraints: " << onlyViolatedConstraints << endl;
@ -523,6 +523,8 @@ int main(int argc, char** argv) {
cerr << "perceptron-update: " << perceptron_update << endl;
cerr << "analytical-update: " << analytical_update << endl;
cerr << "hope-fear: " << hope_fear << endl;
cerr << "hope-n: " << hope_n << endl;
cerr << "fear-n: " << fear_n << endl;
if (learner == "mira") {
cerr << "Optimising using Mira" << endl;
@ -608,6 +610,12 @@ int main(int argc, char** argv) {
vector<vector<float> > bleuScores;
vector<vector<float> > dummyBleuScores;
// variables for hope-fear setting
vector<vector<ScoreComponentCollection> > featureValuesHope;
vector<vector<ScoreComponentCollection> > featureValuesFear;
vector<vector<float> > bleuScoresHope;
vector<vector<float> > bleuScoresFear;
// get moses weights
ScoreComponentCollection mosesWeights = decoder->getWeights();
cerr << "\nRank " << rank << ", next batch" << endl;
@ -632,77 +640,22 @@ int main(int argc, char** argv) {
vector<ScoreComponentCollection> newFeatureValues;
vector<float> newBleuScores;
featureValues.push_back(newFeatureValues);
dummyFeatureValues.push_back(newFeatureValues);
bleuScores.push_back(newBleuScores);
dummyBleuScores.push_back(newBleuScores);
size_t pass_n = (epoch == 0)? nbest_first : n;
if (perceptron_update || analytical_update) {
if (constraints == 1) {
if (historyOf1best) {
// MODEL (for updating the history)
cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
oneBests.push_back(bestModel);
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
}
// HOPE
cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl;
size_t oraclePos = dummyFeatureValues[batchPosition].size();
vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
distinctNbest, rank);
// needed for history
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl;
oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]);
oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]);
// clear dummies
dummyFeatureValues[batchPosition].clear();
dummyBleuScores[batchPosition].clear();
// FEAR
cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl;
size_t fearPos = featureValues[batchPosition].size();
vector<const Word*> fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
}
}
else {
// TODO:
}
if (hope_fear) {
featureValuesHope.push_back(newFeatureValues);
featureValuesFear.push_back(newFeatureValues);
bleuScoresHope.push_back(newBleuScores);
bleuScoresFear.push_back(newBleuScores);
}
else {
if (!hope_fear) {
// MODEL
cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best wrt model score" << endl;
vector<const Word*> bestModel = decoder->getNBest(input, *sid, pass_n, 0.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
oneBests.push_back(bestModel);
// needed for calculating bleu of dev (1best translations) // todo:
all_ref_ids.push_back(*sid);
allBestModelScore.push_back(bestModel);
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl;
}
else if (historyOf1best) {
// MODEL (for updating the history only, using dummy vectors)
featureValues.push_back(newFeatureValues);
dummyFeatureValues.push_back(newFeatureValues);
bleuScores.push_back(newBleuScores);
dummyBleuScores.push_back(newBleuScores);
}
if (perceptron_update || analytical_update) {
if (historyOf1best) {
// MODEL (for updating the history)
cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
@ -713,33 +666,114 @@ int main(int argc, char** argv) {
}
// HOPE
cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best hope translations" << endl;
size_t oraclePos = featureValues[batchPosition].size();
vector<const Word*> oracle = decoder->getNBest(input, *sid, pass_n, 1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl;
size_t oraclePos = dummyFeatureValues[batchPosition].size();
vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
distinctNbest, rank);
// needed for history
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl;
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl;
oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]);
oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]);
// clear dummies
dummyFeatureValues[batchPosition].clear();
dummyBleuScores[batchPosition].clear();
// FEAR
cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best fear translations" << endl;
cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl;
size_t fearPos = featureValues[batchPosition].size();
vector<const Word*> fear = decoder->getNBest(input, *sid, pass_n, -1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
vector<const Word*> fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
}
}
else {
if (hope_fear) {
if (historyOf1best) {
// MODEL (for updating the history only, using dummy vectors)
cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
oneBests.push_back(bestModel);
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
}
// HOPE
cerr << "Rank " << rank << ", run decoder to get " << hope_n << "best hope translations" << endl;
vector<const Word*> oracle = decoder->getNBest(input, *sid, hope_n, 1.0, bleuScoreWeight,
featureValuesHope[batchPosition], bleuScoresHope[batchPosition], true,
distinctNbest, rank);
// needed for history
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][0] << endl;
// FEAR
cerr << "Rank " << rank << ", run decoder to get " << fear_n << "best fear translations" << endl;
vector<const Word*> fear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuScoreWeight,
featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][0] << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
}
}
else {
// MODEL
cerr << "Rank " << rank << ", run decoder to get " << n << "best wrt model score" << endl;
vector<const Word*> bestModel = decoder->getNBest(input, *sid, n, 0.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
oneBests.push_back(bestModel);
// needed for calculating bleu of dev (1best translations) // todo:
all_ref_ids.push_back(*sid);
allBestModelScore.push_back(bestModel);
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl;
// HOPE
cerr << "Rank " << rank << ", run decoder to get " << n << "best hope translations" << endl;
size_t oraclePos = featureValues[batchPosition].size();
vector<const Word*> oracle = decoder->getNBest(input, *sid, n, 1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
// needed for history
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl;
oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
// FEAR
cerr << "Rank " << rank << ", run decoder to get " << n << "best fear translations" << endl;
size_t fearPos = featureValues[batchPosition].size();
vector<const Word*> fear = decoder->getNBest(input, *sid, n, -1.0, bleuScoreWeight,
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank);
decoder->cleanup();
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
}
}
}
// cerr << "Rank " << rank << ", sentence " << *sid << ", best model Bleu (approximate sentence bleu): " << bleuScores[batchPosition][0] << endl;
// summedApproxBleu += bleuScores[batchPosition][0];
@ -750,12 +784,13 @@ int main(int argc, char** argv) {
++shardPosition;
} // end of batch loop
// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
vector<vector<float> > losses(actualBatchSize);
for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
losses[batchPosition].push_back(oracleBleuScores[batchPosition]
- bleuScores[batchPosition][j]);
if (!hope_fear) {
// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
}
}
}
@ -766,11 +801,21 @@ int main(int argc, char** argv) {
if (logFeatureValues) {
for (size_t i = 0; i < featureValues.size(); ++i) {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
featureValues[i][j].ApplyLog(baseOfLog);
if (hope_fear) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
featureValuesHope[i][j].ApplyLog(baseOfLog);
}
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
featureValuesFear[i][j].ApplyLog(baseOfLog);
}
}
else {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
featureValues[i][j].ApplyLog(baseOfLog);
}
oracleFeatureValues[i].ApplyLog(baseOfLog);
oracleFeatureValues[i].ApplyLog(baseOfLog);
}
}
}
@ -786,12 +831,29 @@ int main(int argc, char** argv) {
// optionally print out the feature values
if (print_feature_values) {
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
for (size_t i = 0; i < featureValues.size(); ++i) {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
cerr << featureValues[i][j] << endl;
if (hope_fear) {
cerr << "hope: " << endl;
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
cerr << featureValuesHope[i][j] << endl;
}
}
cerr << "fear: " << endl;
for (size_t i = 0; i < featureValuesFear.size(); ++i) {
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
cerr << featureValuesFear[i][j] << endl;
}
}
cerr << endl;
}
else {
for (size_t i = 0; i < featureValues.size(); ++i) {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
cerr << featureValues[i][j] << endl;
}
}
cerr << endl;
}
cerr << endl;
}
// Run optimiser on batch:
@ -818,9 +880,16 @@ int main(int argc, char** argv) {
learning_rate, max_sentence_update, rank, epoch, controlUpdates);
}
else {
update_status = optimiser->updateWeights(mosesWeights, featureValues,
losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
if (hope_fear) {
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
}
else {
update_status = optimiser->updateWeights(mosesWeights, featureValues,
losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
}
}
if (update_status[0] == 1) {

View File

@ -11,8 +11,12 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
const vector<vector<float> >& losses,
const vector<vector<float> >& bleuScores,
const vector<ScoreComponentCollection>& oracleFeatureValues,
const vector<float> oracleBleuScores, const vector<size_t> sentenceIds,
float learning_rate, float max_sentence_update, size_t rank, size_t epoch,
const vector<float> oracleBleuScores,
const vector<size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates) {
@ -79,6 +83,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
featureValueDiff.MinusEquals(featureValues[i][j]);
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
if (modelScoreDiff == 0) {
cerr << "equal feature values, constraint skipped.." << endl;
continue;
}
@ -312,6 +317,280 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
return statusPlus;
}
vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
const std::vector<std::vector<float> >& bleuScoresFear,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates) {
// vector of feature values differences for all created constraints
vector<ScoreComponentCollection> featureValueDiffs;
vector<float> lossMinusModelScoreDiffs;
vector<float> all_losses;
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
float max_batch_loss = -1;
float max_batch_lossMinusModelScoreDiff = -1;
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
int violatedConstraintsBefore = 0;
float oldDistanceFromOptimum = 0;
// iterate over input sentences (1 (online) or more (batch))
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
size_t sentenceId = sentenceIds[i];
// Pair all hope translations with all fear translations for one input sentence
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
for (size_t k = 0; k < featureValuesFear[i].size(); ++k) {
ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
featureValueDiff.MinusEquals(featureValuesFear[i][k]);
cerr << "feature value diff: " << featureValueDiff << endl;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
if (modelScoreDiff == 0) {
cerr << "equal feature values, constraint skipped.." << endl;
continue;
}
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k];
loss *= m_marginScaleFactor;
if (m_weightedLossFunction == 1) {
loss *= bleuScoresHope[i][j];
}
else if (m_weightedLossFunction == 2) {
loss *= log2(bleuScoresHope[i][j]);
}
else if (m_weightedLossFunction == 10) {
loss *= log10(bleuScoresHope[i][j]);
}
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float diff = loss - modelScoreDiff;
cerr << "constraint: " << modelScoreDiff << " >= " << loss << endl;
if (diff > (epsilon + m_precision)) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << " (loss: " << loss << ")" << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (violated) {
if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
// find the most violated constraint per batch
if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
max_batch_featureValueDiff = featureValueDiff;
max_batch_loss = loss;
}
}
}
if (addConstraint && !m_accumulateMostViolatedConstraints) {
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
all_losses.push_back(loss);
if (violated) {
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
}
}
}
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
}
if (m_max_number_oracles == 1) {
for (size_t k = 0; k < sentenceIds.size(); ++k) {
size_t sentenceId = sentenceIds[k];
m_oracles[sentenceId].clear();
}
}
size_t pastViolatedConstraints = 0;
// Add constraints from past iterations (BEFORE updating that list)
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
// add all past (most violated) constraints to the list of current constraints, computed with current weights!
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float diff = m_losses[i] - modelScoreDiff;
if (diff > (epsilon + m_precision)) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << " (loss: " << m_losses[i] << ")" << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
if (addConstraint) {
featureValueDiffs.push_back(m_featureValueDiffs[i]);
lossMinusModelScoreDiffs.push_back(m_losses[i] - modelScoreDiff);
all_losses.push_back(m_losses[i]);
// cerr << "old constraint: " << modelScoreDiff << " >= " << m_losses[i] << endl;
if (violated) {
++violatedConstraintsBefore;
++pastViolatedConstraints;
oldDistanceFromOptimum += diff;
}
}
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
}
// Add new most violated constraint to the list of current constraints
if (m_accumulateMostViolatedConstraints) {
if (max_batch_loss != -1) {
float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
float diff = max_batch_loss - modelScoreDiff;
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
featureValueDiffs.push_back(max_batch_featureValueDiff);
lossMinusModelScoreDiffs.push_back(max_batch_loss - modelScoreDiff);
all_losses.push_back(max_batch_loss);
// cerr << "new constraint: " << modelScoreDiff << " !>= " << max_batch_loss << endl;
}
}
// Update the list of accumulated most violated constraints
if (max_batch_loss != -1) {
bool updated = false;
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
if (abs(oldScore-newScore) < epsilon) {
m_losses[i] = max_batch_loss;
updated = true;
break;
}
}
if (!updated) {
m_featureValueDiffs.push_back(max_batch_featureValueDiff);
m_losses.push_back(max_batch_loss);
}
}
// run optimisation: compute alphas for all given constraints
vector<float> alphas;
ScoreComponentCollection summedUpdate;
if (violatedConstraintsBefore > 0) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of violated constraints passed to optimizer: " << violatedConstraintsBefore << endl;
if (m_slack != 0) {
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
} else {
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs);
}
// Update the weight vector according to the alphas and the feature value differences
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
float alpha = alphas[k];
cerr << "alpha: " << alpha << endl;
ScoreComponentCollection update(featureValueDiffs[k]);
update.MultiplyEquals(alpha);
// sum up update
summedUpdate.PlusEquals(update);
}
}
else {
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
vector<int> status(3);
status[0] = 1;
status[1] = 0;
status[2] = 0;
return status;
}
ScoreComponentCollection newWeights(currWeights);
newWeights.PlusEquals(summedUpdate);
// Sanity check: are there still violated constraints after optimisation?
int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
float loss = all_losses[i];
float diff = loss - modelScoreDiff;
if (diff > (epsilon + m_precision)) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
}
}
cerr << "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl;
if (controlUpdates && violatedConstraintsAfter > 0) {
float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum;
if ((violatedConstraintsBefore - violatedConstraintsAfter) <= 0 && distanceChange < 0) {
vector<int> statusPlus(3);
statusPlus[0] = -1;
statusPlus[1] = -1;
statusPlus[2] = -1;
return statusPlus;
}
}
// Apply learning rate (fixed or flexible)
if (learning_rate != 1) {
cerr << "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl;
summedUpdate.MultiplyEquals(learning_rate);
cerr << "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl;
}
// Apply threshold scaling
if (max_sentence_update != -1) {
cerr << "Rank " << rank << ", epoch " << epoch << ", update before scaling to max-sentence-update: " << summedUpdate << endl;
summedUpdate.ThresholdScaling(max_sentence_update);
cerr << "Rank " << rank << ", epoch " << epoch << ", update after scaling to max-sentence-update: " << summedUpdate << endl;
}
// Apply update to weight vector or store it for later
if (updates_per_epoch > 0) {
m_accumulatedUpdates.PlusEquals(summedUpdate);
cerr << "Rank " << rank << ", epoch " << epoch << ", new accumulated updates:" << m_accumulatedUpdates << endl;
} else {
// apply update to weight vector
cerr << "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl;
currWeights.PlusEquals(summedUpdate);
cerr << "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl;
}
vector<int> statusPlus(3);
statusPlus[0] = 0;
statusPlus[1] = violatedConstraintsBefore;
statusPlus[2] = violatedConstraintsAfter;
return statusPlus;
}
vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
ScoreComponentCollection& featureValues,
float loss,

View File

@ -29,7 +29,7 @@ namespace Mira {
class Optimiser {
public:
Optimiser() {}
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& featureValues,
float loss,
Moses::ScoreComponentCollection& oracleFeatureValues,
@ -40,24 +40,36 @@ namespace Mira {
size_t rank,
size_t epoch,
bool controlUpdates) = 0;
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector< std::vector<float> >& losses,
const std::vector<std::vector<float> >& bleuScores,
const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
const std::vector< float> oracleBleuScores,
const std::vector< size_t> sentenceId,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates) = 0;
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
const std::vector<std::vector<float> >& bleuScoresFear,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates) = 0;
};
class Perceptron : public Optimiser {
public:
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& featureValues,
float loss,
Moses::ScoreComponentCollection& oracleFeatureValues,
@ -68,19 +80,31 @@ namespace Mira {
size_t rank,
size_t epoch,
bool controlUpdates);
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector< std::vector<float> >& losses,
const std::vector<std::vector<float> >& bleuScores,
const std::vector<Moses::ScoreComponentCollection>& oracleFeatureValues,
const std::vector< float> oracleBleuScores,
const std::vector< size_t> dummy,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates);
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
const std::vector<std::vector<float> >& bleuScoresFear,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates);
};
class MiraOptimiser : public Optimiser {
@ -105,7 +129,7 @@ namespace Mira {
~MiraOptimiser() {}
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& featureValues,
float loss,
Moses::ScoreComponentCollection& oracleFeatureValues,
@ -116,13 +140,25 @@ namespace Mira {
size_t rank,
size_t epoch,
bool controlUpdates);
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector< std::vector<float> >& losses,
const std::vector<std::vector<float> >& bleuScores,
const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
const std::vector< float> oracleBleuScores,
const std::vector< size_t> sentenceId,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates);
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
const std::vector<std::vector<float> >& bleuScoresFear,
const std::vector< size_t> sentenceIds,
float learning_rate,
float max_sentence_update,
size_t rank,

View File

@ -24,21 +24,40 @@ using namespace std;
namespace Mira {
vector<int> Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
ScoreComponentCollection& featureValues,
float loss,
ScoreComponentCollection& oracleFeatureValues,
float oracleBleuScore,
size_t sentenceId,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
bool controlUpdates) {
vector<int> status(1);
status[0] = 0;
return status;
}
vector<int> Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
ScoreComponentCollection& featureValues,
float loss,
ScoreComponentCollection& oracleFeatureValues,
float oracleBleuScore,
size_t sentenceId,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
bool controlUpdates) {
vector<int> status(1);
status[0] = 0;
return status;
}
vector<int> Perceptron::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
const std::vector<std::vector<float> >& bleuScoresFear,
const std::vector< size_t> sentenceId,
float learning_rate,
float max_sentence_update,
size_t rank,
size_t epoch,
int updates_per_epoch,
bool controlUpdates) {
vector<int> status(1);
status[0] = 0;
return status;
}
vector<int> Perceptron::updateWeights(ScoreComponentCollection& currWeights,
const vector< vector<ScoreComponentCollection> >& featureValues,