From 020c71216b721023e745995deb7dfe3d9e2f9046 Mon Sep 17 00:00:00 2001 From: ehasler Date: Mon, 16 May 2011 16:56:52 +0000 Subject: [PATCH] For hope-fear option, add only constraints between hope and fear translations, not between hope translations. Make hope-fear sizes flexible with --hope-n and --fear-n git-svn-id: http://svn.statmt.org/repository/mira@3897 cc96ff50-19ce-11e0-b349-13d7f0bd23df --- mira/Main.cpp | 277 +++++++++++++++++++++++++--------------- mira/MiraOptimiser.cpp | 283 ++++++++++++++++++++++++++++++++++++++++- mira/Optimiser.h | 54 ++++++-- mira/Perceptron.cpp | 49 ++++--- 4 files changed, 533 insertions(+), 130 deletions(-) diff --git a/mira/Main.cpp b/mira/Main.cpp index ecce08a9a..2c7b46db1 100644 --- a/mira/Main.cpp +++ b/mira/Main.cpp @@ -144,7 +144,6 @@ int main(int argc, char** argv) { float min_sentence_update; size_t weightedLossFunction; size_t n; - size_t nbest_first; size_t batchSize; bool distinctNbest; bool onlyViolatedConstraints; @@ -190,7 +189,8 @@ int main(int argc, char** argv) { bool analytical_update; bool perceptron_update; bool hope_fear; - size_t constraints; + int hope_n; + int fear_n; po::options_description desc("Allowed options"); desc.add_options() ("accumulate-most-violated-constraints", po::value(&accumulateMostViolatedConstraints)->default_value(false),"Accumulate most violated constraint per example") @@ -206,7 +206,6 @@ int main(int argc, char** argv) { ("burn-in-input-file", po::value(&burnInInputFile), "Input file for burn-in phase of BLEU history") ("burn-in-reference-files", po::value >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history") ("config,f", po::value(&mosesConfigFile), "Moses ini file") - ("constraints", po::value(&constraints)->default_value(1), "Number of constraints used for analytical update") ("control-updates", po::value(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error") ("decoder-settings", po::value(&decoder_settings)->default_value(""), "Decoder settings for tuning runs") ("decr-learning-rate", po::value(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch") @@ -215,11 +214,13 @@ int main(int argc, char** argv) { ("distinct-nbest", po::value(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step") ("weight-dump-frequency", po::value(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi") ("epochs,e", po::value(&epochs)->default_value(5), "Number of epochs") + ("fear-n", po::value(&fear_n)->default_value(-1), "Number of fear translations used") ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") ("hildreth", po::value(&hildreth)->default_value(true), "Use Hildreth's optimisation algorithm") ("history-of-1best", po::value(&historyOf1best)->default_value(0), "Use the 1best translation to update the history") ("history-smoothing", po::value(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") ("hope-fear", po::value(&hope_fear)->default_value(true), "Use only hope and fear translations (not model)") + ("hope-n", po::value(&hope_n)->default_value(-1), "Number of hope translations used") ("input-file,i", po::value(&inputFile), "Input file containing tokenised source") ("learner,l", po::value(&learner)->default_value("mira"), "Learning algorithm") ("learning-rate", po::value(&learning_rate)->default_value(1), "Learning rate (fixed or flexible)") @@ -236,8 +237,7 @@ int main(int argc, char** argv) { ("msf-step", po::value(&marginScaleFactorStep)->default_value(0), "Decrease margin scale factor iteratively by the value provided") ("multiplyA", po::value(&multiplyA)->default_value(true), "Multiply A with outcome before passing to Hildreth") ("nbest,n", po::value(&n)->default_value(10), "Number of translations in nbest list") - ("nbest-first", po::value(&nbest_first)->default_value(0), "Number of translations in nbest list in the first epoch") - ("normalise", po::value(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder") + ("normalise", po::value(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder") ("only-violated-constraints", po::value(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem") ("past-and-current-constraints", po::value(&pastAndCurrentConstraints)->default_value(false), "Accumulate most violated constraint per example and use them along all current constraints") ("perceptron-update", po::value(&perceptron_update)->default_value(false), "Do a simple perceptron style update") @@ -295,8 +295,9 @@ int main(int argc, char** argv) { return 1; } - if (nbest_first == 0) { - nbest_first = n; + if (hope_n == -1 && fear_n == -1) { + hope_n = n; + fear_n = n; } // load input and references @@ -486,7 +487,6 @@ int main(int argc, char** argv) { cerr << "msf-min: " << marginScaleFactorMin << endl; cerr << "weighted-loss-function: " << weightedLossFunction << endl; cerr << "nbest: " << n << endl; - cerr << "nbest-first: " << nbest_first << endl; cerr << "batch-size: " << batchSize << endl; cerr << "distinct-nbest: " << distinctNbest << endl; cerr << "only-violated-constraints: " << onlyViolatedConstraints << endl; @@ -523,6 +523,8 @@ int main(int argc, char** argv) { cerr << "perceptron-update: " << perceptron_update << endl; cerr << "analytical-update: " << analytical_update << endl; cerr << "hope-fear: " << hope_fear << endl; + cerr << "hope-n: " << hope_n << endl; + cerr << "fear-n: " << fear_n << endl; if (learner == "mira") { cerr << "Optimising using Mira" << endl; @@ -608,6 +610,12 @@ int main(int argc, char** argv) { vector > bleuScores; vector > dummyBleuScores; + // variables for hope-fear setting + vector > featureValuesHope; + vector > featureValuesFear; + vector > bleuScoresHope; + vector > bleuScoresFear; + // get moses weights ScoreComponentCollection mosesWeights = decoder->getWeights(); cerr << "\nRank " << rank << ", next batch" << endl; @@ -632,77 +640,22 @@ int main(int argc, char** argv) { vector newFeatureValues; vector newBleuScores; - featureValues.push_back(newFeatureValues); - dummyFeatureValues.push_back(newFeatureValues); - bleuScores.push_back(newBleuScores); - dummyBleuScores.push_back(newBleuScores); - - size_t pass_n = (epoch == 0)? nbest_first : n; - - if (perceptron_update || analytical_update) { - if (constraints == 1) { - if (historyOf1best) { - // MODEL (for updating the history) - cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl; - vector bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight, - dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true, - distinctNbest, rank); - decoder->cleanup(); - oneBests.push_back(bestModel); - cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl; - } - - // HOPE - cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl; - size_t oraclePos = dummyFeatureValues[batchPosition].size(); - vector oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight, - dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true, - distinctNbest, rank); - // needed for history - inputLengths.push_back(decoder->getCurrentInputLength()); - ref_ids.push_back(*sid); - decoder->cleanup(); - oracles.push_back(oracle); - cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl; - - oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]); - oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]); - // clear dummies - dummyFeatureValues[batchPosition].clear(); - dummyBleuScores[batchPosition].clear(); - - // FEAR - cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl; - size_t fearPos = featureValues[batchPosition].size(); - vector fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight, - featureValues[batchPosition], bleuScores[batchPosition], true, - distinctNbest, rank); - decoder->cleanup(); - cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl; - for (size_t i = 0; i < fear.size(); ++i) { - delete fear[i]; - } - } - else { - // TODO: - } + if (hope_fear) { + featureValuesHope.push_back(newFeatureValues); + featureValuesFear.push_back(newFeatureValues); + bleuScoresHope.push_back(newBleuScores); + bleuScoresFear.push_back(newBleuScores); } else { - if (!hope_fear) { - // MODEL - cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best wrt model score" << endl; - vector bestModel = decoder->getNBest(input, *sid, pass_n, 0.0, bleuScoreWeight, - featureValues[batchPosition], bleuScores[batchPosition], true, - distinctNbest, rank); - decoder->cleanup(); - oneBests.push_back(bestModel); - // needed for calculating bleu of dev (1best translations) // todo: - all_ref_ids.push_back(*sid); - allBestModelScore.push_back(bestModel); - cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl; - } - else if (historyOf1best) { - // MODEL (for updating the history only, using dummy vectors) + featureValues.push_back(newFeatureValues); + dummyFeatureValues.push_back(newFeatureValues); + bleuScores.push_back(newBleuScores); + dummyBleuScores.push_back(newBleuScores); + } + + if (perceptron_update || analytical_update) { + if (historyOf1best) { + // MODEL (for updating the history) cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl; vector bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight, dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true, @@ -713,33 +666,114 @@ int main(int argc, char** argv) { } // HOPE - cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best hope translations" << endl; - size_t oraclePos = featureValues[batchPosition].size(); - vector oracle = decoder->getNBest(input, *sid, pass_n, 1.0, bleuScoreWeight, - featureValues[batchPosition], bleuScores[batchPosition], true, - distinctNbest, rank); + cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl; + size_t oraclePos = dummyFeatureValues[batchPosition].size(); + vector oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight, + dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true, + distinctNbest, rank); // needed for history inputLengths.push_back(decoder->getCurrentInputLength()); ref_ids.push_back(*sid); decoder->cleanup(); oracles.push_back(oracle); - cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl; + cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl; - oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); - oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); + oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]); + oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]); + // clear dummies + dummyFeatureValues[batchPosition].clear(); + dummyBleuScores[batchPosition].clear(); // FEAR - cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best fear translations" << endl; + cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl; size_t fearPos = featureValues[batchPosition].size(); - vector fear = decoder->getNBest(input, *sid, pass_n, -1.0, bleuScoreWeight, - featureValues[batchPosition], bleuScores[batchPosition], true, - distinctNbest, rank); + vector fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight, + featureValues[batchPosition], bleuScores[batchPosition], true, + distinctNbest, rank); decoder->cleanup(); cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl; for (size_t i = 0; i < fear.size(); ++i) { delete fear[i]; } } + else { + if (hope_fear) { + if (historyOf1best) { + // MODEL (for updating the history only, using dummy vectors) + cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl; + vector bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight, + dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true, + distinctNbest, rank); + decoder->cleanup(); + oneBests.push_back(bestModel); + cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl; + } + + // HOPE + cerr << "Rank " << rank << ", run decoder to get " << hope_n << "best hope translations" << endl; + vector oracle = decoder->getNBest(input, *sid, hope_n, 1.0, bleuScoreWeight, + featureValuesHope[batchPosition], bleuScoresHope[batchPosition], true, + distinctNbest, rank); + // needed for history + inputLengths.push_back(decoder->getCurrentInputLength()); + ref_ids.push_back(*sid); + decoder->cleanup(); + oracles.push_back(oracle); + cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][0] << endl; + + // FEAR + cerr << "Rank " << rank << ", run decoder to get " << fear_n << "best fear translations" << endl; + vector fear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuScoreWeight, + featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true, + distinctNbest, rank); + decoder->cleanup(); + cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][0] << endl; + for (size_t i = 0; i < fear.size(); ++i) { + delete fear[i]; + } + } + else { + // MODEL + cerr << "Rank " << rank << ", run decoder to get " << n << "best wrt model score" << endl; + vector bestModel = decoder->getNBest(input, *sid, n, 0.0, bleuScoreWeight, + featureValues[batchPosition], bleuScores[batchPosition], true, + distinctNbest, rank); + decoder->cleanup(); + oneBests.push_back(bestModel); + // needed for calculating bleu of dev (1best translations) // todo: + all_ref_ids.push_back(*sid); + allBestModelScore.push_back(bestModel); + cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl; + + // HOPE + cerr << "Rank " << rank << ", run decoder to get " << n << "best hope translations" << endl; + size_t oraclePos = featureValues[batchPosition].size(); + vector oracle = decoder->getNBest(input, *sid, n, 1.0, bleuScoreWeight, + featureValues[batchPosition], bleuScores[batchPosition], true, + distinctNbest, rank); + // needed for history + inputLengths.push_back(decoder->getCurrentInputLength()); + ref_ids.push_back(*sid); + decoder->cleanup(); + oracles.push_back(oracle); + cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl; + + oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); + oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); + + // FEAR + cerr << "Rank " << rank << ", run decoder to get " << n << "best fear translations" << endl; + size_t fearPos = featureValues[batchPosition].size(); + vector fear = decoder->getNBest(input, *sid, n, -1.0, bleuScoreWeight, + featureValues[batchPosition], bleuScores[batchPosition], true, + distinctNbest, rank); + decoder->cleanup(); + cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl; + for (size_t i = 0; i < fear.size(); ++i) { + delete fear[i]; + } + } + } // cerr << "Rank " << rank << ", sentence " << *sid << ", best model Bleu (approximate sentence bleu): " << bleuScores[batchPosition][0] << endl; // summedApproxBleu += bleuScores[batchPosition][0]; @@ -750,12 +784,13 @@ int main(int argc, char** argv) { ++shardPosition; } // end of batch loop - // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis) vector > losses(actualBatchSize); - for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { - for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { - losses[batchPosition].push_back(oracleBleuScores[batchPosition] - - bleuScores[batchPosition][j]); + if (!hope_fear) { + // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis) + for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { + for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { + losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]); + } } } @@ -766,11 +801,21 @@ int main(int argc, char** argv) { if (logFeatureValues) { for (size_t i = 0; i < featureValues.size(); ++i) { - for (size_t j = 0; j < featureValues[i].size(); ++j) { - featureValues[i][j].ApplyLog(baseOfLog); + if (hope_fear) { + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + featureValuesHope[i][j].ApplyLog(baseOfLog); + } + for (size_t j = 0; j < featureValuesFear[i].size(); ++j) { + featureValuesFear[i][j].ApplyLog(baseOfLog); + } } + else { + for (size_t j = 0; j < featureValues[i].size(); ++j) { + featureValues[i][j].ApplyLog(baseOfLog); + } - oracleFeatureValues[i].ApplyLog(baseOfLog); + oracleFeatureValues[i].ApplyLog(baseOfLog); + } } } @@ -786,12 +831,29 @@ int main(int argc, char** argv) { // optionally print out the feature values if (print_feature_values) { cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl; - for (size_t i = 0; i < featureValues.size(); ++i) { - for (size_t j = 0; j < featureValues[i].size(); ++j) { - cerr << featureValues[i][j] << endl; + if (hope_fear) { + cerr << "hope: " << endl; + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + cerr << featureValuesHope[i][j] << endl; + } } + cerr << "fear: " << endl; + for (size_t i = 0; i < featureValuesFear.size(); ++i) { + for (size_t j = 0; j < featureValuesFear[i].size(); ++j) { + cerr << featureValuesFear[i][j] << endl; + } + } + cerr << endl; + } + else { + for (size_t i = 0; i < featureValues.size(); ++i) { + for (size_t j = 0; j < featureValues[i].size(); ++j) { + cerr << featureValues[i][j] << endl; + } + } + cerr << endl; } - cerr << endl; } // Run optimiser on batch: @@ -818,9 +880,16 @@ int main(int argc, char** argv) { learning_rate, max_sentence_update, rank, epoch, controlUpdates); } else { - update_status = optimiser->updateWeights(mosesWeights, featureValues, - losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids, - learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates); + if (hope_fear) { + update_status = optimiser->updateWeightsHopeFear(mosesWeights, + featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids, + learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates); + } + else { + update_status = optimiser->updateWeights(mosesWeights, featureValues, + losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids, + learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates); + } } if (update_status[0] == 1) { diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp index 55d076432..e410691fd 100644 --- a/mira/MiraOptimiser.cpp +++ b/mira/MiraOptimiser.cpp @@ -11,8 +11,12 @@ vector MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, const vector >& losses, const vector >& bleuScores, const vector& oracleFeatureValues, - const vector oracleBleuScores, const vector sentenceIds, - float learning_rate, float max_sentence_update, size_t rank, size_t epoch, + const vector oracleBleuScores, + const vector sentenceIds, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, int updates_per_epoch, bool controlUpdates) { @@ -79,6 +83,7 @@ vector MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, featureValueDiff.MinusEquals(featureValues[i][j]); float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); if (modelScoreDiff == 0) { + cerr << "equal feature values, constraint skipped.." << endl; continue; } @@ -312,6 +317,280 @@ vector MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, return statusPlus; } +vector MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector< size_t> sentenceIds, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + int updates_per_epoch, + bool controlUpdates) { + + // vector of feature values differences for all created constraints + vector featureValueDiffs; + vector lossMinusModelScoreDiffs; + vector all_losses; + + // most violated constraint in batch + ScoreComponentCollection max_batch_featureValueDiff; + float max_batch_loss = -1; + float max_batch_lossMinusModelScoreDiff = -1; + + // Make constraints for new hypothesis translations + float epsilon = 0.0001; + int violatedConstraintsBefore = 0; + float oldDistanceFromOptimum = 0; + + // iterate over input sentences (1 (online) or more (batch)) + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + size_t sentenceId = sentenceIds[i]; + + // Pair all hope translations with all fear translations for one input sentence + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + for (size_t k = 0; k < featureValuesFear[i].size(); ++k) { + ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; + featureValueDiff.MinusEquals(featureValuesFear[i][k]); + cerr << "feature value diff: " << featureValueDiff << endl; + float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); + if (modelScoreDiff == 0) { + cerr << "equal feature values, constraint skipped.." << endl; + continue; + } + + float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k]; + loss *= m_marginScaleFactor; + if (m_weightedLossFunction == 1) { + loss *= bleuScoresHope[i][j]; + } + else if (m_weightedLossFunction == 2) { + loss *= log2(bleuScoresHope[i][j]); + } + else if (m_weightedLossFunction == 10) { + loss *= log10(bleuScoresHope[i][j]); + } + + // check if constraint is violated + bool violated = false; + bool addConstraint = true; + float diff = loss - modelScoreDiff; + cerr << "constraint: " << modelScoreDiff << " >= " << loss << endl; + if (diff > (epsilon + m_precision)) { + violated = true; + cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << " (loss: " << loss << ")" << endl; + } + else if (m_onlyViolatedConstraints) { + addConstraint = false; + } + + float lossMinusModelScoreDiff = loss - modelScoreDiff; + if (violated) { + if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) { + // find the most violated constraint per batch + if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) { + max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff; + max_batch_featureValueDiff = featureValueDiff; + max_batch_loss = loss; + } + } + } + + if (addConstraint && !m_accumulateMostViolatedConstraints) { + featureValueDiffs.push_back(featureValueDiff); + lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff); + all_losses.push_back(loss); + + if (violated) { + ++violatedConstraintsBefore; + oldDistanceFromOptimum += diff; + } + } + } + } + } + + if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) { + cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl; + } + + if (m_max_number_oracles == 1) { + for (size_t k = 0; k < sentenceIds.size(); ++k) { + size_t sentenceId = sentenceIds[k]; + m_oracles[sentenceId].clear(); + } + } + + size_t pastViolatedConstraints = 0; + // Add constraints from past iterations (BEFORE updating that list) + if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) { + // add all past (most violated) constraints to the list of current constraints, computed with current weights! + for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) { + float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights); + + // check if constraint is violated + bool violated = false; + bool addConstraint = true; + float diff = m_losses[i] - modelScoreDiff; + if (diff > (epsilon + m_precision)) { + violated = true; + cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << " (loss: " << m_losses[i] << ")" << endl; + } + else if (m_onlyViolatedConstraints) { + addConstraint = false; + } + + if (addConstraint) { + featureValueDiffs.push_back(m_featureValueDiffs[i]); + lossMinusModelScoreDiffs.push_back(m_losses[i] - modelScoreDiff); + all_losses.push_back(m_losses[i]); +// cerr << "old constraint: " << modelScoreDiff << " >= " << m_losses[i] << endl; + + if (violated) { + ++violatedConstraintsBefore; + ++pastViolatedConstraints; + oldDistanceFromOptimum += diff; + } + } + } + } + + if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) { + cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl; + } + + // Add new most violated constraint to the list of current constraints + if (m_accumulateMostViolatedConstraints) { + if (max_batch_loss != -1) { + float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights); + float diff = max_batch_loss - modelScoreDiff; + ++violatedConstraintsBefore; + oldDistanceFromOptimum += diff; + + featureValueDiffs.push_back(max_batch_featureValueDiff); + lossMinusModelScoreDiffs.push_back(max_batch_loss - modelScoreDiff); + all_losses.push_back(max_batch_loss); +// cerr << "new constraint: " << modelScoreDiff << " !>= " << max_batch_loss << endl; + } + } + + // Update the list of accumulated most violated constraints + if (max_batch_loss != -1) { + bool updated = false; + for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) { + float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights); + float newScore = max_batch_featureValueDiff.InnerProduct(currWeights); + if (abs(oldScore-newScore) < epsilon) { + m_losses[i] = max_batch_loss; + updated = true; + break; + } + } + + if (!updated) { + m_featureValueDiffs.push_back(max_batch_featureValueDiff); + m_losses.push_back(max_batch_loss); + } + } + + // run optimisation: compute alphas for all given constraints + vector alphas; + ScoreComponentCollection summedUpdate; + if (violatedConstraintsBefore > 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << featureValueDiffs.size() << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", number of violated constraints passed to optimizer: " << violatedConstraintsBefore << endl; + if (m_slack != 0) { + alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack); + } else { + alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs); + } + + // Update the weight vector according to the alphas and the feature value differences + // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) + for (size_t k = 0; k < featureValueDiffs.size(); ++k) { + float alpha = alphas[k]; + cerr << "alpha: " << alpha << endl; + ScoreComponentCollection update(featureValueDiffs[k]); + update.MultiplyEquals(alpha); + + // sum up update + summedUpdate.PlusEquals(update); + } + } + else { + cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl; + vector status(3); + status[0] = 1; + status[1] = 0; + status[2] = 0; + return status; + } + + ScoreComponentCollection newWeights(currWeights); + newWeights.PlusEquals(summedUpdate); + + // Sanity check: are there still violated constraints after optimisation? + int violatedConstraintsAfter = 0; + float newDistanceFromOptimum = 0; + for (size_t i = 0; i < featureValueDiffs.size(); ++i) { + float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights); + float loss = all_losses[i]; + float diff = loss - modelScoreDiff; + if (diff > (epsilon + m_precision)) { + ++violatedConstraintsAfter; + newDistanceFromOptimum += diff; + } + } + cerr << "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl; + + if (controlUpdates && violatedConstraintsAfter > 0) { + float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum; + if ((violatedConstraintsBefore - violatedConstraintsAfter) <= 0 && distanceChange < 0) { + vector statusPlus(3); + statusPlus[0] = -1; + statusPlus[1] = -1; + statusPlus[2] = -1; + return statusPlus; + } + } + + // Apply learning rate (fixed or flexible) + if (learning_rate != 1) { + cerr << "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl; + summedUpdate.MultiplyEquals(learning_rate); + cerr << "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl; + } + + // Apply threshold scaling + if (max_sentence_update != -1) { + cerr << "Rank " << rank << ", epoch " << epoch << ", update before scaling to max-sentence-update: " << summedUpdate << endl; + summedUpdate.ThresholdScaling(max_sentence_update); + cerr << "Rank " << rank << ", epoch " << epoch << ", update after scaling to max-sentence-update: " << summedUpdate << endl; + } + + // Apply update to weight vector or store it for later + if (updates_per_epoch > 0) { + m_accumulatedUpdates.PlusEquals(summedUpdate); + cerr << "Rank " << rank << ", epoch " << epoch << ", new accumulated updates:" << m_accumulatedUpdates << endl; + } else { + // apply update to weight vector + cerr << "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl; + currWeights.PlusEquals(summedUpdate); + cerr << "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl; + } + + vector statusPlus(3); + statusPlus[0] = 0; + statusPlus[1] = violatedConstraintsBefore; + statusPlus[2] = violatedConstraintsAfter; + return statusPlus; +} + vector MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights, ScoreComponentCollection& featureValues, float loss, diff --git a/mira/Optimiser.h b/mira/Optimiser.h index 69607f75e..98967ff0f 100644 --- a/mira/Optimiser.h +++ b/mira/Optimiser.h @@ -29,7 +29,7 @@ namespace Mira { class Optimiser { public: Optimiser() {} - virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& featureValues, float loss, Moses::ScoreComponentCollection& oracleFeatureValues, @@ -40,24 +40,36 @@ namespace Mira { size_t rank, size_t epoch, bool controlUpdates) = 0; - virtual std::vector updateWeights(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeights(Moses::ScoreComponentCollection& currWeights, const std::vector< std::vector >& featureValues, const std::vector< std::vector >& losses, const std::vector >& bleuScores, const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues, const std::vector< float> oracleBleuScores, - const std::vector< size_t> sentenceId, + const std::vector< size_t> sentenceIds, float learning_rate, float max_sentence_update, size_t rank, size_t epoch, int updates_per_epoch, bool controlUpdates) = 0; + virtual std::vector updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector< size_t> sentenceIds, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + int updates_per_epoch, + bool controlUpdates) = 0; }; class Perceptron : public Optimiser { public: - virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& featureValues, float loss, Moses::ScoreComponentCollection& oracleFeatureValues, @@ -68,19 +80,31 @@ namespace Mira { size_t rank, size_t epoch, bool controlUpdates); - virtual std::vector updateWeights(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeights(Moses::ScoreComponentCollection& currWeights, const std::vector< std::vector >& featureValues, const std::vector< std::vector >& losses, const std::vector >& bleuScores, const std::vector& oracleFeatureValues, const std::vector< float> oracleBleuScores, - const std::vector< size_t> dummy, + const std::vector< size_t> sentenceIds, float learning_rate, float max_sentence_update, size_t rank, size_t epoch, int updates_per_epoch, bool controlUpdates); + virtual std::vector updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector< size_t> sentenceIds, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + int updates_per_epoch, + bool controlUpdates); }; class MiraOptimiser : public Optimiser { @@ -105,7 +129,7 @@ namespace Mira { ~MiraOptimiser() {} - virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& featureValues, float loss, Moses::ScoreComponentCollection& oracleFeatureValues, @@ -116,13 +140,25 @@ namespace Mira { size_t rank, size_t epoch, bool controlUpdates); - virtual std::vector updateWeights(Moses::ScoreComponentCollection& weights, + virtual std::vector updateWeights(Moses::ScoreComponentCollection& currWeights, const std::vector< std::vector >& featureValues, const std::vector< std::vector >& losses, const std::vector >& bleuScores, const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues, const std::vector< float> oracleBleuScores, - const std::vector< size_t> sentenceId, + const std::vector< size_t> sentenceIds, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + int updates_per_epoch, + bool controlUpdates); + virtual std::vector updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector< size_t> sentenceIds, float learning_rate, float max_sentence_update, size_t rank, diff --git a/mira/Perceptron.cpp b/mira/Perceptron.cpp index 153ee30ff..095efbe28 100644 --- a/mira/Perceptron.cpp +++ b/mira/Perceptron.cpp @@ -24,21 +24,40 @@ using namespace std; namespace Mira { - vector Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights, - ScoreComponentCollection& featureValues, - float loss, - ScoreComponentCollection& oracleFeatureValues, - float oracleBleuScore, - size_t sentenceId, - float learning_rate, - float max_sentence_update, - size_t rank, - size_t epoch, - bool controlUpdates) { - vector status(1); - status[0] = 0; - return status; - } +vector Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights, + ScoreComponentCollection& featureValues, + float loss, + ScoreComponentCollection& oracleFeatureValues, + float oracleBleuScore, + size_t sentenceId, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + bool controlUpdates) { + + vector status(1); + status[0] = 0; + return status; +} + +vector Perceptron::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights, + const std::vector< std::vector >& featureValuesHope, + const std::vector< std::vector >& featureValuesFear, + const std::vector >& bleuScoresHope, + const std::vector >& bleuScoresFear, + const std::vector< size_t> sentenceId, + float learning_rate, + float max_sentence_update, + size_t rank, + size_t epoch, + int updates_per_epoch, + bool controlUpdates) { + + vector status(1); + status[0] = 0; + return status; +} vector Perceptron::updateWeights(ScoreComponentCollection& currWeights, const vector< vector >& featureValues,