For hope-fear option, add only constraints between hope and fear translations, not between hope translations. Make hope-fear sizes flexible with --hope-n and --fear-n

git-svn-id: http://svn.statmt.org/repository/mira@3897 cc96ff50-19ce-11e0-b349-13d7f0bd23df
2024-11-13 10:40:52 +03:00 · 2011-05-16 16:56:52 +00:00 · 2011-05-16 16:56:52 +00:00 · 020c71216b
commit 020c71216b
parent a177b58d18
4 changed files with 533 additions and 130 deletions
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@ -144,7 +144,6 @@ int main(int argc, char** argv) {
 	float min_sentence_update;
 	size_t weightedLossFunction;
 	size_t n;
-	size_t nbest_first;
 	size_t batchSize;
 	bool distinctNbest;
 	bool onlyViolatedConstraints;
@ -190,7 +189,8 @@ int main(int argc, char** argv) {
 	bool analytical_update;
 	bool perceptron_update;
 	bool hope_fear;
-	size_t constraints;
+	int hope_n;
+	int fear_n;
 	po::options_description desc("Allowed options");
 	desc.add_options()
 			("accumulate-most-violated-constraints", po::value<bool>(&accumulateMostViolatedConstraints)->default_value(false),"Accumulate most violated constraint per example")
@ -206,7 +206,6 @@ int main(int argc, char** argv) {
 			("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
 			("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
 			("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
-			("constraints", po::value<size_t>(&constraints)->default_value(1), "Number of constraints used for analytical update")
 			("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
 			("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
 			("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
@ -215,11 +214,13 @@ int main(int argc, char** argv) {
 			("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
 			("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
 			("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
+			("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
 			("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
 			("hildreth", po::value<bool>(&hildreth)->default_value(true), "Use Hildreth's optimisation algorithm")
 			("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
 			("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
 			("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations (not model)")
+			("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
 			("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
 			("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
 			("learning-rate", po::value<float>(&learning_rate)->default_value(1), "Learning rate (fixed or flexible)")
@ -236,8 +237,7 @@ int main(int argc, char** argv) {
 			("msf-step", po::value<float>(&marginScaleFactorStep)->default_value(0), "Decrease margin scale factor iteratively by the value provided")
 	    ("multiplyA", po::value<bool>(&multiplyA)->default_value(true), "Multiply A with outcome before passing to Hildreth")
 	    ("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
-	    ("nbest-first", po::value<size_t>(&nbest_first)->default_value(0), "Number of translations in nbest list in the first epoch")
-			("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
+	    ("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
 			("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
 	    ("past-and-current-constraints", po::value<bool>(&pastAndCurrentConstraints)->default_value(false), "Accumulate most violated constraint per example and use them along all current constraints")
 	    ("perceptron-update", po::value<bool>(&perceptron_update)->default_value(false), "Do a simple perceptron style update")
@ -295,8 +295,9 @@ int main(int argc, char** argv) {
 	  return 1;
 	}

-	if (nbest_first == 0) {
-		nbest_first = n;
+	if (hope_n == -1 && fear_n == -1) {
+		hope_n = n;
+		fear_n = n;
 	}

 	// load input and references
@ -486,7 +487,6 @@ int main(int argc, char** argv) {
 	cerr << "msf-min: " << marginScaleFactorMin << endl;
 	cerr << "weighted-loss-function: " << weightedLossFunction << endl;
 	cerr << "nbest: " << n << endl;
-	cerr << "nbest-first: " << nbest_first << endl;
 	cerr << "batch-size: " << batchSize << endl;
 	cerr << "distinct-nbest: " << distinctNbest << endl;
 	cerr << "only-violated-constraints: " << onlyViolatedConstraints << endl;
@ -523,6 +523,8 @@ int main(int argc, char** argv) {
 	cerr << "perceptron-update: " << perceptron_update << endl;
 	cerr << "analytical-update: " << analytical_update << endl;
 	cerr << "hope-fear: " << hope_fear << endl;
+	cerr << "hope-n: " << hope_n << endl;
+	cerr << "fear-n: " << fear_n << endl;

 	if (learner == "mira") {
 		cerr << "Optimising using Mira" << endl;
@ -608,6 +610,12 @@ int main(int argc, char** argv) {
 			vector<vector<float> > bleuScores;
 			vector<vector<float> > dummyBleuScores;

+			// variables for hope-fear setting
+			vector<vector<ScoreComponentCollection> > featureValuesHope;
+			vector<vector<ScoreComponentCollection> > featureValuesFear;
+			vector<vector<float> > bleuScoresHope;
+			vector<vector<float> > bleuScoresFear;
+
 			// get moses weights
 			ScoreComponentCollection mosesWeights = decoder->getWeights();
 			cerr << "\nRank " << rank << ", next batch" << endl;
@ -632,77 +640,22 @@ int main(int argc, char** argv) {

 				vector<ScoreComponentCollection> newFeatureValues;
 				vector<float> newBleuScores;
-				featureValues.push_back(newFeatureValues);
-				dummyFeatureValues.push_back(newFeatureValues);
-				bleuScores.push_back(newBleuScores);
-				dummyBleuScores.push_back(newBleuScores);
-
-				size_t pass_n = (epoch == 0)? nbest_first : n;
-
-				if (perceptron_update || analytical_update) {
-					if (constraints == 1) {
-						if (historyOf1best) {
-							// MODEL (for updating the history)
-							cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
-							vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
-									dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
-									distinctNbest, rank);
-							decoder->cleanup();
-							oneBests.push_back(bestModel);
-							cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
-						}
-
-						// HOPE
-						cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl;
-						size_t oraclePos = dummyFeatureValues[batchPosition].size();
-						vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
-										dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
-										distinctNbest, rank);
-						// needed for history
-						inputLengths.push_back(decoder->getCurrentInputLength());
-						ref_ids.push_back(*sid);
-						decoder->cleanup();
-						oracles.push_back(oracle);
-						cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl;
-
-						oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]);
-						oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]);
-						// clear dummies
-						dummyFeatureValues[batchPosition].clear();
-						dummyBleuScores[batchPosition].clear();
-
-						// FEAR
-						cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl;
-						size_t fearPos = featureValues[batchPosition].size();
-						vector<const Word*> fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight,
-										featureValues[batchPosition], bleuScores[batchPosition], true,
-										distinctNbest, rank);
-						decoder->cleanup();
-						cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
-						for (size_t i = 0; i < fear.size(); ++i) {
-							delete fear[i];
-						}
-					}
-					else {
-						// TODO:
-					}
+				if (hope_fear) {
+					featureValuesHope.push_back(newFeatureValues);
+					featureValuesFear.push_back(newFeatureValues);
+					bleuScoresHope.push_back(newBleuScores);
+					bleuScoresFear.push_back(newBleuScores);
 				}
 				else {
-					if (!hope_fear) {
-						// MODEL
-						cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best wrt model score" << endl;
-						vector<const Word*> bestModel = decoder->getNBest(input, *sid, pass_n, 0.0, bleuScoreWeight,
-									featureValues[batchPosition], bleuScores[batchPosition], true,
-									distinctNbest, rank);
-						decoder->cleanup();
-						oneBests.push_back(bestModel);
-						// needed for calculating bleu of dev (1best translations) // todo:
-						all_ref_ids.push_back(*sid);
-						allBestModelScore.push_back(bestModel);
-						cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl;
-					}
-					else if (historyOf1best) {
-						// MODEL (for updating the history only, using dummy vectors)
+					featureValues.push_back(newFeatureValues);
+					dummyFeatureValues.push_back(newFeatureValues);
+					bleuScores.push_back(newBleuScores);
+					dummyBleuScores.push_back(newBleuScores);
+				}
+
+				if (perceptron_update || analytical_update) {
+					if (historyOf1best) {
+						// MODEL (for updating the history)
 						cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
 						vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
 								dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
@ -713,33 +666,114 @@ int main(int argc, char** argv) {
 					}

 					// HOPE
-					cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best hope translations" << endl;
-					size_t oraclePos = featureValues[batchPosition].size();
-					vector<const Word*> oracle = decoder->getNBest(input, *sid, pass_n, 1.0, bleuScoreWeight,
-									featureValues[batchPosition], bleuScores[batchPosition], true,
-									distinctNbest, rank);
+					cerr << "Rank " << rank << ", run decoder to get 1best hope translations" << endl;
+					size_t oraclePos = dummyFeatureValues[batchPosition].size();
+					vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
+							dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
+							distinctNbest, rank);
 					// needed for history
 					inputLengths.push_back(decoder->getCurrentInputLength());
 					ref_ids.push_back(*sid);
 					decoder->cleanup();
 					oracles.push_back(oracle);
-					cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl;
+					cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << dummyBleuScores[batchPosition][oraclePos] << endl;

-					oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
-					oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
+					oracleFeatureValues.push_back(dummyFeatureValues[batchPosition][oraclePos]);
+					oracleBleuScores.push_back(dummyBleuScores[batchPosition][oraclePos]);
+					// clear dummies
+					dummyFeatureValues[batchPosition].clear();
+					dummyBleuScores[batchPosition].clear();

 					// FEAR
-					cerr << "Rank " << rank << ", run decoder to get " << pass_n << "best fear translations" << endl;
+					cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl;
 					size_t fearPos = featureValues[batchPosition].size();
-					vector<const Word*> fear = decoder->getNBest(input, *sid, pass_n, -1.0, bleuScoreWeight,
-									featureValues[batchPosition], bleuScores[batchPosition], true,
-									distinctNbest, rank);
+					vector<const Word*> fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight,
+							featureValues[batchPosition], bleuScores[batchPosition], true,
+							distinctNbest, rank);
 					decoder->cleanup();
 					cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
 					for (size_t i = 0; i < fear.size(); ++i) {
 						delete fear[i];
 					}
 				}
+				else {
+					if (hope_fear) {
+						if (historyOf1best) {
+							// MODEL (for updating the history only, using dummy vectors)
+							cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
+							vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
+									dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
+									distinctNbest, rank);
+							decoder->cleanup();
+							oneBests.push_back(bestModel);
+							cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
+						}
+
+						// HOPE
+						cerr << "Rank " << rank << ", run decoder to get " << hope_n << "best hope translations" << endl;
+						vector<const Word*> oracle = decoder->getNBest(input, *sid, hope_n, 1.0, bleuScoreWeight,
+										featureValuesHope[batchPosition], bleuScoresHope[batchPosition], true,
+										distinctNbest, rank);
+						// needed for history
+						inputLengths.push_back(decoder->getCurrentInputLength());
+						ref_ids.push_back(*sid);
+						decoder->cleanup();
+						oracles.push_back(oracle);
+						cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][0] << endl;
+
+						// FEAR
+						cerr << "Rank " << rank << ", run decoder to get " << fear_n << "best fear translations" << endl;
+						vector<const Word*> fear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuScoreWeight,
+										featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
+										distinctNbest, rank);
+						decoder->cleanup();
+						cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][0] << endl;
+						for (size_t i = 0; i < fear.size(); ++i) {
+							delete fear[i];
+						}
+					}
+					else {
+						// MODEL
+						cerr << "Rank " << rank << ", run decoder to get " << n << "best wrt model score" << endl;
+						vector<const Word*> bestModel = decoder->getNBest(input, *sid, n, 0.0, bleuScoreWeight,
+									featureValues[batchPosition], bleuScores[batchPosition], true,
+									distinctNbest, rank);
+						decoder->cleanup();
+						oneBests.push_back(bestModel);
+						// needed for calculating bleu of dev (1best translations) // todo:
+						all_ref_ids.push_back(*sid);
+						allBestModelScore.push_back(bestModel);
+						cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl;
+
+						// HOPE
+						cerr << "Rank " << rank << ", run decoder to get " << n << "best hope translations" << endl;
+						size_t oraclePos = featureValues[batchPosition].size();
+						vector<const Word*> oracle = decoder->getNBest(input, *sid, n, 1.0, bleuScoreWeight,
+										featureValues[batchPosition], bleuScores[batchPosition], true,
+										distinctNbest, rank);
+						// needed for history
+						inputLengths.push_back(decoder->getCurrentInputLength());
+						ref_ids.push_back(*sid);
+						decoder->cleanup();
+						oracles.push_back(oracle);
+						cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl;
+
+						oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
+						oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
+
+						// FEAR
+						cerr << "Rank " << rank << ", run decoder to get " << n << "best fear translations" << endl;
+						size_t fearPos = featureValues[batchPosition].size();
+						vector<const Word*> fear = decoder->getNBest(input, *sid, n, -1.0, bleuScoreWeight,
+										featureValues[batchPosition], bleuScores[batchPosition], true,
+										distinctNbest, rank);
+						decoder->cleanup();
+						cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
+						for (size_t i = 0; i < fear.size(); ++i) {
+							delete fear[i];
+						}
+					}
+				}

 //				cerr << "Rank " << rank << ", sentence " << *sid << ", best model Bleu (approximate sentence bleu): "  << bleuScores[batchPosition][0] << endl;
 //				summedApproxBleu += bleuScores[batchPosition][0];
@ -750,12 +784,13 @@ int main(int argc, char** argv) {
 				++shardPosition;
 			} // end of batch loop

-			// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
 			vector<vector<float> > losses(actualBatchSize);
-			for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
-				for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
-					losses[batchPosition].push_back(oracleBleuScores[batchPosition]
-					    - bleuScores[batchPosition][j]);
+			if (!hope_fear) {
+				// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
+				for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
+					for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
+						losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
+					}
 				}
 			}

@ -766,11 +801,21 @@ int main(int argc, char** argv) {

 			if (logFeatureValues) {
 				for (size_t i = 0; i < featureValues.size(); ++i) {
-					for (size_t j = 0; j < featureValues[i].size(); ++j) {
-						featureValues[i][j].ApplyLog(baseOfLog);
+					if (hope_fear) {
+						for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+							featureValuesHope[i][j].ApplyLog(baseOfLog);
+						}
+						for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
+							featureValuesFear[i][j].ApplyLog(baseOfLog);
+						}
 					}
+					else {
+						for (size_t j = 0; j < featureValues[i].size(); ++j) {
+							featureValues[i][j].ApplyLog(baseOfLog);
+						}

-					oracleFeatureValues[i].ApplyLog(baseOfLog);
+						oracleFeatureValues[i].ApplyLog(baseOfLog);
+					}
 				}
 			}

@ -786,12 +831,29 @@ int main(int argc, char** argv) {
 			// optionally print out the feature values
 			if (print_feature_values) {
 				cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
-				for (size_t i = 0; i < featureValues.size(); ++i) {
-					for (size_t j = 0; j < featureValues[i].size(); ++j) {
-						cerr << featureValues[i][j] << endl;
+				if (hope_fear) {
+					cerr << "hope: " << endl;
+					for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+						for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+							cerr << featureValuesHope[i][j] << endl;
+						}
 					}
+					cerr << "fear: " << endl;
+					for (size_t i = 0; i < featureValuesFear.size(); ++i) {
+						for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
+							cerr << featureValuesFear[i][j] << endl;
+						}
+					}
+					cerr << endl;
+				}
+				else {
+					for (size_t i = 0; i < featureValues.size(); ++i) {
+						for (size_t j = 0; j < featureValues[i].size(); ++j) {
+							cerr << featureValues[i][j] << endl;
+						}
+					}
+					cerr << endl;
 				}
-				cerr << endl;
 			}

 			// Run optimiser on batch:
@ -818,9 +880,16 @@ int main(int argc, char** argv) {
 			    learning_rate, max_sentence_update, rank, epoch, controlUpdates);
 			}
 			else {
-				update_status = optimiser->updateWeights(mosesWeights, featureValues,
-			    losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
-			    learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
+				if (hope_fear) {
+					update_status = optimiser->updateWeightsHopeFear(mosesWeights,
+							featureValuesHope, featureValuesFear,	bleuScoresHope, bleuScoresFear, ref_ids,
+							learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
+				}
+				else {
+					update_status = optimiser->updateWeights(mosesWeights, featureValues,
+							losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
+							learning_rate, max_sentence_update, rank, epoch, updates_per_epoch, controlUpdates);
+				}
 			}

 			if (update_status[0] == 1) {
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@ -11,8 +11,12 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
    const vector<vector<float> >& losses,
    const vector<vector<float> >& bleuScores,
    const vector<ScoreComponentCollection>& oracleFeatureValues,
-    const vector<float> oracleBleuScores, const vector<size_t> sentenceIds,
-    float learning_rate, float max_sentence_update, size_t rank, size_t epoch,
+    const vector<float> oracleBleuScores,
+    const vector<size_t> sentenceIds,
+    float learning_rate,
+    float max_sentence_update,
+    size_t rank,
+    size_t epoch,
    int updates_per_epoch,
    bool controlUpdates) {

@ -79,6 +83,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 				featureValueDiff.MinusEquals(featureValues[i][j]);
 				float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
 				if (modelScoreDiff == 0) {
+					cerr << "equal feature values, constraint skipped.." << endl;
 					continue;
 				}

@ -312,6 +317,280 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 	return statusPlus;
 }

+vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+		const std::vector<std::vector<float> >& bleuScoresHope,
+		const std::vector<std::vector<float> >& bleuScoresFear,
+		const std::vector< size_t> sentenceIds,
+		float learning_rate,
+		float max_sentence_update,
+		size_t rank,
+		size_t epoch,
+		int updates_per_epoch,
+		bool controlUpdates) {
+
+	// vector of feature values differences for all created constraints
+	vector<ScoreComponentCollection> featureValueDiffs;
+	vector<float> lossMinusModelScoreDiffs;
+	vector<float> all_losses;
+
+	// most violated constraint in batch
+	ScoreComponentCollection max_batch_featureValueDiff;
+	float max_batch_loss = -1;
+	float max_batch_lossMinusModelScoreDiff = -1;
+
+	// Make constraints for new hypothesis translations
+	float epsilon = 0.0001;
+	int violatedConstraintsBefore = 0;
+	float oldDistanceFromOptimum = 0;
+
+	// iterate over input sentences (1 (online) or more (batch))
+	for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+		size_t sentenceId = sentenceIds[i];
+
+		// Pair all hope translations with all fear translations for one input sentence
+		for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+			for (size_t k = 0; k < featureValuesFear[i].size(); ++k) {
+				ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+				featureValueDiff.MinusEquals(featureValuesFear[i][k]);
+				cerr << "feature value diff: " << featureValueDiff << endl;
+				float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
+				if (modelScoreDiff == 0) {
+					cerr << "equal feature values, constraint skipped.." << endl;
+					continue;
+				}
+
+				float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k];
+				loss *= m_marginScaleFactor;
+		    if (m_weightedLossFunction == 1) {
+		    	loss *= bleuScoresHope[i][j];
+		    }
+		    else if (m_weightedLossFunction == 2) {
+		    	loss *= log2(bleuScoresHope[i][j]);
+		    }
+		    else if (m_weightedLossFunction == 10) {
+		    	loss *= log10(bleuScoresHope[i][j]);
+		    }
+
+		  	// check if constraint is violated
+				bool violated = false;
+				bool addConstraint = true;
+				float diff = loss - modelScoreDiff;
+				cerr << "constraint: " << modelScoreDiff << " >= " << loss << endl;
+				if (diff > (epsilon + m_precision)) {
+					violated = true;
+					cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << " (loss: " << loss << ")" << endl;
+				}
+				else if (m_onlyViolatedConstraints) {
+					addConstraint = false;
+				}
+
+				float lossMinusModelScoreDiff = loss - modelScoreDiff;
+				if (violated) {
+					if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
+						// find the most violated constraint per batch
+						if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
+							max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
+							max_batch_featureValueDiff = featureValueDiff;
+							max_batch_loss = loss;
+					  }
+					}
+				}
+
+				if (addConstraint && !m_accumulateMostViolatedConstraints) {
+					featureValueDiffs.push_back(featureValueDiff);
+					lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
+					all_losses.push_back(loss);
+
+					if (violated) {
+						++violatedConstraintsBefore;
+						oldDistanceFromOptimum += diff;
+					}
+				}
+			}
+		}
+	}
+
+	if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
+		cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
+		cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
+	}
+
+	if (m_max_number_oracles == 1) {
+		for (size_t k = 0; k < sentenceIds.size(); ++k) {
+			size_t sentenceId = sentenceIds[k];
+			m_oracles[sentenceId].clear();
+		}
+	}
+
+	size_t pastViolatedConstraints = 0;
+	// Add constraints from past iterations (BEFORE updating that list)
+	if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
+	  // add all past (most violated) constraints to the list of current constraints, computed with current weights!
+	  for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
+	  	float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
+
+	  	// check if constraint is violated
+			bool violated = false;
+			bool addConstraint = true;
+			float diff = m_losses[i] - modelScoreDiff;
+			if (diff > (epsilon + m_precision)) {
+				violated = true;
+				cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << " (loss: " << m_losses[i] << ")" << endl;
+			}
+			else if (m_onlyViolatedConstraints) {
+				addConstraint = false;
+			}
+
+	    if (addConstraint) {
+	    	featureValueDiffs.push_back(m_featureValueDiffs[i]);
+	    	lossMinusModelScoreDiffs.push_back(m_losses[i] - modelScoreDiff);
+	    	all_losses.push_back(m_losses[i]);
+//	    	cerr << "old constraint: " << modelScoreDiff << " >= " << m_losses[i] << endl;
+
+	    	if (violated) {
+	    		++violatedConstraintsBefore;
+	    		++pastViolatedConstraints;
+	    		oldDistanceFromOptimum += diff;
+	    	}
+	    }
+	  }
+	}
+
+	if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
+		cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
+		cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
+	}
+
+	// Add new most violated constraint to the list of current constraints
+	if (m_accumulateMostViolatedConstraints) {
+		if (max_batch_loss != -1) {
+			float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
+			float diff = max_batch_loss - modelScoreDiff;
+    	++violatedConstraintsBefore;
+    	oldDistanceFromOptimum += diff;
+
+    	featureValueDiffs.push_back(max_batch_featureValueDiff);
+    	lossMinusModelScoreDiffs.push_back(max_batch_loss - modelScoreDiff);
+    	all_losses.push_back(max_batch_loss);
+//    	cerr << "new constraint: " << modelScoreDiff << " !>= " << max_batch_loss << endl;
+		}
+	}
+
+	// Update the list of accumulated most violated constraints
+	if (max_batch_loss != -1) {
+		bool updated = false;
+		for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
+			float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
+			float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
+			if (abs(oldScore-newScore) < epsilon) {
+				m_losses[i] = max_batch_loss;
+				updated = true;
+				break;
+			}
+		}
+
+		if (!updated) {
+			m_featureValueDiffs.push_back(max_batch_featureValueDiff);
+			m_losses.push_back(max_batch_loss);
+		}
+	}
+
+	// run optimisation: compute alphas for all given constraints
+	vector<float> alphas;
+	ScoreComponentCollection summedUpdate;
+	if (violatedConstraintsBefore > 0) {
+	  cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << featureValueDiffs.size() << endl;
+	  cerr << "Rank " << rank << ", epoch " << epoch << ", number of violated constraints passed to optimizer: " << violatedConstraintsBefore << endl;
+	  if (m_slack != 0) {
+	    alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
+	  } else {
+	    alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs);
+	  }
+
+	  // Update the weight vector according to the alphas and the feature value differences
+	  // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
+	  for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
+	  	float alpha = alphas[k];
+	  	cerr << "alpha: " << alpha << endl;
+	  	ScoreComponentCollection update(featureValueDiffs[k]);
+	    update.MultiplyEquals(alpha);
+
+	    // sum up update
+	    summedUpdate.PlusEquals(update);
+	  }
+	}
+	else {
+	  cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
+	  vector<int> status(3);
+	  status[0] = 1;
+	  status[1] = 0;
+	  status[2] = 0;
+	  return status;
+	}
+
+	ScoreComponentCollection newWeights(currWeights);
+	newWeights.PlusEquals(summedUpdate);
+
+	// Sanity check: are there still violated constraints after optimisation?
+	int violatedConstraintsAfter = 0;
+	float newDistanceFromOptimum = 0;
+	for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
+		float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
+		float loss = all_losses[i];
+		float diff = loss - modelScoreDiff;
+		if (diff > (epsilon + m_precision)) {
+			++violatedConstraintsAfter;
+			newDistanceFromOptimum += diff;
+		}
+	}
+	cerr << "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter  << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl;
+	cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl;
+
+	if (controlUpdates && violatedConstraintsAfter > 0) {
+		float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum;
+		if ((violatedConstraintsBefore - violatedConstraintsAfter) <= 0 && distanceChange < 0) {
+			vector<int> statusPlus(3);
+			statusPlus[0] = -1;
+			statusPlus[1] = -1;
+			statusPlus[2] = -1;
+			return statusPlus;
+	  }
+	}
+
+	// Apply learning rate (fixed or flexible)
+	if (learning_rate != 1) {
+		cerr << "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl;
+		summedUpdate.MultiplyEquals(learning_rate);
+		cerr << "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl;
+	}
+
+	// Apply threshold scaling
+	if (max_sentence_update != -1) {
+		cerr << "Rank " << rank << ", epoch " << epoch << ", update before scaling to max-sentence-update: " << summedUpdate << endl;
+		summedUpdate.ThresholdScaling(max_sentence_update);
+		cerr << "Rank " << rank << ", epoch " << epoch << ", update after scaling to max-sentence-update: " << summedUpdate << endl;
+	}
+
+	// Apply update to weight vector or store it for later
+	if (updates_per_epoch > 0) {
+		m_accumulatedUpdates.PlusEquals(summedUpdate);
+		cerr << "Rank " << rank << ", epoch " << epoch << ", new accumulated updates:" << m_accumulatedUpdates << endl;
+	} else {
+		// apply update to weight vector
+		cerr << "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl;
+		currWeights.PlusEquals(summedUpdate);
+		cerr << "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl;
+	}
+
+	vector<int> statusPlus(3);
+	statusPlus[0] = 0;
+	statusPlus[1] = violatedConstraintsBefore;
+	statusPlus[2] = violatedConstraintsAfter;
+	return statusPlus;
+}
+
 vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
    ScoreComponentCollection& featureValues,
    float loss,
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@ -29,7 +29,7 @@ namespace Mira {
  class Optimiser {
    public:
      Optimiser() {}
-      virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
+      virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
 							 Moses::ScoreComponentCollection& featureValues,
 							 float loss,
 							 Moses::ScoreComponentCollection& oracleFeatureValues,
@ -40,24 +40,36 @@ namespace Mira {
 							 size_t rank,
 							 size_t epoch,
 							 bool controlUpdates) = 0;
-      virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
+      virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
            						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
            						  const std::vector< std::vector<float> >& losses,
            						  const std::vector<std::vector<float> >& bleuScores,
            						  const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
            						  const std::vector< float> oracleBleuScores,
-            						  const std::vector< size_t> sentenceId,
+            						  const std::vector< size_t> sentenceIds,
      										float learning_rate,
      										float max_sentence_update,
      										size_t rank,
 													size_t epoch,
      										int updates_per_epoch,
      										bool controlUpdates) = 0;
+      virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
+       						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+       						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+       						  const std::vector<std::vector<float> >& bleuScoresHope,
+       						  const std::vector<std::vector<float> >& bleuScoresFear,
+       						  const std::vector< size_t> sentenceIds,
+ 										float learning_rate,
+ 										float max_sentence_update,
+ 										size_t rank,
+ 										size_t epoch,
+ 										int updates_per_epoch,
+ 										bool controlUpdates) = 0;
  };
 
  class Perceptron : public Optimiser {
    public:
-    virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
+			virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
 						       Moses::ScoreComponentCollection& featureValues,
 						       float loss,
 						       Moses::ScoreComponentCollection& oracleFeatureValues,
@ -68,19 +80,31 @@ namespace Mira {
 						       size_t rank,
 						       size_t epoch,
 						       bool controlUpdates);
-			virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
+			virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
                         const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
                         const std::vector< std::vector<float> >& losses,
                         const std::vector<std::vector<float> >& bleuScores,
                         const std::vector<Moses::ScoreComponentCollection>& oracleFeatureValues,
                         const std::vector< float> oracleBleuScores,
-                         const std::vector< size_t> dummy,
+                         const std::vector< size_t> sentenceIds,
                         float learning_rate,
                         float max_sentence_update,
                         size_t rank,
                         size_t epoch,
                         int updates_per_epoch,
                         bool controlUpdates);
+	     virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
+	      						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+	      						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+	      						  const std::vector<std::vector<float> >& bleuScoresHope,
+	      						  const std::vector<std::vector<float> >& bleuScoresFear,
+	      						  const std::vector< size_t> sentenceIds,
+											float learning_rate,
+											float max_sentence_update,
+											size_t rank,
+											size_t epoch,
+											int updates_per_epoch,
+											bool controlUpdates);
  };

  class MiraOptimiser : public Optimiser {
@ -105,7 +129,7 @@ namespace Mira {

     ~MiraOptimiser() {}
   
-     virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& weights,
+     virtual std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
 							Moses::ScoreComponentCollection& featureValues,
 							float loss,
 							Moses::ScoreComponentCollection& oracleFeatureValues,
@ -116,13 +140,25 @@ namespace Mira {
 							size_t rank,
 							size_t epoch,
 							bool controlUpdates);
-     virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& weights,
+     virtual std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
      						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
      						  const std::vector< std::vector<float> >& losses,
      						  const std::vector<std::vector<float> >& bleuScores,
      						  const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
      						  const std::vector< float> oracleBleuScores,
-      						  const std::vector< size_t> sentenceId,
+      						  const std::vector< size_t> sentenceIds,
+										float learning_rate,
+										float max_sentence_update,
+										size_t rank,
+										size_t epoch,
+										int updates_per_epoch,
+										bool controlUpdates);
+     virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
+      						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+      						  const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+      						  const std::vector<std::vector<float> >& bleuScoresHope,
+      						  const std::vector<std::vector<float> >& bleuScoresFear,
+      						  const std::vector< size_t> sentenceIds,
 										float learning_rate,
 										float max_sentence_update,
 										size_t rank,
--- a/mira/Perceptron.cpp
+++ b/mira/Perceptron.cpp
@ -24,21 +24,40 @@ using namespace std;

 namespace Mira {

-  vector<int> Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
-						       ScoreComponentCollection& featureValues,
-						       float loss,
-						       ScoreComponentCollection& oracleFeatureValues,
-						       float oracleBleuScore,
-						       size_t sentenceId,
-						       float learning_rate,
-						       float max_sentence_update,
-						       size_t rank,
-						       size_t epoch,
-						       bool controlUpdates) {
-    vector<int> status(1);
-    status[0] = 0;
-    return status;
-  }
+vector<int> Perceptron::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
+		ScoreComponentCollection& featureValues,
+		float loss,
+		ScoreComponentCollection& oracleFeatureValues,
+		float oracleBleuScore,
+		size_t sentenceId,
+		float learning_rate,
+		float max_sentence_update,
+		size_t rank,
+		size_t epoch,
+		bool controlUpdates) {
+
+			vector<int> status(1);
+			status[0] = 0;
+			return status;
+}
+
+vector<int> Perceptron::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+		const std::vector<std::vector<float> >& bleuScoresHope,
+		const std::vector<std::vector<float> >& bleuScoresFear,
+		const std::vector< size_t> sentenceId,
+		float learning_rate,
+		float max_sentence_update,
+		size_t rank,
+		size_t epoch,
+		int updates_per_epoch,
+		bool controlUpdates) {
+
+	vector<int> status(1);
+	status[0] = 0;
+	return status;
+}

 vector<int> Perceptron::updateWeights(ScoreComponentCollection& currWeights,
 		const vector< vector<ScoreComponentCollection> >& featureValues,