Merge remote branch 'github/miramerge' into bjam

Conflicts: mira/Decoder.h mira/Main.cpp mira/Main.h moses-chart-cmd/src/IOWrapper.cpp moses-chart-cmd/src/Main.cpp moses-cmd/src/Main.cpp moses/src/BleuScoreFeature.cpp moses/src/BleuScoreFeature.h moses/src/TargetNgramFeature.h
2024-09-11 19:27:11 +03:00 · 2012-02-01 10:12:16 +00:00 · 2012-02-01 10:12:16 +00:00 · 5a17ef82b3
commit 5a17ef82b3
parent 1e10bb7ef7 04229893c4
12 changed files with 407 additions and 149 deletions
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@ -103,7 +103,7 @@ namespace Mira {
    staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight);

    m_bleuScoreFeature->SetCurrentSourceLength((*m_sentence).GetSize());
-    m_bleuScoreFeature->SetCurrentReference(sentenceid);
+    m_bleuScoreFeature->SetCurrentShortestReference(sentenceid);

    //run the decoder
    m_manager = new Moses::Manager(*m_sentence, staticData.GetSearchAlgorithm(), &system); 
@ -124,6 +124,8 @@ namespace Mira {

    	Phrase bestPhrase = path.GetTargetPhrase();

+    	if (iter != sentences.begin())
+    		cerr << endl;
    	cerr << "Rank " << rank << ", epoch " << epoch << ", \"";
    	Phrase phrase = path.GetTargetPhrase();
    	for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
@ -192,12 +194,12 @@ namespace Mira {
  	m_bleuScoreFeature->PrintHistory(out);
  }

-  void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
+/*  void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
  	m_bleuScoreFeature->PrintReferenceLength(ref_ids);
-  }
+  }*/

-  size_t MosesDecoder::getReferenceLength(size_t ref_id) {
-  	return m_bleuScoreFeature->GetReferenceLength(ref_id);
+  size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength) {
+  	return m_bleuScoreFeature->GetClosestReferenceLength(ref_id, hypoLength);
  }

  void MosesDecoder::setBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@ -68,12 +68,18 @@ class MosesDecoder {
    void printBleuFeatureHistory(std::ostream& out);
    void printReferenceLength(const std::vector<size_t>& ref_ids);
    size_t getReferenceLength(size_t ref_id);
+//    void printReferenceLength(const std::vector<size_t>& ref_ids);
+    size_t getClosestReferenceLength(size_t ref_id, int hypoLength);
    void setBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,
    		bool scaleByTargetLengthLinear, bool scaleByTargetLengthTrend,
  		  float scaleByX, float historySmoothing, size_t scheme, float relax_BP);
    Moses::ScoreComponentCollection getWeights();
    void setWeights(const Moses::ScoreComponentCollection& weights);
-	void cleanup();
+    void cleanup();
+
+    void setCorrection(float correction) {
+    	m_bleuScoreFeature->SetCorrection(correction);
+    }
 		
 	private:
    float getBleuScore(const Moses::ScoreComponentCollection& scores);
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@ -116,12 +116,15 @@ int main(int argc, char** argv) {
 	int threadcount;
 	size_t adapt_after_epoch;
 	size_t bleu_smoothing_scheme;
-	float max_length_deviation;
+	float max_length_dev_all;
 	float max_length_dev_hypos;
-	float max_length_dev_reference;
+	float max_length_dev_hope_ref;
+	float max_length_dev_fear_ref;
 	float relax_BP;
 	bool stabiliseLength;
 	bool delayUpdates;
+	float min_oracle_bleu;
+	bool correctScaling;
 	po::options_description desc("Allowed options");
 	desc.add_options()
 		("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
@ -134,6 +137,7 @@ int main(int argc, char** argv) {
 		("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
 		("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
 		("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
+		("correct-scaling", po::value<bool>(&correctScaling)->default_value(false), "Try to correct for scaling issues")
 		("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
 		("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
 		("delay-updates", po::value<bool>(&delayUpdates)->default_value(false), "Delay all updates until the end of an epoch")
@ -151,11 +155,13 @@ int main(int argc, char** argv) {
 		("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
 		("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
 		("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
-		("max-length-deviation", po::value<float>(&max_length_deviation)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope/fear translations and w.r.t. reference translations")
-		("max-length-dev-hypos", po::value<float>(&max_length_dev_hypos)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hop/fear translations")
-		("max-length-dev-reference", po::value<float>(&max_length_dev_reference)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation of hope/fear translations w.r.t. reference translations")
+		("max-length-dev-all", po::value<float>(&max_length_dev_all)->default_value(-1), "Make use of all 3 following options")
+		("max-length-dev-hypos", po::value<float>(&max_length_dev_hypos)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and fear translations")
+		("max-length-dev-hope-ref", po::value<float>(&max_length_dev_hope_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and reference translations")
+		("max-length-dev-fear-ref", po::value<float>(&max_length_dev_fear_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between fear and reference translations")
 		("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
-		("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
+		("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
+	        ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
 		("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
 		("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
 		("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
@ -352,6 +358,8 @@ int main(int argc, char** argv) {
 		cerr << "Error: Need to select an one of parameters --hope-fear/--model-hope-fear for mira update." << endl;
 		return 1;
 	}
+	if (historyOf1best || historyOfOracles)
+		sentenceLevelBleu = false;
 	if (!sentenceLevelBleu) {
 		if (!historyOf1best && !historyOfOracles) {
 			historyOf1best = true;
@ -361,9 +369,10 @@ int main(int argc, char** argv) {
 		bleuScoreWeight_hope = bleuScoreWeight;
 	}

-	if (max_length_deviation != -1) {
-		max_length_dev_reference = max_length_deviation;
-		max_length_dev_hypos = max_length_deviation;
+	if (max_length_dev_all != -1) {
+		max_length_dev_hypos = max_length_dev_all;
+		max_length_dev_hope_ref = max_length_dev_all;
+		max_length_dev_fear_ref = max_length_dev_all;
 	}

 #ifdef MPI_ENABLE
@ -429,9 +438,10 @@ int main(int argc, char** argv) {

 		numberOfUpdatesThisEpoch = 0;
 		// Sum up weights over one epoch, final average uses weights from last epoch
-		if (!accumulateWeights) {
+		if (!accumulateWeights)
 			cumulativeWeights.ZeroAll();
-		}
+
+		delayedWeightUpdates.ZeroAll();

 		// number of weight dumps this epoch
 		size_t weightEpochDump = 0;
@ -495,7 +505,8 @@ int main(int argc, char** argv) {
 					}
 				}

-				size_t reference_length = decoder->getReferenceLength(*sid);
+				size_t ref_length;
+				float avg_ref_length;
 				if (hope_fear || perceptron_update) {
 					// HOPE
 					cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n << "best hope translations" << endl;
@ -504,8 +515,11 @@ int main(int argc, char** argv) {
 							distinctNbest, rank, epoch);
 					size_t current_input_length = decoder->getCurrentInputLength();
 					decoder->cleanup();
-					float hope_length_ratio = (float)oracle.size()/reference_length;
+					ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+					avg_ref_length = ref_length;
+					float hope_length_ratio = (float)oracle.size()/ref_length;
 					cerr << ", l-ratio hope: " << hope_length_ratio << endl;
+					cerr << "Rank " << rank << ", epoch " << epoch << ", current input length: " << current_input_length << endl;

 					vector<const Word*> bestModel;
 					if (historyOf1best || stabiliseLength) {
@ -516,8 +530,9 @@ int main(int argc, char** argv) {
 								distinctNbest, rank, epoch);
 						decoder->cleanup();
 						cerr << endl;
+						ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
 						dev_hypothesis_length += bestModel.size();
-						dev_reference_length += reference_length;
+						dev_reference_length += ref_length;
 					}

 					// FEAR
@ -526,7 +541,10 @@ int main(int argc, char** argv) {
 							featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
 							distinctNbest, rank, epoch);
 					decoder->cleanup();
-					float fear_length_ratio = (float)fear.size()/reference_length;
+					ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+					avg_ref_length += ref_length;
+					avg_ref_length /= 2;
+					float fear_length_ratio = (float)fear.size()/ref_length;
 					cerr << ", l-ratio fear: " << fear_length_ratio << endl;
 					for (size_t i = 0; i < fear.size(); ++i) {
 						delete fear[i];
@ -537,11 +555,12 @@ int main(int argc, char** argv) {
 					float length_diff_fear = abs(1 - fear_length_ratio);
 					size_t length_diff_hope_fear = abs((int)oracle.size() - (int)fear.size());
 					cerr << "Rank " << rank << ", epoch " << epoch << ", abs-length hope-fear: " << length_diff_hope_fear << ", BLEU hope-fear: " << bleuScoresHope[batchPosition][0] - bleuScoresFear[batchPosition][0] << endl;
-
 					bool skip = false;
-					if (max_length_dev_reference != -1 && (length_diff_hope > max_length_dev_reference || length_diff_fear > max_length_dev_reference))
+					if (max_length_dev_hypos != -1 && (length_diff_hope_fear > avg_ref_length * max_length_dev_hypos))
 						skip = true;
-					if (max_length_dev_hypos != -1 && (length_diff_hope_fear > reference_length * max_length_dev_hypos))
+					if (max_length_dev_hope_ref != -1 && length_diff_hope > max_length_dev_hope_ref)
+						skip = true;
+					if (max_length_dev_fear_ref != -1 && length_diff_fear > max_length_dev_fear_ref)
 						skip = true;
 					if (skip) {
 						cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << fear_length_ratio << ", " << length_diff_hope_fear << ").. " << endl;
@ -579,7 +598,8 @@ int main(int argc, char** argv) {
 					ref_ids.push_back(*sid);
 					decoder->cleanup();
 					oracles.push_back(oracle);
-					float hope_length_ratio = (float)oracle.size()/reference_length;
+					ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+					float hope_length_ratio = (float)oracle.size()/ref_length;
 					cerr << ", l-ratio hope: " << hope_length_ratio << endl;

 					oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
@ -592,11 +612,12 @@ int main(int argc, char** argv) {
 							distinctNbest, rank, epoch);
 					decoder->cleanup();
 					oneBests.push_back(bestModel);
-					float model_length_ratio = (float)bestModel.size()/reference_length;
+					ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+					float model_length_ratio = (float)bestModel.size()/ref_length;
 					cerr << ", l-ratio model: " << model_length_ratio << endl;
 					if (stabiliseLength) {
 						dev_hypothesis_length += bestModel.size();
-						dev_reference_length += reference_length;
+						dev_reference_length += ref_length;
 					}

 					// FEAR
@ -606,7 +627,8 @@ int main(int argc, char** argv) {
 							featureValues[batchPosition], bleuScores[batchPosition], true,
 							distinctNbest, rank, epoch);
 					decoder->cleanup();
-					float fear_length_ratio = (float)fear.size()/reference_length;
+					ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+					float fear_length_ratio = (float)fear.size()/ref_length;
 					cerr << ", l-ratio fear: " << fear_length_ratio << endl;
 					for (size_t i = 0; i < fear.size(); ++i) {
 						delete fear[i];
@ -649,10 +671,9 @@ int main(int argc, char** argv) {
 					iter = featureFunctions.begin();
 					for (; iter != featureFunctions.end(); ++iter) {
 						if ((*iter)->GetScoreProducerWeightShortName() == "w") {
-							ignoreWPFeature(featureValues, (*iter));
-							ignoreWPFeature(featureValuesHope, (*iter));
-							ignoreWPFeature(featureValuesFear, (*iter));
-							break;
+							ignoreFeature(featureValues, (*iter));
+							ignoreFeature(featureValuesHope, (*iter));
+							ignoreFeature(featureValuesFear, (*iter));							
 						}
 					}
 				}
@ -696,8 +717,16 @@ int main(int argc, char** argv) {
 							featureValuesHope, featureValuesFear, dummy1, dummy1, learning_rate, rank, epoch);
 				}
 				else if (hope_fear) {
-					update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
-							featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, learning_rate, rank, epoch);
+					if (bleuScoresHope[0][0] >= min_oracle_bleu)
+						if (hope_n == 1 && fear_n ==1)
+							update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights, weightUpdate,
+									featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
+									learning_rate, rank, epoch);
+						else
+							update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
+									featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, learning_rate, rank, epoch);
+				  else
+				    update_status = -1;										     
 				}
 				else {
 					// model_hope_fear
@ -709,31 +738,34 @@ int main(int argc, char** argv) {

 				if (update_status == 0) {	 // if weights were updated
 					// apply weight update
-					mosesWeights.PlusEquals(weightUpdate);
-
-					if (normaliseWeights) {
-						mosesWeights.L1Normalise();
+					if (delayUpdates) {
+						delayedWeightUpdates.PlusEquals(weightUpdate);
+						cerr << "\nRank " << rank << ", epoch " << epoch << ", keeping update: " << weightUpdate << endl;
+						++numberOfUpdatesThisEpoch;
 					}
+					else {
+						mosesWeights.PlusEquals(weightUpdate);
+						if (normaliseWeights)
+							mosesWeights.L1Normalise();

-					cumulativeWeights.PlusEquals(mosesWeights);
-					++numberOfUpdates;
-					++numberOfUpdatesThisEpoch;
-					if (averageWeights) {
-						ScoreComponentCollection averageWeights(cumulativeWeights);
-						if (accumulateWeights) {
-							averageWeights.DivideEquals(numberOfUpdates);
-						} else {
-							averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+						cumulativeWeights.PlusEquals(mosesWeights);
+						++numberOfUpdates;
+						++numberOfUpdatesThisEpoch;
+						if (averageWeights) {
+							ScoreComponentCollection averageWeights(cumulativeWeights);
+							if (accumulateWeights) {
+								averageWeights.DivideEquals(numberOfUpdates);
+							} else {
+								averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+							}
+
+							mosesWeights = averageWeights;
 						}

-						mosesWeights = averageWeights;
+						if (!delayUpdates)
+							// set new Moses weights
+							decoder->setWeights(mosesWeights);
 					}
-
-					if (delayUpdates)
-						delayedWeightUpdates.PlusEquals(weightUpdate);
-					else
-						// set new Moses weights
-						decoder->setWeights(mosesWeights);
 				}

 				// update history (for approximate document Bleu)
@ -788,7 +820,7 @@ int main(int argc, char** argv) {
 			} // end mixing

 			// Dump weights?
-			if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
+			if (!delayUpdates && evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
 			  ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
 			  bool proceed = false;
 			  if (accumulateWeights) {
@ -817,25 +849,25 @@ int main(int argc, char** argv) {
 			      
 			      // normalise weights after averaging
 			      if (normaliseWeights) {
-				mixedAverageWeights.L1Normalise();
+			      	mixedAverageWeights.L1Normalise();
 			      }
 			      
 			      // dump final average weights
 			      ostringstream filename;
 			      if (epoch < 10) {
-				filename << weightDumpStem << "_0" << epoch;
+			      	filename << weightDumpStem << "_0" << epoch;
 			      } else {
-				filename << weightDumpStem << "_" << epoch;
+			      	filename << weightDumpStem << "_" << epoch;
 			      }
 			      
 			      if (weightDumpFrequency > 1) {
-				filename << "_" << weightEpochDump;
+			      	filename << "_" << weightEpochDump;
 			      }
 			      
 			      if (accumulateWeights) {
-				cerr << "\nMixed average weights (cumulative) during epoch "	<< epoch << ": " << mixedAverageWeights << endl;
+			      	cerr << "\nMixed average weights (cumulative) during epoch "	<< epoch << ": " << mixedAverageWeights << endl;
 			      } else {
-				cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
+			      	cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
 			      }
 			      
 			      cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
@ -847,12 +879,102 @@ int main(int argc, char** argv) {

 		} // end of shard loop, end of this epoch

+		if (correctScaling && epoch == 0) {
+			float averageRatio = ((MiraOptimiser*) optimiser)->getSumRatios();
+			averageRatio /= numberOfUpdatesThisEpoch;
+			cerr << "Rank " << rank << ", epoch " << epoch << ", average ratio: " << averageRatio << endl;
+			float correctionFactor = 0.9;
+			float mixedAverageRatio = 0;
+			float *sendbuf_float, *recvbuf_float;
+			sendbuf_float = (float *) malloc(sizeof(float));
+			recvbuf_float = (float *) malloc(sizeof(float));
+#ifdef MPI_ENABLE
+			// average across processes
+			//		    mpi::reduce(world, averageRatio, mixedAverageRatio, SCCPlus(), 0);
+			sendbuf_float[0] = averageRatio;
+			recvbuf_float[0] = 0;
+			MPI_Reduce(sendbuf_float, recvbuf_float, 1, MPI_FLOAT, MPI_SUM, 0, world);
+			mixedAverageRatio = recvbuf_float[0];
+
+			  if (rank == 0) {
+			  	mixedAverageRatio /= size;
+			  	mixedAverageRatio *= correctionFactor;
+					mpi::broadcast(world, mixedAverageRatio, 0);
+			  }
+#endif
+#ifndef MPI_ENABLE
+		    mixedAverageRatio = averageRatio;
+		    mixedAverageRatio *= correctionFactor;
+#endif
+
+			decoder->setCorrection(mixedAverageRatio);
+			cerr <<  "Rank " << rank << ", epoch " << epoch << ", setting scaling correction to " << mixedAverageRatio << "." << endl;
+			decoder->setWeights(initialWeights);
+			cerr <<  "Rank " << rank << ", epoch " << epoch << ", resetting decoder weights to initial weights." << endl;
+		}
+
 		if (delayUpdates) {
 			// apply all updates from this epoch to the weight vector
 			ScoreComponentCollection mosesWeights = decoder->getWeights();
+			cerr << "Rank " << rank << ", epoch " << epoch << ", delayed update, old moses weights: " << mosesWeights << endl;
 			mosesWeights.PlusEquals(delayedWeightUpdates);
+			cumulativeWeights.PlusEquals(mosesWeights);
 			decoder->setWeights(mosesWeights);
 			cerr << "Rank " << rank << ", epoch " << epoch << ", delayed update, new moses weights: " << mosesWeights << endl;
+
+		  ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
+		  bool proceed = false;
+		  if (accumulateWeights) {
+		    if (numberOfUpdatesThisEpoch > 0) {
+		      tmpAverageWeights.DivideEquals(epoch+1);
+		      proceed = true;
+		    }
+		  }
+		  else {
+		    if (numberOfUpdatesThisEpoch > 0)
+		      proceed = true;
+		  }
+
+		  if (proceed) {
+#ifdef MPI_ENABLE
+		    // average across processes
+		    mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
+#endif
+#ifndef MPI_ENABLE
+		    mixedAverageWeights = tmpAverageWeights;
+#endif
+		    if (rank == 0 && !weightDumpStem.empty()) {
+		      // divide by number of processes
+		      mixedAverageWeights.DivideEquals(size);
+
+		      // normalise weights after averaging
+		      if (normaliseWeights) {
+		      	mixedAverageWeights.L1Normalise();
+		      }
+
+		      // dump final average weights
+		      ostringstream filename;
+		      if (epoch < 10) {
+		      	filename << weightDumpStem << "_0" << epoch;
+		      } else {
+		      	filename << weightDumpStem << "_" << epoch;
+		      }
+
+		      if (weightDumpFrequency > 1) {
+		      	filename << "_" << weightEpochDump;
+		      }
+
+		      if (accumulateWeights) {
+		      	cerr << "\nMixed average weights (cumulative) during epoch "	<< epoch << ": " << mixedAverageWeights << endl;
+		      } else {
+		      	cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
+		      }
+
+		      cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+		      mixedAverageWeights.Save(filename.str());
+		      ++weightEpochDump;
+		    }
+		  }
 		}

 		if (stabiliseLength && !fixLength) {
@ -1050,10 +1172,10 @@ void ignoreCoreFeatures(vector<vector<ScoreComponentCollection> > &featureValues
 		}
 }

-void ignoreWPFeature(vector<vector<ScoreComponentCollection> > &featureValues, const ScoreProducer* sp) {
+void ignoreFeature(vector<vector<ScoreComponentCollection> > &featureValues, const ScoreProducer* sp) {
 	for (size_t i = 0; i < featureValues.size(); ++i)
 		for (size_t j = 0; j < featureValues[i].size(); ++j)
-			// set WP feature to 0
+			// set feature to 0
 			featureValues[i][j].Assign(sp, 0);
 }

--- a/mira/Main.h
+++ b/mira/Main.h
@ -47,7 +47,7 @@ bool loadWeights(const std::string& filename, StrFloatMap& coreWeightMap);
 bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size);
 void printFeatureValues(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues);
 void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, StrFloatMap &coreWeightMap);
-void ignoreWPFeature(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, const Moses::ScoreProducer* sp);
+void ignoreFeature(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, const Moses::ScoreProducer* sp);
 void takeLogs(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t base);
 void deleteTranslations(std::vector<std::vector<const Moses::Word*> > &translations);

--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@ -107,7 +107,7 @@ size_t MiraOptimiser::updateWeights(
 	  // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
 	  for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
 	  	float alpha = alphas[k];
-	  	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
+	  	cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
 	  	ScoreComponentCollection update(featureValueDiffs[k]);
 	    update.MultiplyEquals(alpha);
 	    
@ -233,7 +233,7 @@ size_t MiraOptimiser::updateWeightsHopeFear(
 					addConstraint = false;
 				}

-				float lossMinusModelScoreDiff = loss - modelScoreDiff;
+				float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_margin_slack);
 				if (addConstraint) {
 					featureValueDiffs.push_back(featureValueDiff);
 					lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
@ -264,7 +264,7 @@ size_t MiraOptimiser::updateWeightsHopeFear(
 	  // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
 	  for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
 	  	float alpha = alphas[k];
-	  	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
+	  	cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
 	  	ScoreComponentCollection update(featureValueDiffs[k]);
 	    update.MultiplyEquals(alpha);

@ -321,5 +321,96 @@ size_t MiraOptimiser::updateWeightsHopeFear(
 	return 0;
 }

+size_t MiraOptimiser::updateWeightsAnalytically(
+		ScoreComponentCollection& currWeights,
+		ScoreComponentCollection& weightUpdate,
+		ScoreComponentCollection& featureValuesHope,
+    ScoreComponentCollection& featureValuesFear,
+    float bleuScoreHope,
+    float bleuScoreFear,
+    float learning_rate,
+    size_t rank,
+    size_t epoch) {
+
+  float epsilon = 0.0001;
+  float oldDistanceFromOptimum = 0;
+  bool constraintViolatedBefore = false;
+
+ // cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
+ // cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
+  ScoreComponentCollection featureValueDiff = featureValuesHope;
+  featureValueDiff.MinusEquals(featureValuesFear);
+  cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
+  float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
+  float loss = bleuScoreHope - bleuScoreFear;
+  float diff = 0;
+  float ratio = (modelScoreDiff == 0) ? 0 : loss / modelScoreDiff;
+  cerr << "Rank " << rank << ", epoch " << epoch << ", ratio model/loss: " << ratio << endl;
+  if (loss > (modelScoreDiff + m_margin_slack)) {
+  	diff = loss - (modelScoreDiff + m_margin_slack);
+  }
+  cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
+  if (epoch == 0)
+  	m_sum_ratios += ratio;
+
+  if (diff > epsilon) {
+    // constraint violated
+    oldDistanceFromOptimum += diff;
+    constraintViolatedBefore = true;
+
+    // compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
+    // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
+    // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
+    float squaredNorm = featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm();
+
+    if (squaredNorm > 0) {
+    	float alpha = diff / squaredNorm;
+    	if (m_slack > 0 ) {
+    		if (alpha > m_slack) {
+    			alpha = m_slack;
+    		}
+    		else if (alpha < m_slack*(-1)) {
+    			alpha = m_slack*(-1);
+    		}
+    	}
+
+    	cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
+    	featureValueDiff.MultiplyEquals(alpha);
+    	weightUpdate.PlusEquals(featureValueDiff);
+    }
+    else {
+    	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
+    }
+  }
+
+  if (!constraintViolatedBefore) {
+    // constraint satisfied, nothing to do
+  	cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
+    return 1;
+  }
+
+  // sanity check: constraint still violated after optimisation?
+/*  ScoreComponentCollection newWeights(currWeights);
+  newWeights.PlusEquals(weightUpdate);
+  bool constraintViolatedAfter = false;
+  float newDistanceFromOptimum = 0;
+  featureValueDiff = featureValuesHope;
+  featureValueDiff.MinusEquals(featureValuesFear);
+  modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
+  diff = loss - (modelScoreDiff + m_margin_slack);
+  // approximate comparison between floats!
+  if (diff > epsilon) {
+    constraintViolatedAfter = true;
+    newDistanceFromOptimum += (loss - modelScoreDiff);
+  }
+
+  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
+  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
+*/
+
+  return 0;
+}
+
 }

--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@ -67,7 +67,8 @@ namespace Mira {
 		  m_slack(slack),
 		  m_scale_margin(scale_margin),
 		  m_scale_update(scale_update),
-		  m_margin_slack(margin_slack) { }
+		  m_margin_slack(margin_slack),
+		  m_sum_ratios(0) { }
   
 	  size_t updateWeights(Moses::ScoreComponentCollection& currWeights,
 	  								Moses::ScoreComponentCollection& weightUpdate,
@ -88,6 +89,15 @@ namespace Mira {
      						  float learning_rate,
      						  size_t rank,
      						  size_t epoch);
+     size_t updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
+    		 Moses::ScoreComponentCollection& weightUpdate,
+    		 Moses::ScoreComponentCollection& featureValuesHope,
+    		 Moses::ScoreComponentCollection& featureValuesFear,
+    		 float bleuScoresHope,
+    		 float bleuScoresFear,
+    		 float learning_rate,
+    		 size_t rank,
+    		 size_t epoch);

     void setSlack(float slack) {
    	 m_slack = slack;
@ -97,6 +107,10 @@ namespace Mira {
    	 m_margin_slack = margin_slack;
     }

+     float getSumRatios() {
+     	return m_sum_ratios;
+     }
+
   private:

      // add only violated constraints to the optimisation problem
@ -112,6 +126,9 @@ namespace Mira {

      // scale update with log 10 of oracle BLEU score
      size_t m_scale_update;
+
+      // collect (loss diff/model score diff) ratios from first epoch
+      float m_sum_ratios;
  };
 }

--- a/moses-chart-cmd/src/IOWrapper.cpp
+++ b/moses-chart-cmd/src/IOWrapper.cpp
@ -367,6 +367,12 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha

    std::string lastName = "";

+    // output stateful sparse features
+    const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
+    for( size_t i=0; i<sff.size(); i++ )
+    	if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
+    		OutputSparseFeatureScores( out, path, sff[i], lastName );
+
    // translation components
    const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
    if (pds.size() > 0) {
@ -408,19 +414,14 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
      }
    }

-    // output sparse features
+    // output stateless sparse features
    lastName = "";
-    const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
-    for( size_t i=0; i<sff.size(); i++ )
-    	if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
-    		OutputSparseFeatureScores( out, path, sff[i], lastName );

    const vector<const StatelessFeatureFunction*>& slf = system->GetStatelessFeatureFunctions();
    for( size_t i=0; i<slf.size(); i++ )
    	if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
    		OutputSparseFeatureScores( out, path, slf[i], lastName );

-
    // total
    out << " ||| " << path.GetTotalScore();

--- a/moses-chart-cmd/src/Main.cpp
+++ b/moses-chart-cmd/src/Main.cpp
@ -173,17 +173,13 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
           << ff->GetScoreProducerWeightShortName() << " "
           << values[i] << endl;
  }
-}
-
-static void PrintSparseFeatureWeight(const FeatureFunction* ff)
-{
-  if (ff->GetNumScoreComponents() == ScoreProducer::unlimited) {
-    if (ff->GetSparseProducerWeight() == 1)
-      cout << ff->GetScoreProducerDescription() << " " <<
-	ff->GetScoreProducerWeightShortName() << " sparse" << endl;
-    else
-      cout << ff->GetScoreProducerDescription() << " " <<
-	ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
+  else {
+  	if (ff->GetSparseProducerWeight() == 1)
+  		cout << ff->GetScoreProducerDescription() << " " <<
+  		ff->GetScoreProducerWeightShortName() << " sparse" << endl;
+  	else
+  		cout << ff->GetScoreProducerDescription() << " " <<
+  		ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
  }
 }

@ -208,9 +204,6 @@ static void ShowWeights()
  for (size_t i = 0; i < slf.size(); ++i) {
    PrintFeatureWeight(slf[i]);
  }
-  for (size_t i = 0; i < sff.size(); ++i) {
-    PrintSparseFeatureWeight(sff[i]);
-  }
 }


--- a/moses-cmd/src/Main.cpp
+++ b/moses-cmd/src/Main.cpp
@ -295,17 +295,13 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
           << ff->GetScoreProducerWeightShortName() << " "
           << values[i] << endl;
  }
-}
-
-static void PrintSparseFeatureWeight(const FeatureFunction* ff)
-{
-  if (ff->GetNumScoreComponents() == ScoreProducer::unlimited) {
-    if (ff->GetSparseProducerWeight() == 1)
-      cout << ff->GetScoreProducerDescription() << " " <<
-	ff->GetScoreProducerWeightShortName() << " sparse" <<  endl;
-    else
-      cout << ff->GetScoreProducerDescription() << " " <<
-	ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
+  else {
+  	if (ff->GetSparseProducerWeight() == 1)
+  		cout << ff->GetScoreProducerDescription() << " " <<
+  		ff->GetScoreProducerWeightShortName() << " sparse" <<  endl;
+  	else
+  		cout << ff->GetScoreProducerDescription() << " " <<
+  		ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
  }
 }

@ -322,7 +318,7 @@ static void ShowWeights()
    PrintFeatureWeight(sff[i]);
  }
  for (size_t i = 0; i < slf.size(); ++i) {
-    PrintFeatureWeight(slf[i]);
+  	PrintFeatureWeight(slf[i]);
  }
  for (size_t i = 0; i < pds.size(); ++i) {
    PrintFeatureWeight(pds[i]);
@ -330,9 +326,6 @@ static void ShowWeights()
  for (size_t i = 0; i < gds.size(); ++i) {
    PrintFeatureWeight(gds[i]);
  }
-  for (size_t i = 0; i < sff.size(); ++i) {
-    PrintSparseFeatureWeight(sff[i]);
-  }
 }

 /** main function of the command line version of the decoder **/
--- a/moses/src/BleuScoreFeature.cpp
+++ b/moses/src/BleuScoreFeature.cpp
@ -98,37 +98,47 @@ void BleuScoreFeature::SetBleuParameters(bool scaleByInputLength, bool scaleByRe
 void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
 {
 	m_refs.clear();
-    FactorCollection& fc = FactorCollection::Instance();
-    cerr << "Number of reference files: " << refs.size() << endl; 
-    for (size_t file_id = 0; file_id < refs.size(); file_id++) {
-      for (size_t ref_id = 0; ref_id < refs[file_id].size(); ref_id++) {
-          const string& ref = refs[file_id][ref_id];
-          vector<string> refTokens  = Tokenize(ref);
-          m_refs[ref_id] = pair<size_t,NGrams>();
-           pair<size_t,NGrams>& ref_pair = m_refs[ref_id];
-          ref_pair.first = refTokens.size();
-          for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
-              for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
-                  Phrase ngram(1);
-                  for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
-                      const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
-                      Word w;
-                      w.SetFactor(0, f);
-                      ngram.AddWord(w);
-                  }
-                  ref_pair.second[ngram] += 1;
-              }
-          }
-      }
-    }
+	FactorCollection& fc = FactorCollection::Instance();
+	for (size_t file_id = 0; file_id < refs.size(); file_id++) {
+		for (size_t ref_id = 0; ref_id < refs[file_id].size(); ref_id++) {
+			const string& ref = refs[file_id][ref_id];
+			vector<string> refTokens  = Tokenize(ref);
+			if (file_id == 0)
+				m_refs[ref_id] = pair<vector<size_t>,NGrams>();
+			pair<vector<size_t>,NGrams>& ref_pair = m_refs[ref_id];
+			(ref_pair.first).push_back(refTokens.size());
+			for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
+				for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
+					Phrase ngram(1);
+					for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
+						const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
+						Word w;
+						w.SetFactor(0, f);
+						ngram.AddWord(w);
+					}
+					ref_pair.second[ngram] += 1;
+				}
+			}
+   	}
+	}
+
+//	for (size_t i = 0; i < m_refs.size(); ++i) {
+//		cerr << "ref id " << i << ", number of entries: " << (m_refs[i].first).size() << endl;
+//	}
 }

 void BleuScoreFeature::SetCurrentSourceLength(size_t source_length) {
    m_cur_source_length = source_length;
 }

-void BleuScoreFeature::SetCurrentReference(size_t ref_id) {
-    m_cur_ref_length = m_refs[ref_id].first;
+void BleuScoreFeature::SetCurrentShortestReference(size_t ref_id) {
+		// look for shortest reference
+		int shortestRef = -1;
+		for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+			if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef)
+				shortestRef = (m_refs[ref_id].first)[i];
+		}
+    m_cur_ref_length = shortestRef;
    m_cur_ref_ngrams = m_refs[ref_id].second;
 }

@ -163,15 +173,16 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
 * Update history with a batch of translations
 */
 void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
-	for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
-	    Phrase phrase(hypos[batchPosition]);
+	for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id){
+	    Phrase phrase(hypos[ref_id]);
 	    std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
 	    std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);

 	    // set current source and reference information for each oracle in the batch
-	    size_t cur_source_length = sourceLengths[batchPosition];
-	    size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
-	    NGrams cur_ref_ngrams = m_refs[ref_ids[batchPosition]].second;
+	    size_t cur_source_length = sourceLengths[ref_id];
+	    size_t hypo_length = hypos[ref_id].size();
+	    size_t cur_ref_length = GetClosestReferenceLength(ref_ids[ref_id], hypo_length);
+	    NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
 	    cerr << "reference length: " << cur_ref_length << endl;

 	    // compute vector c(e;{r_k}):
@ -184,7 +195,7 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 	        m_match_history[i] += ngram_matches[i];

 	        // do this for last position in batch
-	        if (batchPosition == hypos.size() - 1) {
+	        if (ref_id == hypos.size() - 1) {
 	        	m_count_history[i] *= m_historySmoothing;
 	        	m_match_history[i] *= m_historySmoothing;
 	        }
@ -192,11 +203,11 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo

 	    // update counts for reference and target length
 	    m_source_length_history += cur_source_length;
-	    m_target_length_history += hypos[batchPosition].size();
+	    m_target_length_history += hypos[ref_id].size();
 	    m_ref_length_history += cur_ref_length;

 	    // do this for last position in batch
-	    if (batchPosition == hypos.size() - 1) {
+	    if (ref_id == hypos.size() - 1) {
 	    	cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
 	    	cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
 	    	m_source_length_history *= m_historySmoothing;
@ -209,15 +220,24 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 /*
 * Print batch of reference translations
 */
-void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
-	for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
-	    size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
+/*void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
+	for (size_t ref_id = 0; ref_id < ref_ids.size(); ++ref_id){
+	    size_t cur_ref_length = (m_refs[ref_ids[ref_id]].first)[0]; // TODO!!
 	    cerr << "reference length: " << cur_ref_length << endl;
 	}
-}
+}*/

-size_t BleuScoreFeature::GetReferenceLength(size_t ref_id) {
-	size_t cur_ref_length = m_refs[ref_id].first;
+size_t BleuScoreFeature::GetClosestReferenceLength(size_t ref_id, int hypoLength) {
+	// look for closest reference
+	int currentDist = -1;
+	int closestRef = -1;
+	for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+		if (closestRef == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
+			closestRef = (m_refs[ref_id].first)[i];
+			currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
+		}
+	}
+	size_t cur_ref_length = closestRef;
 	return cur_ref_length;
 }

@ -446,7 +466,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
  else if (m_scale_by_avg_length) {
    precision *= (m_source_length_history + m_ref_length_history + m_cur_source_length +  + m_cur_ref_length) / 2;
  }
-  return precision*m_scale_by_x;
+  return (precision*m_scale_by_x)/m_correction;
 }

 const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
--- a/moses/src/BleuScoreFeature.h
+++ b/moses/src/BleuScoreFeature.h
@ -44,7 +44,7 @@ class BleuScoreFeature : public StatefulFeatureFunction {
 public:

  typedef boost::unordered_map< Phrase, size_t > NGrams;
-  typedef boost::unordered_map<size_t, std::pair<size_t,NGrams> > RefCounts;
+  typedef boost::unordered_map<size_t, std::pair<std::vector<size_t>,NGrams> > RefCounts;
  typedef boost::unordered_map<size_t, NGrams> Matches;

 	BleuScoreFeature():
@ -53,14 +53,15 @@ public:
 	                                 m_match_history(BleuScoreState::bleu_order),
 	                                 m_source_length_history(0),
 	                                 m_target_length_history(0),
-
 	                                 m_ref_length_history(0),
 	                                 m_scale_by_input_length(true),
 	                                 m_scale_by_ref_length(false),
 	                                 m_scale_by_avg_length(false),
 	                                 m_scale_by_x(1),
 	                                 m_historySmoothing(0.7),
-	                                 m_smoothing_scheme(PLUS_ONE) {}
+	                                 m_smoothing_scheme(PLUS_ONE),
+	                                 m_relax_BP(1),
+	                                 m_correction(1) {}

    std::string GetScoreProducerDescription() const
    {
@ -75,11 +76,12 @@ public:
    void PrintHistory(std::ostream& out) const;
    void LoadReferences(const std::vector< std::vector< std::string > > &);
    void SetCurrentSourceLength(size_t);
-    void SetCurrentReference(size_t);
+    void SetCurrentShortestReference(size_t);
    void UpdateHistory(const std::vector< const Word* >&);
    void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
    void PrintReferenceLength(const std::vector<size_t>& ref_ids);
    size_t GetReferenceLength(size_t ref_id);
+    size_t GetClosestReferenceLength(size_t ref_id, int hypoLength);
    void SetBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,
    		bool scaleByTargetLengthLinear, bool scaleByTargetLengthTrend,
  		  float scaleByX, float historySmoothing, size_t scheme, float relaxBP);
@ -107,6 +109,14 @@ public:
    float CalculateBleu(BleuScoreState*) const;
    const FFState* EmptyHypothesisState(const InputType&) const;

+    void SetCorrection(float correction) {
+    	m_correction = correction;
+    }
+
+    float GetCorrection() {
+    	return m_correction;
+    }
+
 private:
    // counts for pseudo-document
    std::vector< float > m_count_history;
@ -145,6 +155,9 @@ private:

    // relax application of the BP by setting a value between 0 and 1
    float m_relax_BP;
+
+    // correct scaling issues
+    float m_correction;
 };

 } // Namespace.
--- a/moses/src/WordTranslationFeature.cpp
+++ b/moses/src/WordTranslationFeature.cpp
@ -60,9 +60,9 @@ void WordTranslationFeature::Evaluate(const TargetPhrase& targetPhrase,
    if (m_unrestricted || sourceExists || targetExists) {
      // construct feature name
      stringstream featureName;
-      featureName << (sourceExists ? sourceWord : "OTHER");
+      featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
      featureName << "|";
-      featureName << (targetExists ? targetWord : "OTHER");
+      featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
      accumulator->PlusEquals(this,featureName.str(),1);
    }
  }