code clean-up, step 1

git-svn-id: http://svn.statmt.org/repository/mira@3918 cc96ff50-19ce-11e0-b349-13d7f0bd23df
2024-12-27 22:14:57 +03:00 · 2011-06-28 11:35:59 +00:00 · 2011-06-28 11:35:59 +00:00 · 120be1df4f
commit 120be1df4f
parent 8e6c963041
10 changed files with 59 additions and 564 deletions
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@ -32,8 +32,6 @@ using namespace Moses;
 namespace Mira {
  //Decoder::~Decoder() {}
  /**
    * Allocates a char* and copies string into it.
  **/
@ -70,7 +68,7 @@ namespace Mira {
  MosesDecoder::MosesDecoder(bool scaleByInputLength, float historySmoothing)
 		: m_manager(NULL) {
-			// force initialisation of the phrase dictionary (TODO: what for?)
+	  // force initialisation of the phrase dictionary (TODO: why?)
 	  const StaticData &staticData = StaticData::Instance();
      m_sentence = new Sentence(Input);
      stringstream in("Initialising decoder..\n");
@ -176,61 +174,6 @@ namespace Mira {
    return best;
  }
  vector<float> MosesDecoder::getBleuAndScore(const std::string& source,
                                size_t sentenceid,
                                float bleuObjectiveWeight,
                                float bleuScoreWeight,
                                bool distinct,
                                size_t rank,
                                size_t epoch)
  {
  	StaticData &staticData = StaticData::InstanceNonConst();
  	m_sentence = new Sentence(Input);
  	stringstream in(source + "\n");
  	const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
  	m_sentence->Read(in,inputFactorOrder);
  	const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
  	// set the weight for the bleu feature
  	ostringstream bleuWeightStr;
  	bleuWeightStr << (bleuObjectiveWeight * bleuScoreWeight);
  	PARAM_VEC bleuWeight(1,bleuWeightStr.str());
  	staticData.GetParameter()->OverwriteParam("weight-bl", bleuWeight);
  	staticData.ReLoadBleuScoreFeatureParameter();
  	m_bleuScoreFeature->SetCurrentSourceLength((*m_sentence).GetSize());
  	m_bleuScoreFeature->SetCurrentReference(sentenceid);
  	//run the decoder
  	m_manager = new Moses::Manager(*m_sentence, staticData.GetSearchAlgorithm(), &system);
  	m_manager->ProcessSentence();
  	TrellisPathList sentences;
  	m_manager->CalcNBest(1, sentences, distinct);
  	// read off the feature values and bleu scores for each sentence in the nbest list
  	Moses::TrellisPathList::const_iterator iter = sentences.begin();
  	vector<float> bleuAndScore;
  	const Moses::TrellisPath &path = **iter;
  	float bleuScore = getBleuScore(path.GetScoreBreakdown());
  	float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
  	bleuAndScore.push_back(bleuScore);
  	bleuAndScore.push_back(scoreWithoutBleu);
  	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", 1best translation: ");
  	Phrase phrase = path.GetTargetPhrase();
  	for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
  		const Word &word = phrase.GetWord(pos);
  		Word *newWord = new Word(word);
  		VERBOSE(1, *newWord);
  	}
  	VERBOSE(1, endl);
  	return bleuAndScore;
  }
  size_t MosesDecoder::getCurrentInputLength() {
 	  return (*m_sentence).GetSize();
  }
@ -270,27 +213,5 @@ namespace Mira {
  void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
  	m_bleuScoreFeature->PrintReferenceLength(ref_ids);
  }
  vector<float> MosesDecoder::calculateBleuOfCorpus(const vector< vector< const Word*> >& words, vector<size_t>& ref_ids, size_t epoch, size_t rank) {
  	  vector<float> bleu = m_bleuScoreFeature->CalculateBleuOfCorpus(words, ref_ids);
 	  if (bleu.size() > 0) {
 	    cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": " << bleu[4]*100 << ", "
 	   << bleu[0]*100 << "/" << bleu[1]*100 << "/" << bleu[2]*100 << "/" << bleu[3]*100 << " "
 		 << "(BP=" << bleu[5] << ", " << "ratio=" << bleu[6] << ", "
 		 << "hyp_len=" << bleu[7] << ", ref_len=" << bleu[8] << ")" << endl;
 	    vector<float> bleuAndRatio(2);
 	    bleuAndRatio[0] = bleu[4]*100;
 	    bleuAndRatio[1] = bleu[6];
 	    return bleuAndRatio;
 	  }
 	  else {
 	  	cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": 0" << endl;
 	  	vector<float> bleuAndRatio(2);
 	  	bleuAndRatio[0] = 0;
 	  	bleuAndRatio[1] = 0;
 	  	return bleuAndRatio;
 	  }
  }
 } 
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@ -64,20 +64,12 @@ class MosesDecoder {
                          bool distinct,
                          size_t rank,
                          size_t epoch);
    std::vector<float> getBleuAndScore(const std::string& source,
 													size_t sentenceid,
 													float bleuObjectiveWeight,
 													float bleuScoreWeight,
 													bool distinct,
 													size_t rank,
 													size_t epoch);
    size_t getCurrentInputLength();
    void updateHistory(const std::vector<const Moses::Word*>& words);
    void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
    void loadReferenceSentences(const std::vector<std::vector<std::string> >& refs);
    void printBleuFeatureHistory(std::ostream& out);
    void printReferenceLength(const std::vector<size_t>& ref_ids);
    std::vector<float> calculateBleuOfCorpus(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& ref_ids, size_t epoch, size_t rank);
    Moses::ScoreComponentCollection getWeights();
    void setWeights(const Moses::ScoreComponentCollection& weights);
 	void cleanup();
--- a/mira/Hildreth.cpp
+++ b/mira/Hildreth.cpp
@ -5,187 +5,6 @@ using namespace std;
 namespace Mira {
 vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b) {
  size_t i;
  int max_iter = 10000;
  float eps = 0.00000001;
  float zero = 0.000000000001;
  vector<FValue> alpha ( b.size() );
  vector<FValue> F ( b.size() );
  vector<FValue> kkt ( b.size() );
  float max_kkt = -1e100;
  size_t K = b.size();
  float A[K][K];
  bool is_computed[K];
  for ( i = 0; i < K; i++ )
  {
    A[i][i] = a[i].inner_product(a[i]);
    is_computed[i] = false;
  }
  int max_kkt_i = -1;
  for ( i = 0; i < b.size(); i++ )
  {
    F[i] = b[i];
    kkt[i] = F[i];
    if ( kkt[i] > max_kkt )
    {
      max_kkt = kkt[i];
      max_kkt_i = i;
    }
  }
  int iter = 0;
  FValue diff_alpha;
  FValue try_alpha;
  FValue add_alpha;
  while ( max_kkt >= eps && iter < max_iter )
  {
    diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
    try_alpha = alpha[max_kkt_i] + diff_alpha;
    add_alpha = 0.0;
    if ( try_alpha < 0.0 )
      add_alpha = -1.0 * alpha[max_kkt_i];
    else
      add_alpha = diff_alpha;
    alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
    if ( !is_computed[max_kkt_i] )
    {
      for ( i = 0; i < K; i++ )
      {
        A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
        //A[i][max_kkt_i] = 0; // for version 1
        is_computed[max_kkt_i] = true;
      }
    }
    for ( i = 0; i < F.size(); i++ )
    {
      F[i] -= add_alpha * A[i][max_kkt_i];
      kkt[i] = F[i];
      if ( alpha[i] > zero )
        kkt[i] = abs ( F[i] );
    }
    max_kkt = -1e100;
    max_kkt_i = -1;
    for ( i = 0; i < F.size(); i++ )
      if ( kkt[i] > max_kkt )
      {
        max_kkt = kkt[i];
        max_kkt_i = i;
      }
    iter++;
  }
  return alpha;
 }
 vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b, FValue C) {
  size_t i;
  int max_iter = 10000;
  FValue eps = 0.00000001;
  FValue zero = 0.000000000001;
  vector<FValue> alpha ( b.size() );
  vector<FValue> F ( b.size() );
  vector<FValue> kkt ( b.size() );
  float max_kkt = -1e100;
  size_t K = b.size();
  float A[K][K];
  bool is_computed[K];
  for ( i = 0; i < K; i++ )
  {
    A[i][i] = a[i].inner_product(a[i]);
    is_computed[i] = false;
  }
  int max_kkt_i = -1;
  for ( i = 0; i < b.size(); i++ )
  {
    F[i] = b[i];
    kkt[i] = F[i];
    if ( kkt[i] > max_kkt )
    {
      max_kkt = kkt[i];
      max_kkt_i = i;
    }
  }
  int iter = 0;
  FValue diff_alpha;
  FValue try_alpha;
  FValue add_alpha;
  while ( max_kkt >= eps && iter < max_iter )
  {
    diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
    try_alpha = alpha[max_kkt_i] + diff_alpha;
    add_alpha = 0.0;
    if ( try_alpha < 0.0 )
      add_alpha = -1.0 * alpha[max_kkt_i];
    else if (try_alpha > C)
 			add_alpha = C - alpha[max_kkt_i];
    else
      add_alpha = diff_alpha;
    alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
    if ( !is_computed[max_kkt_i] )
    {
      for ( i = 0; i < K; i++ )
      {
        A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
        //A[i][max_kkt_i] = 0; // for version 1
        is_computed[max_kkt_i] = true;
      }
    }
    for ( i = 0; i < F.size(); i++ )
    {
      F[i] -= add_alpha * A[i][max_kkt_i];
      kkt[i] = F[i];
      if (alpha[i] > C - zero)
 				kkt[i]=-kkt[i];
 			else if (alpha[i] > zero)
 				kkt[i] = abs(F[i]);
    }
    max_kkt = -1e100;
    max_kkt_i = -1;
    for ( i = 0; i < F.size(); i++ )
      if ( kkt[i] > max_kkt )
      {
        max_kkt = kkt[i];
        max_kkt_i = i;
      }
    iter++;
  }
  return alpha;
 }
  vector<FValue> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<FValue>& b) {
    size_t i;
--- a/mira/Hildreth.h
+++ b/mira/Hildreth.h
@ -5,8 +5,6 @@ namespace Mira {
  class Hildreth {
    public :
 			static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b );
 			static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
      static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b );
      static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
  };
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@ -166,7 +166,6 @@ int main(int argc, char** argv) {
 	string decoder_settings;
 	float min_weight_change;
 	float decrease_learning_rate;
 	bool devBleu;
 	bool normaliseWeights;
 	bool print_feature_values;
 	bool historyOf1best;
@ -178,7 +177,6 @@ int main(int argc, char** argv) {
 	float bleuScoreWeight;
 	float margin_slack;
 	float margin_slack_incr;
 	bool analytical_update;
 	bool perceptron_update;
 	bool hope_fear;
 	bool model_hope_fear;
@ -189,7 +187,6 @@ int main(int argc, char** argv) {
 	desc.add_options()
 		("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
 		("adapt-after-epoch", po::value<size_t>(&adapt_after_epoch)->default_value(0), "Index of epoch after which adaptive parameters will be adapted")
 		("analytical-update",  po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
 		("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
 		("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
 		("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
@ -201,9 +198,7 @@ int main(int argc, char** argv) {
 		("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
 		("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
 		("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
 		("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
 		("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
 		("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
 		("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
 		("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
 		("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
@ -214,12 +209,12 @@ int main(int argc, char** argv) {
 		("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
 		("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
 		("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
 		("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
 		("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
 		("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
 		("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
 		("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
 		("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
 		("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
 		("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
 		("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
 		("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
 		("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
 		("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in nbest list")
@ -229,6 +224,8 @@ int main(int argc, char** argv) {
 		("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
 		("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
 		("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
 		("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
 		("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
 		("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level bleu scoring function")
 		("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
 	    ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
@ -236,8 +233,7 @@ int main(int argc, char** argv) {
 	    ("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
 	    ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
 	    ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
-	    ("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
+	    ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
 	    ("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
 	    ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
 	po::options_description cmdline_options;
@ -355,42 +351,31 @@ int main(int argc, char** argv) {
 		perceptron_update = true;
 		model_hope_fear = false; // mira only
 		hope_fear = false; // mira only
 		analytical_update = false; // mira only
 	} else {
 		cerr << "Error: Unknown optimiser: " << learner << endl;
 		return 1;
 	}
 	// resolve parameter dependencies
-	if (perceptron_update || analytical_update) {
+	if (batchSize > 1 && perceptron_update) {
 		batchSize = 1;
-		cerr << "Info: Setting batch size to 1 for perceptron/analytical update" << endl;
+		cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
 	}
 	if (hope_n == -1 && fear_n == -1) {
 		hope_n = n;
 		fear_n = n;
 	}
-
+	if (model_hope_fear && hope_fear) {
 	if ((model_hope_fear || analytical_update) && hope_fear) {
 		hope_fear = false; // is true by default
 	}
-
+	if (!hope_fear) {
 	if (!hope_fear && !analytical_update) {
 		model_hope_fear = true;
 	}
 	if (model_hope_fear && analytical_update) {
 		cerr << "Error: Must choose between model-hope-fear and analytical update" << endl;
 		return 1;
 	}
 	if (!sentenceLevelBleu) {
 		if (!historyOf1best && !historyOfOracles) {
 			historyOf1best = true;
 		}
 	}
 	if (burnIn && sentenceLevelBleu) {
 		burnIn = false;
 		cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
@ -545,7 +530,6 @@ int main(int argc, char** argv) {
 	int sumStillViolatedConstraints_lastEpoch = 0;
 	int sumConstraintChangeAbs;
 	int sumConstraintChangeAbs_lastEpoch = 0;
 //	size_t sumBleuChangeAbs;
 	float *sendbuf, *recvbuf;
 	sendbuf = (float *) malloc(sizeof(float));
 	recvbuf = (float *) malloc(sizeof(float));
@ -553,7 +537,6 @@ int main(int argc, char** argv) {
 		// sum of violated constraints
 		sumStillViolatedConstraints = 0;
 		sumConstraintChangeAbs = 0;
 //		sumBleuChangeAbs = 0;
 		numberOfUpdatesThisEpoch = 0;
 		// Sum up weights over one epoch, final average uses weights from last epoch
@ -619,7 +602,7 @@ int main(int argc, char** argv) {
 				dummyFeatureValues.push_back(newFeatureValues);
 				dummyBleuScores.push_back(newBleuScores);
-				if (perceptron_update || analytical_update) {
+				if (perceptron_update) {
 					if (historyOf1best) {
 						// MODEL (for updating the history)
 						cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
@ -778,15 +761,6 @@ int main(int argc, char** argv) {
 				}
 			}
 /*			// get 1best model results with old weights
 			vector< vector <float > > bestModelOld_batch;
 			for (size_t i = 0; i < actualBatchSize; ++i) {
 				string& input = inputSentences[*current_sid_start + i];
 				vector <float> bestModelOld = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
 				bestModelOld_batch.push_back(bestModelOld);
 				decoder->cleanup();
 			}*/
 			// optionally print out the feature values
 			if (print_feature_values) {
 				cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
@ -826,11 +800,6 @@ int main(int argc, char** argv) {
 						featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
 						learning_rate, rank, epoch);
 			}
 			else if (analytical_update) {
 					update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights,
 							featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
 							ref_ids[0], learning_rate, rank, epoch);
 			}
 			else {
 				if (hope_fear) {
 					if (coreWeightMap.size() > 0) {
@ -900,17 +869,6 @@ int main(int argc, char** argv) {
 			weightDifference.MinusEquals(oldWeights);
 			VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl);
 /*			// get 1best model results with new weights (for each sentence in batch)
 			vector<float> bestModelNew;
 			for (size_t i = 0; i < actualBatchSize; ++i) {
 				string& input = inputSentences[*current_sid_start + i];
 				bestModelNew = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
 				decoder->cleanup();
 				sumBleuChangeAbs += abs(bestModelOld_batch[i][0] - bestModelNew[0]);
 				VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model bleu, old: " << bestModelOld_batch[i][0] << ", new: " << bestModelNew[0] << endl);
 				VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model score, old: " << bestModelOld_batch[i][1] << ", new: " << bestModelNew[1] << endl);
 			}*/
 			// update history (for approximate document Bleu)
 			if (sentenceLevelBleu) {
 				for (size_t i = 0; i < oracles.size(); ++i) {
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@ -110,7 +110,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 	  	ScoreComponentCollection update(featureValueDiffs[k]);
 	    update.MultiplyEquals(alpha);
-	    // sum up update
+	    // sum updates
 	    summedUpdate.PlusEquals(update);
 	  }
 	} 
@ -122,24 +122,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 	  return status;
 	}
 	ScoreComponentCollection newWeights(currWeights);
 	newWeights.PlusEquals(summedUpdate);
 	// Sanity check: are there still violated constraints after optimisation?
 	int violatedConstraintsAfter = 0;
 	float newDistanceFromOptimum = 0;
 	for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
 		float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
 		float loss = all_losses[i];
 		float diff = loss - (modelScoreDiff + m_margin_slack);
 		if (diff > epsilon) {
 			++violatedConstraintsAfter;
 			newDistanceFromOptimum += diff;
 		}
 	}
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter  << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
 	// apply learning rate
 	if (learning_rate != 1) {
 		VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
@ -158,6 +140,21 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 	currWeights.PlusEquals(summedUpdate);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
 	// Sanity check: are there still violated constraints after optimisation?
 	int violatedConstraintsAfter = 0;
 	float newDistanceFromOptimum = 0;
 	for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
 		float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
 		float loss = all_losses[i];
 		float diff = loss - (modelScoreDiff + m_margin_slack);
 		if (diff > epsilon) {
 			++violatedConstraintsAfter;
 			newDistanceFromOptimum += diff;
 		}
 	}
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter  << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
 	vector<int> status(2);
 	status[0] = violatedConstraintsBefore;
 	status[1] = violatedConstraintsAfter;
@ -291,25 +288,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
 	  return status;
 	}
-	ScoreComponentCollection newWeights(currWeights);
+	// apply learning rate
 	newWeights.PlusEquals(summedUpdate);
 	// Sanity check: are there still violated constraints after optimisation?
 	int violatedConstraintsAfter = 0;
 	float newDistanceFromOptimum = 0;
 	for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
 		float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
 		float loss = all_losses[i];
 		float diff = loss - (modelScoreDiff + m_margin_slack);
 		if (diff > epsilon) {
 			++violatedConstraintsAfter;
 			newDistanceFromOptimum += diff;
 		}
 	}
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter  << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
 	// Apply learning rate (fixed or flexible)
 	if (learning_rate != 1) {
 		VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
 		summedUpdate.MultiplyEquals(learning_rate);
@ -321,107 +300,27 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
 	currWeights.PlusEquals(summedUpdate);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
 	// Sanity check: are there still violated constraints after optimisation?
 	int violatedConstraintsAfter = 0;
 	float newDistanceFromOptimum = 0;
 	for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
 		float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
 		float loss = all_losses[i];
 		float diff = loss - (modelScoreDiff + m_margin_slack);
 		if (diff > epsilon) {
 			++violatedConstraintsAfter;
 			newDistanceFromOptimum += diff;
 		}
 	}
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter  << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
 	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
 	vector<int> statusPlus(2);
 	statusPlus[0] = violatedConstraintsBefore;
 	statusPlus[1] = violatedConstraintsAfter;
 	return statusPlus;
 }
 vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
 	ScoreComponentCollection& featureValuesHope,
    ScoreComponentCollection& featureValuesFear,
    float bleuScoreHope,
    float bleuScoreFear,
    size_t sentenceId,
    float learning_rate,
    size_t rank,
    size_t epoch) {
  float epsilon = 0.0001;
  float oldDistanceFromOptimum = 0;
  bool constraintViolatedBefore = false;
  ScoreComponentCollection weightUpdate;
 // cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
 // cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
  ScoreComponentCollection featureValueDiff = featureValuesHope;
  featureValueDiff.MinusEquals(featureValuesFear);
  cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
  float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
  float loss = bleuScoreHope - bleuScoreFear;
  float diff = 0;
  if (loss > (modelScoreDiff + m_margin_slack)) {
 	  diff = loss - (modelScoreDiff + m_margin_slack);
  }
  cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
  if (diff > epsilon) {
    // constraint violated
    oldDistanceFromOptimum += diff;
    constraintViolatedBefore = true;
    // compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
    // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
    // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
    float squaredNorm = featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm();
    if (squaredNorm > 0) {
    	float alpha = diff / squaredNorm;
    	if (m_slack > 0 ) {
    		if (alpha > m_slack) {
    			alpha = m_slack;
    		}
    		else if (alpha < m_slack*(-1)) {
    			alpha = m_slack*(-1);
    		}
    	}
    	cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
    	featureValueDiff.MultiplyEquals(alpha);
    	weightUpdate.PlusEquals(featureValueDiff);
    }
    else {
    	VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
    }
  }
  if (!constraintViolatedBefore) {
    // constraint satisfied, nothing to do
 	cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
    vector<int> status(2);
    status[0] = 0;
    status[1] = 0;
    return status;
  }
  // sanity check: constraint still violated after optimisation?
  ScoreComponentCollection newWeights(currWeights);
  newWeights.PlusEquals(weightUpdate);
  bool constraintViolatedAfter = false;
  float newDistanceFromOptimum = 0;
  featureValueDiff = featureValuesHope;
  featureValueDiff.MinusEquals(featureValuesFear);
  modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
  diff = loss - (modelScoreDiff + m_margin_slack);
  // approximate comparison between floats!
  if (diff > epsilon) {
    constraintViolatedAfter = true;
    newDistanceFromOptimum += (loss - modelScoreDiff);
  }
  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
  // apply update to weight vector
  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
  currWeights.PlusEquals(weightUpdate);
  VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
  vector<int> status(2);
  status[0] = 1;
  status[1] = constraintViolatedAfter ? 1 : 0;
  return status;
 }
 }
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@ -67,15 +67,6 @@ namespace Mira {
 		  m_scale_update(scale_update),
 		  m_margin_slack(margin_slack) { }
     std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
 							Moses::ScoreComponentCollection& featureValuesHope,
 							Moses::ScoreComponentCollection& featureValuesFear,
 							float bleuScoresHope,
 							float bleuScoresFear,
 							size_t sentenceId,
 							float learning_rate,
 							size_t rank,
 							size_t epoch);
     std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
      						  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
      						  const std::vector<std::vector<float> >& losses,
@ -117,6 +108,7 @@ namespace Mira {
      // scale update with log 10 of oracle BLEU score
      bool m_scale_update;
      // slack when comparing losses to model scores
      float m_margin_slack;
  };
 }
--- a/mira/Perceptron.cpp
+++ b/mira/Perceptron.cpp
@ -45,7 +45,6 @@ vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeig
 	vector<int> update_status;
 	update_status.push_back(0);
 	update_status.push_back(0);
 	update_status.push_back(0);
 	return update_status;
 }
--- a/moses/src/BleuScoreFeature.cpp
+++ b/moses/src/BleuScoreFeature.cpp
@ -94,14 +94,12 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin
          for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
              for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
                  Phrase ngram(Output);
                  //cerr << "start: " << end_idx-order << " end: " << end_idx << endl;
                  for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
                      const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
                      Word w;
                      w.SetFactor(0, f);
                      ngram.AddWord(w);
                  }
                  //cerr << "Ref: " << ngram << endl;
                  ref_pair.second[ngram] += 1;
              }
          }
@ -120,10 +118,10 @@ void BleuScoreFeature::SetCurrentReference(size_t ref_id) {
 }
 /*
- * Update the pseudo-document big_O after each translation of a source sentence.
+ * Update the pseudo-document O after each translation of a source sentence.
- * (big_O is an exponentially-weighted moving average of vectors c(e;{r_k}))
+ * (O is an exponentially-weighted moving average of vectors c(e;{r_k}))
- * big_O = 0.9 * (big_O + c(e_oracle))
+ * O = m_historySmoothing * (O + c(e_oracle))
- * big_O_f = 0.9 * (big_O_f + |f|)		input length of document big_O
+ * O_f = m_historySmoothing * (O_f + |f|)		input length of pseudo-document
 */
 void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
    Phrase phrase(Output, hypo);
@ -138,7 +136,6 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
    for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
        m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
        m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
        //cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl;
    }
    // update counts for reference and target length
@ -148,7 +145,7 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
 }
 /*
- * Update history with a batch of oracle translations
+ * Update history with a batch of translations
 */
 void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
 	for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
@ -195,7 +192,7 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 }
 /*
- * Update history with a batch of oracle translations
+ * Print batch of reference translations
 */
 void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
 	for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
@ -325,7 +322,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
    }
    new_state->m_source_length = cur_hypo.GetWordsBitmap().GetSize();
    new_state->m_source_phrase_length = cur_hypo.GetCurrSourceWordsRange().GetNumWordsCovered(); // todo: delete
    new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
                                                           ctx_end_idx));
    new_state->m_target_length += cur_hypo.GetTargetPhrase().GetSize();
@ -337,7 +333,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
    // Calculate new bleu.
    new_bleu = CalculateBleu(new_state);
    //cerr << "NS: " << *new_state << " NB " << new_bleu << endl;
    // Set score to new Bleu score
    accumulator->PlusEquals(this, new_bleu - old_bleu);
@ -396,82 +391,6 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
    return precision;
 }
 vector<float> BleuScoreFeature::CalculateBleuOfCorpus(const vector< vector< const Word* > >& oracles, const vector<size_t>& ref_ids) {
 	// get ngram matches and counts for all oracle sentences and their references
 	vector<size_t> sumOfClippedNgramMatches(BleuScoreState::bleu_order);
 	vector<size_t> sumOfNgramCounts(BleuScoreState::bleu_order);
 	size_t ref_length = 0;
 	size_t target_length = 0;
 	for (size_t batchPosition = 0; batchPosition < oracles.size(); ++batchPosition){
 		Phrase phrase(Output, oracles[batchPosition]);
 		size_t ref_id = ref_ids[batchPosition];
 		size_t cur_ref_length = m_refs[ref_id].first;
 		NGrams cur_ref_ngrams = m_refs[ref_id].second;
 		ref_length += cur_ref_length;
 		target_length += oracles[batchPosition].size();
 		std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
 		std::vector< size_t > clipped_ngram_matches(BleuScoreState::bleu_order);
 		GetClippedNgramMatchesAndCounts(phrase, cur_ref_ngrams, ngram_counts, clipped_ngram_matches, 0);
 		// add clipped ngram matches and ngram counts to corpus sums
 		for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
 			sumOfClippedNgramMatches[i] += clipped_ngram_matches[i];
 			sumOfNgramCounts[i] += ngram_counts[i];
 		}
 	}
 	if (!sumOfNgramCounts[0]) {
 		vector<float> empty(0);
 		return empty;
 	}
 	if (!sumOfClippedNgramMatches[0]) {
 		vector<float> empty(0);
 		return empty;			// if we have no unigram matches, score should be 0
 	}
 	// calculate bleu score
 	float precision = 1.0;
 	vector<float> bleu;
 	// Calculate geometric mean of modified ngram precisions
 	// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
 	// 		  = BP * 4th root(PRODUCT_1_4 p_n)
 	for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
 		if (sumOfNgramCounts[i]) {
 			precision *= 1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i];
 			bleu.push_back(1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i]);
 		}
 	}
 	// take geometric mean
 	precision = pow(precision, (float)1/4);
 	// Apply brevity penalty if applicable.
 	// BP = 1 				if c > r
 	// BP = e^(1- r/c))		if c <= r
 	// where
 	// c: length of the candidate translation
 	// r: effective reference length (sum of best match lengths for each candidate sentence)
 	float BP;
 	if (target_length < ref_length) {
 		precision *= exp(1 - (1.0*ref_length/target_length));
 		BP = exp(1 - (1.0*ref_length/target_length));
 	}
 	else {
 		BP = 1.0;
  }
 	bleu.push_back(precision);
 	bleu.push_back(BP);
 	bleu.push_back(1.0*target_length/ref_length);
 	bleu.push_back(target_length);
 	bleu.push_back(ref_length);
 	return bleu;
 }
 const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
 {
    return new BleuScoreState();
--- a/moses/src/BleuScoreFeature.h
+++ b/moses/src/BleuScoreFeature.h
@ -29,8 +29,6 @@ private:
    size_t m_source_length;
    size_t m_target_length;
    size_t m_source_phrase_length; // todo: delete
    // scaled reference length is needed for scoring incomplete hypotheses against reference translation
    float m_scaled_ref_length;
@ -52,7 +50,7 @@ public:
 	                                 m_target_length_history(0),
 	                                 m_ref_length_history(0),
 	                                 m_scale_by_input_length(true),
-	                                 m_historySmoothing(0.9) {}
+	                                 m_historySmoothing(0.7) {}
 	BleuScoreFeature(bool scaleByInputLength, float historySmoothing):
 	                                 StatefulFeatureFunction("BleuScore"),
@ -101,11 +99,10 @@ public:
                       const FFState* prev_state, 
                       ScoreComponentCollection* accumulator) const;
    float CalculateBleu(BleuScoreState*) const;
    std::vector<float> CalculateBleuOfCorpus(const std::vector< std::vector< const Word* > >& hypos, const std::vector<size_t>& ref_ids);
    const FFState* EmptyHypothesisState(const InputType&) const;
 private:
-    // counts for pseudo-document big_O
+    // counts for pseudo-document
    std::vector< float > m_count_history;
    std::vector< float > m_match_history;
    float m_source_length_history;
@ -117,9 +114,10 @@ private:
    NGrams m_cur_ref_ngrams;
    size_t m_cur_ref_length;
-    // whether or not to scale the BLEU score by a history of the input size
+    // scale BLEU score by history of input size
    bool m_scale_by_input_length;
    // smoothing factor for history counts
    float m_historySmoothing;
 };