From 406f55584e03c702e75c66fac6357a612e41bed8 Mon Sep 17 00:00:00 2001
From: evahasler <evahasler@1f5c12ca-751b-0410-a591-d2e778427230>
Date: Wed, 1 Dec 2010 18:09:49 +0000
Subject: [PATCH] introduce parameter --history-smoothing, add mpi_finalize
 statement

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3752 1f5c12ca-751b-0410-a591-d2e778427230
---
 mira/Decoder.cpp               |  4 +--
 mira/Decoder.h                 |  2 +-
 mira/Main.cpp                  | 55 ++++++++++++++++++++++++++++++----
 mira/MiraOptimiser.cpp         | 16 +++++-----
 mira/Optimiser.h               |  2 +-
 moses/src/BleuScoreFeature.cpp | 30 +++++++++++--------
 moses/src/BleuScoreFeature.h   |  4 ++-
 7 files changed, 81 insertions(+), 32 deletions(-)
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index ccc59bcec..2e0f6dc97 100644
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -66,14 +66,14 @@ namespace Mira {
     delete[] mosesargv;
   }
  
-  MosesDecoder::MosesDecoder(const vector<vector<string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP)
+  MosesDecoder::MosesDecoder(const vector<vector<string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing)
 		: m_manager(NULL) {
 	  // force initialisation of the phrase dictionary
       const StaticData &staticData = StaticData::Instance();
       const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
 
       // Add the bleu feature
-      m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP);
+      m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP, historySmoothing);
       (const_cast<TranslationSystem&>(system)).AddFeatureFunction(m_bleuScoreFeature);
       m_bleuScoreFeature->LoadReferences(refs);
   }
diff --git a/mira/Decoder.h b/mira/Decoder.h
index f867713f2..b77c8eb63 100644
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@@ -50,7 +50,7 @@ void initMoses(const std::string& inifile, int debuglevel,  int argc=0, char** a
  **/
 class MosesDecoder {
   public:
-    MosesDecoder(const std::vector<std::vector<std::string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP);
+    MosesDecoder(const std::vector<std::vector<std::string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing);
 	
     //returns the best sentence
     std::vector<const Moses::Word*> getNBest(const std::string& source,
diff --git a/mira/Main.cpp b/mira/Main.cpp
index c04b2625d..7bf5754cd 100644
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 #include <boost/program_options.hpp>
 #ifdef MPI_ENABLE
+#include "mpi.h"
 #include <boost/mpi.hpp>
 namespace mpi = boost::mpi;
 #endif
@@ -88,6 +89,7 @@ int main(int argc, char** argv) {
   bool distinctNbest;
   bool onlyViolatedConstraints;
   bool accumulateWeights;
+  float historySmoothing;
   bool useScaledReference;
   bool scaleByInputLength;
   bool increaseBP;
@@ -114,6 +116,7 @@ int main(int argc, char** argv) {
 	    ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(false), "Use nbest list with distinct translations in inference step")
 	    ("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
 	    ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
+	    ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
 	    ("use-scaled-reference", po::value<bool>(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases")
 	    ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
 	    ("increase-BP", po::value<bool>(&increaseBP)->default_value(false), "Increase penalty for short translations")
@@ -173,7 +176,7 @@ int main(int argc, char** argv) {
 
   // initialise Moses
   initMoses(mosesConfigFile, verbosity);//, argc, argv);
-  MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP);
+  MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP, historySmoothing);
   ScoreComponentCollection startWeights = decoder->getWeights();
   startWeights.L1Normalise();
   decoder->setWeights(startWeights);
@@ -243,6 +246,8 @@ int main(int argc, char** argv) {
   ScoreComponentCollection averageTotalWeights;
 
   // TODO: scaling of feature values for probabilistic features
+  vector< ScoreComponentCollection> list_of_delta_h;	// collect delta_h and  loss for all examples of an epoch
+  vector< float> list_of_losses;
   for (size_t epoch = 0; epoch < epochs; ++epoch) {
 	  cerr << "\nEpoch " << epoch << endl;
 	  // Sum up weights over one epoch, final average uses weights from last epoch
@@ -385,6 +390,9 @@ int main(int argc, char** argv) {
 		  decoder->setWeights(mosesWeights);
   
 		  // update history (for approximate document bleu)
+		  for (size_t i = 0; i < oracles.size(); ++i) {
+			  cerr << "oracle length: " << oracles[i].size() << " ";
+		  }
 		  decoder->updateHistory(oracles, inputLengths, ref_ids);
 
 		  // clean up oracle translations after updating history
@@ -410,6 +418,9 @@ int main(int argc, char** argv) {
 				  // new weights
 				  margin = featureDiff.InnerProduct(mosesWeights);
 				  lossMinusMargin_new += (losses[batchPosition][j] - margin);
+
+				  list_of_delta_h.push_back(featureDiff);
+				  list_of_losses.push_back(losses[batchPosition][j]);
 			  }
 	      }
 
@@ -431,13 +442,18 @@ int main(int argc, char** argv) {
 #ifdef MPI_ENABLE
 		  if (shardPosition % (shard.size() / mixFrequency) == 0) {
 			  ScoreComponentCollection averageWeights;
-			  VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl);
+			  if (rank == 0) {
+				  cerr << "Rank 0, before mixing: " << mosesWeights << endl);
+			  }
+
+			  //VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl);
 
 			  // collect all weights in averageWeights and divide by number of processes
 			  mpi::reduce(world, mosesWeights, averageWeights, SCCPlus(), 0);
 			  if (rank == 0) {
 				  averageWeights.DivideEquals(size);
-				  VERBOSE(1, "After mixing: " << averageWeights << endl);
+				  //VERBOSE(1, "After mixing: " << averageWeights << endl);
+				  cerr << "Rank 0, after mixing: " << averageWeights << endl);
 
 				  // normalise weights after averaging
 				  averageWeights.L1Normalise();
@@ -446,6 +462,14 @@ int main(int argc, char** argv) {
 			  // broadcast average weights from process 0
 			  mpi::broadcast(world, averageWeights, 0);
 			  decoder->setWeights(averageWeights);
+
+			  // compute summed error after mixing weights
+			  float summedError = 0.0;
+			  for (size_t i = 0; i < list_of_delta_h.size(); ++i) {
+				  summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageWeights));
+			  }
+
+			  cerr << "summed error after mixing weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl;
 		  }
 #endif
 
@@ -458,6 +482,13 @@ int main(int argc, char** argv) {
 			  else
 				  totalWeights.DivideEquals(iterationsThisEpoch);
 
+#ifdef MPI_ENABLE
+			  if (rank == 0) {
+				  cerr << "Rank 0, cumulative weights: " << cumulativeWeights << endl);
+				  cerr << "Rank 0, total weights: " << totalWeights << endl);
+			  }
+#endif
+
 			  // average across processes
 #ifdef MPI_ENABLE
 			  mpi::reduce(world, totalWeights, averageTotalWeights, SCCPlus(), 0);
@@ -465,6 +496,7 @@ int main(int argc, char** argv) {
 				  // average and normalise weights
 				  averageTotalWeights.DivideEquals(size);
 				  averageTotalWeights.L1Normalise();
+				  cerr << "Rank 0, average total weights: " << averageTotalWeights << endl);
 			  }
 #endif
 #ifndef MPI_ENABLE
@@ -482,13 +514,24 @@ int main(int argc, char** argv) {
 				  averageTotalWeights.Save(filename.str());
 				  ++weightEpochDump;
 			  }
+
+			  // compute summed error after dumping weights
+			  float summedError = 0.0;
+			  for (size_t i = 0; i < list_of_delta_h.size(); ++i) {
+				  summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageTotalWeights));
+			  }
+
+			  cerr << "summed error after dumping weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl;
 		  }
 	  }
+
+	  list_of_delta_h.clear();
+	  list_of_losses.clear();
   }
   
-/*#ifdef MPI_ENABLE
-			  mpi::finalize();
-#endif*/
+#ifdef MPI_ENABLE
+  MPI_Finalize()
+#endif
 
   cerr << "Average total weights: " << averageTotalWeights << endl;
 
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index 0a49a28a9..29d05e3d5 100644
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@@ -23,8 +23,8 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 				ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
 				featureValueDiff.MinusEquals(featureValues[i][j]);
 				float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
-				cerr << "loss of hypothesis: " << losses[i][j] << endl;
-				cerr << "model score difference: " << modelScoreDiff << endl;
+				//cerr << "loss of hypothesis: " << losses[i][j] << endl;
+				//cerr << "model score difference: " << modelScoreDiff << endl;
 				float loss = losses[i][j] * m_marginScaleFactor;
 
 				bool addConstraint = true;
@@ -49,12 +49,12 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 			}
 		}
 
-		cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl;
+		//cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl;
 
 		if (violatedConstraintsBefore > 0) {
 			// TODO: slack?
 			// run optimisation
-			cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl;
+			//cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl;
 			// compute deltas for all given constraints
 			vector< float> alphas;
 			if (m_regulariseHildrethUpdates) {
@@ -68,7 +68,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 			// * w' = w' + delta * Dh_ij ---> w' = w' + delta * (h(e*) - h(e_ij))
 			float sumOfAlphas = 0;
 			for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
-				cerr << "alpha " << k << ": " << alphas[k] << endl;
+				//cerr << "alpha " << k << ": " << alphas[k] << endl;
 				sumOfAlphas += alphas[k];
 
 				// compute update
@@ -78,7 +78,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 				currWeights.PlusEquals(featureValueDiffs[k]);
 			}
 
-			cerr << "sum of alphas: " << sumOfAlphas << endl;
+			//cerr << "sum of alphas: " << sumOfAlphas << endl;
 
 			// sanity check: how many constraints violated after optimisation?
 			size_t violatedConstraintsAfter = 0;
@@ -92,11 +92,11 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 						++violatedConstraintsAfter;
 					}
 
-					cerr << "New model score difference: " << modelScoreDiff << endl;
+					//cerr << "New model score difference: " << modelScoreDiff << endl;
 				}
 			}
 
-			cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl;
+			//cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl;
 			if (violatedConstraintsAfter > violatedConstraintsBefore) {
 				cerr << "Increase: " << violatedConstraintsAfter - violatedConstraintsBefore << endl << endl;
 			}
diff --git a/mira/Optimiser.h b/mira/Optimiser.h
index fd78f7435..f7fd46bec 100644
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@@ -41,7 +41,7 @@ namespace Mira {
                          const std::vector< std::vector<Moses::ScoreComponentCollection> >& scores,
                          const std::vector< std::vector<float> >& losses,
                          const std::vector<Moses::ScoreComponentCollection>& oracleScores)
-                         {/* do nothing */}
+                         { return 0; }
   };
  
   class Perceptron : public Optimiser {
diff --git a/moses/src/BleuScoreFeature.cpp b/moses/src/BleuScoreFeature.cpp
index db2d6ba69..78eb8674f 100644
--- a/moses/src/BleuScoreFeature.cpp
+++ b/moses/src/BleuScoreFeature.cpp
@@ -80,9 +80,10 @@ BleuScoreFeature::BleuScoreFeature():
                                  m_ref_length_history(0),
                                  m_use_scaled_reference(true),
                                  m_scale_by_input_length(true),
-                                 m_increase_BP(false) {}
+                                 m_increase_BP(false),
+                                 m_historySmoothing(0.9) {}
 
-BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP):
+BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing):
                                  StatefulFeatureFunction("BleuScore"),      
                                  m_count_history(BleuScoreState::bleu_order),
                                  m_match_history(BleuScoreState::bleu_order),
@@ -91,7 +92,8 @@ BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLen
                                  m_ref_length_history(0),
                                  m_use_scaled_reference(useScaledReference),
                                  m_scale_by_input_length(scaleByInputLength),
-                                 m_increase_BP(increaseBP) {}
+                                 m_increase_BP(increaseBP),
+                                 m_historySmoothing(historySmoothing) {}
 
 void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
 {
@@ -147,15 +149,15 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
 
     // update counts and matches for every ngram length with counts from hypo
     for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
-        m_count_history[i] = 0.9 * (m_count_history[i] + ngram_counts[i]);
-        m_match_history[i] = 0.9 * (m_match_history[i] + ngram_matches[i]);
+        m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
+        m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
         //cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl;
     }
 
     // update counts for reference and target length
-    m_source_length_history = 0.9 * (m_source_length_history + m_cur_source_length);
-    m_target_length_history = 0.9 * (m_target_length_history + hypo.size());
-    m_ref_length_history = 0.9 * (m_ref_length_history + m_cur_ref_length);
+    m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
+    m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
+    m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
 }
 
 /*
@@ -171,6 +173,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 	    size_t cur_source_length = sourceLengths[batchPosition];
 	    size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
 	    NGrams cur_ref_ngrams = m_refs[ref_ids[batchPosition]].second;
+	    cerr << "reference length: " << cur_ref_length << endl;
+
 	    // compute vector c(e;{r_k}):
 	    // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
 	    GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
@@ -182,8 +186,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 
 	        // do this for last position in batch
 	        if (batchPosition == hypos.size() - 1) {
-	        	m_count_history[i] *= 0.9;
-	        	m_match_history[i] *= 0.9;
+	        	m_count_history[i] *= m_historySmoothing;
+	        	m_match_history[i] *= m_historySmoothing;
 	        }
 	    }
 
@@ -194,9 +198,9 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 
 	    // do this for last position in batch
 	    if (batchPosition == hypos.size() - 1) {
-	    	m_source_length_history *= 0.9;
-	    	m_target_length_history *= 0.9;
-	    	m_ref_length_history *= 0.9;
+	    	m_source_length_history *= m_historySmoothing;
+	    	m_target_length_history *= m_historySmoothing;
+	    	m_ref_length_history *= m_historySmoothing;
 	    }
 	}
 }
diff --git a/moses/src/BleuScoreFeature.h b/moses/src/BleuScoreFeature.h
index 1400ce5a3..0e8768ee6 100644
--- a/moses/src/BleuScoreFeature.h
+++ b/moses/src/BleuScoreFeature.h
@@ -45,7 +45,7 @@ typedef std::map< Phrase, size_t > NGrams;
 class BleuScoreFeature : public StatefulFeatureFunction {
 public:
 	BleuScoreFeature();
-    BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP);
+    BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing);
 
     std::string GetScoreProducerDescription() const
     {
@@ -94,6 +94,8 @@ private:
     // increase penalty for short translations
     bool m_increase_BP;
 
+    float m_historySmoothing;
+
     // counts for pseudo-document big_O
     std::vector< float > m_count_history;
     std::vector< float > m_match_history;