introduce parameter --history-smoothing, add mpi_finalize statement

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3752 1f5c12ca-751b-0410-a591-d2e778427230
2025-01-05 02:22:21 +03:00 · 2010-12-01 18:09:49 +00:00 · 2010-12-01 18:09:49 +00:00 · 406f55584e
commit 406f55584e
parent da712f0eeb
7 changed files with 81 additions and 32 deletions
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@ -66,14 +66,14 @@ namespace Mira {
    delete[] mosesargv;
  }
 
-  MosesDecoder::MosesDecoder(const vector<vector<string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP)
+  MosesDecoder::MosesDecoder(const vector<vector<string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing)
 		: m_manager(NULL) {
 	  // force initialisation of the phrase dictionary
      const StaticData &staticData = StaticData::Instance();
      const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);

      // Add the bleu feature
-      m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP);
+      m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP, historySmoothing);
      (const_cast<TranslationSystem&>(system)).AddFeatureFunction(m_bleuScoreFeature);
      m_bleuScoreFeature->LoadReferences(refs);
  }
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@ -50,7 +50,7 @@ void initMoses(const std::string& inifile, int debuglevel,  int argc=0, char** a
 **/
 class MosesDecoder {
  public:
-    MosesDecoder(const std::vector<std::vector<std::string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP);
+    MosesDecoder(const std::vector<std::vector<std::string> >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing);
 	
    //returns the best sentence
    std::vector<const Moses::Word*> getNBest(const std::string& source,
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

 #include <boost/program_options.hpp>
 #ifdef MPI_ENABLE
+#include "mpi.h"
 #include <boost/mpi.hpp>
 namespace mpi = boost::mpi;
 #endif
@ -88,6 +89,7 @@ int main(int argc, char** argv) {
  bool distinctNbest;
  bool onlyViolatedConstraints;
  bool accumulateWeights;
+  float historySmoothing;
  bool useScaledReference;
  bool scaleByInputLength;
  bool increaseBP;
@ -114,6 +116,7 @@ int main(int argc, char** argv) {
 	    ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(false), "Use nbest list with distinct translations in inference step")
 	    ("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
 	    ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
+	    ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
 	    ("use-scaled-reference", po::value<bool>(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases")
 	    ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
 	    ("increase-BP", po::value<bool>(&increaseBP)->default_value(false), "Increase penalty for short translations")
@ -173,7 +176,7 @@ int main(int argc, char** argv) {

  // initialise Moses
  initMoses(mosesConfigFile, verbosity);//, argc, argv);
-  MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP);
+  MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP, historySmoothing);
  ScoreComponentCollection startWeights = decoder->getWeights();
  startWeights.L1Normalise();
  decoder->setWeights(startWeights);
@ -243,6 +246,8 @@ int main(int argc, char** argv) {
  ScoreComponentCollection averageTotalWeights;

  // TODO: scaling of feature values for probabilistic features
+  vector< ScoreComponentCollection> list_of_delta_h;	// collect delta_h and  loss for all examples of an epoch
+  vector< float> list_of_losses;
  for (size_t epoch = 0; epoch < epochs; ++epoch) {
 	  cerr << "\nEpoch " << epoch << endl;
 	  // Sum up weights over one epoch, final average uses weights from last epoch
@ -385,6 +390,9 @@ int main(int argc, char** argv) {
 		  decoder->setWeights(mosesWeights);
  
 		  // update history (for approximate document bleu)
+		  for (size_t i = 0; i < oracles.size(); ++i) {
+			  cerr << "oracle length: " << oracles[i].size() << " ";
+		  }
 		  decoder->updateHistory(oracles, inputLengths, ref_ids);

 		  // clean up oracle translations after updating history
@ -410,6 +418,9 @@ int main(int argc, char** argv) {
 				  // new weights
 				  margin = featureDiff.InnerProduct(mosesWeights);
 				  lossMinusMargin_new += (losses[batchPosition][j] - margin);
+
+				  list_of_delta_h.push_back(featureDiff);
+				  list_of_losses.push_back(losses[batchPosition][j]);
 			  }
 	      }

@ -431,13 +442,18 @@ int main(int argc, char** argv) {
 #ifdef MPI_ENABLE
 		  if (shardPosition % (shard.size() / mixFrequency) == 0) {
 			  ScoreComponentCollection averageWeights;
-			  VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl);
+			  if (rank == 0) {
+				  cerr << "Rank 0, before mixing: " << mosesWeights << endl);
+			  }
+
+			  //VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl);

 			  // collect all weights in averageWeights and divide by number of processes
 			  mpi::reduce(world, mosesWeights, averageWeights, SCCPlus(), 0);
 			  if (rank == 0) {
 				  averageWeights.DivideEquals(size);
-				  VERBOSE(1, "After mixing: " << averageWeights << endl);
+				  //VERBOSE(1, "After mixing: " << averageWeights << endl);
+				  cerr << "Rank 0, after mixing: " << averageWeights << endl);

 				  // normalise weights after averaging
 				  averageWeights.L1Normalise();
@ -446,6 +462,14 @@ int main(int argc, char** argv) {
 			  // broadcast average weights from process 0
 			  mpi::broadcast(world, averageWeights, 0);
 			  decoder->setWeights(averageWeights);
+
+			  // compute summed error after mixing weights
+			  float summedError = 0.0;
+			  for (size_t i = 0; i < list_of_delta_h.size(); ++i) {
+				  summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageWeights));
+			  }
+
+			  cerr << "summed error after mixing weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl;
 		  }
 #endif

@ -458,6 +482,13 @@ int main(int argc, char** argv) {
 			  else
 				  totalWeights.DivideEquals(iterationsThisEpoch);

+#ifdef MPI_ENABLE
+			  if (rank == 0) {
+				  cerr << "Rank 0, cumulative weights: " << cumulativeWeights << endl);
+				  cerr << "Rank 0, total weights: " << totalWeights << endl);
+			  }
+#endif
+
 			  // average across processes
 #ifdef MPI_ENABLE
 			  mpi::reduce(world, totalWeights, averageTotalWeights, SCCPlus(), 0);
@ -465,6 +496,7 @@ int main(int argc, char** argv) {
 				  // average and normalise weights
 				  averageTotalWeights.DivideEquals(size);
 				  averageTotalWeights.L1Normalise();
+				  cerr << "Rank 0, average total weights: " << averageTotalWeights << endl);
 			  }
 #endif
 #ifndef MPI_ENABLE
@ -482,13 +514,24 @@ int main(int argc, char** argv) {
 				  averageTotalWeights.Save(filename.str());
 				  ++weightEpochDump;
 			  }
+
+			  // compute summed error after dumping weights
+			  float summedError = 0.0;
+			  for (size_t i = 0; i < list_of_delta_h.size(); ++i) {
+				  summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageTotalWeights));
+			  }
+
+			  cerr << "summed error after dumping weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl;
 		  }
 	  }
+
+	  list_of_delta_h.clear();
+	  list_of_losses.clear();
  }
  
-/*#ifdef MPI_ENABLE
-			  mpi::finalize();
-#endif*/
+#ifdef MPI_ENABLE
+  MPI_Finalize()
+#endif

  cerr << "Average total weights: " << averageTotalWeights << endl;

--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@ -23,8 +23,8 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 				ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
 				featureValueDiff.MinusEquals(featureValues[i][j]);
 				float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
-				cerr << "loss of hypothesis: " << losses[i][j] << endl;
-				cerr << "model score difference: " << modelScoreDiff << endl;
+				//cerr << "loss of hypothesis: " << losses[i][j] << endl;
+				//cerr << "model score difference: " << modelScoreDiff << endl;
 				float loss = losses[i][j] * m_marginScaleFactor;

 				bool addConstraint = true;
@ -49,12 +49,12 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 			}
 		}

-		cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl;
+		//cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl;

 		if (violatedConstraintsBefore > 0) {
 			// TODO: slack?
 			// run optimisation
-			cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl;
+			//cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl;
 			// compute deltas for all given constraints
 			vector< float> alphas;
 			if (m_regulariseHildrethUpdates) {
@ -68,7 +68,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 			// * w' = w' + delta * Dh_ij ---> w' = w' + delta * (h(e*) - h(e_ij))
 			float sumOfAlphas = 0;
 			for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
-				cerr << "alpha " << k << ": " << alphas[k] << endl;
+				//cerr << "alpha " << k << ": " << alphas[k] << endl;
 				sumOfAlphas += alphas[k];

 				// compute update
@ -78,7 +78,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 				currWeights.PlusEquals(featureValueDiffs[k]);
 			}

-			cerr << "sum of alphas: " << sumOfAlphas << endl;
+			//cerr << "sum of alphas: " << sumOfAlphas << endl;

 			// sanity check: how many constraints violated after optimisation?
 			size_t violatedConstraintsAfter = 0;
@ -92,11 +92,11 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
 						++violatedConstraintsAfter;
 					}

-					cerr << "New model score difference: " << modelScoreDiff << endl;
+					//cerr << "New model score difference: " << modelScoreDiff << endl;
 				}
 			}

-			cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl;
+			//cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl;
 			if (violatedConstraintsAfter > violatedConstraintsBefore) {
 				cerr << "Increase: " << violatedConstraintsAfter - violatedConstraintsBefore << endl << endl;
 			}
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@ -41,7 +41,7 @@ namespace Mira {
                         const std::vector< std::vector<Moses::ScoreComponentCollection> >& scores,
                         const std::vector< std::vector<float> >& losses,
                         const std::vector<Moses::ScoreComponentCollection>& oracleScores)
-                         {/* do nothing */}
+                         { return 0; }
  };
 
  class Perceptron : public Optimiser {
--- a/moses/src/BleuScoreFeature.cpp
+++ b/moses/src/BleuScoreFeature.cpp
@ -80,9 +80,10 @@ BleuScoreFeature::BleuScoreFeature():
                                 m_ref_length_history(0),
                                 m_use_scaled_reference(true),
                                 m_scale_by_input_length(true),
-                                 m_increase_BP(false) {}
+                                 m_increase_BP(false),
+                                 m_historySmoothing(0.9) {}

-BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP):
+BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing):
                                 StatefulFeatureFunction("BleuScore"),      
                                 m_count_history(BleuScoreState::bleu_order),
                                 m_match_history(BleuScoreState::bleu_order),
@ -91,7 +92,8 @@ BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLen
                                 m_ref_length_history(0),
                                 m_use_scaled_reference(useScaledReference),
                                 m_scale_by_input_length(scaleByInputLength),
-                                 m_increase_BP(increaseBP) {}
+                                 m_increase_BP(increaseBP),
+                                 m_historySmoothing(historySmoothing) {}

 void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
 {
@ -147,15 +149,15 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {

    // update counts and matches for every ngram length with counts from hypo
    for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
-        m_count_history[i] = 0.9 * (m_count_history[i] + ngram_counts[i]);
-        m_match_history[i] = 0.9 * (m_match_history[i] + ngram_matches[i]);
+        m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
+        m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
        //cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl;
    }

    // update counts for reference and target length
-    m_source_length_history = 0.9 * (m_source_length_history + m_cur_source_length);
-    m_target_length_history = 0.9 * (m_target_length_history + hypo.size());
-    m_ref_length_history = 0.9 * (m_ref_length_history + m_cur_ref_length);
+    m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
+    m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
+    m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
 }

 /*
@ -171,6 +173,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
 	    size_t cur_source_length = sourceLengths[batchPosition];
 	    size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
 	    NGrams cur_ref_ngrams = m_refs[ref_ids[batchPosition]].second;
+	    cerr << "reference length: " << cur_ref_length << endl;
+
 	    // compute vector c(e;{r_k}):
 	    // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
 	    GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
@ -182,8 +186,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo

 	        // do this for last position in batch
 	        if (batchPosition == hypos.size() - 1) {
-	        	m_count_history[i] *= 0.9;
-	        	m_match_history[i] *= 0.9;
+	        	m_count_history[i] *= m_historySmoothing;
+	        	m_match_history[i] *= m_historySmoothing;
 	        }
 	    }

@ -194,9 +198,9 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo

 	    // do this for last position in batch
 	    if (batchPosition == hypos.size() - 1) {
-	    	m_source_length_history *= 0.9;
-	    	m_target_length_history *= 0.9;
-	    	m_ref_length_history *= 0.9;
+	    	m_source_length_history *= m_historySmoothing;
+	    	m_target_length_history *= m_historySmoothing;
+	    	m_ref_length_history *= m_historySmoothing;
 	    }
 	}
 }
--- a/moses/src/BleuScoreFeature.h
+++ b/moses/src/BleuScoreFeature.h
@ -45,7 +45,7 @@ typedef std::map< Phrase, size_t > NGrams;
 class BleuScoreFeature : public StatefulFeatureFunction {
 public:
 	BleuScoreFeature();
-    BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP);
+    BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing);

    std::string GetScoreProducerDescription() const
    {
@ -94,6 +94,8 @@ private:
    // increase penalty for short translations
    bool m_increase_BP;

+    float m_historySmoothing;
+
    // counts for pseudo-document big_O
    std::vector< float > m_count_history;
    std::vector< float > m_match_history;