From 406f55584e03c702e75c66fac6357a612e41bed8 Mon Sep 17 00:00:00 2001 From: evahasler Date: Wed, 1 Dec 2010 18:09:49 +0000 Subject: [PATCH] introduce parameter --history-smoothing, add mpi_finalize statement git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3752 1f5c12ca-751b-0410-a591-d2e778427230 --- mira/Decoder.cpp | 4 +-- mira/Decoder.h | 2 +- mira/Main.cpp | 55 ++++++++++++++++++++++++++++++---- mira/MiraOptimiser.cpp | 16 +++++----- mira/Optimiser.h | 2 +- moses/src/BleuScoreFeature.cpp | 30 +++++++++++-------- moses/src/BleuScoreFeature.h | 4 ++- 7 files changed, 81 insertions(+), 32 deletions(-) diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp index ccc59bcec..2e0f6dc97 100644 --- a/mira/Decoder.cpp +++ b/mira/Decoder.cpp @@ -66,14 +66,14 @@ namespace Mira { delete[] mosesargv; } - MosesDecoder::MosesDecoder(const vector >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP) + MosesDecoder::MosesDecoder(const vector >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing) : m_manager(NULL) { // force initialisation of the phrase dictionary const StaticData &staticData = StaticData::Instance(); const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); // Add the bleu feature - m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP); + m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, increaseBP, historySmoothing); (const_cast(system)).AddFeatureFunction(m_bleuScoreFeature); m_bleuScoreFeature->LoadReferences(refs); } diff --git a/mira/Decoder.h b/mira/Decoder.h index f867713f2..b77c8eb63 100644 --- a/mira/Decoder.h +++ b/mira/Decoder.h @@ -50,7 +50,7 @@ void initMoses(const std::string& inifile, int debuglevel, int argc=0, char** a **/ class MosesDecoder { public: - MosesDecoder(const std::vector >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP); + MosesDecoder(const std::vector >& refs, bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing); //returns the best sentence std::vector getNBest(const std::string& source, diff --git a/mira/Main.cpp b/mira/Main.cpp index c04b2625d..7bf5754cd 100644 --- a/mira/Main.cpp +++ b/mira/Main.cpp @@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #ifdef MPI_ENABLE +#include "mpi.h" #include namespace mpi = boost::mpi; #endif @@ -88,6 +89,7 @@ int main(int argc, char** argv) { bool distinctNbest; bool onlyViolatedConstraints; bool accumulateWeights; + float historySmoothing; bool useScaledReference; bool scaleByInputLength; bool increaseBP; @@ -114,6 +116,7 @@ int main(int argc, char** argv) { ("distinct-nbest", po::value(&distinctNbest)->default_value(false), "Use nbest list with distinct translations in inference step") ("only-violated-constraints", po::value(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem") ("accumulate-weights", po::value(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs") + ("history-smoothing", po::value(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") ("use-scaled-reference", po::value(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases") ("scale-by-input-length", po::value(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths") ("increase-BP", po::value(&increaseBP)->default_value(false), "Increase penalty for short translations") @@ -173,7 +176,7 @@ int main(int argc, char** argv) { // initialise Moses initMoses(mosesConfigFile, verbosity);//, argc, argv); - MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP); + MosesDecoder* decoder = new MosesDecoder(referenceSentences, useScaledReference, scaleByInputLength, increaseBP, historySmoothing); ScoreComponentCollection startWeights = decoder->getWeights(); startWeights.L1Normalise(); decoder->setWeights(startWeights); @@ -243,6 +246,8 @@ int main(int argc, char** argv) { ScoreComponentCollection averageTotalWeights; // TODO: scaling of feature values for probabilistic features + vector< ScoreComponentCollection> list_of_delta_h; // collect delta_h and loss for all examples of an epoch + vector< float> list_of_losses; for (size_t epoch = 0; epoch < epochs; ++epoch) { cerr << "\nEpoch " << epoch << endl; // Sum up weights over one epoch, final average uses weights from last epoch @@ -385,6 +390,9 @@ int main(int argc, char** argv) { decoder->setWeights(mosesWeights); // update history (for approximate document bleu) + for (size_t i = 0; i < oracles.size(); ++i) { + cerr << "oracle length: " << oracles[i].size() << " "; + } decoder->updateHistory(oracles, inputLengths, ref_ids); // clean up oracle translations after updating history @@ -410,6 +418,9 @@ int main(int argc, char** argv) { // new weights margin = featureDiff.InnerProduct(mosesWeights); lossMinusMargin_new += (losses[batchPosition][j] - margin); + + list_of_delta_h.push_back(featureDiff); + list_of_losses.push_back(losses[batchPosition][j]); } } @@ -431,13 +442,18 @@ int main(int argc, char** argv) { #ifdef MPI_ENABLE if (shardPosition % (shard.size() / mixFrequency) == 0) { ScoreComponentCollection averageWeights; - VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl); + if (rank == 0) { + cerr << "Rank 0, before mixing: " << mosesWeights << endl); + } + + //VERBOSE(1, "\nRank: " << rank << " \nBefore mixing: " << mosesWeights << endl); // collect all weights in averageWeights and divide by number of processes mpi::reduce(world, mosesWeights, averageWeights, SCCPlus(), 0); if (rank == 0) { averageWeights.DivideEquals(size); - VERBOSE(1, "After mixing: " << averageWeights << endl); + //VERBOSE(1, "After mixing: " << averageWeights << endl); + cerr << "Rank 0, after mixing: " << averageWeights << endl); // normalise weights after averaging averageWeights.L1Normalise(); @@ -446,6 +462,14 @@ int main(int argc, char** argv) { // broadcast average weights from process 0 mpi::broadcast(world, averageWeights, 0); decoder->setWeights(averageWeights); + + // compute summed error after mixing weights + float summedError = 0.0; + for (size_t i = 0; i < list_of_delta_h.size(); ++i) { + summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageWeights)); + } + + cerr << "summed error after mixing weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl; } #endif @@ -458,6 +482,13 @@ int main(int argc, char** argv) { else totalWeights.DivideEquals(iterationsThisEpoch); +#ifdef MPI_ENABLE + if (rank == 0) { + cerr << "Rank 0, cumulative weights: " << cumulativeWeights << endl); + cerr << "Rank 0, total weights: " << totalWeights << endl); + } +#endif + // average across processes #ifdef MPI_ENABLE mpi::reduce(world, totalWeights, averageTotalWeights, SCCPlus(), 0); @@ -465,6 +496,7 @@ int main(int argc, char** argv) { // average and normalise weights averageTotalWeights.DivideEquals(size); averageTotalWeights.L1Normalise(); + cerr << "Rank 0, average total weights: " << averageTotalWeights << endl); } #endif #ifndef MPI_ENABLE @@ -482,13 +514,24 @@ int main(int argc, char** argv) { averageTotalWeights.Save(filename.str()); ++weightEpochDump; } + + // compute summed error after dumping weights + float summedError = 0.0; + for (size_t i = 0; i < list_of_delta_h.size(); ++i) { + summedError += (list_of_losses[i] - list_of_delta_h[i].InnerProduct(averageTotalWeights)); + } + + cerr << "summed error after dumping weights: " << summedError << " (" << list_of_delta_h.size() << " examples)" << endl; } } + + list_of_delta_h.clear(); + list_of_losses.clear(); } -/*#ifdef MPI_ENABLE - mpi::finalize(); -#endif*/ +#ifdef MPI_ENABLE + MPI_Finalize() +#endif cerr << "Average total weights: " << averageTotalWeights << endl; diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp index 0a49a28a9..29d05e3d5 100644 --- a/mira/MiraOptimiser.cpp +++ b/mira/MiraOptimiser.cpp @@ -23,8 +23,8 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, ScoreComponentCollection featureValueDiff = oracleFeatureValues[i]; featureValueDiff.MinusEquals(featureValues[i][j]); float modelScoreDiff = featureValueDiff.InnerProduct(currWeights); - cerr << "loss of hypothesis: " << losses[i][j] << endl; - cerr << "model score difference: " << modelScoreDiff << endl; + //cerr << "loss of hypothesis: " << losses[i][j] << endl; + //cerr << "model score difference: " << modelScoreDiff << endl; float loss = losses[i][j] * m_marginScaleFactor; bool addConstraint = true; @@ -49,12 +49,12 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, } } - cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl; + //cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl; if (violatedConstraintsBefore > 0) { // TODO: slack? // run optimisation - cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl; + //cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl; // compute deltas for all given constraints vector< float> alphas; if (m_regulariseHildrethUpdates) { @@ -68,7 +68,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, // * w' = w' + delta * Dh_ij ---> w' = w' + delta * (h(e*) - h(e_ij)) float sumOfAlphas = 0; for (size_t k = 0; k < featureValueDiffs.size(); ++k) { - cerr << "alpha " << k << ": " << alphas[k] << endl; + //cerr << "alpha " << k << ": " << alphas[k] << endl; sumOfAlphas += alphas[k]; // compute update @@ -78,7 +78,7 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, currWeights.PlusEquals(featureValueDiffs[k]); } - cerr << "sum of alphas: " << sumOfAlphas << endl; + //cerr << "sum of alphas: " << sumOfAlphas << endl; // sanity check: how many constraints violated after optimisation? size_t violatedConstraintsAfter = 0; @@ -92,11 +92,11 @@ int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights, ++violatedConstraintsAfter; } - cerr << "New model score difference: " << modelScoreDiff << endl; + //cerr << "New model score difference: " << modelScoreDiff << endl; } } - cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl; + //cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl; if (violatedConstraintsAfter > violatedConstraintsBefore) { cerr << "Increase: " << violatedConstraintsAfter - violatedConstraintsBefore << endl << endl; } diff --git a/mira/Optimiser.h b/mira/Optimiser.h index fd78f7435..f7fd46bec 100644 --- a/mira/Optimiser.h +++ b/mira/Optimiser.h @@ -41,7 +41,7 @@ namespace Mira { const std::vector< std::vector >& scores, const std::vector< std::vector >& losses, const std::vector& oracleScores) - {/* do nothing */} + { return 0; } }; class Perceptron : public Optimiser { diff --git a/moses/src/BleuScoreFeature.cpp b/moses/src/BleuScoreFeature.cpp index db2d6ba69..78eb8674f 100644 --- a/moses/src/BleuScoreFeature.cpp +++ b/moses/src/BleuScoreFeature.cpp @@ -80,9 +80,10 @@ BleuScoreFeature::BleuScoreFeature(): m_ref_length_history(0), m_use_scaled_reference(true), m_scale_by_input_length(true), - m_increase_BP(false) {} + m_increase_BP(false), + m_historySmoothing(0.9) {} -BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP): +BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing): StatefulFeatureFunction("BleuScore"), m_count_history(BleuScoreState::bleu_order), m_match_history(BleuScoreState::bleu_order), @@ -91,7 +92,8 @@ BleuScoreFeature::BleuScoreFeature(bool useScaledReference, bool scaleByInputLen m_ref_length_history(0), m_use_scaled_reference(useScaledReference), m_scale_by_input_length(scaleByInputLength), - m_increase_BP(increaseBP) {} + m_increase_BP(increaseBP), + m_historySmoothing(historySmoothing) {} void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs) { @@ -147,15 +149,15 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) { // update counts and matches for every ngram length with counts from hypo for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { - m_count_history[i] = 0.9 * (m_count_history[i] + ngram_counts[i]); - m_match_history[i] = 0.9 * (m_match_history[i] + ngram_matches[i]); + m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]); + m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]); //cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl; } // update counts for reference and target length - m_source_length_history = 0.9 * (m_source_length_history + m_cur_source_length); - m_target_length_history = 0.9 * (m_target_length_history + hypo.size()); - m_ref_length_history = 0.9 * (m_ref_length_history + m_cur_ref_length); + m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length); + m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size()); + m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length); } /* @@ -171,6 +173,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo size_t cur_source_length = sourceLengths[batchPosition]; size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first; NGrams cur_ref_ngrams = m_refs[ref_ids[batchPosition]].second; + cerr << "reference length: " << cur_ref_length << endl; + // compute vector c(e;{r_k}): // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0); @@ -182,8 +186,8 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo // do this for last position in batch if (batchPosition == hypos.size() - 1) { - m_count_history[i] *= 0.9; - m_match_history[i] *= 0.9; + m_count_history[i] *= m_historySmoothing; + m_match_history[i] *= m_historySmoothing; } } @@ -194,9 +198,9 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo // do this for last position in batch if (batchPosition == hypos.size() - 1) { - m_source_length_history *= 0.9; - m_target_length_history *= 0.9; - m_ref_length_history *= 0.9; + m_source_length_history *= m_historySmoothing; + m_target_length_history *= m_historySmoothing; + m_ref_length_history *= m_historySmoothing; } } } diff --git a/moses/src/BleuScoreFeature.h b/moses/src/BleuScoreFeature.h index 1400ce5a3..0e8768ee6 100644 --- a/moses/src/BleuScoreFeature.h +++ b/moses/src/BleuScoreFeature.h @@ -45,7 +45,7 @@ typedef std::map< Phrase, size_t > NGrams; class BleuScoreFeature : public StatefulFeatureFunction { public: BleuScoreFeature(); - BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP); + BleuScoreFeature(bool useScaledReference, bool scaleByInputLength, bool increaseBP, float historySmoothing); std::string GetScoreProducerDescription() const { @@ -94,6 +94,8 @@ private: // increase penalty for short translations bool m_increase_BP; + float m_historySmoothing; + // counts for pseudo-document big_O std::vector< float > m_count_history; std::vector< float > m_match_history;