From 01c2b001b1008d94297008a0ce7e6682a7360b99 Mon Sep 17 00:00:00 2001 From: Arianna Bisazza Date: Thu, 21 Jun 2012 17:41:05 +0200 Subject: [PATCH] FIXED OPTION FOR FUTURE DISTORTION COST [Moore & Quirk 2007] Summary of changes 1) DistortionScoreProducer::CalculateDistortionScore The previous implementation had the following bugs: - wasn't correctly converting size_t to int - in initial (empty) hypothesis, prefixEndPos was -2 instead of -1 - nb of words between phrases was always one too much 2) DistortionScoreProducer::Evaluate The new distortion state was assgined the first gap of the old hypothesis (it should be the current's) 3) WordsRange::GetNumWordsBetween It returned one word too much. For instance the nb of words between [0..1] and [2..3] was 1, now it's 0. 4) Parameter.cpp, StaticData.cpp, StaticData.h Added binary option to activate future distortion cost (fdc) and corresponding StaticData's variable --- moses/src/DummyScoreProducers.cpp | 53 ++++++++++++++++++++----------- moses/src/Parameter.cpp | 1 + moses/src/StaticData.cpp | 3 ++ moses/src/StaticData.h | 5 +++ moses/src/WordsRange.h | 4 +-- 5 files changed, 46 insertions(+), 20 deletions(-) diff --git a/moses/src/DummyScoreProducers.cpp b/moses/src/DummyScoreProducers.cpp index 3abdbf2e0..6fbcda77d 100644 --- a/moses/src/DummyScoreProducers.cpp +++ b/moses/src/DummyScoreProducers.cpp @@ -61,28 +61,45 @@ std::string DistortionScoreProducer::GetScoreProducerWeightShortName(unsigned) c float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo, const WordsRange &prev, const WordsRange &curr, const int FirstGap) const { - const int USE_OLD = 1; - if (USE_OLD) { + if(!StaticData::Instance().UseFutureDistortionCost()) { return - (float) hypo.GetInput().ComputeDistortionDistance(prev, curr); } + else { + /* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 + Definitions: + S : current source range + S' : last translated source phrase range + S'' : longest fully-translated initial segment + */ - // Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007 + int prefixEndPos = (int)FirstGap-1; + if((int)FirstGap==-1) + prefixEndPos = -1; - int prefixEndPos = FirstGap-1; - if ((int) curr.GetStartPos() == prefixEndPos+1) { - return 0; + // case1: S is adjacent to S'' => return 0 + if ((int) curr.GetStartPos() == prefixEndPos+1) { + IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl; + return 0; + } + + // case2: S is to the left of S' => return 2(length(S)) + if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) { + IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl; + return (float) -2*(int)curr.GetNumWordsCovered(); + } + + // case3: S' is a subsequence of S'' => return 2(nbWordBetween(S,S'')+length(S)) + if ((int) prev.GetEndPos() <= prefixEndPos) { + IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl; + int z = (int)curr.GetStartPos()-prefixEndPos - 1; + return (float) -2*(z + (int)curr.GetNumWordsCovered()); + } + + // case4: otherwise => return 2(nbWordBetween(S,S')+length(S)) + IFVERBOSE(4) std::cerr<< "MQ07disto:case4" << std::endl; + return (float) -2*((int)curr.GetNumWordsBetween(prev) + (int)curr.GetNumWordsCovered()); + } - - if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) { - return (float) -2*curr.GetNumWordsCovered(); - } - - if ((int) prev.GetEndPos() <= prefixEndPos) { - int z = curr.GetStartPos()-prefixEndPos; - return (float) -2*(z + curr.GetNumWordsCovered()); - } - - return (float) -2*(curr.GetNumWordsBetween(prev) + curr.GetNumWordsCovered()); } size_t DistortionScoreProducer::GetNumInputScores() const @@ -104,7 +121,7 @@ FFState* DistortionScoreProducer::Evaluate( out->PlusEquals(this, distortionScore); DistortionState_traditional* res = new DistortionState_traditional( hypo.GetCurrSourceWordsRange(), - hypo.GetPrevHypo()->GetWordsBitmap().GetFirstGapPos()); + hypo.GetWordsBitmap().GetFirstGapPos()); return res; } diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index c1f470f21..97131b6c2 100644 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -98,6 +98,7 @@ Parameter::Parameter() AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation"); AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); + AddParam("future-distortion-cost", "fdc", "include estimate of future cost in the distortion penalty [Moore & Quirk 2007]. Default is no"); AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" ); AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 3b1a3ad10..764ff2803 100644 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -359,6 +359,9 @@ bool StaticData::LoadData(Parameter *parameter) SetBooleanParameter(&m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false); + // include future distortion cost in distortion penalty + SetBooleanParameter( &m_useFutureDistortionCost, "future-distortion-cost", false ); + // unknown word processing SetBooleanParameter( &m_dropUnknown, "drop-unknown", false ); diff --git a/moses/src/StaticData.h b/moses/src/StaticData.h index cf3ba33d8..eb5a2a34f 100644 --- a/moses/src/StaticData.h +++ b/moses/src/StaticData.h @@ -97,12 +97,14 @@ protected: m_translationOptionThreshold, m_wordDeletionWeight; + // PhraseTrans, Generation & LanguageModelScore has multiple weights. int m_maxDistortion; // do it differently from old pharaoh // -ve = no limit on distortion // 0 = no disortion (monotone in old pharaoh) bool m_reorderingConstraint; //! use additional reordering constraints + bool m_useFutureDistortionCost; size_t m_maxHypoStackSize //! hypothesis-stack size that triggers pruning , m_minHypoStackDiversity //! minimum number of hypothesis in stack for each source word coverage @@ -338,6 +340,9 @@ public: bool UseEarlyDiscarding() const { return m_earlyDiscardingThreshold != -std::numeric_limits::infinity(); } + bool UseFutureDistortionCost() const { + return m_useFutureDistortionCost; + } float GetTranslationOptionThreshold() const { return m_translationOptionThreshold; } diff --git a/moses/src/WordsRange.h b/moses/src/WordsRange.h index 7191d259e..7470ec1ee 100644 --- a/moses/src/WordsRange.h +++ b/moses/src/WordsRange.h @@ -78,10 +78,10 @@ public: CHECK(!Overlap(x)); if (x.m_endPos < m_startPos) { - return m_startPos - x.m_endPos; + return m_startPos - x.m_endPos - 1; } - return x.m_startPos - m_endPos; + return x.m_startPos - m_endPos - 1; }