From b368085609b1d638de571e963295117003defc91 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Thu, 15 Aug 2013 11:46:45 +0100 Subject: [PATCH] xml constraint --- Jamroot | 6 +-- moses/Parameter.cpp | 2 +- moses/StaticData.cpp | 3 +- moses/TranslationOptionCollection.cpp | 17 ++++++-- moses/TranslationOptionCollection.h | 3 ++ moses/TranslationOptionCollectionText.cpp | 47 ++++++++++++++++++++++- moses/TranslationOptionCollectionText.h | 2 +- moses/TypeDef.h | 9 +++-- scripts/ems/experiment.meta | 4 +- 9 files changed, 75 insertions(+), 18 deletions(-) diff --git a/Jamroot b/Jamroot index 1f782261c..01546e9e2 100644 --- a/Jamroot +++ b/Jamroot @@ -122,12 +122,12 @@ project : requirements ; #Add directories here if you want their incidental targets too (i.e. tests). -build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses-chart-cmd mira scripts regression-testing ; +build-projects lm util phrase-extract search moses moses/LM mert moses-cmd socket-moses-cmd moses-chart-cmd mira scripts regression-testing ; -alias programs : lm//programs moses-chart-cmd//moses_chart moses-cmd//programs OnDiskPt//CreateOnDiskPt OnDiskPt//queryOnDiskPt mert//programs misc//programs symal phrase-extract phrase-extract//lexical-reordering phrase-extract//extract-ghkm phrase-extract//pcfg-extract phrase-extract//pcfg-score biconcor mira//mira contrib/server//mosesserver ; +alias programs : lm//programs moses-chart-cmd//moses_chart moses-cmd//programs socket-moses-cmd//programs OnDiskPt//CreateOnDiskPt OnDiskPt//queryOnDiskPt mert//programs misc//programs symal phrase-extract phrase-extract//lexical-reordering phrase-extract//extract-ghkm phrase-extract//pcfg-extract phrase-extract//pcfg-score biconcor mira//mira contrib/server//mosesserver ; install-bin-libs programs ; -install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ; +install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd socket-moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ; install-headers headers-moses : moses//headers-to-install : moses ; alias install : prefix-bin prefix-lib headers-base headers-moses ; diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 697e46b9d..c68d265c5 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -77,7 +77,7 @@ Parameter::Parameter() AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); - AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); + AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'"); AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" ); AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation"); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index a80fe8f92..1954929f9 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -487,10 +487,11 @@ bool StaticData::LoadData(Parameter *parameter) if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough; else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive; else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive; + else if (m_parameter->GetParam("xml-input")[0]=="constraint") m_xmlInputType = XmlConstraint; else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore; else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough; else { - UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore"); + UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, constraint, or ignore"); return false; } diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp index 4223934e3..02e5c3638 100644 --- a/moses/TranslationOptionCollection.cpp +++ b/moses/TranslationOptionCollection.cpp @@ -569,7 +569,6 @@ void TranslationOptionCollection::Sort() * called by CreateTranslationOptionsForRange() * \param startPos first position in input sentence * \param lastPos last position in input sentence - * \param adhereTableLimit whether phrase & generation table limits are adhered to */ bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) const { @@ -577,6 +576,18 @@ bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) //not implemented for base class } +/** Check if an option conflicts with any constraint XML options. Okay, if XML option is substring in source and target. + * by default, we don't support XML options. subclasses need to override this function. + * called by CreateTranslationOptionsForRange() + * \param startPos first position in input sentence + * \param lastPos last position in input sentence + */ +bool TranslationOptionCollection::ViolatesXmlOptionsConstraint(size_t, size_t, TranslationOption *) const +{ + return false; + //not implemented for base class +} + /** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing. * called by CreateTranslationOptionsForRange() * \param startPos first position in input sentence @@ -588,9 +599,7 @@ void TranslationOptionCollection::CreateXmlOptionsForRange(size_t, size_t) }; - - -/** add translation option to the list +/** Add translation option to the list * \param translationOption translation option to be added */ void TranslationOptionCollection::Add(TranslationOption *translationOption) { diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h index d8e839a50..cf69a2897 100644 --- a/moses/TranslationOptionCollection.h +++ b/moses/TranslationOptionCollection.h @@ -140,6 +140,9 @@ public: //!Check if this range has XML options virtual bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const; + //! Check if a subsumed XML option constraint is satisfied + virtual bool ViolatesXmlOptionsConstraint(size_t startPosition, size_t endPosition, TranslationOption *transOpt) const; + //! Create xml-based translation options for the specific input span virtual void CreateXmlOptionsForRange(size_t startPosition, size_t endPosition); diff --git a/moses/TranslationOptionCollectionText.cpp b/moses/TranslationOptionCollectionText.cpp index f403c0fb6..7b01c2863 100644 --- a/moses/TranslationOptionCollectionText.cpp +++ b/moses/TranslationOptionCollectionText.cpp @@ -79,6 +79,51 @@ bool TranslationOptionCollectionText::HasXmlOptionsOverlappingRange(size_t start return source.XmlOverlap(startPosition,endPosition); } +/** + * Check if the given translation option violates a specified xml Option + */ +bool TranslationOptionCollectionText::ViolatesXmlOptionsConstraint(size_t startPosition, size_t endPosition, TranslationOption *transOpt) const +{ + // skip if there is no overlap + Sentence const& source=dynamic_cast(m_source); + if (!source.XmlOverlap(startPosition,endPosition)) { + return false; + } + // check for all sub spans + for(size_t start=startPosition; start<=endPosition; start++) { + for(size_t end=start; end<=endPosition; end++) { + // get list of xml options for the subspan + vector xmlOptions; + source.GetXmlTranslationOptions(xmlOptions,start,end); + if (xmlOptions.size() > 0) { + // check if any xml option matches + for(size_t i=0; iGetTargetPhrase(); + const TargetPhrase &xmlPhrase = xmlOptions[i]->GetTargetPhrase(); + // check this xml option (if shorter) + if (phrase.GetSize() >= xmlPhrase.GetSize()) { + // match may start in middle of phrase + for(size_t offset=0; offset <= phrase.GetSize()-xmlPhrase.GetSize(); offset++) { + bool match = true; + // match every word (only surface factor) + for(size_t wordPos=0; match && wordPos < xmlPhrase.GetSize(); wordPos++) { + if (phrase.GetFactor( wordPos+offset,0 )->Compare(*(xmlPhrase.GetFactor( wordPos,0 )))) { + match = false; + } + } + if (match) { + return false; // no violation if matching xml option found + } + } + } + } + return true; // there were xml options for this range, but none matched + } + } + } + return false; +} + /** * Create xml-based translation options for the specific input span */ @@ -140,5 +185,3 @@ void TranslationOptionCollectionText::CreateTranslationOptionsForRange( } - - diff --git a/moses/TranslationOptionCollectionText.h b/moses/TranslationOptionCollectionText.h index 6f5f46424..6ba5598ef 100644 --- a/moses/TranslationOptionCollectionText.h +++ b/moses/TranslationOptionCollectionText.h @@ -51,7 +51,7 @@ public: TranslationOptionCollectionText(Sentence const& input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold); bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const; - + bool ViolatesXmlOptionsConstraint(size_t startPosition, size_t endPosition, TranslationOption *transOpt) const; void CreateXmlOptionsForRange(size_t startPosition, size_t endPosition); void CreateTranslationOptions(); diff --git a/moses/TypeDef.h b/moses/TypeDef.h index af3a47b23..122c3d1dc 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -134,10 +134,11 @@ enum InputTypeEnum { }; enum XmlInputType { - XmlPassThrough = 0, - XmlIgnore = 1, - XmlExclusive = 2, - XmlInclusive = 3 + XmlPassThrough = 0, + XmlIgnore = 1, + XmlExclusive = 2, + XmlInclusive = 3, + XmlConstraint = 4 }; enum DictionaryFind { diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index fb385f47c..479b59086 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -515,8 +515,8 @@ build-osm in: corpus word-alignment out: osm-model ignore-unless: operation-sequence-model - rerun-on-change: operation-sequence-model training-options script giza-settings - template: $moses-script-dir/OSM/OSM-Train.sh IN0.$output-extension IN0.$input-extension IN1.$alignment-symmetrization-method $operation-sequence-model-order OUT $moses-src-dir $srilm-dir + rerun-on-change: operation-sequence-model training-options script giza-settings operation-sequence-model-settings + template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --srilm-dir $srilm-dir $operation-sequence-model-settings default-name: model/OSM extract-phrases in: corpus-mml-postfilter=OR=word-alignment scored-corpus