diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index c2d8d3363..10db90d1b 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -42,6 +42,7 @@ #include "moses/FF/ControlRecombination.h" #include "moses/FF/ConstrainedDecoding.h" #include "moses/FF/SoftSourceSyntacticConstraintsFeature.h" +#include "moses/FF/TargetConstituentAdjacencyFeature.h" #include "moses/FF/TargetPreferencesFeature.h" #include "moses/FF/CoveredReferenceFeature.h" #include "moses/FF/TreeStructureFeature.h" @@ -264,6 +265,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(CoveredReferenceFeature); MOSES_FNAME(SourceGHKMTreeInputMatchFeature); MOSES_FNAME(SoftSourceSyntacticConstraintsFeature); + MOSES_FNAME(TargetConstituentAdjacencyFeature); MOSES_FNAME(TargetPreferencesFeature); MOSES_FNAME(TreeStructureFeature); MOSES_FNAME(SoftMatchingFeature); diff --git a/moses/FF/TargetConstituentAdjacencyFeature.cpp b/moses/FF/TargetConstituentAdjacencyFeature.cpp new file mode 100644 index 000000000..e136b4b27 --- /dev/null +++ b/moses/FF/TargetConstituentAdjacencyFeature.cpp @@ -0,0 +1,189 @@ +#include "TargetConstituentAdjacencyFeature.h" +#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h" +#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h" +#include "moses/StaticData.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/Hypothesis.h" +#include "moses/FactorCollection.h" +#include "moses/TreeInput.h" +#include + + +using namespace std; + +namespace Moses +{ + +size_t TargetConstituentAdjacencyFeatureState::hash() const +{ + if (m_recombine) { + return 0; + } + size_t ret = 0; + boost::hash_combine(ret, m_collection.size()); + for (std::map::const_iterator it=m_collection.begin(); + it!=m_collection.end(); ++it) { + boost::hash_combine(ret, it->first); + } + return ret; +}; + +bool TargetConstituentAdjacencyFeatureState::operator==(const FFState& other) const +{ + if (m_recombine) { + return true; + } + + if (this == &other) { + return true; + } + + const TargetConstituentAdjacencyFeatureState* otherState = + dynamic_cast(&other); + UTIL_THROW_IF2(otherState == NULL, "Wrong state type"); + + if (m_collection.size() != (otherState->m_collection).size()) { + return false; + } + std::map::const_iterator thisIt, otherIt; + for (thisIt=m_collection.begin(), otherIt=(otherState->m_collection).begin(); + thisIt!=m_collection.end(); ++thisIt, ++otherIt) { + if (thisIt->first != otherIt->first) { + return false; + } + } + return true; +}; + + +TargetConstituentAdjacencyFeature::TargetConstituentAdjacencyFeature(const std::string &line) + : StatefulFeatureFunction(2, line) + , m_featureVariant(0) + , m_recombine(false) +{ + VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); + ReadParameters(); + VERBOSE(1, " Done." << std::endl); + VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl); +} + + +void TargetConstituentAdjacencyFeature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "variant") { + m_featureVariant = Scan(value); + } else if (key == "recombine") { + m_recombine = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + + +FFState* TargetConstituentAdjacencyFeature::EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + // dense scores + std::vector newScores(m_numScoreComponents,0); // m_numScoreComponents == 2 + + // state + const TargetConstituentAdjacencyFeatureState *prevState = static_cast(prev_state); + + // read TargetConstituentAdjacency property + const TargetPhrase &currTarPhr = cur_hypo.GetCurrTargetPhrase(); + FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl); + + if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesLeft")) { + + const TargetConstituentBoundariesLeftPhraseProperty *targetConstituentBoundariesLeftPhraseProperty = + static_cast(property); + const TargetConstituentBoundariesLeftCollection& leftConstituentCollection = + targetConstituentBoundariesLeftPhraseProperty->GetCollection(); + float prob = 0; + size_t numMatch = 0; + size_t numOverall = 0; + + if ( !cur_hypo.GetPrevHypo()->GetPrevHypo() ) { + // previous hypothesis is initial, i.e. target sentence starts here + + ++numOverall; + FactorCollection &factorCollection = FactorCollection::Instance(); + const Factor* bosFactor = factorCollection.AddFactor("BOS_",false); + TargetConstituentBoundariesLeftCollection::const_iterator found = + leftConstituentCollection.find(bosFactor); + if ( found != leftConstituentCollection.end() ) { + ++numMatch; + prob += found->second; + } + + } else { + + const std::map& hypConstituentCollection = prevState->m_collection; + std::map::const_iterator iter1 = hypConstituentCollection.begin(); + std::map::const_iterator iter2 = leftConstituentCollection.begin(); + while ( iter1 != hypConstituentCollection.end() && iter2 != leftConstituentCollection.end() ) { + ++numOverall; + if ( iter1->first < iter2->first ) { + ++iter1; + } else if ( iter2->first < iter1->first ) { + ++iter2; + } else { + ++numMatch; + float currProb = iter1->second * iter2->second; + if (currProb > prob) + prob = currProb; + ++iter1; + ++iter2; + } + } + } + + if ( (numMatch == 0) || (prob == 0) ) { + ++newScores[1]; + } else { + if ( m_featureVariant == 1 ) { + newScores[0] += TransformScore(prob); + } else { + newScores[0] += TransformScore( (float)numMatch/numOverall ); + } + } + + } else { + + // abort with error message if the phrase does not translate an unknown word + UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription() + << ": Missing TargetConstituentBoundariesLeft property."); + + ++newScores[1]; + + } + + TargetConstituentAdjacencyFeatureState *newState = new TargetConstituentAdjacencyFeatureState(m_recombine); + + if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesRightAdjacent")) { + + const TargetConstituentBoundariesRightAdjacentPhraseProperty *targetConstituentBoundariesRightAdjacentPhraseProperty = + static_cast(property); + const TargetConstituentBoundariesLeftCollection& rightAdjacentConstituentCollection = targetConstituentBoundariesRightAdjacentPhraseProperty->GetCollection(); + + std::copy(rightAdjacentConstituentCollection.begin(), rightAdjacentConstituentCollection.end(), + std::inserter(newState->m_collection, newState->m_collection.begin())); + + } else { + + // abort with error message if the phrase does not translate an unknown word + UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription() + << ": Missing TargetConstituentBoundariesRightAdjacent property."); + + } + + // add scores + accumulator->PlusEquals(this, newScores); + + return newState; +} + +} + diff --git a/moses/FF/TargetConstituentAdjacencyFeature.h b/moses/FF/TargetConstituentAdjacencyFeature.h new file mode 100644 index 000000000..f5305e0df --- /dev/null +++ b/moses/FF/TargetConstituentAdjacencyFeature.h @@ -0,0 +1,101 @@ +#pragma once + +#include +#include +#include +#include +#include "StatefulFeatureFunction.h" +#include "FFState.h" +#include "util/exception.hh" +#include + +namespace Moses +{ + +class TargetConstituentAdjacencyFeatureState : public FFState +{ + +public: + + friend class TargetConstituentAdjacencyFeature; + + TargetConstituentAdjacencyFeatureState(bool recombine) + : m_recombine(recombine) + {}; + + size_t hash() const; + + virtual bool operator==(const FFState& other) const; + +private: + + const bool m_recombine; + std::map m_collection; + +}; + + +class TargetConstituentAdjacencyFeature : public StatefulFeatureFunction +{ + +public: + + TargetConstituentAdjacencyFeature(const std::string &line); + + ~TargetConstituentAdjacencyFeature() + {}; + + bool IsUseable(const FactorMask &mask) const { + return true; + }; + + virtual const FFState* EmptyHypothesisState(const InputType &input) const { + return new TargetConstituentAdjacencyFeatureState(m_recombine); + }; + + void SetParameter(const std::string& key, const std::string& value); + + void Load(AllOptions::ptr const& opts) + {}; + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {}; + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const + {}; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const + {}; + + FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + FFState* EvaluateWhenApplied( + const ChartHypothesis& cur_hypo, + int featureID, // used to index the state in the previous hypotheses + ScoreComponentCollection* accumulator) const { + UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for chart-based decoding."); + return new TargetConstituentAdjacencyFeatureState(m_recombine); + }; + + +private: + + size_t m_featureVariant; + bool m_recombine; + +}; + +} + diff --git a/moses/Factor.h b/moses/Factor.h index f4bb2074d..dce10ac90 100644 --- a/moses/Factor.h +++ b/moses/Factor.h @@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#ifndef moses_Factor_h -#define moses_Factor_h +#pragma once #include #include @@ -98,4 +97,4 @@ public: size_t hash_value(const Factor &f); } -#endif + diff --git a/moses/PP/Factory.cpp b/moses/PP/Factory.cpp index 72c927072..46ca7d362 100644 --- a/moses/PP/Factory.cpp +++ b/moses/PP/Factory.cpp @@ -11,6 +11,8 @@ #include "moses/PP/SpanLengthPhraseProperty.h" #include "moses/PP/NonTermContextProperty.h" #include "moses/PP/OrientationPhraseProperty.h" +#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h" +#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h" namespace Moses { @@ -58,6 +60,8 @@ PhrasePropertyFactory::PhrasePropertyFactory() MOSES_PNAME2("Counts", CountsPhraseProperty); MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty); + MOSES_PNAME2("TargetConstituentBoundariesLeft", TargetConstituentBoundariesLeftPhraseProperty); + MOSES_PNAME2("TargetConstituentBoundariesRightAdjacent", TargetConstituentBoundariesRightAdjacentPhraseProperty); MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty); MOSES_PNAME2("Tree",TreeStructurePhraseProperty); MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty); diff --git a/moses/PP/PhraseProperty.cpp b/moses/PP/PhraseProperty.cpp index 69e3c3374..4224e62dd 100644 --- a/moses/PP/PhraseProperty.cpp +++ b/moses/PP/PhraseProperty.cpp @@ -5,9 +5,14 @@ namespace Moses std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj) { - out << "Base phrase property"; + obj.Print(out); return out; } +void PhraseProperty::Print(std::ostream &out) const +{ + out << "Base phrase property"; +} + } diff --git a/moses/PP/PhraseProperty.h b/moses/PP/PhraseProperty.h index 76c294481..eef5be688 100644 --- a/moses/PP/PhraseProperty.h +++ b/moses/PP/PhraseProperty.h @@ -28,6 +28,8 @@ public: protected: + virtual void Print(std::ostream& out) const; + std::string *m_value; }; diff --git a/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp b/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp new file mode 100644 index 000000000..e3a0917ea --- /dev/null +++ b/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp @@ -0,0 +1,63 @@ +#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h" +#include "moses/FactorCollection.h" +#include "moses/Util.h" +#include +#include +#include + +namespace Moses +{ + +void TargetConstituentBoundariesLeftPhraseProperty::ProcessValue(const std::string &value) +{ + FactorCollection &factorCollection = FactorCollection::Instance(); + std::vector tokens; + Tokenize(tokens, value, " "); + std::vector::const_iterator tokenIter = tokens.begin(); + while (tokenIter != tokens.end()) { + try { + + std::vector constituents; + Tokenize(constituents, *tokenIter, "<"); + ++tokenIter; + float count = std::atof( tokenIter->c_str() ); + ++tokenIter; + + std::set dedup; + + for ( std::vector::iterator constituentIter = constituents.begin(); + constituentIter != constituents.end(); ++constituentIter ) { + + const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false); + + std::pair< std::set::iterator, bool > dedupIns = + dedup.insert(constituentFactor); + if ( dedupIns.second ) { + + std::pair< TargetConstituentBoundariesLeftCollection::iterator, bool > inserted = + m_constituentsCollection.insert(std::make_pair(constituentFactor,count)); + if ( !inserted.second ) { + (inserted.first)->second += count; + } + } + } + + } catch (const std::exception &e) { + UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: Read error. Flawed property? " << value); + } + } +}; + +void TargetConstituentBoundariesLeftPhraseProperty::Print(std::ostream& out) const +{ + for ( TargetConstituentBoundariesLeftCollection::const_iterator it = m_constituentsCollection.begin(); + it != m_constituentsCollection.end(); ++it ) { + if ( it != m_constituentsCollection.begin() ) { + out << " "; + } + out << *(it->first) << " " << it->second; + } +} + +} // namespace Moses + diff --git a/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h b/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h new file mode 100644 index 000000000..d9c629922 --- /dev/null +++ b/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h @@ -0,0 +1,40 @@ +#pragma once + +#include "moses/PP/PhraseProperty.h" +#include "moses/Factor.h" +#include "util/exception.hh" +#include +#include + +namespace Moses +{ + +typedef std::map TargetConstituentBoundariesLeftCollection; + + +class TargetConstituentBoundariesLeftPhraseProperty : public PhraseProperty +{ +public: + TargetConstituentBoundariesLeftPhraseProperty() + {}; + + virtual void ProcessValue(const std::string &value); + + const TargetConstituentBoundariesLeftCollection &GetCollection() const { + return m_constituentsCollection; + }; + + virtual const std::string *GetValueString() const { + UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: value string not available in this phrase property"); + return NULL; + }; + +protected: + + virtual void Print(std::ostream& out) const; + + TargetConstituentBoundariesLeftCollection m_constituentsCollection; +}; + +} // namespace Moses + diff --git a/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp b/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp new file mode 100644 index 000000000..5bed2c764 --- /dev/null +++ b/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp @@ -0,0 +1,63 @@ +#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h" +#include "moses/FactorCollection.h" +#include "moses/Util.h" +#include +#include +#include + +namespace Moses +{ + +void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value) +{ + FactorCollection &factorCollection = FactorCollection::Instance(); + std::vector tokens; + Tokenize(tokens, value, " "); + std::vector::const_iterator tokenIter = tokens.begin(); + while (tokenIter != tokens.end()) { + try { + + std::vector constituents; + Tokenize(constituents, *tokenIter, "<"); + ++tokenIter; + float count = std::atof( tokenIter->c_str() ); + ++tokenIter; + + std::set dedup; + + for ( std::vector::iterator constituentIter = constituents.begin(); + constituentIter != constituents.end(); ++constituentIter ) { + + const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false); + + std::pair< std::set::iterator, bool > dedupIns = + dedup.insert(constituentFactor); + if ( dedupIns.second ) { + + std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted = + m_constituentsCollection.insert(std::make_pair(constituentFactor,count)); + if ( !inserted.second ) { + (inserted.first)->second += count; + } + } + } + + } catch (const std::exception &e) { + UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value); + } + } +}; + +void TargetConstituentBoundariesRightAdjacentPhraseProperty::Print(std::ostream& out) const +{ + for ( TargetConstituentBoundariesRightAdjacentCollection::const_iterator it = m_constituentsCollection.begin(); + it != m_constituentsCollection.end(); ++it ) { + if ( it != m_constituentsCollection.begin() ) { + out << " "; + } + out << *(it->first) << " " << it->second; + } +} + +} // namespace Moses + diff --git a/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h b/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h new file mode 100644 index 000000000..79b5c71be --- /dev/null +++ b/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h @@ -0,0 +1,40 @@ +#pragma once + +#include "moses/PP/PhraseProperty.h" +#include "moses/Factor.h" +#include "util/exception.hh" +#include +#include + +namespace Moses +{ + +typedef std::map TargetConstituentBoundariesRightAdjacentCollection; + + +class TargetConstituentBoundariesRightAdjacentPhraseProperty : public PhraseProperty +{ +public: + TargetConstituentBoundariesRightAdjacentPhraseProperty() + {}; + + virtual void ProcessValue(const std::string &value); + + const TargetConstituentBoundariesRightAdjacentCollection &GetCollection() const { + return m_constituentsCollection; + }; + + virtual const std::string *GetValueString() const { + UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: value string not available in this phrase property"); + return NULL; + }; + +protected: + + virtual void Print(std::ostream& out) const; + + TargetConstituentBoundariesRightAdjacentCollection m_constituentsCollection; +}; + +} // namespace Moses + diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index 57821fe44..9a2884858 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke std::ostringstream oss; for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); iter!=allPropertyValues->end(); ++iter) { - if (iter!=allPropertyValues->begin()) { + if (!(iter->first).empty()) { + if (iter!=allPropertyValues->begin()) { + oss << " "; + } + oss << iter->first; oss << " "; + oss << iter->second; } - oss << iter->first; - oss << " "; - oss << iter->second; } std::string allPropertyValuesString(oss.str()); diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 38e570b79..859ab92d7 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -50,6 +50,8 @@ private: bool onlyOutputSpanInfo; bool gzOutput; std::string instanceWeightsFile; //weights for each sentence + bool targetConstituentConstrainedFlag; + bool targetConstituentBoundariesFlag; bool flexScoreFlag; bool singleWordHeuristicFlag; @@ -73,6 +75,8 @@ public: includeSentenceIdFlag(false), onlyOutputSpanInfo(false), gzOutput(false), + targetConstituentConstrainedFlag(false), + targetConstituentBoundariesFlag(false), flexScoreFlag(false), singleWordHeuristicFlag(false), debug(false) { @@ -118,6 +122,12 @@ public: void initInstanceWeightsFile(const char* initInstanceWeightsFile) { instanceWeightsFile = std::string(initInstanceWeightsFile); } + void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) { + targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag; + } + void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) { + targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag; + } void initFlexScoreFlag(const bool initflexScoreFlag) { flexScoreFlag=initflexScoreFlag; } @@ -165,6 +175,12 @@ public: std::string getInstanceWeightsFile() const { return instanceWeightsFile; } + bool isTargetConstituentConstrainedFlag() const { + return targetConstituentConstrainedFlag; + } + bool isTargetConstituentBoundariesFlag() const { + return targetConstituentBoundariesFlag; + } bool isFlexScoreFlag() const { return flexScoreFlag; } diff --git a/phrase-extract/SyntaxNodeCollection.cpp b/phrase-extract/SyntaxNodeCollection.cpp index 70f52317e..2a321f1e2 100644 --- a/phrase-extract/SyntaxNodeCollection.cpp +++ b/phrase-extract/SyntaxNodeCollection.cpp @@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos, SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); + m_endPositionsIndex[ endPos ].push_back( newNode ); + m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)? m_numWords = std::max(endPos+1, m_numWords); return newNode; } @@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes( return endIndex->second; } +bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const +{ + return GetNodesByStartPosition(startPos).size() > 0; +} + +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition( + int startPos ) const +{ + InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos ); + if (startIndex == m_startPositionsIndex.end() ) + return m_emptyNode; + + return startIndex->second; +} + +bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const +{ + return GetNodesByEndPosition(endPos).size() > 0; +} + +const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition( + int endPos ) const +{ + InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos ); + if (endIndex == m_endPositionsIndex.end() ) + return m_emptyNode; + + return endIndex->second; +} + std::auto_ptr SyntaxNodeCollection::ExtractTree() { std::map nodeToTree; diff --git a/phrase-extract/SyntaxNodeCollection.h b/phrase-extract/SyntaxNodeCollection.h index ef0989cd0..83aa66bb4 100644 --- a/phrase-extract/SyntaxNodeCollection.h +++ b/phrase-extract/SyntaxNodeCollection.h @@ -50,6 +50,11 @@ public: //! Lookup the SyntaxNodes for a given span. const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; + bool HasNodeStartingAtPosition( int startPos ) const; + const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const; + bool HasNodeEndingAtPosition( int endPos ) const; + const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const; + //! Get a vector of pointers to all SyntaxNodes (unordered). const std::vector< SyntaxNode* >& GetAllNodes() { return m_nodes; @@ -78,6 +83,9 @@ private: NodeIndex m_index; int m_numWords; std::vector< SyntaxNode* > m_emptyNode; + + InnerNodeIndex m_endPositionsIndex; + InnerNodeIndex m_startPositionsIndex; }; } // namespace MosesTraining diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 0e77368f4..e4d074e15 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -1,11 +1,3 @@ -/* - * extract.cpp - * Modified by: Rohit Gupta CDAC, Mumbai, India - * on July 15, 2012 to implement parallel processing - * Modified by: Nadi Tomeh - LIMSI/CNRS - * Machine Translation Marathon 2010, Dublin - */ - #include #include #include @@ -20,11 +12,12 @@ #include #include -#include "SentenceAlignment.h" #include "tables-core.h" #include "InputFileStream.h" #include "OutputFileStream.h" #include "PhraseExtractionOptions.h" +#include "SentenceAlignmentWithSyntax.h" +#include "SyntaxNode.h" using namespace std; using namespace MosesTraining; @@ -46,14 +39,14 @@ typedef vector < HPhrase > HPhraseVector; // The key of the map is the English index and the value is a set of the source ones typedef map > HSentenceVertices; -REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int)); -REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &); -REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, +REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool, int, int, int, int, int, int, int, bool (*)(int, int), bool (*)(int, int), const HSentenceVertices &, const HSentenceVertices &, @@ -69,7 +62,7 @@ bool ge(int, int); bool le(int, int); bool lt(int, int); -bool isAligned (SentenceAlignment &, int, int); +bool isAligned (SentenceAlignmentWithSyntax &, int, int); int sentenceOffset = 0; @@ -87,7 +80,7 @@ class ExtractTask { public: ExtractTask( - size_t id, SentenceAlignment &sentence, + size_t id, SentenceAlignmentWithSyntax &sentence, PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv, @@ -109,14 +102,17 @@ private: vector< string > m_extractedPhrasesSid; vector< string > m_extractedPhrasesContext; vector< string > m_extractedPhrasesContextInv; - void extractBase(SentenceAlignment &); - void extract(SentenceAlignment &); - void addPhrase(SentenceAlignment &, int, int, int, int, string &); + void extractBase(SentenceAlignmentWithSyntax &); + void extract(SentenceAlignmentWithSyntax &); + void addPhrase(const SentenceAlignmentWithSyntax &, int, int, int, int, const std::string &, const std::string &); void writePhrasesToFile(); - bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF); + bool checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF); bool isPlaceholder(const string &word); + bool checkTargetConstituentBoundaries(const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + std::string &phrasePropertiesString); - SentenceAlignment &m_sentence; + SentenceAlignmentWithSyntax &m_sentence; const PhraseExtractionOptions &m_options; Moses::OutputFileStream &m_extractFile; Moses::OutputFileStream &m_extractFileInv; @@ -133,7 +129,8 @@ int main(int argc, char* argv[]) if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; - cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; + cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename "; + cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl; exit(1); } @@ -153,6 +150,10 @@ int main(int argc, char* argv[]) options.initOnlyOutputSpanInfo(true); } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { options.initOrientationFlag(true); + } else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) { + options.initTargetConstituentConstrainedFlag(true); + } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) { + options.initTargetConstituentBoundariesFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) { @@ -280,6 +281,11 @@ int main(int argc, char* argv[]) extractFileContextInv.Open(fileNameExtractContextInv.c_str()); } + // stats on labels for glue grammar and unknown word label probabilities + set< string > targetLabelCollection, sourceLabelCollection; + map< string, int > targetTopLabelCollection, sourceTopLabelCollection; + const bool targetSyntax = true; + int i = sentenceOffset; string englishString, foreignString, alignmentString, weightString; @@ -295,7 +301,10 @@ int main(int argc, char* argv[]) getline(*iwFileP, weightString); } - SentenceAlignment sentence; + SentenceAlignmentWithSyntax sentence + (targetLabelCollection, sourceLabelCollection, + targetTopLabelCollection, sourceTopLabelCollection, + targetSyntax, false); // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (options.isOnlyOutputSpanInfo()) { @@ -360,7 +369,7 @@ void ExtractTask::Run() } -void ExtractTask::extract(SentenceAlignment &sentence) +void ExtractTask::extract(SentenceAlignmentWithSyntax &sentence) { int countE = sentence.target.size(); int countF = sentence.source.size(); @@ -454,7 +463,15 @@ void ExtractTask::extract(SentenceAlignment &sentence) // if(m_options.isAllModelsOutputFlag()) // " | | "; } - addPhrase(sentence, startE, endE, startF, endF, orientationInfo); + std::string phrasePropertiesString; + bool doAdd = !m_options.isTargetConstituentBoundariesFlag(); + if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) { + bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString); + doAdd = doAdd || isTargetConstituentCovered; + } + if (doAdd) { + addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString); + } } } } @@ -510,12 +527,20 @@ void ExtractTask::extract(SentenceAlignment &sentence) ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " + ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : ""); - addPhrase(sentence, startE, endE, startF, endF, orientationInfo); + std::string phrasePropertiesString; + bool doAdd = !m_options.isTargetConstituentBoundariesFlag(); + if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) { + bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString); + doAdd = doAdd || isTargetConstituentCovered; + } + if (doAdd) { + addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString); + } } } } -REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int) ) @@ -541,7 +566,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp } // to be called with countF-1 instead of countF -REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), @@ -577,7 +602,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model } // to be called with countF-1 instead of countF -REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, +REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType, bool connectedLeftTop, bool connectedRightTop, int startF, int endF, int startE, int endE, int countF, int zero, int unit, bool (*ge)(int, int), bool (*lt)(int, int), @@ -629,7 +654,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy return UNKNOWN; } -bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) +bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei ) { if (ei == -1 && fi == -1) return true; @@ -715,8 +740,138 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) } return ""; } + -void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) +bool ExtractTask::checkTargetConstituentBoundaries( const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + std::string &phrasePropertiesString) +{ + ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesLeft; + + if (m_options.isTargetConstituentBoundariesFlag()) { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "{{TargetConstituentBoundariesLeft "; + } + + bool validTargetConstituentBoundaries = false; + bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true; + + if (m_options.isTargetConstituentBoundariesFlag()) { + if (startE==0) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "BOS_"; + } + } + + if (!sentence.targetTree.HasNodeStartingAtPosition(startE)) { + + validTargetConstituentBoundaries = false; + + } else { + + const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(startE); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) { + if ( (*iter)->end == endE ) { + validTargetConstituentBoundaries = true; + if (!m_options.isTargetConstituentBoundariesFlag()) { + break; + } + } + if (m_options.isTargetConstituentBoundariesFlag()) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + } else { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << (*iter)->label; + } + } + } + + if (m_options.isTargetConstituentBoundariesFlag()) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "}}"; + } + + + if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) { + // skip over all boundary punctuation and check again + bool relaxedValidTargetConstituentBoundaries = false; + int relaxedStartE = startE; + int relaxedEndE = endE; + const std::string punctuation = ",;.:!?"; + while ( (relaxedStartE < endE) && + (sentence.target[relaxedStartE].size() == 1) && + (punctuation.find(sentence.target[relaxedStartE].at(0)) != std::string::npos) ) { + ++relaxedStartE; + } + while ( (relaxedEndE > relaxedStartE) && + (sentence.target[relaxedEndE].size() == 1) && + (punctuation.find(sentence.target[relaxedEndE].at(0)) != std::string::npos) ) { + --relaxedEndE; + } + + if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) { + const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(relaxedStartE); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); + (iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries); + ++iter ) { + if ( (*iter)->end == relaxedEndE ) { + relaxedValidTargetConstituentBoundaries = true; + } + } + } + + if (!relaxedValidTargetConstituentBoundaries) { + return false; + } + } + + + if (m_options.isTargetConstituentBoundariesFlag()) { + + ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent; + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "{{TargetConstituentBoundariesRightAdjacent "; + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true; + + if (endE==sentence.target.size()-1) { + + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "EOS_"; + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + + } else { + + const std::vector< SyntaxNode* >& adjacentNodes = sentence.targetTree.GetNodesByStartPosition(endE+1); + for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) { + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false; + } else { + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << (*iter)->label; + } + } + + if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) { + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<"; + } + outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "}}"; + + phrasePropertiesString += " "; + phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesLeft.str(); + phrasePropertiesString += " "; + phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent.str(); + } + + return true; +} + + +void ExtractTask::addPhrase( const SentenceAlignmentWithSyntax &sentence, + int startE, int endE, int startF, int endF, + const std::string &orientationInfo, + const std::string &phrasePropertiesString) { // source // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; @@ -746,11 +901,18 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + // target for(int ei=startE; ei<=endE; ei++) { - if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; - if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; - if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + + if (m_options.isTranslationFlag()) { + outextractstr << sentence.target[ei] << " "; + outextractstrInv << sentence.target[ei] << " "; + } + + if (m_options.isOrientationFlag()) { + outextractstrOrientation << sentence.target[ei] << " "; + } } if (m_options.isTranslationFlag()) outextractstr << "|||"; if (m_options.isTranslationFlag()) outextractstrInv << "||| "; @@ -792,7 +954,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, } } - + outextractstr << phrasePropertiesString; // generate two lines for every extracted phrase: // once with left, once with right context @@ -901,7 +1063,7 @@ void ExtractTask::writePhrasesToFile() // if proper conditioning, we need the number of times a source phrase occured -void ExtractTask::extractBase( SentenceAlignment &sentence ) +void ExtractTask::extractBase( SentenceAlignmentWithSyntax &sentence ) { ostringstream outextractFile; ostringstream outextractFileInv; @@ -935,7 +1097,7 @@ void ExtractTask::extractBase( SentenceAlignment &sentence ) } -bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF) +bool ExtractTask::checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF) { for (size_t pos = startF; pos <= endF; ++pos) { const string &sourceWord = sentence.source[pos]; diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 9095df01b..081ee8ef1 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -68,6 +68,7 @@ bool spanLength = false; bool ruleLength = false; bool nonTermContext = false; bool nonTermContextTarget = false; +bool targetConstituentBoundariesFlag = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; @@ -286,6 +287,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) { nonTermContextTarget = true; std::cerr << "non-term context (target)" << std::endl; + } else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) { + targetConstituentBoundariesFlag = true; + std::cerr << "including target constituent boundaries information" << std::endl; } else { featureArgs.push_back(argv[i]); ++i; @@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, } } + // target constituent boundaries + if (targetConstituentBoundariesFlag && !inverseFlag) { + const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft"); + if (!targetConstituentBoundariesLeftValues.empty()) { + phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}"; + } + const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent"); + if (!targetConstituentBoundariesRightAdjacentValues.empty()) { + phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}"; + } + } + phraseTableFile << std::endl; } diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 95c982cd7..6d0019838 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2407,6 +2407,12 @@ sub define_training_extract_phrases { if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) { $cmd .= "-ghkm-strip-bitpar-nonterminal-labels "; } + + } else { # !hierarchical-rule-set + + if (&get("TRAINING:target-constituent-boundaries")) { + $cmd .= "-target-constituent-boundaries "; + } } my $extract_settings = &get("TRAINING:extract-settings"); @@ -2464,6 +2470,12 @@ sub define_training_build_ttable { my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model","")); $cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; } + + } else { # !hierarchical-rule-set + + if (&get("TRAINING:target-constituent-boundaries")) { + $cmd .= "-target-constituent-boundaries "; + } } &create_step($step_id,$cmd); @@ -2678,6 +2690,10 @@ sub define_training_create_config { $cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; } + if (&get("TRAINING:target-constituent-boundaries")) { + $cmd .= "-target-constituent-boundaries "; + } + # sparse lexical features provide additional content for config file my @additional_ini_files; push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index d347aa6ec..7dc51a473 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -134,6 +134,7 @@ my($_EXTERNAL_BINDIR, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE, + $_TARGET_CONSTITUENT_BOUNDARIES, $_FLEXIBILITY_SCORE, $_FEATURE_LINES, $_WEIGHT_LINES, @@ -258,6 +259,7 @@ $_HELP = 1 'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE, 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE, 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES, + 'target-constituent-boundaries' => \$_TARGET_CONSTITUENT_BOUNDARIES, 'flexibility-score' => \$_FLEXIBILITY_SCORE, 'config-add-feature-lines=s' => \$_FEATURE_LINES, 'config-add-weight-lines=s' => \$_WEIGHT_LINES, @@ -1607,6 +1609,7 @@ sub extract_phrase { $cmd .= " --GZOutput "; $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE; $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/; + $cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES; $cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE; $cmd .= " --NoTTable" if $_MMSAPT; @@ -1764,9 +1767,10 @@ sub score_phrase_phrase_extract { $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); $cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE); $cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE); + $cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES; + $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE; $cmd .= " $DOMAIN" if $DOMAIN; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); - $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE; # sorting if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) { @@ -2386,6 +2390,7 @@ sub create_ini { print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE); print INI "\n"; } + print INI "TargetConstituentAdjacencyFeature\n" if $_TARGET_CONSTITUENT_BOUNDARIES; print INI $feature_spec; print INI "\n# dense weights for feature functions\n"; @@ -2398,6 +2403,7 @@ sub create_ini { print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION; print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE); + print INI "TargetConstituentAdjacencyFeature0= 0.05 -0.1\n" if $_TARGET_CONSTITUENT_BOUNDARIES; print INI $weight_spec; close(INI); }