Option for target constituent constrained phrase extraction. TargetConstituentAdjacencyFeature.

This commit is contained in:
Matthias Huck 2016-02-12 17:46:57 +00:00
parent c75f9854e4
commit 1659d6b4c8
19 changed files with 810 additions and 44 deletions

View File

@ -42,6 +42,7 @@
#include "moses/FF/ControlRecombination.h" #include "moses/FF/ControlRecombination.h"
#include "moses/FF/ConstrainedDecoding.h" #include "moses/FF/ConstrainedDecoding.h"
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h" #include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
#include "moses/FF/TargetConstituentAdjacencyFeature.h"
#include "moses/FF/TargetPreferencesFeature.h" #include "moses/FF/TargetPreferencesFeature.h"
#include "moses/FF/CoveredReferenceFeature.h" #include "moses/FF/CoveredReferenceFeature.h"
#include "moses/FF/TreeStructureFeature.h" #include "moses/FF/TreeStructureFeature.h"
@ -264,6 +265,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(CoveredReferenceFeature); MOSES_FNAME(CoveredReferenceFeature);
MOSES_FNAME(SourceGHKMTreeInputMatchFeature); MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature); MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
MOSES_FNAME(TargetConstituentAdjacencyFeature);
MOSES_FNAME(TargetPreferencesFeature); MOSES_FNAME(TargetPreferencesFeature);
MOSES_FNAME(TreeStructureFeature); MOSES_FNAME(TreeStructureFeature);
MOSES_FNAME(SoftMatchingFeature); MOSES_FNAME(SoftMatchingFeature);

View File

@ -0,0 +1,189 @@
#include "TargetConstituentAdjacencyFeature.h"
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/StaticData.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Hypothesis.h"
#include "moses/FactorCollection.h"
#include "moses/TreeInput.h"
#include <algorithm>
using namespace std;
namespace Moses
{
size_t TargetConstituentAdjacencyFeatureState::hash() const
{
if (m_recombine) {
return 0;
}
size_t ret = 0;
boost::hash_combine(ret, m_collection.size());
for (std::map<const Factor*, float>::const_iterator it=m_collection.begin();
it!=m_collection.end(); ++it) {
boost::hash_combine(ret, it->first);
}
return ret;
};
bool TargetConstituentAdjacencyFeatureState::operator==(const FFState& other) const
{
if (m_recombine) {
return true;
}
if (this == &other) {
return true;
}
const TargetConstituentAdjacencyFeatureState* otherState =
dynamic_cast<const TargetConstituentAdjacencyFeatureState*>(&other);
UTIL_THROW_IF2(otherState == NULL, "Wrong state type");
if (m_collection.size() != (otherState->m_collection).size()) {
return false;
}
std::map<const Factor*, float>::const_iterator thisIt, otherIt;
for (thisIt=m_collection.begin(), otherIt=(otherState->m_collection).begin();
thisIt!=m_collection.end(); ++thisIt, ++otherIt) {
if (thisIt->first != otherIt->first) {
return false;
}
}
return true;
};
TargetConstituentAdjacencyFeature::TargetConstituentAdjacencyFeature(const std::string &line)
: StatefulFeatureFunction(2, line)
, m_featureVariant(0)
, m_recombine(false)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
VERBOSE(1, " Done." << std::endl);
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
}
void TargetConstituentAdjacencyFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "variant") {
m_featureVariant = Scan<size_t>(value);
} else if (key == "recombine") {
m_recombine = Scan<bool>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
FFState* TargetConstituentAdjacencyFeature::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
// dense scores
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 2
// state
const TargetConstituentAdjacencyFeatureState *prevState = static_cast<const TargetConstituentAdjacencyFeatureState*>(prev_state);
// read TargetConstituentAdjacency property
const TargetPhrase &currTarPhr = cur_hypo.GetCurrTargetPhrase();
FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl);
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesLeft")) {
const TargetConstituentBoundariesLeftPhraseProperty *targetConstituentBoundariesLeftPhraseProperty =
static_cast<const TargetConstituentBoundariesLeftPhraseProperty*>(property);
const TargetConstituentBoundariesLeftCollection& leftConstituentCollection =
targetConstituentBoundariesLeftPhraseProperty->GetCollection();
float prob = 0;
size_t numMatch = 0;
size_t numOverall = 0;
if ( !cur_hypo.GetPrevHypo()->GetPrevHypo() ) {
// previous hypothesis is initial, i.e. target sentence starts here
++numOverall;
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* bosFactor = factorCollection.AddFactor("BOS_",false);
TargetConstituentBoundariesLeftCollection::const_iterator found =
leftConstituentCollection.find(bosFactor);
if ( found != leftConstituentCollection.end() ) {
++numMatch;
prob += found->second;
}
} else {
const std::map<const Factor*, float>& hypConstituentCollection = prevState->m_collection;
std::map<const Factor*, float>::const_iterator iter1 = hypConstituentCollection.begin();
std::map<const Factor*, float>::const_iterator iter2 = leftConstituentCollection.begin();
while ( iter1 != hypConstituentCollection.end() && iter2 != leftConstituentCollection.end() ) {
++numOverall;
if ( iter1->first < iter2->first ) {
++iter1;
} else if ( iter2->first < iter1->first ) {
++iter2;
} else {
++numMatch;
float currProb = iter1->second * iter2->second;
if (currProb > prob)
prob = currProb;
++iter1;
++iter2;
}
}
}
if ( (numMatch == 0) || (prob == 0) ) {
++newScores[1];
} else {
if ( m_featureVariant == 1 ) {
newScores[0] += TransformScore(prob);
} else {
newScores[0] += TransformScore( (float)numMatch/numOverall );
}
}
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing TargetConstituentBoundariesLeft property.");
++newScores[1];
}
TargetConstituentAdjacencyFeatureState *newState = new TargetConstituentAdjacencyFeatureState(m_recombine);
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesRightAdjacent")) {
const TargetConstituentBoundariesRightAdjacentPhraseProperty *targetConstituentBoundariesRightAdjacentPhraseProperty =
static_cast<const TargetConstituentBoundariesRightAdjacentPhraseProperty*>(property);
const TargetConstituentBoundariesLeftCollection& rightAdjacentConstituentCollection = targetConstituentBoundariesRightAdjacentPhraseProperty->GetCollection();
std::copy(rightAdjacentConstituentCollection.begin(), rightAdjacentConstituentCollection.end(),
std::inserter(newState->m_collection, newState->m_collection.begin()));
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing TargetConstituentBoundariesRightAdjacent property.");
}
// add scores
accumulator->PlusEquals(this, newScores);
return newState;
}
}

View File

@ -0,0 +1,101 @@
#pragma once
#include <string>
#include <vector>
#include <set>
#include <iostream>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
#include "util/exception.hh"
#include <stdint.h>
namespace Moses
{
class TargetConstituentAdjacencyFeatureState : public FFState
{
public:
friend class TargetConstituentAdjacencyFeature;
TargetConstituentAdjacencyFeatureState(bool recombine)
: m_recombine(recombine)
{};
size_t hash() const;
virtual bool operator==(const FFState& other) const;
private:
const bool m_recombine;
std::map<const Factor*, float> m_collection;
};
class TargetConstituentAdjacencyFeature : public StatefulFeatureFunction
{
public:
TargetConstituentAdjacencyFeature(const std::string &line);
~TargetConstituentAdjacencyFeature()
{};
bool IsUseable(const FactorMask &mask) const {
return true;
};
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
return new TargetConstituentAdjacencyFeatureState(m_recombine);
};
void SetParameter(const std::string& key, const std::string& value);
void Load(AllOptions::ptr const& opts)
{};
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{};
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{};
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{};
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
FFState* EvaluateWhenApplied(
const ChartHypothesis& cur_hypo,
int featureID, // used to index the state in the previous hypotheses
ScoreComponentCollection* accumulator) const {
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for chart-based decoding.");
return new TargetConstituentAdjacencyFeatureState(m_recombine);
};
private:
size_t m_featureVariant;
bool m_recombine;
};
}

View File

@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/ ***********************************************************************/
#ifndef moses_Factor_h #pragma once
#define moses_Factor_h
#include <ostream> #include <ostream>
#include <string> #include <string>
@ -98,4 +97,4 @@ public:
size_t hash_value(const Factor &f); size_t hash_value(const Factor &f);
} }
#endif

View File

@ -11,6 +11,8 @@
#include "moses/PP/SpanLengthPhraseProperty.h" #include "moses/PP/SpanLengthPhraseProperty.h"
#include "moses/PP/NonTermContextProperty.h" #include "moses/PP/NonTermContextProperty.h"
#include "moses/PP/OrientationPhraseProperty.h" #include "moses/PP/OrientationPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
namespace Moses namespace Moses
{ {
@ -58,6 +60,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Counts", CountsPhraseProperty); MOSES_PNAME2("Counts", CountsPhraseProperty);
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty); MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
MOSES_PNAME2("TargetConstituentBoundariesLeft", TargetConstituentBoundariesLeftPhraseProperty);
MOSES_PNAME2("TargetConstituentBoundariesRightAdjacent", TargetConstituentBoundariesRightAdjacentPhraseProperty);
MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty); MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty);
MOSES_PNAME2("Tree",TreeStructurePhraseProperty); MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty); MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);

View File

@ -5,9 +5,14 @@ namespace Moses
std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj) std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj)
{ {
out << "Base phrase property"; obj.Print(out);
return out; return out;
} }
void PhraseProperty::Print(std::ostream &out) const
{
out << "Base phrase property";
}
} }

View File

@ -28,6 +28,8 @@ public:
protected: protected:
virtual void Print(std::ostream& out) const;
std::string *m_value; std::string *m_value;
}; };

View File

@ -0,0 +1,63 @@
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include <iostream>
#include <queue>
#include <ostream>
namespace Moses
{
void TargetConstituentBoundariesLeftPhraseProperty::ProcessValue(const std::string &value)
{
FactorCollection &factorCollection = FactorCollection::Instance();
std::vector<std::string> tokens;
Tokenize(tokens, value, " ");
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
while (tokenIter != tokens.end()) {
try {
std::vector<std::string> constituents;
Tokenize(constituents, *tokenIter, "<");
++tokenIter;
float count = std::atof( tokenIter->c_str() );
++tokenIter;
std::set<const Factor* > dedup;
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
constituentIter != constituents.end(); ++constituentIter ) {
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
dedup.insert(constituentFactor);
if ( dedupIns.second ) {
std::pair< TargetConstituentBoundariesLeftCollection::iterator, bool > inserted =
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
if ( !inserted.second ) {
(inserted.first)->second += count;
}
}
}
} catch (const std::exception &e) {
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: Read error. Flawed property? " << value);
}
}
};
void TargetConstituentBoundariesLeftPhraseProperty::Print(std::ostream& out) const
{
for ( TargetConstituentBoundariesLeftCollection::const_iterator it = m_constituentsCollection.begin();
it != m_constituentsCollection.end(); ++it ) {
if ( it != m_constituentsCollection.begin() ) {
out << " ";
}
out << *(it->first) << " " << it->second;
}
}
} // namespace Moses

View File

@ -0,0 +1,40 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "moses/Factor.h"
#include "util/exception.hh"
#include <map>
#include <string>
namespace Moses
{
typedef std::map<const Factor*, float> TargetConstituentBoundariesLeftCollection;
class TargetConstituentBoundariesLeftPhraseProperty : public PhraseProperty
{
public:
TargetConstituentBoundariesLeftPhraseProperty()
{};
virtual void ProcessValue(const std::string &value);
const TargetConstituentBoundariesLeftCollection &GetCollection() const {
return m_constituentsCollection;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
virtual void Print(std::ostream& out) const;
TargetConstituentBoundariesLeftCollection m_constituentsCollection;
};
} // namespace Moses

View File

@ -0,0 +1,63 @@
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include <iostream>
#include <queue>
#include <ostream>
namespace Moses
{
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value)
{
FactorCollection &factorCollection = FactorCollection::Instance();
std::vector<std::string> tokens;
Tokenize(tokens, value, " ");
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
while (tokenIter != tokens.end()) {
try {
std::vector<std::string> constituents;
Tokenize(constituents, *tokenIter, "<");
++tokenIter;
float count = std::atof( tokenIter->c_str() );
++tokenIter;
std::set<const Factor* > dedup;
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
constituentIter != constituents.end(); ++constituentIter ) {
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
dedup.insert(constituentFactor);
if ( dedupIns.second ) {
std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted =
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
if ( !inserted.second ) {
(inserted.first)->second += count;
}
}
}
} catch (const std::exception &e) {
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value);
}
}
};
void TargetConstituentBoundariesRightAdjacentPhraseProperty::Print(std::ostream& out) const
{
for ( TargetConstituentBoundariesRightAdjacentCollection::const_iterator it = m_constituentsCollection.begin();
it != m_constituentsCollection.end(); ++it ) {
if ( it != m_constituentsCollection.begin() ) {
out << " ";
}
out << *(it->first) << " " << it->second;
}
}
} // namespace Moses

View File

@ -0,0 +1,40 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "moses/Factor.h"
#include "util/exception.hh"
#include <map>
#include <string>
namespace Moses
{
typedef std::map<const Factor*, float> TargetConstituentBoundariesRightAdjacentCollection;
class TargetConstituentBoundariesRightAdjacentPhraseProperty : public PhraseProperty
{
public:
TargetConstituentBoundariesRightAdjacentPhraseProperty()
{};
virtual void ProcessValue(const std::string &value);
const TargetConstituentBoundariesRightAdjacentCollection &GetCollection() const {
return m_constituentsCollection;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
virtual void Print(std::ostream& out) const;
TargetConstituentBoundariesRightAdjacentCollection m_constituentsCollection;
};
} // namespace Moses

View File

@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
std::ostringstream oss; std::ostringstream oss;
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
iter!=allPropertyValues->end(); ++iter) { iter!=allPropertyValues->end(); ++iter) {
if (iter!=allPropertyValues->begin()) { if (!(iter->first).empty()) {
if (iter!=allPropertyValues->begin()) {
oss << " ";
}
oss << iter->first;
oss << " "; oss << " ";
oss << iter->second;
} }
oss << iter->first;
oss << " ";
oss << iter->second;
} }
std::string allPropertyValuesString(oss.str()); std::string allPropertyValuesString(oss.str());

View File

@ -50,6 +50,8 @@ private:
bool onlyOutputSpanInfo; bool onlyOutputSpanInfo;
bool gzOutput; bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence std::string instanceWeightsFile; //weights for each sentence
bool targetConstituentConstrainedFlag;
bool targetConstituentBoundariesFlag;
bool flexScoreFlag; bool flexScoreFlag;
bool singleWordHeuristicFlag; bool singleWordHeuristicFlag;
@ -73,6 +75,8 @@ public:
includeSentenceIdFlag(false), includeSentenceIdFlag(false),
onlyOutputSpanInfo(false), onlyOutputSpanInfo(false),
gzOutput(false), gzOutput(false),
targetConstituentConstrainedFlag(false),
targetConstituentBoundariesFlag(false),
flexScoreFlag(false), flexScoreFlag(false),
singleWordHeuristicFlag(false), singleWordHeuristicFlag(false),
debug(false) { debug(false) {
@ -118,6 +122,12 @@ public:
void initInstanceWeightsFile(const char* initInstanceWeightsFile) { void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
instanceWeightsFile = std::string(initInstanceWeightsFile); instanceWeightsFile = std::string(initInstanceWeightsFile);
} }
void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
}
void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
}
void initFlexScoreFlag(const bool initflexScoreFlag) { void initFlexScoreFlag(const bool initflexScoreFlag) {
flexScoreFlag=initflexScoreFlag; flexScoreFlag=initflexScoreFlag;
} }
@ -165,6 +175,12 @@ public:
std::string getInstanceWeightsFile() const { std::string getInstanceWeightsFile() const {
return instanceWeightsFile; return instanceWeightsFile;
} }
bool isTargetConstituentConstrainedFlag() const {
return targetConstituentConstrainedFlag;
}
bool isTargetConstituentBoundariesFlag() const {
return targetConstituentBoundariesFlag;
}
bool isFlexScoreFlag() const { bool isFlexScoreFlag() const {
return flexScoreFlag; return flexScoreFlag;
} }

View File

@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos); SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
m_nodes.push_back( newNode ); m_nodes.push_back( newNode );
m_index[ startPos ][ endPos ].push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode );
m_endPositionsIndex[ endPos ].push_back( newNode );
m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
m_numWords = std::max(endPos+1, m_numWords); m_numWords = std::max(endPos+1, m_numWords);
return newNode; return newNode;
} }
@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
return endIndex->second; return endIndex->second;
} }
bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
{
return GetNodesByStartPosition(startPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
int startPos ) const
{
InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
if (startIndex == m_startPositionsIndex.end() )
return m_emptyNode;
return startIndex->second;
}
bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
{
return GetNodesByEndPosition(endPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
int endPos ) const
{
InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
if (endIndex == m_endPositionsIndex.end() )
return m_emptyNode;
return endIndex->second;
}
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree() std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
{ {
std::map<SyntaxNode *, SyntaxTree *> nodeToTree; std::map<SyntaxNode *, SyntaxTree *> nodeToTree;

View File

@ -50,6 +50,11 @@ public:
//! Lookup the SyntaxNodes for a given span. //! Lookup the SyntaxNodes for a given span.
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
bool HasNodeStartingAtPosition( int startPos ) const;
const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
bool HasNodeEndingAtPosition( int endPos ) const;
const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
//! Get a vector of pointers to all SyntaxNodes (unordered). //! Get a vector of pointers to all SyntaxNodes (unordered).
const std::vector< SyntaxNode* >& GetAllNodes() { const std::vector< SyntaxNode* >& GetAllNodes() {
return m_nodes; return m_nodes;
@ -78,6 +83,9 @@ private:
NodeIndex m_index; NodeIndex m_index;
int m_numWords; int m_numWords;
std::vector< SyntaxNode* > m_emptyNode; std::vector< SyntaxNode* > m_emptyNode;
InnerNodeIndex m_endPositionsIndex;
InnerNodeIndex m_startPositionsIndex;
}; };
} // namespace MosesTraining } // namespace MosesTraining

View File

@ -1,11 +1,3 @@
/*
* extract.cpp
* Modified by: Rohit Gupta CDAC, Mumbai, India
* on July 15, 2012 to implement parallel processing
* Modified by: Nadi Tomeh - LIMSI/CNRS
* Machine Translation Marathon 2010, Dublin
*/
#include <cstdio> #include <cstdio>
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
@ -20,11 +12,12 @@
#include <vector> #include <vector>
#include <limits> #include <limits>
#include "SentenceAlignment.h"
#include "tables-core.h" #include "tables-core.h"
#include "InputFileStream.h" #include "InputFileStream.h"
#include "OutputFileStream.h" #include "OutputFileStream.h"
#include "PhraseExtractionOptions.h" #include "PhraseExtractionOptions.h"
#include "SentenceAlignmentWithSyntax.h"
#include "SyntaxNode.h"
using namespace std; using namespace std;
using namespace MosesTraining; using namespace MosesTraining;
@ -46,14 +39,14 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones // The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices; typedef map <int, set<int> > HSentenceVertices;
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int, int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int)); bool (*)(int, int), bool (*)(int, int));
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int, int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int), bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &); const HSentenceVertices &, const HSentenceVertices &);
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int, int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int), bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &, const HSentenceVertices &,
@ -69,7 +62,7 @@ bool ge(int, int);
bool le(int, int); bool le(int, int);
bool lt(int, int); bool lt(int, int);
bool isAligned (SentenceAlignment &, int, int); bool isAligned (SentenceAlignmentWithSyntax &, int, int);
int sentenceOffset = 0; int sentenceOffset = 0;
@ -87,7 +80,7 @@ class ExtractTask
{ {
public: public:
ExtractTask( ExtractTask(
size_t id, SentenceAlignment &sentence, size_t id, SentenceAlignmentWithSyntax &sentence,
PhraseExtractionOptions &initoptions, PhraseExtractionOptions &initoptions,
Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFile,
Moses::OutputFileStream &extractFileInv, Moses::OutputFileStream &extractFileInv,
@ -109,14 +102,17 @@ private:
vector< string > m_extractedPhrasesSid; vector< string > m_extractedPhrasesSid;
vector< string > m_extractedPhrasesContext; vector< string > m_extractedPhrasesContext;
vector< string > m_extractedPhrasesContextInv; vector< string > m_extractedPhrasesContextInv;
void extractBase(SentenceAlignment &); void extractBase(SentenceAlignmentWithSyntax &);
void extract(SentenceAlignment &); void extract(SentenceAlignmentWithSyntax &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &); void addPhrase(const SentenceAlignmentWithSyntax &, int, int, int, int, const std::string &, const std::string &);
void writePhrasesToFile(); void writePhrasesToFile();
bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF); bool checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF);
bool isPlaceholder(const string &word); bool isPlaceholder(const string &word);
bool checkTargetConstituentBoundaries(const SentenceAlignmentWithSyntax &sentence,
int startE, int endE, int startF, int endF,
std::string &phrasePropertiesString);
SentenceAlignment &m_sentence; SentenceAlignmentWithSyntax &m_sentence;
const PhraseExtractionOptions &m_options; const PhraseExtractionOptions &m_options;
Moses::OutputFileStream &m_extractFile; Moses::OutputFileStream &m_extractFile;
Moses::OutputFileStream &m_extractFileInv; Moses::OutputFileStream &m_extractFileInv;
@ -133,7 +129,8 @@ int main(int argc, char* argv[])
if (argc < 6) { if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ";
cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl;
exit(1); exit(1);
} }
@ -153,6 +150,10 @@ int main(int argc, char* argv[])
options.initOnlyOutputSpanInfo(true); options.initOnlyOutputSpanInfo(true);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
options.initOrientationFlag(true); options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) {
options.initTargetConstituentConstrainedFlag(true);
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
options.initTargetConstituentBoundariesFlag(true);
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) { } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
options.initFlexScoreFlag(true); options.initFlexScoreFlag(true);
} else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) { } else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
@ -280,6 +281,11 @@ int main(int argc, char* argv[])
extractFileContextInv.Open(fileNameExtractContextInv.c_str()); extractFileContextInv.Open(fileNameExtractContextInv.c_str());
} }
// stats on labels for glue grammar and unknown word label probabilities
set< string > targetLabelCollection, sourceLabelCollection;
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
const bool targetSyntax = true;
int i = sentenceOffset; int i = sentenceOffset;
string englishString, foreignString, alignmentString, weightString; string englishString, foreignString, alignmentString, weightString;
@ -295,7 +301,10 @@ int main(int argc, char* argv[])
getline(*iwFileP, weightString); getline(*iwFileP, weightString);
} }
SentenceAlignment sentence; SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection,
targetSyntax, false);
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line //az: output src, tgt, and alingment line
if (options.isOnlyOutputSpanInfo()) { if (options.isOnlyOutputSpanInfo()) {
@ -360,7 +369,7 @@ void ExtractTask::Run()
} }
void ExtractTask::extract(SentenceAlignment &sentence) void ExtractTask::extract(SentenceAlignmentWithSyntax &sentence)
{ {
int countE = sentence.target.size(); int countE = sentence.target.size();
int countF = sentence.source.size(); int countF = sentence.source.size();
@ -454,7 +463,15 @@ void ExtractTask::extract(SentenceAlignment &sentence)
// if(m_options.isAllModelsOutputFlag()) // if(m_options.isAllModelsOutputFlag())
// " | | "; // " | | ";
} }
addPhrase(sentence, startE, endE, startF, endF, orientationInfo); std::string phrasePropertiesString;
bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
doAdd = doAdd || isTargetConstituentCovered;
}
if (doAdd) {
addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
}
} }
} }
} }
@ -510,12 +527,20 @@ void ExtractTask::extract(SentenceAlignment &sentence)
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " + ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : ""); ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo); std::string phrasePropertiesString;
bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
doAdd = doAdd || isTargetConstituentCovered;
}
if (doAdd) {
addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
}
} }
} }
} }
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop, bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit, int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int) ) bool (*ge)(int, int), bool (*lt)(int, int) )
@ -541,7 +566,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp
} }
// to be called with countF-1 instead of countF // to be called with countF-1 instead of countF
REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop, bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit, int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int), bool (*ge)(int, int), bool (*lt)(int, int),
@ -577,7 +602,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model
} }
// to be called with countF-1 instead of countF // to be called with countF-1 instead of countF
REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop, bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit, int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int), bool (*ge)(int, int), bool (*lt)(int, int),
@ -629,7 +654,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy
return UNKNOWN; return UNKNOWN;
} }
bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei )
{ {
if (ei == -1 && fi == -1) if (ei == -1 && fi == -1)
return true; return true;
@ -715,8 +740,138 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
} }
return ""; return "";
} }
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) bool ExtractTask::checkTargetConstituentBoundaries( const SentenceAlignmentWithSyntax &sentence,
int startE, int endE, int startF, int endF,
std::string &phrasePropertiesString)
{
ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesLeft;
if (m_options.isTargetConstituentBoundariesFlag()) {
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "{{TargetConstituentBoundariesLeft ";
}
bool validTargetConstituentBoundaries = false;
bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
if (m_options.isTargetConstituentBoundariesFlag()) {
if (startE==0) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "BOS_";
}
}
if (!sentence.targetTree.HasNodeStartingAtPosition(startE)) {
validTargetConstituentBoundaries = false;
} else {
const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(startE);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) {
if ( (*iter)->end == endE ) {
validTargetConstituentBoundaries = true;
if (!m_options.isTargetConstituentBoundariesFlag()) {
break;
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
}
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << (*iter)->label;
}
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
}
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "}}";
}
if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) {
// skip over all boundary punctuation and check again
bool relaxedValidTargetConstituentBoundaries = false;
int relaxedStartE = startE;
int relaxedEndE = endE;
const std::string punctuation = ",;.:!?";
while ( (relaxedStartE < endE) &&
(sentence.target[relaxedStartE].size() == 1) &&
(punctuation.find(sentence.target[relaxedStartE].at(0)) != std::string::npos) ) {
++relaxedStartE;
}
while ( (relaxedEndE > relaxedStartE) &&
(sentence.target[relaxedEndE].size() == 1) &&
(punctuation.find(sentence.target[relaxedEndE].at(0)) != std::string::npos) ) {
--relaxedEndE;
}
if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) {
const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(relaxedStartE);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin();
(iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries);
++iter ) {
if ( (*iter)->end == relaxedEndE ) {
relaxedValidTargetConstituentBoundaries = true;
}
}
}
if (!relaxedValidTargetConstituentBoundaries) {
return false;
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent;
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "{{TargetConstituentBoundariesRightAdjacent ";
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
if (endE==sentence.target.size()-1) {
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "EOS_";
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
const std::vector< SyntaxNode* >& adjacentNodes = sentence.targetTree.GetNodesByStartPosition(endE+1);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
}
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << (*iter)->label;
}
}
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
}
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "}}";
phrasePropertiesString += " ";
phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesLeft.str();
phrasePropertiesString += " ";
phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent.str();
}
return true;
}
void ExtractTask::addPhrase( const SentenceAlignmentWithSyntax &sentence,
int startE, int endE, int startF, int endF,
const std::string &orientationInfo,
const std::string &phrasePropertiesString)
{ {
// source // source
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
@ -746,11 +901,18 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isTranslationFlag()) outextractstr << "||| ";
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
// target // target
for(int ei=startE; ei<=endE; ei++) { for(int ei=startE; ei<=endE; ei++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; if (m_options.isTranslationFlag()) {
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; outextractstr << sentence.target[ei] << " ";
outextractstrInv << sentence.target[ei] << " ";
}
if (m_options.isOrientationFlag()) {
outextractstrOrientation << sentence.target[ei] << " ";
}
} }
if (m_options.isTranslationFlag()) outextractstr << "|||"; if (m_options.isTranslationFlag()) outextractstr << "|||";
if (m_options.isTranslationFlag()) outextractstrInv << "||| "; if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
@ -792,7 +954,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
} }
} }
outextractstr << phrasePropertiesString;
// generate two lines for every extracted phrase: // generate two lines for every extracted phrase:
// once with left, once with right context // once with left, once with right context
@ -901,7 +1063,7 @@ void ExtractTask::writePhrasesToFile()
// if proper conditioning, we need the number of times a source phrase occured // if proper conditioning, we need the number of times a source phrase occured
void ExtractTask::extractBase( SentenceAlignment &sentence ) void ExtractTask::extractBase( SentenceAlignmentWithSyntax &sentence )
{ {
ostringstream outextractFile; ostringstream outextractFile;
ostringstream outextractFileInv; ostringstream outextractFileInv;
@ -935,7 +1097,7 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
} }
bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF) bool ExtractTask::checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF)
{ {
for (size_t pos = startF; pos <= endF; ++pos) { for (size_t pos = startF; pos <= endF; ++pos) {
const string &sourceWord = sentence.source[pos]; const string &sourceWord = sentence.source[pos];

View File

@ -68,6 +68,7 @@ bool spanLength = false;
bool ruleLength = false; bool ruleLength = false;
bool nonTermContext = false; bool nonTermContext = false;
bool nonTermContextTarget = false; bool nonTermContextTarget = false;
bool targetConstituentBoundariesFlag = false;
int countOfCounts[COC_MAX+1]; int countOfCounts[COC_MAX+1];
int totalDistinct = 0; int totalDistinct = 0;
@ -286,6 +287,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--NonTermContextTarget") == 0) { } else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
nonTermContextTarget = true; nonTermContextTarget = true;
std::cerr << "non-term context (target)" << std::endl; std::cerr << "non-term context (target)" << std::endl;
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
targetConstituentBoundariesFlag = true;
std::cerr << "including target constituent boundaries information" << std::endl;
} else { } else {
featureArgs.push_back(argv[i]); featureArgs.push_back(argv[i]);
++i; ++i;
@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
} }
} }
// target constituent boundaries
if (targetConstituentBoundariesFlag && !inverseFlag) {
const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
if (!targetConstituentBoundariesLeftValues.empty()) {
phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
}
const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
}
}
phraseTableFile << std::endl; phraseTableFile << std::endl;
} }

View File

@ -2407,6 +2407,12 @@ sub define_training_extract_phrases {
if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) { if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) {
$cmd .= "-ghkm-strip-bitpar-nonterminal-labels "; $cmd .= "-ghkm-strip-bitpar-nonterminal-labels ";
} }
} else { # !hierarchical-rule-set
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
} }
my $extract_settings = &get("TRAINING:extract-settings"); my $extract_settings = &get("TRAINING:extract-settings");
@ -2464,6 +2470,12 @@ sub define_training_build_ttable {
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model","")); my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; $cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
} }
} else { # !hierarchical-rule-set
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
} }
&create_step($step_id,$cmd); &create_step($step_id,$cmd);
@ -2678,6 +2690,10 @@ sub define_training_create_config {
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file "; $cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
} }
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
# sparse lexical features provide additional content for config file # sparse lexical features provide additional content for config file
my @additional_ini_files; my @additional_ini_files;
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features; push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;

View File

@ -134,6 +134,7 @@ my($_EXTERNAL_BINDIR,
$_LMODEL_OOV_FEATURE, $_LMODEL_OOV_FEATURE,
$_NUM_LATTICE_FEATURES, $_NUM_LATTICE_FEATURES,
$IGNORE, $IGNORE,
$_TARGET_CONSTITUENT_BOUNDARIES,
$_FLEXIBILITY_SCORE, $_FLEXIBILITY_SCORE,
$_FEATURE_LINES, $_FEATURE_LINES,
$_WEIGHT_LINES, $_WEIGHT_LINES,
@ -258,6 +259,7 @@ $_HELP = 1
'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE, 'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE, 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES, 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
'target-constituent-boundaries' => \$_TARGET_CONSTITUENT_BOUNDARIES,
'flexibility-score' => \$_FLEXIBILITY_SCORE, 'flexibility-score' => \$_FLEXIBILITY_SCORE,
'config-add-feature-lines=s' => \$_FEATURE_LINES, 'config-add-feature-lines=s' => \$_FEATURE_LINES,
'config-add-weight-lines=s' => \$_WEIGHT_LINES, 'config-add-weight-lines=s' => \$_WEIGHT_LINES,
@ -1607,6 +1609,7 @@ sub extract_phrase {
$cmd .= " --GZOutput "; $cmd .= " --GZOutput ";
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE; $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/; $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE; $cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
$cmd .= " --NoTTable" if $_MMSAPT; $cmd .= " --NoTTable" if $_MMSAPT;
@ -1764,9 +1767,10 @@ sub score_phrase_phrase_extract {
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE); $cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE); $cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
$cmd .= " $DOMAIN" if $DOMAIN; $cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
# sorting # sorting
if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) { if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) {
@ -2386,6 +2390,7 @@ sub create_ini {
print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE); print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "\n"; print INI "\n";
} }
print INI "TargetConstituentAdjacencyFeature\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
print INI $feature_spec; print INI $feature_spec;
print INI "\n# dense weights for feature functions\n"; print INI "\n# dense weights for feature functions\n";
@ -2398,6 +2403,7 @@ sub create_ini {
print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE); print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION; print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION;
print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE); print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
print INI "TargetConstituentAdjacencyFeature0= 0.05 -0.1\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
print INI $weight_spec; print INI $weight_spec;
close(INI); close(INI);
} }