mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-07-14 14:50:41 +03:00
Option for target constituent constrained phrase extraction. TargetConstituentAdjacencyFeature.
This commit is contained in:
parent
c75f9854e4
commit
1659d6b4c8
@ -42,6 +42,7 @@
|
||||
#include "moses/FF/ControlRecombination.h"
|
||||
#include "moses/FF/ConstrainedDecoding.h"
|
||||
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
|
||||
#include "moses/FF/TargetConstituentAdjacencyFeature.h"
|
||||
#include "moses/FF/TargetPreferencesFeature.h"
|
||||
#include "moses/FF/CoveredReferenceFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
@ -264,6 +265,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(CoveredReferenceFeature);
|
||||
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
|
||||
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
|
||||
MOSES_FNAME(TargetConstituentAdjacencyFeature);
|
||||
MOSES_FNAME(TargetPreferencesFeature);
|
||||
MOSES_FNAME(TreeStructureFeature);
|
||||
MOSES_FNAME(SoftMatchingFeature);
|
||||
|
189
moses/FF/TargetConstituentAdjacencyFeature.cpp
Normal file
189
moses/FF/TargetConstituentAdjacencyFeature.cpp
Normal file
@ -0,0 +1,189 @@
|
||||
#include "TargetConstituentAdjacencyFeature.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/TreeInput.h"
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
size_t TargetConstituentAdjacencyFeatureState::hash() const
|
||||
{
|
||||
if (m_recombine) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
boost::hash_combine(ret, m_collection.size());
|
||||
for (std::map<const Factor*, float>::const_iterator it=m_collection.begin();
|
||||
it!=m_collection.end(); ++it) {
|
||||
boost::hash_combine(ret, it->first);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
bool TargetConstituentAdjacencyFeatureState::operator==(const FFState& other) const
|
||||
{
|
||||
if (m_recombine) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this == &other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetConstituentAdjacencyFeatureState* otherState =
|
||||
dynamic_cast<const TargetConstituentAdjacencyFeatureState*>(&other);
|
||||
UTIL_THROW_IF2(otherState == NULL, "Wrong state type");
|
||||
|
||||
if (m_collection.size() != (otherState->m_collection).size()) {
|
||||
return false;
|
||||
}
|
||||
std::map<const Factor*, float>::const_iterator thisIt, otherIt;
|
||||
for (thisIt=m_collection.begin(), otherIt=(otherState->m_collection).begin();
|
||||
thisIt!=m_collection.end(); ++thisIt, ++otherIt) {
|
||||
if (thisIt->first != otherIt->first) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
TargetConstituentAdjacencyFeature::TargetConstituentAdjacencyFeature(const std::string &line)
|
||||
: StatefulFeatureFunction(2, line)
|
||||
, m_featureVariant(0)
|
||||
, m_recombine(false)
|
||||
{
|
||||
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
||||
ReadParameters();
|
||||
VERBOSE(1, " Done." << std::endl);
|
||||
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
|
||||
}
|
||||
|
||||
|
||||
void TargetConstituentAdjacencyFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "variant") {
|
||||
m_featureVariant = Scan<size_t>(value);
|
||||
} else if (key == "recombine") {
|
||||
m_recombine = Scan<bool>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
FFState* TargetConstituentAdjacencyFeature::EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
// dense scores
|
||||
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 2
|
||||
|
||||
// state
|
||||
const TargetConstituentAdjacencyFeatureState *prevState = static_cast<const TargetConstituentAdjacencyFeatureState*>(prev_state);
|
||||
|
||||
// read TargetConstituentAdjacency property
|
||||
const TargetPhrase &currTarPhr = cur_hypo.GetCurrTargetPhrase();
|
||||
FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl);
|
||||
|
||||
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesLeft")) {
|
||||
|
||||
const TargetConstituentBoundariesLeftPhraseProperty *targetConstituentBoundariesLeftPhraseProperty =
|
||||
static_cast<const TargetConstituentBoundariesLeftPhraseProperty*>(property);
|
||||
const TargetConstituentBoundariesLeftCollection& leftConstituentCollection =
|
||||
targetConstituentBoundariesLeftPhraseProperty->GetCollection();
|
||||
float prob = 0;
|
||||
size_t numMatch = 0;
|
||||
size_t numOverall = 0;
|
||||
|
||||
if ( !cur_hypo.GetPrevHypo()->GetPrevHypo() ) {
|
||||
// previous hypothesis is initial, i.e. target sentence starts here
|
||||
|
||||
++numOverall;
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
const Factor* bosFactor = factorCollection.AddFactor("BOS_",false);
|
||||
TargetConstituentBoundariesLeftCollection::const_iterator found =
|
||||
leftConstituentCollection.find(bosFactor);
|
||||
if ( found != leftConstituentCollection.end() ) {
|
||||
++numMatch;
|
||||
prob += found->second;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
const std::map<const Factor*, float>& hypConstituentCollection = prevState->m_collection;
|
||||
std::map<const Factor*, float>::const_iterator iter1 = hypConstituentCollection.begin();
|
||||
std::map<const Factor*, float>::const_iterator iter2 = leftConstituentCollection.begin();
|
||||
while ( iter1 != hypConstituentCollection.end() && iter2 != leftConstituentCollection.end() ) {
|
||||
++numOverall;
|
||||
if ( iter1->first < iter2->first ) {
|
||||
++iter1;
|
||||
} else if ( iter2->first < iter1->first ) {
|
||||
++iter2;
|
||||
} else {
|
||||
++numMatch;
|
||||
float currProb = iter1->second * iter2->second;
|
||||
if (currProb > prob)
|
||||
prob = currProb;
|
||||
++iter1;
|
||||
++iter2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( (numMatch == 0) || (prob == 0) ) {
|
||||
++newScores[1];
|
||||
} else {
|
||||
if ( m_featureVariant == 1 ) {
|
||||
newScores[0] += TransformScore(prob);
|
||||
} else {
|
||||
newScores[0] += TransformScore( (float)numMatch/numOverall );
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// abort with error message if the phrase does not translate an unknown word
|
||||
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
|
||||
<< ": Missing TargetConstituentBoundariesLeft property.");
|
||||
|
||||
++newScores[1];
|
||||
|
||||
}
|
||||
|
||||
TargetConstituentAdjacencyFeatureState *newState = new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
|
||||
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesRightAdjacent")) {
|
||||
|
||||
const TargetConstituentBoundariesRightAdjacentPhraseProperty *targetConstituentBoundariesRightAdjacentPhraseProperty =
|
||||
static_cast<const TargetConstituentBoundariesRightAdjacentPhraseProperty*>(property);
|
||||
const TargetConstituentBoundariesLeftCollection& rightAdjacentConstituentCollection = targetConstituentBoundariesRightAdjacentPhraseProperty->GetCollection();
|
||||
|
||||
std::copy(rightAdjacentConstituentCollection.begin(), rightAdjacentConstituentCollection.end(),
|
||||
std::inserter(newState->m_collection, newState->m_collection.begin()));
|
||||
|
||||
} else {
|
||||
|
||||
// abort with error message if the phrase does not translate an unknown word
|
||||
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
|
||||
<< ": Missing TargetConstituentBoundariesRightAdjacent property.");
|
||||
|
||||
}
|
||||
|
||||
// add scores
|
||||
accumulator->PlusEquals(this, newScores);
|
||||
|
||||
return newState;
|
||||
}
|
||||
|
||||
}
|
||||
|
101
moses/FF/TargetConstituentAdjacencyFeature.h
Normal file
101
moses/FF/TargetConstituentAdjacencyFeature.h
Normal file
@ -0,0 +1,101 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include "StatefulFeatureFunction.h"
|
||||
#include "FFState.h"
|
||||
#include "util/exception.hh"
|
||||
#include <stdint.h>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class TargetConstituentAdjacencyFeatureState : public FFState
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
friend class TargetConstituentAdjacencyFeature;
|
||||
|
||||
TargetConstituentAdjacencyFeatureState(bool recombine)
|
||||
: m_recombine(recombine)
|
||||
{};
|
||||
|
||||
size_t hash() const;
|
||||
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
private:
|
||||
|
||||
const bool m_recombine;
|
||||
std::map<const Factor*, float> m_collection;
|
||||
|
||||
};
|
||||
|
||||
|
||||
class TargetConstituentAdjacencyFeature : public StatefulFeatureFunction
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
TargetConstituentAdjacencyFeature(const std::string &line);
|
||||
|
||||
~TargetConstituentAdjacencyFeature()
|
||||
{};
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
};
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
||||
return new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
};
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void Load(AllOptions::ptr const& opts)
|
||||
{};
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{};
|
||||
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{};
|
||||
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{};
|
||||
|
||||
FFState* EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
FFState* EvaluateWhenApplied(
|
||||
const ChartHypothesis& cur_hypo,
|
||||
int featureID, // used to index the state in the previous hypotheses
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for chart-based decoding.");
|
||||
return new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
size_t m_featureVariant;
|
||||
bool m_recombine;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_Factor_h
|
||||
#define moses_Factor_h
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
@ -98,4 +97,4 @@ public:
|
||||
size_t hash_value(const Factor &f);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -11,6 +11,8 @@
|
||||
#include "moses/PP/SpanLengthPhraseProperty.h"
|
||||
#include "moses/PP/NonTermContextProperty.h"
|
||||
#include "moses/PP/OrientationPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -58,6 +60,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
|
||||
|
||||
MOSES_PNAME2("Counts", CountsPhraseProperty);
|
||||
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
|
||||
MOSES_PNAME2("TargetConstituentBoundariesLeft", TargetConstituentBoundariesLeftPhraseProperty);
|
||||
MOSES_PNAME2("TargetConstituentBoundariesRightAdjacent", TargetConstituentBoundariesRightAdjacentPhraseProperty);
|
||||
MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty);
|
||||
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
||||
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
|
||||
|
@ -5,9 +5,14 @@ namespace Moses
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj)
|
||||
{
|
||||
out << "Base phrase property";
|
||||
obj.Print(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
void PhraseProperty::Print(std::ostream &out) const
|
||||
{
|
||||
out << "Base phrase property";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -28,6 +28,8 @@ public:
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
std::string *m_value;
|
||||
|
||||
};
|
||||
|
63
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp
Normal file
63
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/Util.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <ostream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void TargetConstituentBoundariesLeftPhraseProperty::ProcessValue(const std::string &value)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
std::vector<std::string> tokens;
|
||||
Tokenize(tokens, value, " ");
|
||||
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
|
||||
while (tokenIter != tokens.end()) {
|
||||
try {
|
||||
|
||||
std::vector<std::string> constituents;
|
||||
Tokenize(constituents, *tokenIter, "<");
|
||||
++tokenIter;
|
||||
float count = std::atof( tokenIter->c_str() );
|
||||
++tokenIter;
|
||||
|
||||
std::set<const Factor* > dedup;
|
||||
|
||||
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
|
||||
constituentIter != constituents.end(); ++constituentIter ) {
|
||||
|
||||
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
|
||||
|
||||
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
|
||||
dedup.insert(constituentFactor);
|
||||
if ( dedupIns.second ) {
|
||||
|
||||
std::pair< TargetConstituentBoundariesLeftCollection::iterator, bool > inserted =
|
||||
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
|
||||
if ( !inserted.second ) {
|
||||
(inserted.first)->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: Read error. Flawed property? " << value);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TargetConstituentBoundariesLeftPhraseProperty::Print(std::ostream& out) const
|
||||
{
|
||||
for ( TargetConstituentBoundariesLeftCollection::const_iterator it = m_constituentsCollection.begin();
|
||||
it != m_constituentsCollection.end(); ++it ) {
|
||||
if ( it != m_constituentsCollection.begin() ) {
|
||||
out << " ";
|
||||
}
|
||||
out << *(it->first) << " " << it->second;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
40
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h
Normal file
40
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include "moses/Factor.h"
|
||||
#include "util/exception.hh"
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef std::map<const Factor*, float> TargetConstituentBoundariesLeftCollection;
|
||||
|
||||
|
||||
class TargetConstituentBoundariesLeftPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
TargetConstituentBoundariesLeftPhraseProperty()
|
||||
{};
|
||||
|
||||
virtual void ProcessValue(const std::string &value);
|
||||
|
||||
const TargetConstituentBoundariesLeftCollection &GetCollection() const {
|
||||
return m_constituentsCollection;
|
||||
};
|
||||
|
||||
virtual const std::string *GetValueString() const {
|
||||
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: value string not available in this phrase property");
|
||||
return NULL;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
TargetConstituentBoundariesLeftCollection m_constituentsCollection;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -0,0 +1,63 @@
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/Util.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <ostream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
std::vector<std::string> tokens;
|
||||
Tokenize(tokens, value, " ");
|
||||
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
|
||||
while (tokenIter != tokens.end()) {
|
||||
try {
|
||||
|
||||
std::vector<std::string> constituents;
|
||||
Tokenize(constituents, *tokenIter, "<");
|
||||
++tokenIter;
|
||||
float count = std::atof( tokenIter->c_str() );
|
||||
++tokenIter;
|
||||
|
||||
std::set<const Factor* > dedup;
|
||||
|
||||
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
|
||||
constituentIter != constituents.end(); ++constituentIter ) {
|
||||
|
||||
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
|
||||
|
||||
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
|
||||
dedup.insert(constituentFactor);
|
||||
if ( dedupIns.second ) {
|
||||
|
||||
std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted =
|
||||
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
|
||||
if ( !inserted.second ) {
|
||||
(inserted.first)->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TargetConstituentBoundariesRightAdjacentPhraseProperty::Print(std::ostream& out) const
|
||||
{
|
||||
for ( TargetConstituentBoundariesRightAdjacentCollection::const_iterator it = m_constituentsCollection.begin();
|
||||
it != m_constituentsCollection.end(); ++it ) {
|
||||
if ( it != m_constituentsCollection.begin() ) {
|
||||
out << " ";
|
||||
}
|
||||
out << *(it->first) << " " << it->second;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include "moses/Factor.h"
|
||||
#include "util/exception.hh"
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef std::map<const Factor*, float> TargetConstituentBoundariesRightAdjacentCollection;
|
||||
|
||||
|
||||
class TargetConstituentBoundariesRightAdjacentPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
TargetConstituentBoundariesRightAdjacentPhraseProperty()
|
||||
{};
|
||||
|
||||
virtual void ProcessValue(const std::string &value);
|
||||
|
||||
const TargetConstituentBoundariesRightAdjacentCollection &GetCollection() const {
|
||||
return m_constituentsCollection;
|
||||
};
|
||||
|
||||
virtual const std::string *GetValueString() const {
|
||||
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: value string not available in this phrase property");
|
||||
return NULL;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
TargetConstituentBoundariesRightAdjacentCollection m_constituentsCollection;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
|
||||
std::ostringstream oss;
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
if (!(iter->first).empty()) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
oss << " ";
|
||||
}
|
||||
oss << iter->first;
|
||||
oss << " ";
|
||||
oss << iter->second;
|
||||
}
|
||||
oss << iter->first;
|
||||
oss << " ";
|
||||
oss << iter->second;
|
||||
}
|
||||
|
||||
std::string allPropertyValuesString(oss.str());
|
||||
|
@ -50,6 +50,8 @@ private:
|
||||
bool onlyOutputSpanInfo;
|
||||
bool gzOutput;
|
||||
std::string instanceWeightsFile; //weights for each sentence
|
||||
bool targetConstituentConstrainedFlag;
|
||||
bool targetConstituentBoundariesFlag;
|
||||
bool flexScoreFlag;
|
||||
bool singleWordHeuristicFlag;
|
||||
|
||||
@ -73,6 +75,8 @@ public:
|
||||
includeSentenceIdFlag(false),
|
||||
onlyOutputSpanInfo(false),
|
||||
gzOutput(false),
|
||||
targetConstituentConstrainedFlag(false),
|
||||
targetConstituentBoundariesFlag(false),
|
||||
flexScoreFlag(false),
|
||||
singleWordHeuristicFlag(false),
|
||||
debug(false) {
|
||||
@ -118,6 +122,12 @@ public:
|
||||
void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
|
||||
instanceWeightsFile = std::string(initInstanceWeightsFile);
|
||||
}
|
||||
void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
|
||||
targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
|
||||
}
|
||||
void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
|
||||
targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
|
||||
}
|
||||
void initFlexScoreFlag(const bool initflexScoreFlag) {
|
||||
flexScoreFlag=initflexScoreFlag;
|
||||
}
|
||||
@ -165,6 +175,12 @@ public:
|
||||
std::string getInstanceWeightsFile() const {
|
||||
return instanceWeightsFile;
|
||||
}
|
||||
bool isTargetConstituentConstrainedFlag() const {
|
||||
return targetConstituentConstrainedFlag;
|
||||
}
|
||||
bool isTargetConstituentBoundariesFlag() const {
|
||||
return targetConstituentBoundariesFlag;
|
||||
}
|
||||
bool isFlexScoreFlag() const {
|
||||
return flexScoreFlag;
|
||||
}
|
||||
|
@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
||||
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
|
||||
m_nodes.push_back( newNode );
|
||||
m_index[ startPos ][ endPos ].push_back( newNode );
|
||||
m_endPositionsIndex[ endPos ].push_back( newNode );
|
||||
m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
|
||||
m_numWords = std::max(endPos+1, m_numWords);
|
||||
return newNode;
|
||||
}
|
||||
@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
|
||||
{
|
||||
return GetNodesByStartPosition(startPos).size() > 0;
|
||||
}
|
||||
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
|
||||
int startPos ) const
|
||||
{
|
||||
InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
|
||||
if (startIndex == m_startPositionsIndex.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
return startIndex->second;
|
||||
}
|
||||
|
||||
bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
|
||||
{
|
||||
return GetNodesByEndPosition(endPos).size() > 0;
|
||||
}
|
||||
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
|
||||
int endPos ) const
|
||||
{
|
||||
InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
|
||||
if (endIndex == m_endPositionsIndex.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||
{
|
||||
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
||||
|
@ -50,6 +50,11 @@ public:
|
||||
//! Lookup the SyntaxNodes for a given span.
|
||||
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
||||
|
||||
bool HasNodeStartingAtPosition( int startPos ) const;
|
||||
const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
|
||||
bool HasNodeEndingAtPosition( int endPos ) const;
|
||||
const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
|
||||
|
||||
//! Get a vector of pointers to all SyntaxNodes (unordered).
|
||||
const std::vector< SyntaxNode* >& GetAllNodes() {
|
||||
return m_nodes;
|
||||
@ -78,6 +83,9 @@ private:
|
||||
NodeIndex m_index;
|
||||
int m_numWords;
|
||||
std::vector< SyntaxNode* > m_emptyNode;
|
||||
|
||||
InnerNodeIndex m_endPositionsIndex;
|
||||
InnerNodeIndex m_startPositionsIndex;
|
||||
};
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
@ -1,11 +1,3 @@
|
||||
/*
|
||||
* extract.cpp
|
||||
* Modified by: Rohit Gupta CDAC, Mumbai, India
|
||||
* on July 15, 2012 to implement parallel processing
|
||||
* Modified by: Nadi Tomeh - LIMSI/CNRS
|
||||
* Machine Translation Marathon 2010, Dublin
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
@ -20,11 +12,12 @@
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "PhraseExtractionOptions.h"
|
||||
#include "SentenceAlignmentWithSyntax.h"
|
||||
#include "SyntaxNode.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -46,14 +39,14 @@ typedef vector < HPhrase > HPhraseVector;
|
||||
// The key of the map is the English index and the value is a set of the source ones
|
||||
typedef map <int, set<int> > HSentenceVertices;
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int));
|
||||
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSentenceVertices &, const HSentenceVertices &);
|
||||
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSentenceVertices &, const HSentenceVertices &,
|
||||
@ -69,7 +62,7 @@ bool ge(int, int);
|
||||
bool le(int, int);
|
||||
bool lt(int, int);
|
||||
|
||||
bool isAligned (SentenceAlignment &, int, int);
|
||||
bool isAligned (SentenceAlignmentWithSyntax &, int, int);
|
||||
|
||||
int sentenceOffset = 0;
|
||||
|
||||
@ -87,7 +80,7 @@ class ExtractTask
|
||||
{
|
||||
public:
|
||||
ExtractTask(
|
||||
size_t id, SentenceAlignment &sentence,
|
||||
size_t id, SentenceAlignmentWithSyntax &sentence,
|
||||
PhraseExtractionOptions &initoptions,
|
||||
Moses::OutputFileStream &extractFile,
|
||||
Moses::OutputFileStream &extractFileInv,
|
||||
@ -109,14 +102,17 @@ private:
|
||||
vector< string > m_extractedPhrasesSid;
|
||||
vector< string > m_extractedPhrasesContext;
|
||||
vector< string > m_extractedPhrasesContextInv;
|
||||
void extractBase(SentenceAlignment &);
|
||||
void extract(SentenceAlignment &);
|
||||
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
||||
void extractBase(SentenceAlignmentWithSyntax &);
|
||||
void extract(SentenceAlignmentWithSyntax &);
|
||||
void addPhrase(const SentenceAlignmentWithSyntax &, int, int, int, int, const std::string &, const std::string &);
|
||||
void writePhrasesToFile();
|
||||
bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
|
||||
bool checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF);
|
||||
bool isPlaceholder(const string &word);
|
||||
bool checkTargetConstituentBoundaries(const SentenceAlignmentWithSyntax &sentence,
|
||||
int startE, int endE, int startF, int endF,
|
||||
std::string &phrasePropertiesString);
|
||||
|
||||
SentenceAlignment &m_sentence;
|
||||
SentenceAlignmentWithSyntax &m_sentence;
|
||||
const PhraseExtractionOptions &m_options;
|
||||
Moses::OutputFileStream &m_extractFile;
|
||||
Moses::OutputFileStream &m_extractFileInv;
|
||||
@ -133,7 +129,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
|
||||
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
|
||||
cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ";
|
||||
cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -153,6 +150,10 @@ int main(int argc, char* argv[])
|
||||
options.initOnlyOutputSpanInfo(true);
|
||||
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
options.initOrientationFlag(true);
|
||||
} else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) {
|
||||
options.initTargetConstituentConstrainedFlag(true);
|
||||
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
|
||||
options.initTargetConstituentBoundariesFlag(true);
|
||||
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
|
||||
options.initFlexScoreFlag(true);
|
||||
} else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
|
||||
@ -280,6 +281,11 @@ int main(int argc, char* argv[])
|
||||
extractFileContextInv.Open(fileNameExtractContextInv.c_str());
|
||||
}
|
||||
|
||||
// stats on labels for glue grammar and unknown word label probabilities
|
||||
set< string > targetLabelCollection, sourceLabelCollection;
|
||||
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
|
||||
const bool targetSyntax = true;
|
||||
|
||||
int i = sentenceOffset;
|
||||
|
||||
string englishString, foreignString, alignmentString, weightString;
|
||||
@ -295,7 +301,10 @@ int main(int argc, char* argv[])
|
||||
getline(*iwFileP, weightString);
|
||||
}
|
||||
|
||||
SentenceAlignment sentence;
|
||||
SentenceAlignmentWithSyntax sentence
|
||||
(targetLabelCollection, sourceLabelCollection,
|
||||
targetTopLabelCollection, sourceTopLabelCollection,
|
||||
targetSyntax, false);
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
if (options.isOnlyOutputSpanInfo()) {
|
||||
@ -360,7 +369,7 @@ void ExtractTask::Run()
|
||||
|
||||
}
|
||||
|
||||
void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
void ExtractTask::extract(SentenceAlignmentWithSyntax &sentence)
|
||||
{
|
||||
int countE = sentence.target.size();
|
||||
int countF = sentence.source.size();
|
||||
@ -454,7 +463,15 @@ void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
// if(m_options.isAllModelsOutputFlag())
|
||||
// " | | ";
|
||||
}
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
std::string phrasePropertiesString;
|
||||
bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
|
||||
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
|
||||
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
|
||||
doAdd = doAdd || isTargetConstituentCovered;
|
||||
}
|
||||
if (doAdd) {
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -510,12 +527,20 @@ void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
|
||||
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
std::string phrasePropertiesString;
|
||||
bool doAdd = !m_options.isTargetConstituentBoundariesFlag();
|
||||
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
|
||||
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(sentence, startE, endE, startF, endF, phrasePropertiesString);
|
||||
doAdd = doAdd || isTargetConstituentCovered;
|
||||
}
|
||||
if (doAdd) {
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo, phrasePropertiesString);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int) )
|
||||
@ -541,7 +566,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int),
|
||||
@ -577,7 +602,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int),
|
||||
@ -629,7 +654,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
|
||||
bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei )
|
||||
{
|
||||
if (ei == -1 && fi == -1)
|
||||
return true;
|
||||
@ -715,8 +740,138 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
|
||||
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||
bool ExtractTask::checkTargetConstituentBoundaries( const SentenceAlignmentWithSyntax &sentence,
|
||||
int startE, int endE, int startF, int endF,
|
||||
std::string &phrasePropertiesString)
|
||||
{
|
||||
ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesLeft;
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "{{TargetConstituentBoundariesLeft ";
|
||||
}
|
||||
|
||||
bool validTargetConstituentBoundaries = false;
|
||||
bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (startE==0) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "BOS_";
|
||||
}
|
||||
}
|
||||
|
||||
if (!sentence.targetTree.HasNodeStartingAtPosition(startE)) {
|
||||
|
||||
validTargetConstituentBoundaries = false;
|
||||
|
||||
} else {
|
||||
|
||||
const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(startE);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) {
|
||||
if ( (*iter)->end == endE ) {
|
||||
validTargetConstituentBoundaries = true;
|
||||
if (!m_options.isTargetConstituentBoundariesFlag()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
} else {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
|
||||
}
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << (*iter)->label;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "<";
|
||||
}
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesLeft << "}}";
|
||||
}
|
||||
|
||||
|
||||
if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) {
|
||||
// skip over all boundary punctuation and check again
|
||||
bool relaxedValidTargetConstituentBoundaries = false;
|
||||
int relaxedStartE = startE;
|
||||
int relaxedEndE = endE;
|
||||
const std::string punctuation = ",;.:!?";
|
||||
while ( (relaxedStartE < endE) &&
|
||||
(sentence.target[relaxedStartE].size() == 1) &&
|
||||
(punctuation.find(sentence.target[relaxedStartE].at(0)) != std::string::npos) ) {
|
||||
++relaxedStartE;
|
||||
}
|
||||
while ( (relaxedEndE > relaxedStartE) &&
|
||||
(sentence.target[relaxedEndE].size() == 1) &&
|
||||
(punctuation.find(sentence.target[relaxedEndE].at(0)) != std::string::npos) ) {
|
||||
--relaxedEndE;
|
||||
}
|
||||
|
||||
if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) {
|
||||
const std::vector< SyntaxNode* >& startingNodes = sentence.targetTree.GetNodesByStartPosition(relaxedStartE);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin();
|
||||
(iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries);
|
||||
++iter ) {
|
||||
if ( (*iter)->end == relaxedEndE ) {
|
||||
relaxedValidTargetConstituentBoundaries = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!relaxedValidTargetConstituentBoundaries) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
|
||||
ostringstream outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent;
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "{{TargetConstituentBoundariesRightAdjacent ";
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
|
||||
|
||||
if (endE==sentence.target.size()-1) {
|
||||
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "EOS_";
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
|
||||
} else {
|
||||
|
||||
const std::vector< SyntaxNode* >& adjacentNodes = sentence.targetTree.GetNodesByStartPosition(endE+1);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
} else {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
|
||||
}
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << (*iter)->label;
|
||||
}
|
||||
}
|
||||
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "<";
|
||||
}
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent << "}}";
|
||||
|
||||
phrasePropertiesString += " ";
|
||||
phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesLeft.str();
|
||||
phrasePropertiesString += " ";
|
||||
phrasePropertiesString += outextractstrPhrasePropertyTargetConstituentBoundariesRightAdjacent.str();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void ExtractTask::addPhrase( const SentenceAlignmentWithSyntax &sentence,
|
||||
int startE, int endE, int startF, int endF,
|
||||
const std::string &orientationInfo,
|
||||
const std::string &phrasePropertiesString)
|
||||
{
|
||||
// source
|
||||
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
@ -746,11 +901,18 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
if (m_options.isTranslationFlag()) outextractstr << "||| ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||
|
||||
|
||||
// target
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
|
||||
|
||||
if (m_options.isTranslationFlag()) {
|
||||
outextractstr << sentence.target[ei] << " ";
|
||||
outextractstrInv << sentence.target[ei] << " ";
|
||||
}
|
||||
|
||||
if (m_options.isOrientationFlag()) {
|
||||
outextractstrOrientation << sentence.target[ei] << " ";
|
||||
}
|
||||
}
|
||||
if (m_options.isTranslationFlag()) outextractstr << "|||";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
|
||||
@ -792,7 +954,7 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
outextractstr << phrasePropertiesString;
|
||||
|
||||
// generate two lines for every extracted phrase:
|
||||
// once with left, once with right context
|
||||
@ -901,7 +1063,7 @@ void ExtractTask::writePhrasesToFile()
|
||||
|
||||
// if proper conditioning, we need the number of times a source phrase occured
|
||||
|
||||
void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
void ExtractTask::extractBase( SentenceAlignmentWithSyntax &sentence )
|
||||
{
|
||||
ostringstream outextractFile;
|
||||
ostringstream outextractFileInv;
|
||||
@ -935,7 +1097,7 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
}
|
||||
|
||||
|
||||
bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
|
||||
bool ExtractTask::checkPlaceholders (const SentenceAlignmentWithSyntax &sentence, int startE, int endE, int startF, int endF)
|
||||
{
|
||||
for (size_t pos = startF; pos <= endF; ++pos) {
|
||||
const string &sourceWord = sentence.source[pos];
|
||||
|
@ -68,6 +68,7 @@ bool spanLength = false;
|
||||
bool ruleLength = false;
|
||||
bool nonTermContext = false;
|
||||
bool nonTermContextTarget = false;
|
||||
bool targetConstituentBoundariesFlag = false;
|
||||
|
||||
int countOfCounts[COC_MAX+1];
|
||||
int totalDistinct = 0;
|
||||
@ -286,6 +287,9 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
|
||||
nonTermContextTarget = true;
|
||||
std::cerr << "non-term context (target)" << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
|
||||
targetConstituentBoundariesFlag = true;
|
||||
std::cerr << "including target constituent boundaries information" << std::endl;
|
||||
} else {
|
||||
featureArgs.push_back(argv[i]);
|
||||
++i;
|
||||
@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
}
|
||||
}
|
||||
|
||||
// target constituent boundaries
|
||||
if (targetConstituentBoundariesFlag && !inverseFlag) {
|
||||
const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
|
||||
if (!targetConstituentBoundariesLeftValues.empty()) {
|
||||
phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
|
||||
}
|
||||
const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
|
||||
if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
|
||||
phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << std::endl;
|
||||
}
|
||||
|
||||
|
@ -2407,6 +2407,12 @@ sub define_training_extract_phrases {
|
||||
if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) {
|
||||
$cmd .= "-ghkm-strip-bitpar-nonterminal-labels ";
|
||||
}
|
||||
|
||||
} else { # !hierarchical-rule-set
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
}
|
||||
|
||||
my $extract_settings = &get("TRAINING:extract-settings");
|
||||
@ -2464,6 +2470,12 @@ sub define_training_build_ttable {
|
||||
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||
}
|
||||
|
||||
} else { # !hierarchical-rule-set
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
@ -2678,6 +2690,10 @@ sub define_training_create_config {
|
||||
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
my @additional_ini_files;
|
||||
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;
|
||||
|
@ -134,6 +134,7 @@ my($_EXTERNAL_BINDIR,
|
||||
$_LMODEL_OOV_FEATURE,
|
||||
$_NUM_LATTICE_FEATURES,
|
||||
$IGNORE,
|
||||
$_TARGET_CONSTITUENT_BOUNDARIES,
|
||||
$_FLEXIBILITY_SCORE,
|
||||
$_FEATURE_LINES,
|
||||
$_WEIGHT_LINES,
|
||||
@ -258,6 +259,7 @@ $_HELP = 1
|
||||
'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
|
||||
'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
|
||||
'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
|
||||
'target-constituent-boundaries' => \$_TARGET_CONSTITUENT_BOUNDARIES,
|
||||
'flexibility-score' => \$_FLEXIBILITY_SCORE,
|
||||
'config-add-feature-lines=s' => \$_FEATURE_LINES,
|
||||
'config-add-weight-lines=s' => \$_WEIGHT_LINES,
|
||||
@ -1607,6 +1609,7 @@ sub extract_phrase {
|
||||
$cmd .= " --GZOutput ";
|
||||
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
|
||||
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
|
||||
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
|
||||
$cmd .= " --NoTTable" if $_MMSAPT;
|
||||
|
||||
@ -1764,9 +1767,10 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||
|
||||
# sorting
|
||||
if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) {
|
||||
@ -2386,6 +2390,7 @@ sub create_ini {
|
||||
print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE);
|
||||
print INI "\n";
|
||||
}
|
||||
print INI "TargetConstituentAdjacencyFeature\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
print INI $feature_spec;
|
||||
|
||||
print INI "\n# dense weights for feature functions\n";
|
||||
@ -2398,6 +2403,7 @@ sub create_ini {
|
||||
print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION;
|
||||
print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
print INI "TargetConstituentAdjacencyFeature0= 0.05 -0.1\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
print INI $weight_spec;
|
||||
close(INI);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user