diff --git a/moses/src/TargetNgramFeature.cpp b/moses/src/TargetNgramFeature.cpp
index 4cb260f97..d0992d328 100644
--- a/moses/src/TargetNgramFeature.cpp
+++ b/moses/src/TargetNgramFeature.cpp
@@ -194,24 +194,30 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
// loop over rule
bool makePrefix = false;
bool makeSuffix = false;
- bool beforeSubphrase = true;
- size_t terminalsBeforeSubphrase = 0;
- size_t terminalsAfterSubphrase = 0;
- for (size_t phrasePos = 0, wordPos = 0;
- phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize();
- phrasePos++)
+ bool collectForPrefix = true;
+ size_t prefixTerminals = 0;
+ size_t suffixTerminals = 0;
+ size_t totalTerminals = 0;
+ bool onlyTerminals = true;
+ bool prev_is_NT = false;
+ size_t prev_subPhraseLength = 0;
+ for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++)
{
// consult rule for either word or non-terminal
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// cerr << "word: " << word << endl;
// regular word
- if (!word.IsNonTerminal())
- {
- if (phrasePos==0)
- makePrefix = true;
+ if (!word.IsNonTerminal()) {
+ prev_is_NT = false;
+ ++totalTerminals;
- contextFactor.push_back(&word);
+ if (phrasePos==0)
+ makePrefix = true;
+ if (phrasePos==cur_hypo.GetCurrTargetPhrase().GetSize()-1 || prev_is_NT)
+ makeSuffix = true;
+
+ contextFactor.push_back(&word);
// beginning of sentence symbol ?
if (word.GetString(GetFactorType(), false).compare("") == 0)
@@ -219,25 +225,30 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
assert(phrasePos == 0);
delete lmState;
lmState = NewState( GetBeginSentenceState() );
-
- terminalsBeforeSubphrase++;
+ prefixTerminals++;
}
// end of sentence symbol ?
else if (word.GetString(GetFactorType(), false).compare("") == 0) {
- terminalsAfterSubphrase++;
+ suffixTerminals++;
}
// everything else
else {
string curr_ngram = word.GetString(GetFactorType(), false);
-// cerr << "ngram: " << curr_ngram << endl;
+// cerr << "ngram: " << curr_ngram << endl;
accumulator->PlusEquals(this,curr_ngram,1);
- }
+ if (collectForPrefix)
+ prefixTerminals++;
+ else
+ suffixTerminals++;
+ }
}
// non-terminal, add phrase from underlying hypothesis
else if (GetNGramOrder() > 1)
{
+ onlyTerminals = false;
+
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermIndex);
@@ -246,45 +257,56 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
static_cast(prevHypo->GetFFState(featureID));
size_t subPhraseLength = prevState->GetNumTargetTerminals();
- if (subPhraseLength==1) {
- if (beforeSubphrase)
- terminalsBeforeSubphrase++;
- else
- terminalsAfterSubphrase++;
- }
- else {
- beforeSubphrase = false;
- }
// special case: rule starts with non-terminal -> copy everything
if (phrasePos == 0) {
- if (subPhraseLength == 1)
+ if (subPhraseLength == 1) {
makePrefix = true;
+ collectForPrefix = true;
+ prefixTerminals++;
- // get language model state
- delete lmState;
- lmState = NewState( prevState->GetRightContext() );
+ // get language model state
+ delete lmState;
+ lmState = NewState( prevState->GetRightContext() );
- // push suffix
-// cerr << "suffix of NT in the beginning" << endl;
- int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
- if (suffixPos < 0) suffixPos = 0; // push all words if less than order
- for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
- {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
-// cerr << "NT0 --> : " << word << endl;
- contextFactor.push_back(&word);
- wordPos++;
- }
+ const Word &word = prevState->GetSuffix().GetWord(0);
+// cerr << "NT0 --> : " << word << endl;
+ contextFactor.push_back(&word);
+ }
+ else {
+ // get language model state
+ delete lmState;
+ lmState = NewState( prevState->GetRightContext() );
+
+ // push suffix
+// cerr << "suffix of NT in the beginning" << endl;
+ collectForPrefix = false;
+ int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
+ if (suffixPos < 0) suffixPos = 0; // push all words if less than order
+ for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
+ {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+// cerr << "NT0 --> : " << word << endl;
+ contextFactor.push_back(&word);
+ }
+ }
}
// internal non-terminal
else
{
- if (subPhraseLength == 1 && phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
- makeSuffix = true;
+ if (subPhraseLength==1) {
+ if (collectForPrefix)
+ prefixTerminals++;
+ else
+ suffixTerminals++;
-// cerr << "prefix of subphrase for left context" << endl;
+ if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
+ makeSuffix = true;
+ }
+
+ cerr << "prefix of subphrase for left context" << endl;
+ collectForPrefix = true;
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
@@ -292,69 +314,108 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
prefixPos++)
{
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
-// cerr << "NT --> " << word << endl;
+ cerr << "NT --> " << word << endl;
contextFactor.push_back(&word);
}
bool next = false;
- if (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)
+ if (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1) // there is something after this phrase
next = true;
// check if we are dealing with a large sub-phrase
- if (next && subPhraseLength > GetNGramOrder() - 1) // TODO: CHECK??
+ if (next && subPhraseLength > GetNGramOrder() - 1)
{
+ // cerr << "large sub phrase" << endl;
// clear up pending ngrams
- MakePrefixNgrams(contextFactor, accumulator, terminalsBeforeSubphrase);
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
contextFactor.clear();
makePrefix = false;
makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
// cerr << "suffix of subphrase for right context (only if something is following)" << endl;
+ // copy language model state
+ delete lmState;
+ lmState = NewState( prevState->GetRightContext() );
- // copy language model state
- delete lmState;
- lmState = NewState( prevState->GetRightContext() );
+ // push its suffix
+ size_t remainingWords = subPhraseLength - (GetNGramOrder()-1);
+ if (remainingWords > GetNGramOrder()-1) {
+ // only what is needed for the history window
+ remainingWords = GetNGramOrder()-1;
+ }
+ for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+// cerr << "NT --> : " << word << endl;
+ contextFactor.push_back(&word);
+ }
+ }
+ // subphrase can be used as suffix and as prefix for the next part
+ else if (next && subPhraseLength == GetNGramOrder() - 1)
+ {
+ // clear up pending ngrams
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ makePrefix = false;
+ makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
- // push its suffix
- size_t remainingWords = subPhraseLength - (GetNGramOrder()-1);
- if (remainingWords > GetNGramOrder()-1) {
- // only what is needed for the history window
- remainingWords = GetNGramOrder()-1;
- }
- for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
-// cerr << "NT --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- wordPos += subPhraseLength;
+// cerr << "(reuse) suffix of subphrase for right context (only if something is following)" << endl;
+ // copy language model state
+ delete lmState;
+ lmState = NewState( prevState->GetRightContext() );
+ }
+ else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) {
+ // two NTs in a row: make transition
+ MakePrefixNgrams(contextFactor, accumulator, 1, GetNGramOrder()-2);
+ MakeSuffixNgrams(contextFactor, accumulator, 1, GetNGramOrder()-2);
+ makePrefix = false;
+ makeSuffix = false;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
+ // remove duplicates
+ string curr_ngram;
+ curr_ngram.append((*contextFactor[GetNGramOrder()-2]).GetString(GetFactorType(), false));
+ curr_ngram.append(":");
+ curr_ngram.append((*contextFactor[GetNGramOrder()-1]).GetString(GetFactorType(), false));
+ accumulator->MinusEquals(this,curr_ngram,1);
}
}
+ prev_is_NT = true;
+ prev_subPhraseLength = subPhraseLength;
}
}
if (GetNGramOrder() > 1) {
- if (makePrefix) {
- size_t terminals = beforeSubphrase? 1 : terminalsBeforeSubphrase;
- MakePrefixNgrams(contextFactor, accumulator, terminals);
- }
- if (makeSuffix) {
- size_t terminals = beforeSubphrase? 1 : terminalsAfterSubphrase;
- MakeSuffixNgrams(contextFactor, accumulator, terminals);
- }
+ if (onlyTerminals) {
+ MakePrefixNgrams(contextFactor, accumulator, totalTerminals-1);
+ }
+ else {
+ if (makePrefix)
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ if (makeSuffix)
+ MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals);
- // remove duplicates
- if (makePrefix && makeSuffix && (contextFactor.size() <= GetNGramOrder())) {
- string curr_ngram;
- for (size_t i = 0; i < contextFactor.size(); ++i) {
- curr_ngram.append((*contextFactor[i]).GetString(GetFactorType(), false));
- if (i < contextFactor.size()-1)
- curr_ngram.append(":");
- }
- accumulator->MinusEquals(this,curr_ngram,1);
- }
+ // remove duplicates
+ if (makePrefix && makeSuffix && (contextFactor.size() <= GetNGramOrder())) {
+ string curr_ngram;
+ for (size_t i = 0; i < contextFactor.size(); ++i) {
+ curr_ngram.append((*contextFactor[i]).GetString(GetFactorType(), false));
+ if (i < contextFactor.size()-1)
+ curr_ngram.append(":");
+ }
+ accumulator->MinusEquals(this,curr_ngram,1);
+ }
+ }
}
ret->Set(lmState);
-// cerr << endl;
+ cerr << endl;
return ret;
}
@@ -371,45 +432,48 @@ void TargetNgramFeature::ShiftOrPush(std::vector &contextFactor, co
}
}
-void TargetNgramFeature::MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos) const {
+void TargetNgramFeature::MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const {
string curr_ngram;
- size_t size = contextFactor.size();
- for (size_t k = 0; k < numberOfStartPos; ++k) {
- size_t max_length = (size < GetNGramOrder())? size: GetNGramOrder();
- for (size_t end = 1+k; end < max_length+k; ++end) {
- for (size_t i=k; i <= end; ++i) {
- if (i > k)
- curr_ngram.append(":");
- curr_ngram.append((*contextFactor[i]).GetString(GetFactorType(), false));
- }
- if (curr_ngram != "" && curr_ngram != "") {
-// cerr << "p-ngram: " << curr_ngram << endl;
- accumulator->PlusEquals(this,curr_ngram,1);
- }
- curr_ngram.clear();
- }
- }
+ size_t size = contextFactor.size();
+ for (size_t k = 0; k < numberOfStartPos; ++k) {
+ size_t max_end = (size < GetNGramOrder()+k+offset)? size: GetNGramOrder()+k+offset;
+ for (size_t end_pos = 1+k+offset; end_pos < max_end; ++end_pos) {
+// cerr << "start: " << k+offset << endl;
+// cerr << "end: " << end_pos << endl;
+ for (size_t i=k+offset; i <= end_pos; ++i) {
+ if (i > k+offset)
+ curr_ngram.append(":");
+ curr_ngram.append((*contextFactor[i]).GetString(GetFactorType(), false));
+ }
+ if (curr_ngram != "" && curr_ngram != "") {
+// cerr << "p-ngram: " << curr_ngram << endl;
+ accumulator->PlusEquals(this,curr_ngram,1);
+ }
+ curr_ngram.clear();
+ }
+ }
}
-void TargetNgramFeature::MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos) const {
- string curr_ngram;
- size_t size = contextFactor.size();
- for (size_t k = 0; k < numberOfEndPos; ++k) {
- size_t min_start = (size > GetNGramOrder())? (size - GetNGramOrder()): 0;
- size_t end = size-1;
- for (size_t start=min_start-k; start < end-k; ++start) {
- for (size_t j=start; j < size-k; ++j){
- curr_ngram.append((*contextFactor[j]).GetString(GetFactorType(), false));
- if (j < size-k-1)
- curr_ngram.append(":");
- }
- if (curr_ngram != "" && curr_ngram != "") {
-// cerr << "s-ngram: " << curr_ngram << endl;
- accumulator->PlusEquals(this,curr_ngram,1);
- }
- curr_ngram.clear();
- }
- }
+void TargetNgramFeature::MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const {
+ string curr_ngram;
+ size_t size = contextFactor.size();
+ for (size_t k = 0; k < numberOfEndPos; ++k) {
+ size_t end_pos = size-1-k-offset;
+ for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < GetNGramOrder()); --start_pos) {
+// cerr << "start: " << start_pos << endl;
+// cerr << "end: " << end_pos << endl;
+ for (size_t j=start_pos; j <= end_pos; ++j){
+ curr_ngram.append((*contextFactor[j]).GetString(GetFactorType(), false));
+ if (j < end_pos)
+ curr_ngram.append(":");
+ }
+ if (curr_ngram != "" && curr_ngram != "") {
+// cerr << "s-ngram: " << curr_ngram << endl;
+ accumulator->PlusEquals(this,curr_ngram,1);
+ }
+ curr_ngram.clear();
+ }
+ }
}
bool TargetNgramFeature::Load(const std::string &filePath, FactorType factorType, size_t nGramOrder) {
diff --git a/moses/src/TargetNgramFeature.h b/moses/src/TargetNgramFeature.h
index c02c5ef79..0bb7a391d 100644
--- a/moses/src/TargetNgramFeature.h
+++ b/moses/src/TargetNgramFeature.h
@@ -238,9 +238,9 @@ private:
void appendNgram(const Word& word, bool& skip, std::string& ngram) const;
void ShiftOrPush(std::vector &contextFactor, const Word &word) const;
void MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfStartPos = 1) const;
+ size_t numberOfStartPos = 1, size_t offset = 0) const;
void MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfEndPos = 1) const;
+ size_t numberOfEndPos = 1, size_t offset = 0) const;
std::vector GetFactorType() const {
std::vector factorType;