2011-08-13 06:40:54 +04:00
|
|
|
|
#include <sstream>
|
2012-09-13 21:16:13 +04:00
|
|
|
|
#include <boost/algorithm/string.hpp>
|
2011-08-13 06:40:54 +04:00
|
|
|
|
#include "WordTranslationFeature.h"
|
2013-05-24 21:02:49 +04:00
|
|
|
|
#include "moses/Phrase.h"
|
|
|
|
|
#include "moses/TargetPhrase.h"
|
|
|
|
|
#include "moses/Hypothesis.h"
|
|
|
|
|
#include "moses/ChartHypothesis.h"
|
|
|
|
|
#include "moses/ScoreComponentCollection.h"
|
|
|
|
|
#include "moses/TranslationOption.h"
|
|
|
|
|
#include "moses/UserMessage.h"
|
2013-04-25 22:42:30 +04:00
|
|
|
|
#include "util/string_piece_hash.hh"
|
2013-06-09 23:23:10 +04:00
|
|
|
|
#include "util/exception.hh"
|
|
|
|
|
|
|
|
|
|
using namespace std;
|
2011-08-13 06:40:54 +04:00
|
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
|
namespace Moses
|
|
|
|
|
{
|
2011-08-13 06:40:54 +04:00
|
|
|
|
|
2013-01-03 01:26:56 +04:00
|
|
|
|
WordTranslationFeature::WordTranslationFeature(const std::string &line)
|
2013-05-29 21:16:15 +04:00
|
|
|
|
:StatelessFeatureFunction("WordTranslationFeature", 0, line)
|
|
|
|
|
,m_unrestricted(true)
|
|
|
|
|
,m_simple(true)
|
|
|
|
|
,m_sourceContext(false)
|
|
|
|
|
,m_targetContext(false)
|
|
|
|
|
,m_ignorePunctuation(false)
|
|
|
|
|
,m_domainTrigger(false)
|
2013-01-03 01:26:56 +04:00
|
|
|
|
{
|
|
|
|
|
std::cerr << "Initializing word translation feature.. " << endl;
|
2013-06-20 16:06:03 +04:00
|
|
|
|
ReadParameters();
|
2013-01-03 01:26:56 +04:00
|
|
|
|
|
|
|
|
|
if (m_simple == 1) std::cerr << "using simple word translations.. ";
|
|
|
|
|
if (m_sourceContext == 1) std::cerr << "using source context.. ";
|
|
|
|
|
if (m_targetContext == 1) std::cerr << "using target context.. ";
|
|
|
|
|
if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
|
|
|
|
|
|
|
|
|
|
// compile a list of punctuation characters
|
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
std::cerr << "ignoring punctuation for triggers.. ";
|
|
|
|
|
char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~=";
|
|
|
|
|
for (size_t i=0; i < sizeof(punctuation)-1; ++i) {
|
|
|
|
|
m_punctuationHash[punctuation[i]] = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::cerr << "done." << std::endl;
|
|
|
|
|
|
|
|
|
|
// TODO not sure about this
|
|
|
|
|
/*
|
|
|
|
|
if (weight[0] != 1) {
|
|
|
|
|
AddSparseProducer(wordTranslationFeature);
|
|
|
|
|
cerr << "wt sparse producer weight: " << weight[0] << endl;
|
|
|
|
|
if (m_mira)
|
|
|
|
|
m_metaFeatureProducer = new MetaFeatureProducer("wt");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
|
wordTranslationFeature->SetSparseFeatureReporting();
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
}
|
2011-08-13 06:40:54 +04:00
|
|
|
|
|
2013-06-20 16:25:02 +04:00
|
|
|
|
void WordTranslationFeature::SetParameter(const std::string& key, const std::string& value)
|
2013-06-10 21:11:55 +04:00
|
|
|
|
{
|
|
|
|
|
if (key == "input-factor") {
|
|
|
|
|
m_factorTypeSource = Scan<FactorType>(value);
|
|
|
|
|
} else if (key == "output-factor") {
|
|
|
|
|
m_factorTypeTarget = Scan<FactorType>(value);
|
|
|
|
|
} else if (key == "simple") {
|
|
|
|
|
m_simple = Scan<bool>(value);
|
|
|
|
|
} else if (key == "source-context") {
|
|
|
|
|
m_sourceContext = Scan<bool>(value);
|
|
|
|
|
} else if (key == "target-context") {
|
|
|
|
|
m_targetContext = Scan<bool>(value);
|
|
|
|
|
} else if (key == "ignore-punctuation") {
|
|
|
|
|
m_ignorePunctuation = Scan<bool>(value);
|
|
|
|
|
} else if (key == "domain-trigger") {
|
|
|
|
|
m_domainTrigger = Scan<bool>(value);
|
|
|
|
|
} else if (key == "texttype") {
|
|
|
|
|
//texttype = value; TODO not used
|
|
|
|
|
} else if (key == "source-path") {
|
|
|
|
|
m_filePathSource = value;
|
|
|
|
|
} else if (key == "target-path") {
|
|
|
|
|
m_filePathTarget = value;
|
|
|
|
|
} else {
|
2013-06-20 16:25:02 +04:00
|
|
|
|
StatelessFeatureFunction::SetParameter(key, value);
|
2013-06-10 21:11:55 +04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-06-09 23:23:10 +04:00
|
|
|
|
void WordTranslationFeature::Load()
|
2011-08-13 06:40:54 +04:00
|
|
|
|
{
|
2013-06-09 23:23:10 +04:00
|
|
|
|
// load word list for restricted feature set
|
|
|
|
|
if (m_filePathSource.empty()) {
|
|
|
|
|
return;
|
|
|
|
|
} //else if (tokens.size() == 8) {
|
|
|
|
|
|
|
|
|
|
cerr << "loading word translation word lists from " << m_filePathSource << " and " << m_filePathTarget << endl;
|
2012-07-26 20:32:50 +04:00
|
|
|
|
if (m_domainTrigger) {
|
|
|
|
|
// domain trigger terms for each input document
|
2013-06-09 23:23:10 +04:00
|
|
|
|
ifstream inFileSource(m_filePathSource.c_str());
|
|
|
|
|
UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
std::string line;
|
|
|
|
|
while (getline(inFileSource, line)) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
m_vocabDomain.resize(m_vocabDomain.size() + 1);
|
|
|
|
|
vector<string> termVector;
|
|
|
|
|
boost::split(termVector, line, boost::is_any_of("\t "));
|
|
|
|
|
for (size_t i=0; i < termVector.size(); ++i)
|
|
|
|
|
m_vocabDomain.back().insert(termVector[i]);
|
2012-07-26 20:32:50 +04:00
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
inFileSource.close();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
} else {
|
2012-07-26 20:32:50 +04:00
|
|
|
|
// restricted source word vocabulary
|
2013-06-09 23:23:10 +04:00
|
|
|
|
ifstream inFileSource(m_filePathSource.c_str());
|
|
|
|
|
UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
std::string line;
|
|
|
|
|
while (getline(inFileSource, line)) {
|
|
|
|
|
m_vocabSource.insert(line);
|
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
inFileSource.close();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
// restricted target word vocabulary
|
2013-06-09 23:23:10 +04:00
|
|
|
|
ifstream inFileTarget(m_filePathTarget.c_str());
|
|
|
|
|
UTIL_THROW_IF(!inFileTarget, util::Exception, "could not open file " << m_filePathTarget);
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
while (getline(inFileTarget, line)) {
|
|
|
|
|
m_vocabTarget.insert(line);
|
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
inFileTarget.close();
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
|
m_unrestricted = false;
|
2011-08-13 06:40:54 +04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-09-13 21:16:13 +04:00
|
|
|
|
void WordTranslationFeature::Evaluate
|
2013-05-29 21:16:15 +04:00
|
|
|
|
(const PhraseBasedFeatureContext& context,
|
|
|
|
|
ScoreComponentCollection* accumulator) const
|
2011-08-13 06:40:54 +04:00
|
|
|
|
{
|
2012-09-19 21:00:53 +04:00
|
|
|
|
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
|
|
|
|
|
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
|
2012-10-19 18:10:10 +04:00
|
|
|
|
const AlignmentInfo &alignment = targetPhrase.GetAlignTerm();
|
2011-08-13 06:40:54 +04:00
|
|
|
|
|
|
|
|
|
// process aligned words
|
|
|
|
|
for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) {
|
2012-07-26 20:32:50 +04:00
|
|
|
|
const Phrase& sourcePhrase = targetPhrase.GetSourcePhrase();
|
|
|
|
|
int sourceIndex = alignmentPoint->first;
|
|
|
|
|
int targetIndex = alignmentPoint->second;
|
|
|
|
|
Word ws = sourcePhrase.GetWord(sourceIndex);
|
|
|
|
|
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
|
|
|
|
|
Word wt = targetPhrase.GetWord(targetIndex);
|
|
|
|
|
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
|
2013-04-25 22:42:30 +04:00
|
|
|
|
StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
|
|
|
|
|
StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
|
2012-03-20 23:20:32 +04:00
|
|
|
|
if (m_ignorePunctuation) {
|
2012-04-09 23:47:51 +04:00
|
|
|
|
// check if source or target are punctuation
|
2013-04-29 21:46:48 +04:00
|
|
|
|
char firstChar = sourceWord[0];
|
2012-03-20 23:20:32 +04:00
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
2013-05-29 21:16:15 +04:00
|
|
|
|
continue;
|
2013-04-29 21:46:48 +04:00
|
|
|
|
firstChar = targetWord[0];
|
2012-03-20 23:20:32 +04:00
|
|
|
|
charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2012-03-07 21:56:29 +04:00
|
|
|
|
if (!m_unrestricted) {
|
2013-04-25 22:42:30 +04:00
|
|
|
|
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
|
2013-05-29 21:16:15 +04:00
|
|
|
|
sourceWord = "OTHER";
|
2013-04-25 22:42:30 +04:00
|
|
|
|
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
|
2013-05-29 21:16:15 +04:00
|
|
|
|
targetWord = "OTHER";
|
2012-03-07 21:56:29 +04:00
|
|
|
|
}
|
2012-04-09 23:47:51 +04:00
|
|
|
|
|
|
|
|
|
if (m_simple) {
|
2012-10-03 21:53:55 +04:00
|
|
|
|
// construct feature name
|
|
|
|
|
stringstream featureName;
|
2013-05-15 14:37:21 +04:00
|
|
|
|
featureName << m_description << "_";
|
2012-10-03 21:53:55 +04:00
|
|
|
|
featureName << sourceWord;
|
|
|
|
|
featureName << "~";
|
|
|
|
|
featureName << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(featureName.str(), 1);
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
2013-05-24 14:50:44 +04:00
|
|
|
|
if (m_domainTrigger && !m_sourceContext) {
|
2012-10-03 21:53:55 +04:00
|
|
|
|
const bool use_topicid = input.GetUseTopicId();
|
2013-05-24 14:50:44 +04:00
|
|
|
|
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
|
2012-07-26 20:32:50 +04:00
|
|
|
|
if (use_topicid || use_topicid_prob) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
if(use_topicid) {
|
|
|
|
|
// use topicid as trigger
|
|
|
|
|
const long topicid = input.GetTopicId();
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
if (topicid == -1)
|
|
|
|
|
feature << "unk";
|
|
|
|
|
else
|
|
|
|
|
feature << topicid;
|
|
|
|
|
|
|
|
|
|
feature << "_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
} else {
|
|
|
|
|
// use topic probabilities
|
|
|
|
|
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
|
|
|
|
|
if (atol(topicid_prob[0].c_str()) == -1) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_unk_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
} else {
|
|
|
|
|
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
feature << topicid_prob[i];
|
|
|
|
|
feature << "_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// range over domain trigger words (keywords)
|
|
|
|
|
const long docid = input.GetDocumentId();
|
|
|
|
|
for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
|
|
|
|
|
string sourceTrigger = *p;
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
feature << "_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
}
|
2012-07-26 20:32:50 +04:00
|
|
|
|
}
|
|
|
|
|
}
|
2012-04-09 23:47:51 +04:00
|
|
|
|
if (m_sourceContext) {
|
2012-10-03 21:53:55 +04:00
|
|
|
|
size_t globalSourceIndex = context.GetTranslationOption().GetStartPos() + sourceIndex;
|
|
|
|
|
if (!m_domainTrigger && globalSourceIndex == 0) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
// add <s> trigger feature for source
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
feature << "<s>,";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
2012-10-03 21:53:55 +04:00
|
|
|
|
}
|
2013-05-24 14:50:44 +04:00
|
|
|
|
|
2012-10-03 21:53:55 +04:00
|
|
|
|
// range over source words to get context
|
|
|
|
|
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
if (contextIndex == globalSourceIndex) continue;
|
|
|
|
|
StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
|
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
// check if trigger is punctuation
|
|
|
|
|
char firstChar = sourceTrigger[0];
|
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const long docid = input.GetDocumentId();
|
|
|
|
|
bool sourceTriggerExists = false;
|
|
|
|
|
if (m_domainTrigger)
|
|
|
|
|
sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
|
|
|
|
|
else if (!m_unrestricted)
|
|
|
|
|
sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
|
|
|
|
|
|
|
|
|
|
if (m_domainTrigger) {
|
|
|
|
|
if (sourceTriggerExists) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
feature << "_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
}
|
|
|
|
|
} else if (m_unrestricted || sourceTriggerExists) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << m_description << "_";
|
|
|
|
|
if (contextIndex < globalSourceIndex) {
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
} else {
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
}
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
}
|
2012-10-03 21:53:55 +04:00
|
|
|
|
}
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
|
|
|
|
if (m_targetContext) {
|
2012-09-13 21:16:13 +04:00
|
|
|
|
throw runtime_error("Can't use target words outside current translation option in a stateless feature");
|
|
|
|
|
/*
|
2013-05-29 21:16:15 +04:00
|
|
|
|
size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
|
|
|
|
|
if (globalTargetIndex == 0) {
|
|
|
|
|
// add <s> trigger feature for source
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << "<s>,";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// range over target words (up to current position) to get context
|
|
|
|
|
for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
|
|
|
|
|
string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
|
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
// check if trigger is punctuation
|
|
|
|
|
char firstChar = targetTrigger.at(0);
|
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool targetTriggerExists = false;
|
|
|
|
|
if (!m_unrestricted)
|
|
|
|
|
targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
|
|
|
|
|
|
|
|
|
|
if (m_unrestricted || targetTriggerExists) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetTrigger;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
}
|
|
|
|
|
}*/
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2012-02-29 00:22:09 +04:00
|
|
|
|
|
2012-09-14 01:08:01 +04:00
|
|
|
|
void WordTranslationFeature::EvaluateChart(
|
2013-05-29 21:16:15 +04:00
|
|
|
|
const ChartBasedFeatureContext& context,
|
|
|
|
|
ScoreComponentCollection* accumulator) const
|
2012-04-09 23:47:51 +04:00
|
|
|
|
{
|
2012-09-21 14:56:01 +04:00
|
|
|
|
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
|
2012-10-19 18:10:10 +04:00
|
|
|
|
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
|
2012-04-09 23:47:51 +04:00
|
|
|
|
|
|
|
|
|
// process aligned words
|
2012-10-19 19:00:42 +04:00
|
|
|
|
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) {
|
2012-04-10 18:48:38 +04:00
|
|
|
|
const Phrase& sourcePhrase = targetPhrase.GetSourcePhrase();
|
|
|
|
|
int sourceIndex = alignmentPoint->first;
|
|
|
|
|
int targetIndex = alignmentPoint->second;
|
|
|
|
|
Word ws = sourcePhrase.GetWord(sourceIndex);
|
|
|
|
|
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
|
|
|
|
|
Word wt = targetPhrase.GetWord(targetIndex);
|
|
|
|
|
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
|
2013-04-25 22:42:30 +04:00
|
|
|
|
StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
|
|
|
|
|
StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
|
2012-04-09 23:47:51 +04:00
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
// check if source or target are punctuation
|
2013-04-25 22:42:30 +04:00
|
|
|
|
char firstChar = sourceWord[0];
|
2012-04-09 23:47:51 +04:00
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
2013-05-29 21:16:15 +04:00
|
|
|
|
continue;
|
2013-04-25 22:42:30 +04:00
|
|
|
|
firstChar = targetWord[0];
|
2012-04-09 23:47:51 +04:00
|
|
|
|
charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!m_unrestricted) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
|
|
|
|
|
sourceWord = "OTHER";
|
|
|
|
|
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
|
|
|
|
|
targetWord = "OTHER";
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
|
2012-04-09 23:47:51 +04:00
|
|
|
|
if (m_simple) {
|
2013-05-29 21:16:15 +04:00
|
|
|
|
// construct feature name
|
|
|
|
|
stringstream featureName;
|
|
|
|
|
featureName << m_description << "_";
|
|
|
|
|
//featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
|
|
|
|
|
featureName << sourceWord;
|
|
|
|
|
featureName << "~";
|
|
|
|
|
//featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
|
|
|
|
|
featureName << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(featureName.str(), 1);
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
2013-05-29 21:16:15 +04:00
|
|
|
|
/* if (m_sourceContext) {
|
|
|
|
|
size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
|
|
|
|
|
if (globalSourceIndex == 0) {
|
|
|
|
|
// add <s> trigger feature for source
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
feature << "<s>,";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
cerr << feature.str() << endl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// range over source words to get context
|
|
|
|
|
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
|
|
|
|
|
if (contextIndex == globalSourceIndex) continue;
|
|
|
|
|
string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
|
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
// check if trigger is punctuation
|
|
|
|
|
char firstChar = sourceTrigger.at(0);
|
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool sourceTriggerExists = false;
|
|
|
|
|
if (!m_unrestricted)
|
|
|
|
|
sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
|
|
|
|
|
|
|
|
|
|
if (m_unrestricted || sourceTriggerExists) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
if (contextIndex < globalSourceIndex) {
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << sourceTrigger;
|
|
|
|
|
}
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
cerr << feature.str() << endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}*/
|
2013-05-29 21:16:15 +04:00
|
|
|
|
/* if (m_targetContext) {
|
|
|
|
|
size_t globalTargetIndex = 0; // TODO
|
|
|
|
|
// size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
|
|
|
|
|
if (globalTargetIndex == 0) {
|
|
|
|
|
// add <s> trigger feature for source
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << "<s>,";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
cerr << feature.str() << endl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// range over target words (up to current position) to get context
|
|
|
|
|
for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
|
|
|
|
|
Phrase outputPhrase = cur_hypo.GetOutputPhrase();
|
|
|
|
|
string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
|
|
|
|
|
//string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
|
|
|
|
|
if (m_ignorePunctuation) {
|
|
|
|
|
// check if trigger is punctuation
|
|
|
|
|
char firstChar = targetTrigger.at(0);
|
|
|
|
|
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
|
|
|
|
|
if(charIterator != m_punctuationHash.end())
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool targetTriggerExists = false;
|
|
|
|
|
if (!m_unrestricted)
|
|
|
|
|
targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
|
|
|
|
|
|
|
|
|
|
if (m_unrestricted || targetTriggerExists) {
|
|
|
|
|
stringstream feature;
|
|
|
|
|
feature << "wt_";
|
|
|
|
|
feature << sourceWord;
|
|
|
|
|
feature << "~";
|
|
|
|
|
feature << targetTrigger;
|
|
|
|
|
feature << ",";
|
|
|
|
|
feature << targetWord;
|
|
|
|
|
accumulator->SparsePlusEquals(feature.str(), 1);
|
|
|
|
|
cerr << feature.str() << endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}*/
|
2012-04-09 23:47:51 +04:00
|
|
|
|
}
|
2012-09-21 18:00:24 +04:00
|
|
|
|
|
2012-02-29 00:22:09 +04:00
|
|
|
|
}
|
|
|
|
|
|
2013-05-30 15:41:08 +04:00
|
|
|
|
bool WordTranslationFeature::IsUseable(const FactorMask &mask) const
|
|
|
|
|
{
|
2013-05-30 15:51:40 +04:00
|
|
|
|
bool ret = mask[m_factorTypeTarget];
|
|
|
|
|
return ret;
|
2013-05-30 15:41:08 +04:00
|
|
|
|
}
|
|
|
|
|
|
2011-08-13 06:40:54 +04:00
|
|
|
|
}
|