mosesdecoder/moses/FF/WordTranslationFeature.cpp
2013-06-10 21:24:31 +01:00

515 lines
18 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <sstream>
#include <boost/algorithm/string.hpp>
#include "WordTranslationFeature.h"
#include "moses/Phrase.h"
#include "moses/TargetPhrase.h"
#include "moses/Hypothesis.h"
#include "moses/ChartHypothesis.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/TranslationOption.h"
#include "moses/UserMessage.h"
#include "util/string_piece_hash.hh"
#include "util/exception.hh"
using namespace std;
namespace Moses
{
WordTranslationFeature::WordTranslationFeature(const std::string &line)
:StatelessFeatureFunction("WordTranslationFeature", 0, line)
,m_unrestricted(true)
,m_simple(true)
,m_sourceContext(false)
,m_targetContext(false)
,m_ignorePunctuation(false)
,m_domainTrigger(false)
{
std::cerr << "Initializing word translation feature.. " << endl;
size_t ind = 0;
while (ind < m_args.size()) {
vector<string> &args = m_args[ind];
bool consumed = OverrideParameter(args[0], args[1]);
if (consumed) {
m_args.erase(m_args.begin() + ind);
} else {
++ind;
}
}
if (m_simple == 1) std::cerr << "using simple word translations.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
if (m_targetContext == 1) std::cerr << "using target context.. ";
if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
// compile a list of punctuation characters
if (m_ignorePunctuation) {
std::cerr << "ignoring punctuation for triggers.. ";
char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i) {
m_punctuationHash[punctuation[i]] = 1;
}
}
std::cerr << "done." << std::endl;
// TODO not sure about this
/*
if (weight[0] != 1) {
AddSparseProducer(wordTranslationFeature);
cerr << "wt sparse producer weight: " << weight[0] << endl;
if (m_mira)
m_metaFeatureProducer = new MetaFeatureProducer("wt");
}
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
wordTranslationFeature->SetSparseFeatureReporting();
}
*/
}
bool WordTranslationFeature::OverrideParameter(const std::string& key, const std::string& value)
{
if (key == "input-factor") {
m_factorTypeSource = Scan<FactorType>(value);
} else if (key == "output-factor") {
m_factorTypeTarget = Scan<FactorType>(value);
} else if (key == "simple") {
m_simple = Scan<bool>(value);
} else if (key == "source-context") {
m_sourceContext = Scan<bool>(value);
} else if (key == "target-context") {
m_targetContext = Scan<bool>(value);
} else if (key == "ignore-punctuation") {
m_ignorePunctuation = Scan<bool>(value);
} else if (key == "domain-trigger") {
m_domainTrigger = Scan<bool>(value);
} else if (key == "texttype") {
//texttype = value; TODO not used
} else if (key == "source-path") {
m_filePathSource = value;
} else if (key == "target-path") {
m_filePathTarget = value;
} else {
return false;
}
return true;
}
void WordTranslationFeature::Load()
{
// load word list for restricted feature set
if (m_filePathSource.empty()) {
return;
} //else if (tokens.size() == 8) {
cerr << "loading word translation word lists from " << m_filePathSource << " and " << m_filePathTarget << endl;
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);
std::string line;
while (getline(inFileSource, line)) {
m_vocabDomain.resize(m_vocabDomain.size() + 1);
vector<string> termVector;
boost::split(termVector, line, boost::is_any_of("\t "));
for (size_t i=0; i < termVector.size(); ++i)
m_vocabDomain.back().insert(termVector[i]);
}
inFileSource.close();
} else {
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
inFileSource.close();
// restricted target word vocabulary
ifstream inFileTarget(m_filePathTarget.c_str());
UTIL_THROW_IF(!inFileTarget, util::Exception, "could not open file " << m_filePathTarget);
while (getline(inFileTarget, line)) {
m_vocabTarget.insert(line);
}
inFileTarget.close();
m_unrestricted = false;
}
}
void WordTranslationFeature::Evaluate
(const PhraseBasedFeatureContext& context,
ScoreComponentCollection* accumulator) const
{
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
const AlignmentInfo &alignment = targetPhrase.GetAlignTerm();
// process aligned words
for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) {
const Phrase& sourcePhrase = targetPhrase.GetSourcePhrase();
int sourceIndex = alignmentPoint->first;
int targetIndex = alignmentPoint->second;
Word ws = sourcePhrase.GetWord(sourceIndex);
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
Word wt = targetPhrase.GetWord(targetIndex);
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if source or target are punctuation
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
if (!m_unrestricted) {
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
sourceWord = "OTHER";
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
targetWord = "OTHER";
}
if (m_simple) {
// construct feature name
stringstream featureName;
featureName << m_description << "_";
featureName << sourceWord;
featureName << "~";
featureName << targetWord;
accumulator->SparsePlusEquals(featureName.str(), 1);
}
if (m_domainTrigger && !m_sourceContext) {
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
// use topicid as trigger
const long topicid = input.GetTopicId();
stringstream feature;
feature << m_description << "_";
if (topicid == -1)
feature << "unk";
else
feature << topicid;
feature << "_";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
if (atol(topicid_prob[0].c_str()) == -1) {
stringstream feature;
feature << m_description << "_unk_";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
feature << m_description << "_";
feature << topicid_prob[i];
feature << "_";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
} else {
// range over domain trigger words (keywords)
const long docid = input.GetDocumentId();
for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
string sourceTrigger = *p;
stringstream feature;
feature << m_description << "_";
feature << sourceTrigger;
feature << "_";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
}
}
if (m_sourceContext) {
size_t globalSourceIndex = context.GetTranslationOption().GetStartPos() + sourceIndex;
if (!m_domainTrigger && globalSourceIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
feature << m_description << "_";
feature << "<s>,";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
if (contextIndex == globalSourceIndex) continue;
StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
char firstChar = sourceTrigger[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
const long docid = input.GetDocumentId();
bool sourceTriggerExists = false;
if (m_domainTrigger)
sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
else if (!m_unrestricted)
sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
if (m_domainTrigger) {
if (sourceTriggerExists) {
stringstream feature;
feature << m_description << "_";
feature << sourceTrigger;
feature << "_";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
} else if (m_unrestricted || sourceTriggerExists) {
stringstream feature;
feature << m_description << "_";
if (contextIndex < globalSourceIndex) {
feature << sourceTrigger;
feature << ",";
feature << sourceWord;
} else {
feature << sourceWord;
feature << ",";
feature << sourceTrigger;
}
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
}
}
if (m_targetContext) {
throw runtime_error("Can't use target words outside current translation option in a stateless feature");
/*
size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
if (globalTargetIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
feature << "wt_";
feature << sourceWord;
feature << "~";
feature << "<s>,";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
// range over target words (up to current position) to get context
for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
char firstChar = targetTrigger.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
bool targetTriggerExists = false;
if (!m_unrestricted)
targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
if (m_unrestricted || targetTriggerExists) {
stringstream feature;
feature << "wt_";
feature << sourceWord;
feature << "~";
feature << targetTrigger;
feature << ",";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
}
}*/
}
}
}
void WordTranslationFeature::EvaluateChart(
const ChartBasedFeatureContext& context,
ScoreComponentCollection* accumulator) const
{
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
// process aligned words
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) {
const Phrase& sourcePhrase = targetPhrase.GetSourcePhrase();
int sourceIndex = alignmentPoint->first;
int targetIndex = alignmentPoint->second;
Word ws = sourcePhrase.GetWord(sourceIndex);
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
Word wt = targetPhrase.GetWord(targetIndex);
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if source or target are punctuation
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
if (!m_unrestricted) {
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
sourceWord = "OTHER";
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
targetWord = "OTHER";
}
if (m_simple) {
// construct feature name
stringstream featureName;
featureName << m_description << "_";
//featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
featureName << sourceWord;
featureName << "~";
//featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
featureName << targetWord;
accumulator->SparsePlusEquals(featureName.str(), 1);
}
/* if (m_sourceContext) {
size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
if (globalSourceIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
feature << "wt_";
feature << "<s>,";
feature << sourceWord;
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
cerr << feature.str() << endl;
}
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
if (contextIndex == globalSourceIndex) continue;
string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
char firstChar = sourceTrigger.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
bool sourceTriggerExists = false;
if (!m_unrestricted)
sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
if (m_unrestricted || sourceTriggerExists) {
stringstream feature;
feature << "wt_";
if (contextIndex < globalSourceIndex) {
feature << sourceTrigger;
feature << ",";
feature << sourceWord;
}
else {
feature << sourceWord;
feature << ",";
feature << sourceTrigger;
}
feature << "~";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
cerr << feature.str() << endl;
}
}
}*/
/* if (m_targetContext) {
size_t globalTargetIndex = 0; // TODO
// size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
if (globalTargetIndex == 0) {
// add <s> trigger feature for source
stringstream feature;
feature << "wt_";
feature << sourceWord;
feature << "~";
feature << "<s>,";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
cerr << feature.str() << endl;
}
// range over target words (up to current position) to get context
for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
Phrase outputPhrase = cur_hypo.GetOutputPhrase();
string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
//string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
char firstChar = targetTrigger.at(0);
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
bool targetTriggerExists = false;
if (!m_unrestricted)
targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
if (m_unrestricted || targetTriggerExists) {
stringstream feature;
feature << "wt_";
feature << sourceWord;
feature << "~";
feature << targetTrigger;
feature << ",";
feature << targetWord;
accumulator->SparsePlusEquals(feature.str(), 1);
cerr << feature.str() << endl;
}
}
}*/
}
}
bool WordTranslationFeature::IsUseable(const FactorMask &mask) const
{
bool ret = mask[m_factorTypeTarget];
return ret;
}
}