mosesdecoder/moses/FF/PhrasePairFeature.cpp

252 lines
8.1 KiB
C++
Raw Normal View History

2012-09-07 19:57:53 +04:00
#include <boost/algorithm/string.hpp>
#include "PhrasePairFeature.h"
2013-05-24 21:02:49 +04:00
#include "moses/AlignmentInfo.h"
#include "moses/TargetPhrase.h"
#include "moses/Hypothesis.h"
#include "moses/TranslationOption.h"
#include "moses/InputPath.h"
#include "util/string_piece_hash.hh"
#include "util/exception.hh"
using namespace std;
2013-05-29 21:16:15 +04:00
namespace Moses
{
2013-01-15 19:04:32 +04:00
PhrasePairFeature::PhrasePairFeature(const std::string &line)
:StatelessFeatureFunction(0, line)
2013-01-15 19:04:32 +04:00
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
ReadParameters();
2013-01-15 19:04:32 +04:00
if (m_simple == 1) std::cerr << "using simple phrase pairs.. ";
if (m_sourceContext == 1) std::cerr << "using source context.. ";
if (m_domainTrigger == 1) std::cerr << "using domain triggers.. ";
// compile a list of punctuation characters
if (m_ignorePunctuation) {
std::cerr << "ignoring punctuation for triggers.. ";
char punctuation[] = "\"'!?¿·()#_,.:;•&@/\\0123456789~=";
for (size_t i=0; i < sizeof(punctuation)-1; ++i)
m_punctuationHash[punctuation[i]] = 1;
}
}
void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "input-factor") {
m_sourceFactorId = Scan<FactorType>(value);
} else if (key == "output-factor") {
m_targetFactorId = Scan<FactorType>(value);
} else if (key == "unrestricted") {
m_unrestricted = Scan<bool>(value);
} else if (key == "simple") {
m_simple = Scan<bool>(value);
} else if (key == "source-context") {
m_sourceContext = Scan<bool>(value);
} else if (key == "domain-trigger") {
m_domainTrigger = Scan<bool>(value);
} else if (key == "ignore-punctuation") {
m_ignorePunctuation = Scan<bool>(value);
} else if (key == "ignore-punctuation") {
m_filePathSource = value;
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
}
void PhrasePairFeature::Load()
{
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(m_filePathSource.c_str());
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
2013-05-29 21:16:15 +04:00
std::string line;
while (getline(inFileSource, line)) {
std::set<std::string> terms;
vector<string> termVector;
boost::split(termVector, line, boost::is_any_of("\t "));
2013-05-29 21:16:15 +04:00
for (size_t i=0; i < termVector.size(); ++i)
2013-01-15 19:04:32 +04:00
terms.insert(termVector[i]);
2013-05-29 21:16:15 +04:00
// add term set for current document
m_vocabDomain.push_back(terms);
}
2013-05-29 21:16:15 +04:00
inFileSource.close();
2013-05-29 21:16:15 +04:00
} else {
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
2013-11-23 00:27:46 +04:00
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
2013-05-29 21:16:15 +04:00
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
2013-05-29 21:16:15 +04:00
inFileSource.close();
2013-05-29 21:16:15 +04:00
/* // restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
if (!inFileTarget)
{
cerr << "could not open file " << filePathTarget << endl;
return false;
}
while (getline(inFileTarget, line)) {
m_vocabTarget.insert(line);
}
inFileTarget.close();*/
m_unrestricted = false;
}
}
2012-09-07 19:57:53 +04:00
void PhrasePairFeature::Evaluate(
2013-08-23 17:25:25 +04:00
const Hypothesis& hypo,
2013-05-29 21:16:15 +04:00
ScoreComponentCollection* accumulator) const
{
2013-08-23 17:25:25 +04:00
const TargetPhrase& target = hypo.GetCurrTargetPhrase();
const Phrase& source = hypo.GetTranslationOption().GetInputPath().GetPhrase();
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
}
namestr << "~";
namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
for (size_t i = 1; i < target.GetSize(); ++i) {
const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
2013-05-29 21:16:15 +04:00
accumulator->SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
2013-08-23 17:25:25 +04:00
const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
// compute pair
ostringstream pair;
pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
pair << ",";
pair << sourceFactor->GetString();
}
pair << "~";
pair << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
for (size_t i = 1; i < target.GetSize(); ++i) {
const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
pair << ",";
pair << targetFactor->GetString();
}
2013-05-29 21:16:15 +04:00
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
2013-05-29 21:16:15 +04:00
// use topicid as trigger
const long topicid = input.GetTopicId();
stringstream feature;
feature << "pp_";
if (topicid == -1)
feature << "unk";
else
feature << topicid;
feature << "_";
feature << pair.str();
accumulator->SparsePlusEquals(feature.str(), 1);
} else {
// use topic probabilities
const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
if (atol(topicid_prob[0].c_str()) == -1) {
stringstream feature;
feature << "pp_unk_";
feature << pair.str();
accumulator->SparsePlusEquals(feature.str(), 1);
} else {
for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
stringstream feature;
feature << "pp_";
feature << topicid_prob[i];
feature << "_";
feature << pair.str();
accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
}
}
}
2013-05-29 21:16:15 +04:00
} else {
// range over domain trigger words
const long docid = input.GetDocumentId();
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
2013-05-29 21:16:15 +04:00
string sourceTrigger = *p;
ostringstream namestr;
namestr << "pp_";
namestr << sourceTrigger;
namestr << "_";
namestr << pair.str();
accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
if (m_sourceContext) {
2013-08-23 17:25:25 +04:00
const Sentence& input = static_cast<const Sentence&>(hypo.GetInput());
2013-05-29 21:16:15 +04:00
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
if (m_ignorePunctuation) {
2013-05-29 21:16:15 +04:00
// check if trigger is punctuation
char firstChar = sourceTrigger[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
2013-05-29 21:16:15 +04:00
bool sourceTriggerExists = false;
if (!m_unrestricted)
2013-05-29 21:16:15 +04:00
sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
if (m_unrestricted || sourceTriggerExists) {
2013-05-29 21:16:15 +04:00
ostringstream namestr;
namestr << "pp_";
namestr << sourceTrigger;
namestr << "~";
namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
for (size_t i = 1; i < source.GetSize(); ++i) {
const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
namestr << ",";
namestr << sourceFactor->GetString();
}
namestr << "~";
namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
for (size_t i = 1; i < target.GetSize(); ++i) {
const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
namestr << ",";
namestr << targetFactor->GetString();
}
accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
}
bool PhrasePairFeature::IsUseable(const FactorMask &mask) const
{
2013-05-30 15:51:40 +04:00
bool ret = mask[m_targetFactorId];
return ret;
}
}