mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-10 10:59:21 +03:00
c1be2d47e5
Conflicts: moses/FF/Factory.cpp
223 lines
8.2 KiB
C++
223 lines
8.2 KiB
C++
#include <iostream>
|
|
|
|
#include "moses/ChartHypothesis.h"
|
|
#include "moses/ChartManager.h"
|
|
#include "moses/FactorCollection.h"
|
|
#include "moses/Sentence.h"
|
|
|
|
#include "util/exception.hh"
|
|
|
|
#include "SparseHieroReorderingFeature.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace Moses
|
|
{
|
|
|
|
SparseHieroReorderingFeature::SparseHieroReorderingFeature(const std::string &line)
|
|
:StatelessFeatureFunction(0, line),
|
|
m_type(SourceCombined),
|
|
m_sourceFactor(0),
|
|
m_targetFactor(0),
|
|
m_sourceVocabFile(""),
|
|
m_targetVocabFile("")
|
|
{
|
|
|
|
/*
|
|
Configuration of features.
|
|
factor - Which factor should it apply to
|
|
type - what type of sparse reordering feature. e.g. block (modelled on Matthias
|
|
Huck's EAMT 2012 features)
|
|
word - which words to include, e.g. src_bdry, src_all, tgt_bdry , ...
|
|
vocab - vocab file to limit it to
|
|
orientation - e.g. lr, etc.
|
|
*/
|
|
cerr << "Constructing a Sparse Reordering feature" << endl;
|
|
ReadParameters();
|
|
m_otherFactor = FactorCollection::Instance().AddFactor("##OTHER##");
|
|
LoadVocabulary(m_sourceVocabFile, m_sourceVocab);
|
|
LoadVocabulary(m_targetVocabFile, m_targetVocab);
|
|
}
|
|
|
|
void SparseHieroReorderingFeature::SetParameter(const std::string& key, const std::string& value) {
|
|
if (key == "input-factor") {
|
|
m_sourceFactor = Scan<FactorType>(value);
|
|
} else if (key == "output-factor") {
|
|
m_targetFactor = Scan<FactorType>(value);
|
|
} else if (key == "input-vocab-file") {
|
|
m_sourceVocabFile = value;
|
|
} else if (key == "output-vocab-file") {
|
|
m_targetVocabFile = value;
|
|
} else if (key == "type") {
|
|
if (value == "SourceCombined") {
|
|
m_type = SourceCombined;
|
|
} else if (value == "SourceLeft") {
|
|
m_type = SourceLeft;
|
|
} else if (value == "SourceRight") {
|
|
m_type = SourceRight;
|
|
} else {
|
|
UTIL_THROW(util::Exception, "Unknown sparse reordering type " << value);
|
|
}
|
|
} else {
|
|
FeatureFunction::SetParameter(key, value);
|
|
}
|
|
}
|
|
|
|
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
|
|
{
|
|
if (filename.empty()) return;
|
|
ifstream in(filename.c_str());
|
|
UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
|
|
string line;
|
|
while(getline(in,line)) {
|
|
vocab.insert(FactorCollection::Instance().AddFactor(line));
|
|
}
|
|
in.close();
|
|
}
|
|
|
|
const Factor* SparseHieroReorderingFeature::GetFactor(const Word& word, const Vocab& vocab, FactorType factorType) const {
|
|
const Factor* factor = word.GetFactor(factorType);
|
|
if (vocab.size() && vocab.find(factor) == vocab.end()) return m_otherFactor;
|
|
return factor;
|
|
}
|
|
|
|
void SparseHieroReorderingFeature::EvaluateChart(
|
|
const ChartHypothesis& cur_hypo ,
|
|
ScoreComponentCollection* accumulator) const
|
|
{
|
|
// get index map for underlying hypotheses
|
|
//const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
|
// cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
|
|
|
|
//The Huck features. For a rule with source side:
|
|
// abXcdXef
|
|
//We first have to split into blocks:
|
|
// ab X cd X ef
|
|
//Then we extract features based in the boundary words of the neighbouring blocks
|
|
//For the block pair, we use the right word of the left block, and the left
|
|
//word of the right block.
|
|
|
|
//Need to get blocks, and their alignment. Each block has a word range (on the
|
|
// on the source), a non-terminal flag, and a set of alignment points in the target phrase
|
|
|
|
//We need to be able to map source word position to target word position, as
|
|
//much as possible (don't need interior of non-terminals). The alignment info
|
|
//objects just give us the mappings between *rule* positions. So if we can
|
|
//map source word position to source rule position, and target rule position
|
|
//to target word position, then we can map right through.
|
|
|
|
size_t sourceStart = cur_hypo.GetCurrSourceRange().GetStartPos();
|
|
size_t sourceSize = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
|
|
|
|
vector<WordsRange> sourceNTSpans;
|
|
for (size_t prevHypoId = 0; prevHypoId < cur_hypo.GetPrevHypos().size(); ++prevHypoId) {
|
|
sourceNTSpans.push_back(cur_hypo.GetPrevHypo(prevHypoId)->GetCurrSourceRange());
|
|
}
|
|
//put in source order. Is this necessary?
|
|
sort(sourceNTSpans.begin(), sourceNTSpans.end());
|
|
//cerr << "Source NTs: ";
|
|
//for (size_t i = 0; i < sourceNTSpans.size(); ++i) cerr << sourceNTSpans[i] << " ";
|
|
//cerr << endl;
|
|
|
|
typedef pair<WordsRange,bool> Block;//flag indicates NT
|
|
vector<Block> sourceBlocks;
|
|
sourceBlocks.push_back(Block(cur_hypo.GetCurrSourceRange(),false));
|
|
for (vector<WordsRange>::const_iterator i = sourceNTSpans.begin();
|
|
i != sourceNTSpans.end(); ++i) {
|
|
const WordsRange& prevHypoRange = *i;
|
|
Block lastBlock = sourceBlocks.back();
|
|
sourceBlocks.pop_back();
|
|
//split this range into before NT, NT and after NT
|
|
if (prevHypoRange.GetStartPos() > lastBlock.first.GetStartPos()) {
|
|
sourceBlocks.push_back(Block(WordsRange(lastBlock.first.GetStartPos(),prevHypoRange.GetStartPos()-1),false));
|
|
}
|
|
sourceBlocks.push_back(Block(prevHypoRange,true));
|
|
if (prevHypoRange.GetEndPos() < lastBlock.first.GetEndPos()) {
|
|
sourceBlocks.push_back(Block(WordsRange(prevHypoRange.GetEndPos()+1,lastBlock.first.GetEndPos()), false));
|
|
}
|
|
}
|
|
/*
|
|
cerr << "Source Blocks: ";
|
|
for (size_t i = 0; i < sourceBlocks.size(); ++i) cerr << sourceBlocks[i].first << " "
|
|
<< (sourceBlocks[i].second ? "NT" : "T") << " ";
|
|
cerr << endl;
|
|
*/
|
|
|
|
//Mapping from source word to target rule position
|
|
vector<size_t> sourceWordToTargetRulePos(sourceSize);
|
|
map<size_t,size_t> alignMap;
|
|
alignMap.insert(
|
|
cur_hypo.GetCurrTargetPhrase().GetAlignTerm().begin(),
|
|
cur_hypo.GetCurrTargetPhrase().GetAlignTerm().end());
|
|
alignMap.insert(
|
|
cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().begin(),
|
|
cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().end());
|
|
//vector<size_t> alignMapTerm = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm()
|
|
size_t sourceRulePos = 0;
|
|
//cerr << "SW->RP ";
|
|
for (vector<Block>::const_iterator sourceBlockIt = sourceBlocks.begin();
|
|
sourceBlockIt != sourceBlocks.end(); ++sourceBlockIt) {
|
|
for (size_t sourceWordPos = sourceBlockIt->first.GetStartPos();
|
|
sourceWordPos <= sourceBlockIt->first.GetEndPos(); ++sourceWordPos) {
|
|
sourceWordToTargetRulePos[sourceWordPos - sourceStart] = alignMap[sourceRulePos];
|
|
// cerr << sourceWordPos - sourceStart << "-" << alignMap[sourceRulePos] << " ";
|
|
if (! sourceBlockIt->second) {
|
|
//T
|
|
++sourceRulePos;
|
|
}
|
|
}
|
|
if ( sourceBlockIt->second) {
|
|
//NT
|
|
++sourceRulePos;
|
|
}
|
|
}
|
|
//cerr << endl;
|
|
|
|
//Iterate through block pairs
|
|
const Sentence& sentence =
|
|
dynamic_cast<const Sentence&>(cur_hypo.GetManager().GetSource());
|
|
//const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
|
|
for (size_t i = 0; i < sourceBlocks.size()-1; ++i) {
|
|
Block& leftSourceBlock = sourceBlocks[i];
|
|
Block& rightSourceBlock = sourceBlocks[i+1];
|
|
size_t sourceLeftBoundaryPos = leftSourceBlock.first.GetEndPos();
|
|
size_t sourceRightBoundaryPos = rightSourceBlock.first.GetStartPos();
|
|
const Word& sourceLeftBoundaryWord = sentence.GetWord(sourceLeftBoundaryPos);
|
|
const Word& sourceRightBoundaryWord = sentence.GetWord(sourceRightBoundaryPos);
|
|
sourceLeftBoundaryPos -= sourceStart;
|
|
sourceRightBoundaryPos -= sourceStart;
|
|
|
|
// Need to figure out where these map to on the target.
|
|
size_t targetLeftRulePos =
|
|
sourceWordToTargetRulePos[sourceLeftBoundaryPos];
|
|
size_t targetRightRulePos =
|
|
sourceWordToTargetRulePos[sourceRightBoundaryPos];
|
|
|
|
bool isMonotone = true;
|
|
if ((sourceLeftBoundaryPos < sourceRightBoundaryPos &&
|
|
targetLeftRulePos > targetRightRulePos) ||
|
|
((sourceLeftBoundaryPos > sourceRightBoundaryPos &&
|
|
targetLeftRulePos < targetRightRulePos)))
|
|
{
|
|
isMonotone = false;
|
|
}
|
|
stringstream buf;
|
|
buf << "h_"; //sparse reordering, Huck
|
|
if (m_type == SourceLeft || m_type == SourceCombined) {
|
|
buf << GetFactor(sourceLeftBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
|
|
buf << "_";
|
|
}
|
|
if (m_type == SourceRight || m_type == SourceCombined) {
|
|
buf << GetFactor(sourceRightBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
|
|
buf << "_";
|
|
}
|
|
buf << (isMonotone ? "M" : "S");
|
|
accumulator->PlusEquals(this,buf.str(), 1);
|
|
}
|
|
// cerr << endl;
|
|
}
|
|
|
|
|
|
}
|
|
|