mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 15:48:05 +03:00
OSM-Feature
This commit is contained in:
parent
bd95c2ccfe
commit
cf55ab6678
244
moses/FF/OSM-Feature/OpSequenceModel.cpp
Normal file
244
moses/FF/OSM-Feature/OpSequenceModel.cpp
Normal file
@ -0,0 +1,244 @@
|
||||
#include <fstream>
|
||||
#include "OpSequenceModel.h"
|
||||
#include "osmHyp.h"
|
||||
#include "util/check.hh"
|
||||
#include "moses/Util.h"
|
||||
#include "moses/OSM-Feature/osmHyp.h"
|
||||
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
OpSequenceModel::OpSequenceModel()
|
||||
:StatefulFeatureFunction("OpSequenceModel", 5 )
|
||||
{
|
||||
|
||||
|
||||
|
||||
//LanguageModel = NULL;
|
||||
}
|
||||
|
||||
void OpSequenceModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
|
||||
vector <int> numbers;
|
||||
int nonWordFlag = 0;
|
||||
string unkOp = "_TRANS_SLF_";
|
||||
ptrOp = new Api;
|
||||
ptrOp -> read_lm(lmFile,lmOrder);
|
||||
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
|
||||
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
|
||||
|
||||
/*
|
||||
setlocale(LC_CTYPE, "");
|
||||
setlocale(LC_COLLATE, "");
|
||||
|
||||
Vocab *vocab = new Vocab;
|
||||
vocab->unkIsWord() = true; // vocabulary contains unknown word tag
|
||||
|
||||
LanguageModel = new Ngram( *vocab,order );
|
||||
assert(LanguageModel != 0);
|
||||
// LanguageModel->debugme(0);
|
||||
|
||||
File file( lmFile, "r" );
|
||||
if (!LanguageModel->read( file )) {
|
||||
cerr << "format error in lm file\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
file.close();
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
void OpSequenceModel::Load(const std::string &osmFeatureFile, const std::string &operationLM , int orderVal)
|
||||
{
|
||||
// load future cost
|
||||
lmOrder= orderVal;
|
||||
//vector <string> input;
|
||||
ifstream sr (osmFeatureFile.c_str());
|
||||
char* tmp;
|
||||
|
||||
CHECK(sr.is_open());
|
||||
|
||||
vector<FactorType> factorOrder;
|
||||
factorOrder.push_back(0);
|
||||
|
||||
string line;
|
||||
while (std::getline(sr, line))
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
tokens = TokenizeMultiCharSeparator(line, "|||");
|
||||
CHECK(tokens.size() == 3);
|
||||
|
||||
Phrase source, target;
|
||||
source.CreateFromString(factorOrder, tokens[0], "|");
|
||||
target.CreateFromString(factorOrder, tokens[1], "|");
|
||||
|
||||
ParallelPhrase pp(source, target);
|
||||
Scores scores = Tokenize<float>(tokens[2], " ");
|
||||
m_futureCost[pp] = scores;
|
||||
// m_coll[pp] = scores;
|
||||
}
|
||||
|
||||
readLanguageModel(operationLM.c_str());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
FFState* OpSequenceModel::Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
||||
const WordsBitmap &bitmap = cur_hypo.GetWordsBitmap();
|
||||
WordsBitmap myBitmap = bitmap;
|
||||
const Manager &manager = cur_hypo.GetManager();
|
||||
const InputType &source = manager.GetSource();
|
||||
const Sentence &sourceSentence = static_cast<const Sentence&>(source);
|
||||
osmHypothesis obj;
|
||||
vector <string> mySourcePhrase;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores(5);
|
||||
|
||||
|
||||
//target.GetWord(0)
|
||||
|
||||
//cerr << target <<" --- "<<target.GetSourcePhrase()<< endl; // English ...
|
||||
|
||||
//cerr << align << endl; // Alignments ...
|
||||
//cerr << cur_hypo.GetCurrSourceWordsRange() << endl;
|
||||
|
||||
//cerr << source <<endl;
|
||||
|
||||
// int a = sourceRange.GetStartPos();
|
||||
// cerr << source.GetWord(a);
|
||||
//cerr <<a<<endl;
|
||||
|
||||
//const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource());
|
||||
|
||||
|
||||
const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange();
|
||||
int startIndex = sourceRange.GetStartPos();
|
||||
int endIndex = sourceRange.GetEndPos();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
osmState * statePtr;
|
||||
|
||||
vector <int> alignments;
|
||||
|
||||
|
||||
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
||||
for (iter = align.begin(); iter != align.end(); ++iter) {
|
||||
//cerr << iter->first << "----" << iter->second << " ";
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
|
||||
//cerr<<bitmap<<endl;
|
||||
//cerr<<startIndex<<" "<<endIndex<<endl;
|
||||
|
||||
|
||||
for (int i = startIndex; i <= endIndex; i++)
|
||||
{
|
||||
myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString());
|
||||
// cerr<<mySourcePhrase[i]<<endl;
|
||||
}
|
||||
|
||||
for (int i = 0; i < target.GetSize(); i++)
|
||||
{
|
||||
|
||||
if (target.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString());
|
||||
|
||||
}
|
||||
|
||||
|
||||
//cerr<<myBitmap<<endl;
|
||||
|
||||
obj.setState(prev_state);
|
||||
obj.constructCepts(alignments,startIndex,endIndex,target.GetSize());
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.computeOSMFeature(startIndex,myBitmap,*ptrOp,lmOrder);
|
||||
obj.populateScores(scores);
|
||||
|
||||
/*
|
||||
if (bitmap.GetFirstGapPos() == NOT_FOUND)
|
||||
{
|
||||
|
||||
int xx;
|
||||
cerr<<bitmap<<endl;
|
||||
int a = bitmap.GetFirstGapPos();
|
||||
obj.print();
|
||||
cin>>xx;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
vector<float> scores(5);
|
||||
scores[0] = 0.343423f;
|
||||
scores[1] = 1.343423f;
|
||||
scores[2] = 2.343423f;
|
||||
scores[3] = 3.343423f;
|
||||
scores[4] = 4.343423f;
|
||||
*/
|
||||
|
||||
accumulator->PlusEquals(this, scores);
|
||||
|
||||
return obj.saveState();
|
||||
|
||||
|
||||
|
||||
|
||||
//return statePtr;
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
FFState* OpSequenceModel::EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
abort();
|
||||
|
||||
}
|
||||
|
||||
const FFState* OpSequenceModel::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
cerr << "OpSequenceModel::EmptyHypothesisState()" << endl;
|
||||
return new osmState();
|
||||
}
|
||||
|
||||
std::string OpSequenceModel::GetScoreProducerWeightShortName(unsigned idx) const
|
||||
{
|
||||
return "osm";
|
||||
}
|
||||
|
||||
std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const Phrase &target) const
|
||||
{
|
||||
ParallelPhrase pp(source, target);
|
||||
std::map<ParallelPhrase, Scores>::const_iterator iter;
|
||||
iter = m_futureCost.find(pp);
|
||||
//iter = m_coll.find(pp);
|
||||
if (iter == m_futureCost.end()) {
|
||||
vector<float> scores(5, 0);
|
||||
scores[0] = unkOpProb;
|
||||
return scores;
|
||||
}
|
||||
else {
|
||||
const vector<float> &scores = iter->second;
|
||||
return scores;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
58
moses/FF/OSM-Feature/OpSequenceModel.h
Normal file
58
moses/FF/OSM-Feature/OpSequenceModel.h
Normal file
@ -0,0 +1,58 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include "moses/FeatureFunction.h"
|
||||
#include "Ngram.h"
|
||||
#include "moses/Manager.h"
|
||||
#include "moses/OSM-Feature/osmHyp.h"
|
||||
#include "moses/OSM-Feature/SRILM-API.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class OpSequenceModel : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
|
||||
//LM *LanguageModel;
|
||||
Api * ptrOp;
|
||||
int lmOrder;
|
||||
float unkOpProb;
|
||||
|
||||
OpSequenceModel();
|
||||
|
||||
void readLanguageModel(const char *);
|
||||
void Load(const std::string &osmFeatureFile, const std::string &operationLM , int orderVal);
|
||||
|
||||
FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
virtual FFState* EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
|
||||
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
||||
|
||||
std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const;
|
||||
|
||||
protected:
|
||||
typedef std::pair<Phrase, Phrase> ParallelPhrase;
|
||||
typedef std::vector<float> Scores;
|
||||
std::map<ParallelPhrase, Scores> m_futureCost;
|
||||
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
} // namespace
|
175
moses/FF/OSM-Feature/SRILM-API.cpp
Normal file
175
moses/FF/OSM-Feature/SRILM-API.cpp
Normal file
@ -0,0 +1,175 @@
|
||||
#include "SRILM-API.h"
|
||||
#include "Ngram.h"
|
||||
|
||||
|
||||
Api :: Api()
|
||||
{
|
||||
LanguageModel = NULL;
|
||||
}
|
||||
|
||||
Api :: ~Api()
|
||||
{
|
||||
delete LanguageModel;
|
||||
}
|
||||
|
||||
int Api :: getLMID(char* toBeChecked)
|
||||
{
|
||||
|
||||
VocabString words[11];
|
||||
unsigned len = LanguageModel->vocab.parseWords(toBeChecked, words, 10);
|
||||
|
||||
if (len < 1) {
|
||||
cerr << "Error: in input file!\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
VocabString last = words[len-1];
|
||||
VocabIndex index = LanguageModel->vocab.getIndex(last,LanguageModel->vocab.unkIndex());
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
double Api :: contextProbN (vector <int> numbers, int & nonWordFlag)
|
||||
{
|
||||
|
||||
VocabIndex wordIndex[11];
|
||||
VocabIndex last = numbers[numbers.size()-1];
|
||||
|
||||
int c = 0;
|
||||
//cout<<last<<endl;
|
||||
for(int i = numbers.size()-2; i>=0; i--)
|
||||
{
|
||||
//cout<<numbers[i]<<endl;
|
||||
wordIndex[c] = numbers[i];
|
||||
c++;
|
||||
}
|
||||
|
||||
wordIndex[c]=Vocab_None;
|
||||
|
||||
//return pow(10,LanguageModel->wordProb(last,wordIndex));
|
||||
|
||||
return LanguageModel->wordProb(last,wordIndex);
|
||||
|
||||
}
|
||||
|
||||
unsigned Api :: backOffLength (vector <int> numbers)
|
||||
{
|
||||
|
||||
VocabIndex wordIndex[11];
|
||||
VocabIndex last = numbers[numbers.size()-1];
|
||||
unsigned length = 0;
|
||||
|
||||
int c = 0;
|
||||
//cout<<last<<endl;
|
||||
for(int i = numbers.size()-2; i>=0; i--)
|
||||
{
|
||||
//cout<<numbers[i]<<endl;
|
||||
wordIndex[c] = numbers[i];
|
||||
c++;
|
||||
}
|
||||
|
||||
wordIndex[c]=Vocab_None;
|
||||
|
||||
//return pow(10,LanguageModel->wordProb(last,wordIndex));
|
||||
LanguageModel->contextID(last,wordIndex,length);
|
||||
return length;
|
||||
|
||||
}
|
||||
|
||||
double Api :: contextProb (char * toBeChecked, int & nonWordFlag)
|
||||
{
|
||||
|
||||
|
||||
//read_lm(languageModel,order);
|
||||
VocabString words[11];
|
||||
|
||||
unsigned len = LanguageModel->vocab.parseWords(toBeChecked, words, 10);
|
||||
|
||||
if (len < 1) {
|
||||
cerr << "Error: in input file!\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
VocabString last = words[len-1];
|
||||
|
||||
words[len-1] = 0;
|
||||
// reverse N-gram prefix to obtain context
|
||||
|
||||
VocabIndex index = LanguageModel->vocab.getIndex(last);
|
||||
|
||||
|
||||
if(index == Vocab_None)
|
||||
{
|
||||
nonWordFlag=1;
|
||||
|
||||
}
|
||||
|
||||
LanguageModel->vocab.reverse( words );
|
||||
|
||||
// double cost= pow(10,lm_logprobContext(last, words ));
|
||||
double cost= lm_logprobContext(last, words);
|
||||
|
||||
return cost;
|
||||
|
||||
}
|
||||
|
||||
double Api :: sentProb (char * toBeChecked)
|
||||
{
|
||||
|
||||
|
||||
//read_lm(languageModel,order);
|
||||
VocabString sentence[15];
|
||||
unsigned len = LanguageModel->vocab.parseWords(toBeChecked, sentence, 15);
|
||||
|
||||
|
||||
if (len < 1)
|
||||
{
|
||||
cerr << "Error: in input file!\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//printf("%lf\n", exp(lm_logprobSent(sentence)));
|
||||
//cout<<lm_logprobSent(sentence)<<endl;
|
||||
return pow(10,lm_logprobSent(sentence));
|
||||
}
|
||||
|
||||
void Api :: read_lm(const char *lmFile,int order)
|
||||
{
|
||||
|
||||
|
||||
setlocale(LC_CTYPE, "");
|
||||
setlocale(LC_COLLATE, "");
|
||||
|
||||
Vocab *vocab = new Vocab;
|
||||
vocab->unkIsWord() = true; /* vocabulary contains unknown word tag */
|
||||
|
||||
LanguageModel = new Ngram( *vocab,order );
|
||||
assert(LanguageModel != 0);
|
||||
// LanguageModel->debugme(0);
|
||||
|
||||
File file( lmFile, "r" );
|
||||
if (!LanguageModel->read( file )) {
|
||||
cerr << "format error in lm file\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
file.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
float Api :: lm_logprobSent( const VocabString *sentence )
|
||||
|
||||
{
|
||||
TextStats obj;
|
||||
return LanguageModel->sentenceProb(sentence, obj);
|
||||
}
|
||||
|
||||
|
||||
float Api :: lm_logprobContext( const VocabString word, const VocabString *context )
|
||||
{
|
||||
return LanguageModel->wordProb( word, context );
|
||||
}
|
||||
|
||||
|
31
moses/FF/OSM-Feature/SRILM-API.h
Normal file
31
moses/FF/OSM-Feature/SRILM-API.h
Normal file
@ -0,0 +1,31 @@
|
||||
#pragma once
|
||||
|
||||
#include "Ngram.h"
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Api
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
Api();
|
||||
~Api();
|
||||
void read_lm(const char *,int);
|
||||
float lm_logprobContext( const VocabString word, const VocabString *context );
|
||||
float lm_logprobSent( const VocabString *sentence );
|
||||
double contextProb(char *, int & );
|
||||
double contextProbN (std::vector <int> , int &);
|
||||
unsigned backOffLength (std::vector <int>);
|
||||
|
||||
double sentProb(char *) ;
|
||||
int getLMID(char *);
|
||||
|
||||
private :
|
||||
|
||||
LM *LanguageModel;
|
||||
|
||||
};
|
||||
|
||||
|
690
moses/FF/OSM-Feature/osmHyp.cpp
Normal file
690
moses/FF/OSM-Feature/osmHyp.cpp
Normal file
@ -0,0 +1,690 @@
|
||||
#include "osmHyp.h"
|
||||
#include <sstream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
osmState::osmState()
|
||||
:j(0)
|
||||
,E(0)
|
||||
{
|
||||
history.push_back("<s>");
|
||||
}
|
||||
|
||||
void osmState::saveState(int jVal, int eVal, vector <string> & histVal , map <int , string> & gapVal)
|
||||
{
|
||||
history.clear();
|
||||
gap.clear();
|
||||
gap = gapVal;
|
||||
history = histVal;
|
||||
j = jVal;
|
||||
E = eVal;
|
||||
}
|
||||
|
||||
int osmState::Compare(const FFState& otherBase) const
|
||||
{
|
||||
const osmState &other = static_cast<const osmState&>(otherBase);
|
||||
if (j != other.j)
|
||||
return (j < other.j) ? -1 : +1;
|
||||
if (E != other.E)
|
||||
return (E < other.E) ? -1 : +1;
|
||||
if (gap != other.gap)
|
||||
return (gap < other.gap) ? -1 : +1;
|
||||
if (history != other.history)
|
||||
return (history < other.history) ? -1 : +1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void osmState :: print() const
|
||||
{
|
||||
|
||||
for (int i = 0; i< delHistory.size(); i++)
|
||||
{
|
||||
cerr<<delHistory[i]<<" ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::string osmState :: getName() const
|
||||
{
|
||||
|
||||
print();
|
||||
return "done";
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
osmHypothesis :: osmHypothesis()
|
||||
{
|
||||
opProb = 0;
|
||||
gapWidth = 0;
|
||||
gapCount = 0;
|
||||
openGapCount = 0;
|
||||
deletionCount = 0;
|
||||
gapCount = 0;
|
||||
j = 0;
|
||||
E = 0;
|
||||
history.clear();
|
||||
gap.clear();
|
||||
}
|
||||
|
||||
void osmHypothesis :: setState(const FFState* prev_state)
|
||||
{
|
||||
|
||||
if(prev_state != NULL)
|
||||
{
|
||||
|
||||
|
||||
j = static_cast <const osmState *> (prev_state)->getJ();
|
||||
E = static_cast <const osmState *> (prev_state)->getE();
|
||||
history = static_cast <const osmState *> (prev_state)->getHistory();
|
||||
gap = static_cast <const osmState *> (prev_state)->getGap();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
osmState * osmHypothesis :: saveState()
|
||||
{
|
||||
|
||||
osmState * statePtr = new osmState;
|
||||
statePtr->saveState(j,E,history,gap);
|
||||
statePtr->saveDelHistory(operations);
|
||||
return statePtr;
|
||||
}
|
||||
|
||||
int osmHypothesis :: isTranslationOperation(int x)
|
||||
{
|
||||
if (operations[x].find("_JMP_BCK_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_JMP_FWD_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_CONT_CEPT_") != -1)
|
||||
return 0;
|
||||
|
||||
if (operations[x].find("_INS_GAP_") != -1)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: removeReorderingOperations()
|
||||
{
|
||||
gapCount = 0;
|
||||
deletionCount = 0;
|
||||
openGapCount = 0;
|
||||
gapWidth = 0;
|
||||
//cout<<"I came here"<<endl;
|
||||
|
||||
std::vector <std::string> tupleSequence;
|
||||
|
||||
for (int x = 0; x < operations.size(); x++)
|
||||
{
|
||||
// cout<<operations[x]<<endl;
|
||||
|
||||
if(isTranslationOperation(x) == 1)
|
||||
{
|
||||
tupleSequence.push_back(operations[x]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
operations.clear();
|
||||
operations = tupleSequence;
|
||||
}
|
||||
|
||||
void osmHypothesis :: calculateOSMProb(Api & ptrOp , int order)
|
||||
{
|
||||
|
||||
opProb = 0;
|
||||
vector <int> numbers;
|
||||
vector <int> context;
|
||||
int nonWordFlag = 0;
|
||||
double temp;
|
||||
|
||||
for (int i=0; i< operations.size(); i++)
|
||||
numbers.push_back(ptrOp.getLMID(const_cast <char *> (operations[i].c_str())));
|
||||
|
||||
// cerr<<"History Of Operations "<<history.size()<<endl;
|
||||
|
||||
for (int i=0; i< history.size(); i++)
|
||||
{
|
||||
context.push_back(ptrOp.getLMID(const_cast <char *> (history[i].c_str())));
|
||||
//cerr<<history[i]<<" ";
|
||||
}
|
||||
//cerr<<endl;
|
||||
|
||||
for (int i = 0; i<operations.size(); i++)
|
||||
{
|
||||
//cerr<<operations[i]<<endl;
|
||||
context.push_back(numbers[i]);
|
||||
history.push_back(operations[i]);
|
||||
//cout<<"Context Size "<<context.size()<<endl;
|
||||
if (context.size() > order)
|
||||
{
|
||||
context.erase(context.begin());
|
||||
history.erase(history.begin());
|
||||
}
|
||||
|
||||
temp = ptrOp.contextProbN(context,nonWordFlag);
|
||||
opProb = opProb + temp;
|
||||
|
||||
//cout<<temp<<" "<<opProb<<endl;
|
||||
|
||||
}
|
||||
|
||||
if (history.size() > order-1)
|
||||
{
|
||||
history.erase(history.begin());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
int osmHypothesis :: firstOpenGap(vector <int> & coverageVector)
|
||||
{
|
||||
|
||||
int firstOG =-1;
|
||||
|
||||
for(int nd = 0; nd < coverageVector.size(); nd++)
|
||||
{
|
||||
if(coverageVector[nd]==0)
|
||||
{
|
||||
firstOG = nd;
|
||||
return firstOG;
|
||||
}
|
||||
}
|
||||
|
||||
return firstOG;
|
||||
|
||||
}
|
||||
|
||||
string osmHypothesis :: intToString(int num)
|
||||
{
|
||||
|
||||
std::ostringstream stm;
|
||||
stm<<num;
|
||||
|
||||
return stm.str();
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: generateOperations(int & startIndex , int j1 , int contFlag , WordsBitmap & coverageVector , string english , string german , set <int> & targetNullWords , vector <string> & currF)
|
||||
{
|
||||
|
||||
int gFlag = 0;
|
||||
int gp = 0;
|
||||
int ans;
|
||||
|
||||
|
||||
if ( j < j1) // j1 is the index of the source word we are about to generate ...
|
||||
{
|
||||
//if(coverageVector[j]==0) // if source word at j is not generated yet ...
|
||||
if(coverageVector.GetValue(j)==0) // if source word at j is not generated yet ...
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gFlag++;
|
||||
gap[j]="Unfilled";
|
||||
}
|
||||
if (j == E)
|
||||
{
|
||||
j = j1;
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_JMP_FWD_");
|
||||
j=E;
|
||||
}
|
||||
}
|
||||
|
||||
if (j1 < j)
|
||||
{
|
||||
// if(j < E && coverageVector[j]==0)
|
||||
if(j < E && coverageVector.GetValue(j)==0)
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gFlag++;
|
||||
gap[j]="Unfilled";
|
||||
}
|
||||
|
||||
j=closestGap(gap,j1,gp);
|
||||
operations.push_back("_JMP_BCK_"+ intToString(gp));
|
||||
|
||||
//cout<<"I am j "<<j<<endl;
|
||||
//cout<<"I am j1 "<<j1<<endl;
|
||||
|
||||
if(j==j1)
|
||||
gap[j]="Filled";
|
||||
}
|
||||
|
||||
if (j < j1)
|
||||
{
|
||||
operations.push_back("_INS_GAP_");
|
||||
gap[j] = "Unfilled";
|
||||
gFlag++;
|
||||
j=j1;
|
||||
}
|
||||
|
||||
if(contFlag == 0) // First words of the multi-word cept ...
|
||||
{
|
||||
|
||||
if(english == "_TRANS_SLF_") // Unknown word ...
|
||||
{
|
||||
operations.push_back("_TRANS_SLF_");
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_TRANS_" + english + "_TO_" + german);
|
||||
}
|
||||
|
||||
//ans = firstOpenGap(coverageVector);
|
||||
ans = coverageVector.GetFirstGapPos();
|
||||
|
||||
if (ans != -1)
|
||||
gapWidth += j - ans;
|
||||
|
||||
}
|
||||
else if (contFlag == 2)
|
||||
{
|
||||
|
||||
operations.push_back("_INS_" + german);
|
||||
ans = coverageVector.GetFirstGapPos();
|
||||
|
||||
if (ans != -1)
|
||||
gapWidth += j - ans;
|
||||
deletionCount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
operations.push_back("_CONT_CEPT_");
|
||||
}
|
||||
|
||||
//coverageVector[j]=1;
|
||||
coverageVector.SetValue(j,1);
|
||||
j+=1;
|
||||
|
||||
if(E<j)
|
||||
E=j;
|
||||
|
||||
if (gFlag > 0)
|
||||
gapCount++;
|
||||
|
||||
openGapCount += getOpenGaps();
|
||||
|
||||
//if (coverageVector[j] == 0 && targetNullWords.find(j) != targetNullWords.end())
|
||||
if (coverageVector.GetValue(j) == 0 && targetNullWords.find(j) != targetNullWords.end())
|
||||
{
|
||||
j1 = j;
|
||||
german = currF[j1-startIndex];
|
||||
english = "_INS_";
|
||||
generateOperations(startIndex, j1, 2 , coverageVector , english , german , targetNullWords , currF);
|
||||
}
|
||||
|
||||
//print();
|
||||
}
|
||||
|
||||
void osmHypothesis :: print()
|
||||
{
|
||||
for (int i = 0; i< operations.size(); i++)
|
||||
{
|
||||
cerr<<operations[i]<<" ";
|
||||
|
||||
}
|
||||
|
||||
cerr<<endl<<endl;
|
||||
|
||||
cerr<<"Operation Probability "<<opProb<<endl;
|
||||
cerr<<"Gap Count "<<gapCount<<endl;
|
||||
cerr<<"Open Gap Count "<<openGapCount<<endl;
|
||||
cerr<<"Gap Width "<<gapWidth<<endl;
|
||||
cerr<<"Deletion Count "<<deletionCount<<endl;
|
||||
|
||||
cerr<<"_______________"<<endl;
|
||||
}
|
||||
|
||||
int osmHypothesis :: closestGap(map <int,string> gap, int j1, int & gp)
|
||||
{
|
||||
|
||||
int dist=1172;
|
||||
int value=-1;
|
||||
int temp=0;
|
||||
gp=0;
|
||||
int opGap=0;
|
||||
|
||||
map <int,string> :: iterator iter;
|
||||
|
||||
iter=gap.end();
|
||||
|
||||
do
|
||||
{
|
||||
iter--;
|
||||
//cout<<"Trapped "<<iter->first<<endl;
|
||||
|
||||
if(iter->first==j1 && iter->second== "Unfilled")
|
||||
{
|
||||
opGap++;
|
||||
gp = opGap;
|
||||
return j1;
|
||||
|
||||
}
|
||||
|
||||
if(iter->second =="Unfilled")
|
||||
{
|
||||
opGap++;
|
||||
temp = iter->first - j1;
|
||||
|
||||
if(temp<0)
|
||||
temp=temp * -1;
|
||||
|
||||
if(dist>temp && iter->first < j1)
|
||||
{
|
||||
dist=temp;
|
||||
value=iter->first;
|
||||
gp=opGap;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
while(iter!=gap.begin());
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int osmHypothesis :: getOpenGaps()
|
||||
{
|
||||
map <int,string> :: iterator iter;
|
||||
|
||||
int nd = 0;
|
||||
for (iter = gap.begin(); iter!=gap.end(); iter++)
|
||||
{
|
||||
if(iter->second == "Unfilled")
|
||||
nd++;
|
||||
}
|
||||
|
||||
return nd;
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes)
|
||||
{
|
||||
|
||||
operations.push_back("_DEL_" + english);
|
||||
currTargetIndex++;
|
||||
|
||||
while(doneTargetIndexes.find(currTargetIndex) != doneTargetIndexes.end())
|
||||
{
|
||||
currTargetIndex++;
|
||||
}
|
||||
|
||||
if (sourceNullWords.find(currTargetIndex) != sourceNullWords.end())
|
||||
{
|
||||
english = currE[currTargetIndex];
|
||||
generateDeleteOperations(english,currTargetIndex,doneTargetIndexes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: computeOSMFeature(int startIndex , WordsBitmap & coverageVector , Api & ptrOp, int order)
|
||||
{
|
||||
|
||||
set <int> doneTargetIndexes;
|
||||
set <int> eSide;
|
||||
set <int> fSide;
|
||||
set <int> :: iterator iter;
|
||||
string english;
|
||||
string source;
|
||||
int j1;
|
||||
int start = 0;
|
||||
int targetIndex = 0;
|
||||
doneTargetIndexes.clear();
|
||||
|
||||
|
||||
if (targetNullWords.size() != 0) // Source words to be deleted in the start of this phrase ...
|
||||
{
|
||||
iter = targetNullWords.begin();
|
||||
|
||||
if (*iter == startIndex)
|
||||
{
|
||||
|
||||
j1 = startIndex;
|
||||
source = currF[j1-startIndex];
|
||||
english = "_INS_";
|
||||
generateOperations(startIndex, j1, 2 , coverageVector , english , source , targetNullWords , currF);
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceNullWords.find(targetIndex) != sourceNullWords.end()) // first word has to be deleted ...
|
||||
{
|
||||
english = currE[targetIndex];
|
||||
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < ceptsInPhrase.size(); i++)
|
||||
{
|
||||
source = "";
|
||||
english = "";
|
||||
|
||||
fSide = ceptsInPhrase[i].first;
|
||||
eSide = ceptsInPhrase[i].second;
|
||||
|
||||
iter = eSide.begin();
|
||||
targetIndex = *iter;
|
||||
english += currE[*iter];
|
||||
iter++;
|
||||
|
||||
for (; iter != eSide.end(); iter++)
|
||||
{
|
||||
if(*iter == targetIndex+1)
|
||||
targetIndex++;
|
||||
else
|
||||
doneTargetIndexes.insert(*iter);
|
||||
|
||||
english += "^_^";
|
||||
english += currE[*iter];
|
||||
}
|
||||
|
||||
iter = fSide.begin();
|
||||
source += currF[*iter];
|
||||
iter++;
|
||||
|
||||
for (; iter != fSide.end(); iter++)
|
||||
{
|
||||
source += "^_^";
|
||||
source += currF[*iter];
|
||||
}
|
||||
|
||||
iter = fSide.begin();
|
||||
j1 = *iter + startIndex;
|
||||
iter++;
|
||||
|
||||
generateOperations(startIndex, j1, 0 , coverageVector , english , source , targetNullWords , currF);
|
||||
|
||||
|
||||
for (; iter != fSide.end(); iter++)
|
||||
{
|
||||
j1 = *iter + startIndex;
|
||||
generateOperations(startIndex, j1, 1 , coverageVector , english , source , targetNullWords , currF);
|
||||
}
|
||||
|
||||
targetIndex++; // Check whether the next target word is unaligned ...
|
||||
|
||||
while(doneTargetIndexes.find(targetIndex) != doneTargetIndexes.end())
|
||||
{
|
||||
targetIndex++;
|
||||
}
|
||||
|
||||
if(sourceNullWords.find(targetIndex) != sourceNullWords.end())
|
||||
{
|
||||
english = currE[targetIndex];
|
||||
generateDeleteOperations(english,targetIndex, doneTargetIndexes);
|
||||
}
|
||||
}
|
||||
|
||||
//removeReorderingOperations();
|
||||
calculateOSMProb(ptrOp, order);
|
||||
//print();
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT)
|
||||
{
|
||||
set <int> :: iterator iter;
|
||||
|
||||
int sz = eSide.size();
|
||||
vector <int> t;
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
t = tS[*iter];
|
||||
|
||||
for (int i = 0; i < t.size(); i++)
|
||||
{
|
||||
fSide.insert(t[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
|
||||
t = sT[*iter];
|
||||
|
||||
for (int i = 0 ; i<t.size(); i++)
|
||||
{
|
||||
eSide.insert(t[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (eSide.size () > sz)
|
||||
{
|
||||
getMeCepts(eSide,fSide,tS,sT);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength)
|
||||
{
|
||||
|
||||
std::map <int , vector <int> > sT;
|
||||
std::map <int , vector <int> > tS;
|
||||
std::set <int> eSide;
|
||||
std::set <int> fSide;
|
||||
std::set <int> :: iterator iter;
|
||||
std :: map <int , vector <int> > :: iterator iter2;
|
||||
std :: pair < set <int> , set <int> > cept;
|
||||
int src;
|
||||
int tgt;
|
||||
|
||||
|
||||
for (int i = 0; i < align.size(); i+=2)
|
||||
{
|
||||
src = align[i];
|
||||
tgt = align[i+1];
|
||||
tS[tgt].push_back(src);
|
||||
sT[src].push_back(tgt);
|
||||
}
|
||||
|
||||
for (int i = startIndex; i<= endIndex; i++) // What are unaligned source words in this phrase ...
|
||||
{
|
||||
if (sT.find(i-startIndex) == sT.end())
|
||||
{
|
||||
targetNullWords.insert(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < targetPhraseLength; i++) // What are unaligned target words in this phrase ...
|
||||
{
|
||||
if (tS.find(i) == tS.end())
|
||||
{
|
||||
sourceNullWords.insert(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
while (tS.size() != 0 && sT.size() != 0)
|
||||
{
|
||||
|
||||
iter2 = tS.begin();
|
||||
|
||||
eSide.clear();
|
||||
fSide.clear();
|
||||
eSide.insert (iter2->first);
|
||||
|
||||
getMeCepts(eSide, fSide, tS , sT);
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
iter2 = tS.find(*iter);
|
||||
tS.erase(iter2);
|
||||
}
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
iter2 = sT.find(*iter);
|
||||
sT.erase(iter2);
|
||||
}
|
||||
|
||||
cept = make_pair (fSide , eSide);
|
||||
ceptsInPhrase.push_back(cept);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
|
||||
cerr<<"Extracted Cepts "<<endl;
|
||||
for (int i = 0; i < ceptsInPhrase.size(); i++)
|
||||
{
|
||||
|
||||
fSide = ceptsInPhrase[i].first;
|
||||
eSide = ceptsInPhrase[i].second;
|
||||
|
||||
for (iter = eSide.begin(); iter != eSide.end(); iter++)
|
||||
{
|
||||
cerr<<*iter<<" ";
|
||||
}
|
||||
cerr<<"<---> ";
|
||||
|
||||
for (iter = fSide.begin(); iter != fSide.end(); iter++)
|
||||
{
|
||||
cerr<<*iter<<" ";
|
||||
}
|
||||
|
||||
cerr<<endl;
|
||||
}
|
||||
cerr<<endl;
|
||||
|
||||
cerr<<"Unaligned Target Words"<<endl;
|
||||
|
||||
for (iter = sourceNullWords.begin(); iter != sourceNullWords.end(); iter++)
|
||||
cerr<<*iter<<"<--->"<<endl;
|
||||
|
||||
cerr<<"Unaligned Source Words"<<endl;
|
||||
|
||||
for (iter = targetNullWords.begin(); iter != targetNullWords.end(); iter++)
|
||||
cerr<<*iter<<"<--->"<<endl;
|
||||
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
void osmHypothesis :: populateScores(vector <float> & scores)
|
||||
{
|
||||
scores.clear();
|
||||
scores.push_back(opProb);
|
||||
scores.push_back(gapWidth);
|
||||
scores.push_back(gapCount);
|
||||
scores.push_back(openGapCount);
|
||||
scores.push_back(deletionCount);
|
||||
}
|
||||
|
||||
|
||||
} // namespace
|
||||
|
89
moses/FF/OSM-Feature/osmHyp.h
Normal file
89
moses/FF/OSM-Feature/osmHyp.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
|
||||
# include "SRILM-API.h"
|
||||
# include "moses/FFState.h"
|
||||
# include "moses/Manager.h"
|
||||
# include <set>
|
||||
# include <map>
|
||||
# include <string>
|
||||
# include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class osmState : public FFState
|
||||
{
|
||||
public:
|
||||
osmState();
|
||||
int Compare(const FFState& other) const;
|
||||
void saveState(int jVal, int eVal, vector <string> & hist , map <int , string> & gapVal);
|
||||
int getJ()const {return j;}
|
||||
int getE()const {return E;}
|
||||
map <int , string> getGap() const { return gap;}
|
||||
vector <string> getHistory()const {return history;}
|
||||
void print() const;
|
||||
std::string getName() const;
|
||||
void saveDelHistory(vector <string> & histVal){delHistory = histVal;}
|
||||
|
||||
protected:
|
||||
int j, E;
|
||||
std::map <int,std::string> gap;
|
||||
std::vector <std::string> history;
|
||||
std::vector <std::string> delHistory;
|
||||
};
|
||||
|
||||
class osmHypothesis
|
||||
{
|
||||
|
||||
private:
|
||||
|
||||
std::vector <std::string> history;
|
||||
std::vector <std::string> operations; // List of operations required to generated this hyp ...
|
||||
std::map <int,std::string> gap; // Maintains gap history ...
|
||||
int j; // Position after the last source word generated ...
|
||||
int E; // Position after the right most source word so far generated ...
|
||||
|
||||
int gapCount; // Number of gaps inserted ...
|
||||
int deletionCount;
|
||||
int openGapCount;
|
||||
int gapWidth;
|
||||
double opProb;
|
||||
|
||||
vector <string> currE;
|
||||
vector <string> currF;
|
||||
vector < pair < set <int> , set <int> > > ceptsInPhrase;
|
||||
set <int> targetNullWords;
|
||||
set <int> sourceNullWords;
|
||||
|
||||
int closestGap(std::map <int,std::string> gap,int j1, int & gp);
|
||||
int firstOpenGap(std::vector <int> & coverageVector);
|
||||
std::string intToString(int);
|
||||
int getOpenGaps();
|
||||
int isTranslationOperation(int j);
|
||||
void removeReorderingOperations();
|
||||
|
||||
void getMeCepts ( set <int> & eSide , set <int> & fSide , map <int , vector <int> > & tS , map <int , vector <int> > & sT);
|
||||
|
||||
public:
|
||||
|
||||
osmHypothesis();
|
||||
~osmHypothesis(){};
|
||||
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
|
||||
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
|
||||
void calculateOSMProb(Api & opPtr , int order);
|
||||
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector , Api & ptrOp, int order);
|
||||
void constructCepts(vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
|
||||
void setPhrases(vector <string> & val1 , vector <string> & val2){currF = val1; currE = val2;}
|
||||
void setState(const FFState* prev_state);
|
||||
osmState * saveState();
|
||||
void print();
|
||||
void populateScores(vector <float> & scores);
|
||||
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user