mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
Merge github.com:moses-smt/mosesdecoder into hieu_opt_input2
This commit is contained in:
commit
a602e2052f
@ -31,11 +31,11 @@ const char REFLEN_CLOSEST[] = "closest";
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
|
||||
|
||||
BleuDocScorer::BleuDocScorer(const string& config)
|
||||
: BleuScorer("BLEUDOC", config),
|
||||
m_ref_length_type(CLOSEST)
|
||||
: BleuScorer("BLEUDOC", config),
|
||||
m_ref_length_type(CLOSEST)
|
||||
{
|
||||
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
|
||||
if (reflen == REFLEN_AVERAGE) {
|
||||
@ -63,41 +63,40 @@ bool BleuDocScorer::OpenReferenceStream(istream* is, size_t file_id)
|
||||
|
||||
if (line.find("<doc docid") != std::string::npos) { // new document
|
||||
doc_id++;
|
||||
m_references.push_back(new ScopedVector<Reference>());
|
||||
m_references.push_back(new ScopedVector<Reference>());
|
||||
sid = 0;
|
||||
}
|
||||
else if (line.find("<seg") != std::string::npos) { //new sentence
|
||||
} else if (line.find("<seg") != std::string::npos) { //new sentence
|
||||
int start = line.find_first_of('>') + 1;
|
||||
std::string trans = line.substr(start, line.find_last_of('<')-start);
|
||||
trans = preprocessSentence(trans);
|
||||
|
||||
if (file_id == 0) {
|
||||
Reference* ref = new Reference;
|
||||
m_references[doc_id]->push_back(ref); // Take ownership of the Reference object.
|
||||
Reference* ref = new Reference;
|
||||
m_references[doc_id]->push_back(ref); // Take ownership of the Reference object.
|
||||
}
|
||||
|
||||
if (m_references[doc_id]->size() <= sid) {
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
NgramCounts counts;
|
||||
size_t length = CountNgrams(trans, counts, kBleuNgramOrder);
|
||||
|
||||
|
||||
//for any counts larger than those already there, merge them in
|
||||
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
const NgramCounts::Key& ngram = ci->first;
|
||||
const NgramCounts::Value newcount = ci->second;
|
||||
|
||||
NgramCounts::Value oldcount = 0;
|
||||
m_references[doc_id]->get().at(sid)->get_counts()->Lookup(ngram, &oldcount);
|
||||
if (newcount > oldcount) {
|
||||
m_references[doc_id]->get().at(sid)->get_counts()->operator[](ngram) = newcount;
|
||||
}
|
||||
const NgramCounts::Key& ngram = ci->first;
|
||||
const NgramCounts::Value newcount = ci->second;
|
||||
|
||||
NgramCounts::Value oldcount = 0;
|
||||
m_references[doc_id]->get().at(sid)->get_counts()->Lookup(ngram, &oldcount);
|
||||
if (newcount > oldcount) {
|
||||
m_references[doc_id]->get().at(sid)->get_counts()->operator[](ngram) = newcount;
|
||||
}
|
||||
}
|
||||
//add in the length
|
||||
|
||||
m_references[doc_id]->get().at(sid)->push_back(length);
|
||||
m_references[doc_id]->get().at(sid)->push_back(length);
|
||||
if (sid > 0 && sid % 100 == 0) {
|
||||
TRACE_ERR(".");
|
||||
TRACE_ERR(".");
|
||||
}
|
||||
++sid;
|
||||
}
|
||||
@ -127,14 +126,14 @@ void BleuDocScorer::prepareStats(size_t sid, const string& text, ScoreStats& ent
|
||||
|
||||
//precision on each ngram type
|
||||
for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
|
||||
testcounts_it != testcounts.end(); ++testcounts_it) {
|
||||
testcounts_it != testcounts.end(); ++testcounts_it) {
|
||||
const NgramCounts::Value guess = testcounts_it->second;
|
||||
const size_t len = testcounts_it->first.size();
|
||||
NgramCounts::Value correct = 0;
|
||||
|
||||
|
||||
NgramCounts::Value v = 0;
|
||||
if (m_references[sid]->get().at(i)->get_counts()->Lookup(testcounts_it->first, &v)) {
|
||||
correct = min(v, guess);
|
||||
correct = min(v, guess);
|
||||
}
|
||||
stats[len * 2 - 2] += correct;
|
||||
stats[len * 2 - 1] += guess;
|
||||
@ -143,13 +142,13 @@ void BleuDocScorer::prepareStats(size_t sid, const string& text, ScoreStats& ent
|
||||
const int reference_len = CalcReferenceLength(sid, i, length);
|
||||
stats.push_back(reference_len);
|
||||
|
||||
//ADD stats to totStats
|
||||
std::transform(stats.begin(), stats.end(), totStats.begin(),
|
||||
totStats.begin(), std::plus<int>());
|
||||
//ADD stats to totStats
|
||||
std::transform(stats.begin(), stats.end(), totStats.begin(),
|
||||
totStats.begin(), std::plus<int>());
|
||||
}
|
||||
entry.set(totStats);
|
||||
entry.set(totStats);
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::string> BleuDocScorer::splitDoc(const std::string& text)
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
@ -188,18 +187,18 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
|
||||
int BleuDocScorer::CalcReferenceLength(size_t doc_id, size_t sentence_id, size_t length)
|
||||
{
|
||||
switch (m_ref_length_type) {
|
||||
case AVERAGE:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcShortest();
|
||||
break;
|
||||
default:
|
||||
cerr << "unknown reference types." << endl;
|
||||
exit(1);
|
||||
case AVERAGE:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return m_references[doc_id]->get().at(sentence_id)->CalcShortest();
|
||||
break;
|
||||
default:
|
||||
cerr << "unknown reference types." << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,7 +29,7 @@ public:
|
||||
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
|
||||
virtual statscore_t calculateScore(const std::vector<int>& comps) const;
|
||||
|
||||
int CalcReferenceLength(std::size_t doc_id, std::size_t sentence_id, std::size_t length);
|
||||
int CalcReferenceLength(std::size_t doc_id, std::size_t sentence_id, std::size_t length);
|
||||
|
||||
// NOTE: this function is used for unit testing.
|
||||
virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
|
||||
|
@ -67,7 +67,7 @@ public:
|
||||
// NOTE: this function is used for unit testing.
|
||||
virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
|
||||
|
||||
//private:
|
||||
//private:
|
||||
protected:
|
||||
ReferenceLengthType m_ref_length_type;
|
||||
|
||||
@ -76,7 +76,7 @@ protected:
|
||||
|
||||
// constructor used by subclasses
|
||||
BleuScorer(const std::string& name, const std::string& config): StatisticsBasedScorer(name,config) {}
|
||||
|
||||
|
||||
// no copying allowed
|
||||
BleuScorer(const BleuScorer&);
|
||||
BleuScorer& operator=(const BleuScorer&);
|
||||
|
@ -51,12 +51,12 @@ int main(int argc, char **argv)
|
||||
const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
|
||||
const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
|
||||
const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
|
||||
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
|
||||
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
|
||||
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
|
||||
//const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
|
||||
|
||||
StaticData::InstanceNonConst().LoadData(parameter);
|
||||
|
||||
PhraseDictionaryCompact pdc("input-factor=0 output-factor=0 num-features=5 path=" + ttable);
|
||||
PhraseDictionaryCompact pdc("PhraseDictionaryCompact input-factor=0 output-factor=0 num-features=5 path=" + ttable);
|
||||
pdc.Load();
|
||||
|
||||
std::string line;
|
||||
|
@ -11,7 +11,7 @@ namespace Moses
|
||||
{
|
||||
|
||||
OpSequenceModel::OpSequenceModel(const std::string &line)
|
||||
:StatefulFeatureFunction("OpSequenceModel", 5, line )
|
||||
:StatefulFeatureFunction("OpSequenceModel", 5, line )
|
||||
{
|
||||
ReadParameters();
|
||||
}
|
||||
@ -19,29 +19,29 @@ OpSequenceModel::OpSequenceModel(const std::string &line)
|
||||
void OpSequenceModel :: readLanguageModel(const char *lmFile)
|
||||
{
|
||||
|
||||
string unkOp = "_TRANS_SLF_";
|
||||
string unkOp = "_TRANS_SLF_";
|
||||
|
||||
|
||||
/*
|
||||
|
||||
// Code for SRILM
|
||||
/*
|
||||
|
||||
vector <int> numbers;
|
||||
// Code for SRILM
|
||||
|
||||
vector <int> numbers;
|
||||
int nonWordFlag = 0;
|
||||
|
||||
ptrOp = new Api;
|
||||
ptrOp -> read_lm(lmFile,lmOrder);
|
||||
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
|
||||
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
|
||||
|
||||
*/
|
||||
|
||||
// Code to load KenLM
|
||||
ptrOp = new Api;
|
||||
ptrOp -> read_lm(lmFile,lmOrder);
|
||||
numbers.push_back(ptrOp->getLMID(const_cast <char *> (unkOp.c_str())));
|
||||
unkOpProb = ptrOp->contextProbN(numbers,nonWordFlag);
|
||||
|
||||
OSM = new Model(m_lmPath.c_str());
|
||||
State startState = OSM->NullContextState();
|
||||
State endState;
|
||||
unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState);
|
||||
*/
|
||||
|
||||
// Code to load KenLM
|
||||
|
||||
OSM = new Model(m_lmPath.c_str());
|
||||
State startState = OSM->NullContextState();
|
||||
State endState;
|
||||
unkOpProb = OSM->Score(startState,OSM->GetVocabulary().Index(unkOp),endState);
|
||||
}
|
||||
|
||||
|
||||
@ -85,58 +85,55 @@ void OpSequenceModel::Load()
|
||||
|
||||
|
||||
void OpSequenceModel:: Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{
|
||||
|
||||
osmHypothesis obj;
|
||||
obj.setState(OSM->NullContextState());
|
||||
WordsBitmap myBitmap(source.GetSize());
|
||||
vector <string> mySourcePhrase;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores(5);
|
||||
vector <int> alignments;
|
||||
int startIndex = 0;
|
||||
int endIndex = source.GetSize();
|
||||
osmHypothesis obj;
|
||||
obj.setState(OSM->NullContextState());
|
||||
WordsBitmap myBitmap(source.GetSize());
|
||||
vector <string> mySourcePhrase;
|
||||
vector <string> myTargetPhrase;
|
||||
vector<float> scores(5);
|
||||
vector <int> alignments;
|
||||
int startIndex = 0;
|
||||
int endIndex = source.GetSize();
|
||||
|
||||
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
||||
AlignmentInfo::const_iterator iter;
|
||||
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
||||
|
||||
for (iter = align.begin(); iter != align.end(); ++iter)
|
||||
{
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
for (iter = align.begin(); iter != align.end(); ++iter) {
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
for (int i = 0; i < targetPhrase.GetSize(); i++)
|
||||
{
|
||||
if (targetPhrase.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
||||
if (targetPhrase.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
|
||||
for (int i = 0; i < source.GetSize(); i++)
|
||||
{
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.calculateOSMProb(*OSM);
|
||||
obj.populateScores(scores);
|
||||
estimatedFutureScore.PlusEquals(this, scores);
|
||||
for (int i = 0; i < source.GetSize(); i++) {
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
}
|
||||
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.calculateOSMProb(*OSM);
|
||||
obj.populateScores(scores);
|
||||
estimatedFutureScore.PlusEquals(this, scores);
|
||||
|
||||
}
|
||||
|
||||
|
||||
FFState* OpSequenceModel::Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
||||
const WordsBitmap &bitmap = cur_hypo.GetWordsBitmap();
|
||||
@ -159,83 +156,81 @@ FFState* OpSequenceModel::Evaluate(
|
||||
|
||||
//cerr << source <<endl;
|
||||
|
||||
// int a = sourceRange.GetStartPos();
|
||||
// cerr << source.GetWord(a);
|
||||
// int a = sourceRange.GetStartPos();
|
||||
// cerr << source.GetWord(a);
|
||||
//cerr <<a<<endl;
|
||||
|
||||
//const Sentence &sentence = static_cast<const Sentence&>(curr_hypo.GetManager().GetSource());
|
||||
|
||||
|
||||
const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange();
|
||||
int startIndex = sourceRange.GetStartPos();
|
||||
int endIndex = sourceRange.GetEndPos();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
osmState * statePtr;
|
||||
const WordsRange & sourceRange = cur_hypo.GetCurrSourceWordsRange();
|
||||
int startIndex = sourceRange.GetStartPos();
|
||||
int endIndex = sourceRange.GetEndPos();
|
||||
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
||||
osmState * statePtr;
|
||||
|
||||
vector <int> alignments;
|
||||
vector <int> alignments;
|
||||
|
||||
|
||||
|
||||
AlignmentInfo::const_iterator iter;
|
||||
AlignmentInfo::const_iterator iter;
|
||||
|
||||
for (iter = align.begin(); iter != align.end(); ++iter) {
|
||||
//cerr << iter->first << "----" << iter->second << " ";
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
|
||||
//cerr<<bitmap<<endl;
|
||||
//cerr<<startIndex<<" "<<endIndex<<endl;
|
||||
|
||||
|
||||
for (int i = startIndex; i <= endIndex; i++)
|
||||
{
|
||||
myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
// cerr<<mySourcePhrase[i]<<endl;
|
||||
for (iter = align.begin(); iter != align.end(); ++iter) {
|
||||
//cerr << iter->first << "----" << iter->second << " ";
|
||||
alignments.push_back(iter->first);
|
||||
alignments.push_back(iter->second);
|
||||
}
|
||||
|
||||
for (int i = 0; i < target.GetSize(); i++)
|
||||
{
|
||||
|
||||
if (target.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
//cerr<<bitmap<<endl;
|
||||
//cerr<<startIndex<<" "<<endIndex<<endl;
|
||||
|
||||
|
||||
for (int i = startIndex; i <= endIndex; i++) {
|
||||
myBitmap.SetValue(i,0); // resetting coverage of this phrase ...
|
||||
mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
// cerr<<mySourcePhrase[i]<<endl;
|
||||
}
|
||||
|
||||
for (int i = 0; i < target.GetSize(); i++) {
|
||||
|
||||
if (target.GetWord(i).IsOOV())
|
||||
myTargetPhrase.push_back("_TRANS_SLF_");
|
||||
else
|
||||
myTargetPhrase.push_back(target.GetWord(i).GetFactor(0)->GetString().as_string());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//cerr<<myBitmap<<endl;
|
||||
|
||||
obj.setState(prev_state);
|
||||
obj.constructCepts(alignments,startIndex,endIndex,target.GetSize());
|
||||
obj.setPhrases(mySourcePhrase , myTargetPhrase);
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.computeOSMFeature(startIndex,myBitmap);
|
||||
obj.calculateOSMProb(*OSM);
|
||||
obj.populateScores(scores);
|
||||
|
||||
/*
|
||||
if (bitmap.GetFirstGapPos() == NOT_FOUND)
|
||||
{
|
||||
/*
|
||||
if (bitmap.GetFirstGapPos() == NOT_FOUND)
|
||||
{
|
||||
|
||||
int xx;
|
||||
cerr<<bitmap<<endl;
|
||||
int a = bitmap.GetFirstGapPos();
|
||||
obj.print();
|
||||
cin>>xx;
|
||||
}
|
||||
*/
|
||||
int xx;
|
||||
cerr<<bitmap<<endl;
|
||||
int a = bitmap.GetFirstGapPos();
|
||||
obj.print();
|
||||
cin>>xx;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
vector<float> scores(5);
|
||||
scores[0] = 0.343423f;
|
||||
scores[1] = 1.343423f;
|
||||
scores[2] = 2.343423f;
|
||||
scores[3] = 3.343423f;
|
||||
scores[4] = 4.343423f;
|
||||
*/
|
||||
/*
|
||||
vector<float> scores(5);
|
||||
scores[0] = 0.343423f;
|
||||
scores[1] = 1.343423f;
|
||||
scores[2] = 2.343423f;
|
||||
scores[3] = 3.343423f;
|
||||
scores[4] = 4.343423f;
|
||||
*/
|
||||
|
||||
accumulator->PlusEquals(this, scores);
|
||||
|
||||
@ -245,7 +240,7 @@ FFState* OpSequenceModel::Evaluate(
|
||||
|
||||
|
||||
//return statePtr;
|
||||
// return NULL;
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
FFState* OpSequenceModel::EvaluateChart(
|
||||
@ -276,29 +271,28 @@ std::vector<float> OpSequenceModel::GetFutureScores(const Phrase &source, const
|
||||
ParallelPhrase pp(source, target);
|
||||
std::map<ParallelPhrase, Scores>::const_iterator iter;
|
||||
iter = m_futureCost.find(pp);
|
||||
//iter = m_coll.find(pp);
|
||||
//iter = m_coll.find(pp);
|
||||
if (iter == m_futureCost.end()) {
|
||||
vector<float> scores(5, 0);
|
||||
scores[0] = unkOpProb;
|
||||
return scores;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
const vector<float> &scores = iter->second;
|
||||
return scores;
|
||||
return scores;
|
||||
}
|
||||
}
|
||||
|
||||
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "feature-path") {
|
||||
m_featurePath = value;
|
||||
} else if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "order") {
|
||||
lmOrder = Scan<int>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
if (key == "feature-path") {
|
||||
m_featurePath = value;
|
||||
} else if (key == "path") {
|
||||
m_lmPath = value;
|
||||
} else if (key == "order") {
|
||||
lmOrder = Scan<int>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -16,26 +16,26 @@ class OpSequenceModel : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
|
||||
|
||||
lm::ngram::Model * OSM;
|
||||
|
||||
int lmOrder;
|
||||
float unkOpProb;
|
||||
|
||||
OpSequenceModel(const std::string &line);
|
||||
lm::ngram::Model * OSM;
|
||||
|
||||
void readLanguageModel(const char *);
|
||||
void Load();
|
||||
int lmOrder;
|
||||
float unkOpProb;
|
||||
|
||||
FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
OpSequenceModel(const std::string &line);
|
||||
|
||||
void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
void readLanguageModel(const char *);
|
||||
void Load();
|
||||
|
||||
FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
void Evaluate(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const;
|
||||
|
||||
virtual FFState* EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
@ -49,17 +49,18 @@ public:
|
||||
std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const;
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const
|
||||
{ return true; }
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
typedef std::pair<Phrase, Phrase> ParallelPhrase;
|
||||
typedef std::vector<float> Scores;
|
||||
std::map<ParallelPhrase, Scores> m_futureCost;
|
||||
typedef std::pair<Phrase, Phrase> ParallelPhrase;
|
||||
typedef std::vector<float> Scores;
|
||||
std::map<ParallelPhrase, Scores> m_futureCost;
|
||||
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::string m_featurePath, m_lmPath;
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::string m_featurePath, m_lmPath;
|
||||
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -17,15 +17,23 @@ public:
|
||||
osmState(const lm::ngram::State & val);
|
||||
int Compare(const FFState& other) const;
|
||||
void saveState(int jVal, int eVal, std::map <int , std::string> & gapVal);
|
||||
int getJ()const {return j;}
|
||||
int getE()const {return E;}
|
||||
std::map <int , std::string> getGap() const { return gap;}
|
||||
int getJ()const {
|
||||
return j;
|
||||
}
|
||||
int getE()const {
|
||||
return E;
|
||||
}
|
||||
std::map <int , std::string> getGap() const {
|
||||
return gap;
|
||||
}
|
||||
|
||||
lm::ngram::State getLMState() const {return lmState;}
|
||||
lm::ngram::State getLMState() const {
|
||||
return lmState;
|
||||
}
|
||||
|
||||
void print() const;
|
||||
std::string getName() const;
|
||||
|
||||
|
||||
protected:
|
||||
int j, E;
|
||||
std::map <int,std::string> gap;
|
||||
@ -35,51 +43,56 @@ protected:
|
||||
class osmHypothesis
|
||||
{
|
||||
|
||||
private:
|
||||
|
||||
|
||||
std::vector <std::string> operations; // List of operations required to generated this hyp ...
|
||||
std::map <int,std::string> gap; // Maintains gap history ...
|
||||
int j; // Position after the last source word generated ...
|
||||
int E; // Position after the right most source word so far generated ...
|
||||
lm::ngram::State lmState; // KenLM's Model State ...
|
||||
private:
|
||||
|
||||
int gapCount; // Number of gaps inserted ...
|
||||
int deletionCount;
|
||||
int openGapCount;
|
||||
int gapWidth;
|
||||
double opProb;
|
||||
|
||||
std::vector <std::string> currE;
|
||||
std::vector <std::string> currF;
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::set <int> sourceNullWords;
|
||||
std::vector <std::string> operations; // List of operations required to generated this hyp ...
|
||||
std::map <int,std::string> gap; // Maintains gap history ...
|
||||
int j; // Position after the last source word generated ...
|
||||
int E; // Position after the right most source word so far generated ...
|
||||
lm::ngram::State lmState; // KenLM's Model State ...
|
||||
|
||||
int closestGap(std::map <int,std::string> gap,int j1, int & gp);
|
||||
int firstOpenGap(std::vector <int> & coverageVector);
|
||||
std::string intToString(int);
|
||||
int getOpenGaps();
|
||||
int isTranslationOperation(int j);
|
||||
void removeReorderingOperations();
|
||||
int gapCount; // Number of gaps inserted ...
|
||||
int deletionCount;
|
||||
int openGapCount;
|
||||
int gapWidth;
|
||||
double opProb;
|
||||
|
||||
void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
|
||||
std::vector <std::string> currE;
|
||||
std::vector <std::string> currF;
|
||||
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
||||
std::set <int> targetNullWords;
|
||||
std::set <int> sourceNullWords;
|
||||
|
||||
public:
|
||||
int closestGap(std::map <int,std::string> gap,int j1, int & gp);
|
||||
int firstOpenGap(std::vector <int> & coverageVector);
|
||||
std::string intToString(int);
|
||||
int getOpenGaps();
|
||||
int isTranslationOperation(int j);
|
||||
void removeReorderingOperations();
|
||||
|
||||
osmHypothesis();
|
||||
~osmHypothesis(){};
|
||||
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
|
||||
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
|
||||
void calculateOSMProb(lm::ngram::Model & ptrOp);
|
||||
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector);
|
||||
void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
|
||||
void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2){currF = val1; currE = val2;}
|
||||
void setState(const FFState* prev_state);
|
||||
osmState * saveState();
|
||||
void print();
|
||||
void populateScores(std::vector <float> & scores);
|
||||
void setState(const lm::ngram::State & val){lmState = val;}
|
||||
void getMeCepts ( std::set <int> & eSide , std::set <int> & fSide , std::map <int , std::vector <int> > & tS , std::map <int , std::vector <int> > & sT);
|
||||
|
||||
public:
|
||||
|
||||
osmHypothesis();
|
||||
~osmHypothesis() {};
|
||||
void generateOperations(int & startIndex, int j1 , int contFlag , WordsBitmap & coverageVector , std::string english , std::string german , std::set <int> & targetNullWords , std::vector <std::string> & currF);
|
||||
void generateDeleteOperations(std::string english, int currTargetIndex, std::set <int> doneTargetIndexes);
|
||||
void calculateOSMProb(lm::ngram::Model & ptrOp);
|
||||
void computeOSMFeature(int startIndex , WordsBitmap & coverageVector);
|
||||
void constructCepts(std::vector <int> & align , int startIndex , int endIndex, int targetPhraseLength);
|
||||
void setPhrases(std::vector <std::string> & val1 , std::vector <std::string> & val2) {
|
||||
currF = val1;
|
||||
currE = val2;
|
||||
}
|
||||
void setState(const FFState* prev_state);
|
||||
osmState * saveState();
|
||||
void print();
|
||||
void populateScores(std::vector <float> & scores);
|
||||
void setState(const lm::ngram::State & val) {
|
||||
lmState = val;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
@ -383,7 +383,7 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
|
||||
try {
|
||||
lm::ngram::ModelType model_type;
|
||||
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
||||
|
||||
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
|
||||
|
@ -694,9 +694,9 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "OpSequenceModel") {
|
||||
OpSequenceModel* model = new OpSequenceModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
OpSequenceModel* model = new OpSequenceModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhrasePenalty") {
|
||||
PhrasePenalty* model = new PhrasePenalty(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
|
@ -190,7 +190,7 @@ std::string PhraseDecoder::MakeSourceKey(std::string &source)
|
||||
return source + m_separator;
|
||||
}
|
||||
|
||||
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel)
|
||||
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel, bool eval)
|
||||
{
|
||||
|
||||
// Not using TargetPhraseCollection avoiding "new" operator
|
||||
@ -234,7 +234,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
|
||||
|
||||
// Decompress and decode target phrase collection
|
||||
TargetPhraseVectorPtr decodedPhraseColl =
|
||||
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel);
|
||||
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel, eval);
|
||||
|
||||
return decodedPhraseColl;
|
||||
} else
|
||||
@ -243,7 +243,7 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
|
||||
|
||||
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
|
||||
TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
|
||||
const Phrase &sourcePhrase, bool topLevel)
|
||||
const Phrase &sourcePhrase, bool topLevel, bool eval)
|
||||
{
|
||||
|
||||
bool extending = tpv->size();
|
||||
@ -397,7 +397,8 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
|
||||
|
||||
if(scores.size() == m_numScoreComponent) {
|
||||
targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores);
|
||||
targetPhrase->Evaluate(sourcePhrase);
|
||||
if(eval)
|
||||
targetPhrase->Evaluate(sourcePhrase);
|
||||
|
||||
if(m_containsAlignmentInfo)
|
||||
state = Alignment;
|
||||
|
@ -131,12 +131,13 @@ public:
|
||||
size_t Load(std::FILE* in);
|
||||
|
||||
TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
|
||||
bool topLevel = false);
|
||||
bool topLevel = false, bool eval = true);
|
||||
|
||||
TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
|
||||
BitWrapper<> &encodedBitStream,
|
||||
const Phrase &sourcePhrase,
|
||||
bool topLevel);
|
||||
bool topLevel,
|
||||
bool eval);
|
||||
|
||||
void PruneCache();
|
||||
};
|
||||
|
@ -117,7 +117,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
|
||||
|
||||
// Retrieve target phrase collection from phrase table
|
||||
TargetPhraseVectorPtr decodedPhraseColl
|
||||
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
|
||||
= m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
|
||||
|
||||
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
|
||||
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
|
||||
@ -130,7 +130,6 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
|
||||
std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
|
||||
for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) {
|
||||
TargetPhrase *tp = new TargetPhrase(*it);
|
||||
cerr << *tp << endl;
|
||||
phraseColl->Add(tp);
|
||||
}
|
||||
|
||||
@ -152,7 +151,7 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase
|
||||
return TargetPhraseVectorPtr();
|
||||
|
||||
// Retrieve target phrase collection from phrase table
|
||||
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
|
||||
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
|
||||
}
|
||||
|
||||
PhraseDictionaryCompact::~PhraseDictionaryCompact()
|
||||
|
@ -38,7 +38,7 @@ bool operator<(const PackedItem &pi1, const PackedItem &pi2)
|
||||
}
|
||||
|
||||
std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
|
||||
std::string PhraseTableCreator::m_separator = " ||| ";
|
||||
std::string PhraseTableCreator::m_separator = "|||";
|
||||
|
||||
PhraseTableCreator::PhraseTableCreator(std::string inPath,
|
||||
std::string outPath,
|
||||
@ -332,12 +332,12 @@ void PhraseTableCreator::CreateRankHash()
|
||||
|
||||
inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
|
||||
{
|
||||
return source + m_separator;
|
||||
return source + " " + m_separator + " ";
|
||||
}
|
||||
|
||||
inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
|
||||
{
|
||||
return source + m_separator + target + m_separator;
|
||||
return source + " " + m_separator + " " + target + " " + m_separator + " ";
|
||||
}
|
||||
|
||||
void PhraseTableCreator::EncodeTargetPhrases()
|
||||
@ -1034,17 +1034,24 @@ void RankingTask::operator()()
|
||||
for(size_t i = 0; i < lines.size(); i++) {
|
||||
std::vector<std::string> tokens;
|
||||
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
|
||||
|
||||
if(tokens.size() < 3) {
|
||||
|
||||
for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
|
||||
*it = Moses::Trim(*it);
|
||||
|
||||
if(tokens.size() < 4) {
|
||||
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
|
||||
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
|
||||
abort();
|
||||
}
|
||||
if(tokens.size() == 3 && m_creator.m_warnMe) {
|
||||
std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl;
|
||||
std::cerr << "but you are using PREnc encoding which makes use of alignment data. " << std::endl;
|
||||
std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl;
|
||||
|
||||
if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
|
||||
std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl;
|
||||
std::cerr << "but you are using ";
|
||||
std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
|
||||
std::cerr << " encoding which makes use of alignment data. " << std::endl;
|
||||
std::cerr << "Use -encoding None" << std::endl;
|
||||
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
std::vector<float> scores = Tokenize<float>(tokens[2]);
|
||||
@ -1125,18 +1132,23 @@ void EncodingTask::operator()()
|
||||
std::vector<std::string> tokens;
|
||||
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
|
||||
|
||||
for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
|
||||
*it = Moses::Trim(*it);
|
||||
|
||||
if(tokens.size() < 3) {
|
||||
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
|
||||
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
|
||||
abort();
|
||||
}
|
||||
if(tokens.size() == 3 && m_creator.m_coding != PhraseTableCreator::None && m_creator.m_warnMe) {
|
||||
std::cerr << "Warning: It seems the following line contains no alignment information, " << std::endl;
|
||||
|
||||
if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
|
||||
std::cerr << "Error: It seems the following line contains no alignment information, " << std::endl;
|
||||
std::cerr << "but you are using ";
|
||||
std::cerr << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
|
||||
std::cerr << " encoding which makes use of alignment data. " << std::endl;
|
||||
std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl;
|
||||
std::cerr << "Use -encoding None" << std::endl;
|
||||
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
size_t ownRank = 0;
|
||||
|
@ -59,8 +59,7 @@ public:
|
||||
/** deep copy */
|
||||
Word(const Word ©)
|
||||
:m_isNonTerminal(copy.m_isNonTerminal)
|
||||
,m_isOOV(copy.m_isOOV)
|
||||
{
|
||||
,m_isOOV(copy.m_isOOV) {
|
||||
std::memcpy(m_factorArray, copy.m_factorArray, sizeof(FactorArray));
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user