mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-17 14:17:13 +03:00
Reorganization of phrase scorers in Mmsapt.
This commit is contained in:
parent
c3db1a3a67
commit
1a3d7d3266
@ -23,6 +23,7 @@
|
||||
#include "ug_typedefs.h"
|
||||
#include "tpt_pickler.h"
|
||||
#include "ug_bitext.h"
|
||||
#include "../mmsapt_phrase_scorers.h"
|
||||
#include "ug_lexical_phrase_scorer2.h"
|
||||
|
||||
using namespace std;
|
||||
@ -44,7 +45,7 @@ float lbsmooth = .005;
|
||||
|
||||
PScorePfwd<Token> calc_pfwd;
|
||||
PScorePbwd<Token> calc_pbwd;
|
||||
PScoreLex<Token> calc_lex;
|
||||
PScoreLex<Token> calc_lex(1.0);
|
||||
PScoreWP<Token> apply_wp;
|
||||
vector<float> fweights;
|
||||
|
||||
@ -129,7 +130,7 @@ int main(int argc, char* argv[])
|
||||
bt.setDefaultSampleSize(max_samples);
|
||||
|
||||
size_t i;
|
||||
i = calc_pfwd.init(0,.05);
|
||||
i = calc_pfwd.init(0,.05,'g');
|
||||
i = calc_pbwd.init(i,.05);
|
||||
i = calc_lex.init(i,base+L1+"-"+L2+".lex");
|
||||
i = apply_wp.init(i);
|
||||
|
@ -47,15 +47,22 @@ namespace Moses
|
||||
}
|
||||
#endif
|
||||
|
||||
vector<string> const&
|
||||
Mmsapt::
|
||||
GetFeatureNames() const
|
||||
{
|
||||
return m_feature_names;
|
||||
}
|
||||
|
||||
Mmsapt::
|
||||
Mmsapt(string const& line)
|
||||
// : PhraseDictionary("Mmsapt",line), ofactor(1,0)
|
||||
: PhraseDictionary(line)
|
||||
, m_lex_alpha(1.0)
|
||||
, withLogCountFeatures(false)
|
||||
, withPfwd(true), withPbwd(true)
|
||||
, withCoherence(true)
|
||||
, m_pfwd_features("g"), withPbwd(true), poolCounts(true)
|
||||
, ofactor(1,0)
|
||||
, m_tpc_ctr(0)
|
||||
// default values chosen for bwd probability
|
||||
{
|
||||
this->init(line);
|
||||
}
|
||||
@ -101,52 +108,56 @@ namespace Moses
|
||||
assert(L1.size());
|
||||
assert(L2.size());
|
||||
|
||||
m = param.find("pfwd_denom");
|
||||
m = param.find("pfwd-denom");
|
||||
m_pfwd_denom = m != param.end() ? m->second[0] : 's';
|
||||
|
||||
|
||||
m = param.find("smooth");
|
||||
m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
|
||||
|
||||
m = param.find("max-samples");
|
||||
m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
|
||||
|
||||
m = param.find("logcnt-features");
|
||||
if (m != param.end())
|
||||
if ((m = param.find("logcnt-features")) != param.end())
|
||||
withLogCountFeatures = m->second != "0";
|
||||
|
||||
m = param.find("pfwd");
|
||||
if (m != param.end())
|
||||
withPfwd = m->second != "0";
|
||||
|
||||
m = param.find("pbwd");
|
||||
if (m != param.end())
|
||||
if ((m = param.find("coh")) != param.end())
|
||||
withCoherence = m->second != "0";
|
||||
|
||||
if ((m = param.find("pfwd")) != param.end())
|
||||
m_pfwd_features = (m->second == "0" ? "" : m->second);
|
||||
|
||||
if (m_pfwd_features == "1")
|
||||
m_pfwd_features[0] = m_pfwd_denom;
|
||||
|
||||
if ((m = param.find("pbwd")) != param.end())
|
||||
withPbwd = m->second != "0";
|
||||
|
||||
if ((m = param.find("lexalpha")) != param.end())
|
||||
m_lex_alpha = atof(m->second.c_str());
|
||||
|
||||
m = param.find("workers");
|
||||
m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
|
||||
m_workers = min(m_workers,24UL);
|
||||
|
||||
m = param.find("limit");
|
||||
if (m != param.end()) m_tableLimit = atoi(m->second.c_str());
|
||||
if ((m = param.find("limit")) != param.end())
|
||||
m_tableLimit = atoi(m->second.c_str());
|
||||
|
||||
m = param.find("cache-size");
|
||||
m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000);
|
||||
// in plain language: cache size is at least 1000, and 10,000 by default
|
||||
// this cache keeps track of the most frequently used target phrase collections
|
||||
// even when not actively in use
|
||||
|
||||
this->m_numScoreComponents = atoi(param["num-features"].c_str());
|
||||
|
||||
// num_features = 0;
|
||||
m = param.find("ifactor");
|
||||
input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
|
||||
|
||||
poolCounts = true;
|
||||
m = param.find("extra");
|
||||
if (m != param.end())
|
||||
{
|
||||
extra_data = m->second;
|
||||
// cerr << "have extra data" << endl;
|
||||
}
|
||||
// keeps track of the most frequently used target phrase collections
|
||||
// (to keep them cached even when not actively in use)
|
||||
|
||||
if ((m = param.find("extra")) != param.end())
|
||||
extra_data = m->second;
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
@ -175,6 +186,49 @@ namespace Moses
|
||||
// cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
|
||||
}
|
||||
|
||||
size_t
|
||||
Mmsapt::
|
||||
add_corpus_specific_features
|
||||
(vector<sptr<pscorer > >& ffvec, size_t num_feats)
|
||||
{
|
||||
float const lbop = m_lbop_parameter; // just for code readability below
|
||||
// for the time being, we assume that all phrase probability features
|
||||
// use the same confidence parameter for lower-bound-estimation
|
||||
for (size_t i = 0; i < m_pfwd_features.size(); ++i)
|
||||
{
|
||||
UTIL_THROW_IF2(m_pfwd_features[i] != 'g' &&
|
||||
m_pfwd_features[i] != 'r' &&
|
||||
m_pfwd_features[i] != 's',
|
||||
"Can't handle pfwd feature type '"
|
||||
<< m_pfwd_features[i] << "'.");
|
||||
sptr<PScorePfwd<Token> > ff(new PScorePfwd<Token>());
|
||||
size_t k = num_feats;
|
||||
num_feats = ff->init(num_feats,lbop,m_pfwd_features[i]);
|
||||
for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
|
||||
ffvec.push_back(ff);
|
||||
}
|
||||
|
||||
if (withPbwd)
|
||||
{
|
||||
sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>());
|
||||
size_t k = num_feats;
|
||||
num_feats = ff->init(num_feats,lbop);
|
||||
for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
|
||||
ffvec.push_back(ff);
|
||||
}
|
||||
|
||||
if (withLogCountFeatures)
|
||||
{
|
||||
sptr<PScoreLogCounts<Token> > ff(new PScoreLogCounts<Token>());
|
||||
size_t k = num_feats;
|
||||
num_feats = ff->init(num_feats);
|
||||
for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
|
||||
ffvec.push_back(ff);
|
||||
}
|
||||
|
||||
return num_feats;
|
||||
}
|
||||
|
||||
void
|
||||
Mmsapt::
|
||||
Load()
|
||||
@ -184,44 +238,52 @@ namespace Moses
|
||||
btfix.setDefaultSampleSize(m_default_sample_size);
|
||||
|
||||
size_t num_feats = 0;
|
||||
// TO DO: should we use different lbop parameters
|
||||
// for the relative-frequency based features?
|
||||
|
||||
if (withLogCountFeatures) num_feats = add_logcounts_fix.init(num_feats);
|
||||
|
||||
float const lbop = m_lbop_parameter; // just for code readability below
|
||||
if (withPfwd) num_feats = calc_pfwd_fix.init(num_feats,lbop,m_pfwd_denom);
|
||||
if (withPbwd) num_feats = calc_pbwd_fix.init(num_feats,lbop);
|
||||
// lexical scores are currently always active
|
||||
sptr<PScoreLex<Token> > ff(new PScoreLex<Token>(m_lex_alpha));
|
||||
size_t k = num_feats;
|
||||
num_feats = ff->init(num_feats, bname + L1 + "-" + L2 + ".lex");
|
||||
for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
|
||||
m_active_ff_common.push_back(ff);
|
||||
|
||||
// currently always active by default; may (should) change later
|
||||
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
|
||||
|
||||
// if (this->m_numScoreComponents%2) // a bit of a hack, for backwards compatibility
|
||||
// num_feats = apply_pp.init(num_feats);
|
||||
|
||||
if (num_feats < this->m_numScoreComponents)
|
||||
if (withCoherence)
|
||||
{
|
||||
poolCounts = false;
|
||||
if (withLogCountFeatures) num_feats = add_logcounts_dyn.init(num_feats);
|
||||
if (withPfwd) num_feats = calc_pfwd_dyn.init(num_feats,lbop,m_pfwd_denom);
|
||||
if (withPbwd) num_feats = calc_pbwd_dyn.init(num_feats,lbop);
|
||||
sptr<PScoreCoherence<Token> > ff(new PScoreCoherence<Token>());
|
||||
size_t k = num_feats;
|
||||
num_feats = ff->init(num_feats);
|
||||
for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
|
||||
m_active_ff_common.push_back(ff);
|
||||
}
|
||||
|
||||
if (num_feats != this->m_numScoreComponents)
|
||||
{
|
||||
ostringstream buf;
|
||||
buf << "At " << __FILE__ << ":" << __LINE__
|
||||
<< ": number of feature values provided by Phrase table"
|
||||
<< " does not match number specified in Moses config file!";
|
||||
throw buf.str().c_str();
|
||||
}
|
||||
// cerr << "MMSAPT provides " << num_feats << " features at "
|
||||
// << __FILE__ << ":" << __LINE__ << endl;
|
||||
|
||||
num_feats = add_corpus_specific_features(m_active_ff_fix,num_feats);
|
||||
// cerr << num_feats << "/" << this->m_numScoreComponents
|
||||
// << " at " << __FILE__ << ":" << __LINE__ << endl;
|
||||
poolCounts = poolCounts && num_feats == this->m_numScoreComponents;
|
||||
if (!poolCounts)
|
||||
num_feats = add_corpus_specific_features(m_active_ff_dyn, num_feats);
|
||||
|
||||
#if 0
|
||||
cerr << "MMSAPT provides " << num_feats << " features at "
|
||||
<< __FILE__ << ":" << __LINE__ << endl;
|
||||
BOOST_FOREACH(string const& fname, m_feature_names)
|
||||
cerr << fname << endl;
|
||||
#endif
|
||||
UTIL_THROW_IF2(num_feats != this->m_numScoreComponents,
|
||||
"At " << __FILE__ << ":" << __LINE__
|
||||
<< ": number of feature values provided by Phrase table ("
|
||||
<< num_feats << ") does not match number specified in "
|
||||
<< "Moses config file (" << this->m_numScoreComponents
|
||||
<< ")!\n";);
|
||||
|
||||
|
||||
btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
|
||||
btdyn->num_workers = this->m_workers;
|
||||
if (extra_data.size()) load_extra_data(extra_data);
|
||||
|
||||
if (extra_data.size())
|
||||
{
|
||||
load_extra_data(extra_data);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// currently not used
|
||||
LexicalPhraseScorer2<Token>::table_t & COOC = calc_lex.scorer.COOC;
|
||||
typedef LexicalPhraseScorer2<Token>::table_t::Cell cell_t;
|
||||
@ -230,7 +292,8 @@ namespace Moses
|
||||
for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c)
|
||||
wlex21[c->id].push_back(r);
|
||||
COOCraw.open(bname + L1 + "-" + L2 + ".coc");
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
@ -283,20 +346,28 @@ namespace Moses
|
||||
{
|
||||
PhrasePair pp;
|
||||
pp.init(pid1, stats, this->m_numScoreComponents);
|
||||
// if (this->m_numScoreComponents%2)
|
||||
// apply_pp(bt,pp);
|
||||
pstats::trg_map_t::const_iterator t;
|
||||
for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
|
||||
{
|
||||
pp.update(t->first,t->second);
|
||||
calc_lex(bt,pp);
|
||||
if (withPfwd) calc_pfwd_fix(bt,pp);
|
||||
if (withPbwd) calc_pbwd_fix(bt,pp);
|
||||
if (withLogCountFeatures) add_logcounts_fix(bt,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(bt,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(bt,pp);
|
||||
tpcoll->Add(createTargetPhrase(src,bt,pp));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Mmsapt::
|
||||
ScorePPfix(bitext::PhrasePair& pp) const
|
||||
{
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(btfix,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(btfix,pp);
|
||||
}
|
||||
|
||||
// process phrase stats from a single parallel corpus
|
||||
bool
|
||||
Mmsapt::
|
||||
@ -318,8 +389,6 @@ namespace Moses
|
||||
pp.init(pid1b, *statsb, this->m_numScoreComponents);
|
||||
else return false; // throw "no stats for pooling available!";
|
||||
|
||||
// if (this->m_numScoreComponents%2)
|
||||
// apply_pp(bta,pp);
|
||||
pstats::trg_map_t::const_iterator b;
|
||||
pstats::trg_map_t::iterator a;
|
||||
if (statsb)
|
||||
@ -344,10 +413,10 @@ namespace Moses
|
||||
b->second);
|
||||
}
|
||||
else pp.update(b->first,b->second);
|
||||
calc_lex(btb,pp);
|
||||
if (withPfwd) calc_pfwd_fix(btb,pp);
|
||||
if (withPbwd) calc_pbwd_fix(btb,pp);
|
||||
if (withLogCountFeatures) add_logcounts_fix(btb,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(btb,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(btb,pp);
|
||||
tpcoll->Add(createTargetPhrase(src,btb,pp));
|
||||
}
|
||||
}
|
||||
@ -377,20 +446,19 @@ namespace Moses
|
||||
#endif
|
||||
|
||||
UTIL_THROW_IF2(pp.raw2 == 0,
|
||||
"OOPS"
|
||||
<< bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
|
||||
"OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
|
||||
<< bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
|
||||
<< pp.raw1 << " " << pp.sample1 << " "
|
||||
<< pp.good1 << " " << pp.joint << " "
|
||||
<< pp.raw2);
|
||||
calc_lex(bta,pp);
|
||||
if (withPfwd) calc_pfwd_fix(bta,pp);
|
||||
if (withPbwd) calc_pbwd_fix(bta,pp);
|
||||
if (withLogCountFeatures) add_logcounts_fix(bta,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(bta,pp);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(bta,pp);
|
||||
tpcoll->Add(createTargetPhrase(src,bta,pp));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// process phrase stats from a single parallel corpus
|
||||
@ -398,75 +466,81 @@ namespace Moses
|
||||
Mmsapt::
|
||||
combine_pstats
|
||||
(Phrase const& src,
|
||||
uint64_t const pid1a,
|
||||
pstats * statsa,
|
||||
Bitext<Token> const & bta,
|
||||
uint64_t const pid1b,
|
||||
pstats const* statsb,
|
||||
Bitext<Token> const & btb,
|
||||
TargetPhraseCollection* tpcoll
|
||||
) const
|
||||
uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
|
||||
uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
|
||||
TargetPhraseCollection* tpcoll) const
|
||||
{
|
||||
PhrasePair ppfix,ppdyn,pool;
|
||||
// ppfix: counts from btfix
|
||||
// ppdyn: counts from btdyn
|
||||
// pool: pooled counts from both
|
||||
Word w;
|
||||
if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
|
||||
if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
|
||||
pstats::trg_map_t::const_iterator b;
|
||||
pstats::trg_map_t::iterator a;
|
||||
|
||||
if (statsb)
|
||||
{
|
||||
pool.init(pid1b,*statsb,0);
|
||||
// if (this->m_numScoreComponents%2)
|
||||
// apply_pp(btb,ppdyn);
|
||||
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
|
||||
{
|
||||
ppdyn.update(b->first,b->second);
|
||||
if (withPfwd) calc_pfwd_dyn(btb,ppdyn);
|
||||
if (withPbwd) calc_pbwd_dyn(btb,ppdyn);
|
||||
if (withLogCountFeatures) add_logcounts_dyn(btb,ppdyn);
|
||||
calc_lex(btb,ppdyn);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
|
||||
(*ff)(btb,ppdyn);
|
||||
|
||||
uint32_t sid,off,len;
|
||||
parse_pid(b->first, sid, off, len);
|
||||
Token const* x = bta.T2->sntStart(sid) + off;
|
||||
TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
|
||||
|
||||
if (m.size() && statsa &&
|
||||
((a = statsa->trg.find(m.getPid()))
|
||||
!= statsa->trg.end()))
|
||||
((a = statsa->trg.find(m.getPid())) != statsa->trg.end()))
|
||||
{
|
||||
// phrase pair found also in btfix
|
||||
ppfix.update(a->first,a->second);
|
||||
if (withPfwd) calc_pfwd_fix(bta,ppfix,&ppdyn.fvals);
|
||||
if (withPbwd) calc_pbwd_fix(bta,ppfix,&ppdyn.fvals);
|
||||
if (withLogCountFeatures) add_logcounts_fix(bta,ppfix,&ppdyn.fvals);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(bta,ppfix,&ppdyn.fvals);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(bta,ppfix,&ppdyn.fvals);
|
||||
a->second.invalidate();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m.size())
|
||||
pool.update(b->first,m.approxOccurrenceCount(),
|
||||
b->second);
|
||||
else
|
||||
// phrase pair was not found in btfix
|
||||
|
||||
// ... but the source phrase was
|
||||
if (m.size())
|
||||
pool.update(b->first,m.approxOccurrenceCount(), b->second);
|
||||
|
||||
// ... and not even the source phrase
|
||||
else
|
||||
pool.update(b->first,b->second);
|
||||
if (withPfwd) calc_pfwd_fix(btb,pool,&ppdyn.fvals);
|
||||
if (withPbwd) calc_pbwd_fix(btb,pool,&ppdyn.fvals);
|
||||
if (withLogCountFeatures) add_logcounts_fix(btb,pool,&ppdyn.fvals);
|
||||
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(btb,pool,&ppdyn.fvals);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(btb,pool,&ppdyn.fvals);
|
||||
|
||||
}
|
||||
|
||||
tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
|
||||
}
|
||||
}
|
||||
|
||||
// now deal with all phraise pairs that are ONLY in btfix
|
||||
// (the ones that are in both were dealt with above)
|
||||
if (statsa)
|
||||
{
|
||||
pool.init(pid1a,*statsa,0);
|
||||
// if (this->m_numScoreComponents%2)
|
||||
// apply_pp(bta,ppfix);
|
||||
for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
|
||||
{
|
||||
if (!a->second.valid()) continue; // done above
|
||||
ppfix.update(a->first,a->second);
|
||||
if (withPfwd) calc_pfwd_fix(bta,ppfix);
|
||||
if (withPbwd) calc_pbwd_fix(bta,ppfix);
|
||||
if (withLogCountFeatures) add_logcounts_fix(bta,ppfix);
|
||||
calc_lex(bta,ppfix);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
|
||||
(*ff)(bta,ppfix);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
|
||||
(*ff)(bta,ppfix);
|
||||
|
||||
if (btb.I2)
|
||||
{
|
||||
@ -480,9 +554,8 @@ namespace Moses
|
||||
pool.update(a->first,a->second);
|
||||
}
|
||||
else pool.update(a->first,a->second);
|
||||
if (withPfwd) calc_pfwd_dyn(bta,pool,&ppfix.fvals);
|
||||
if (withPbwd) calc_pbwd_dyn(bta,pool,&ppfix.fvals);
|
||||
if (withLogCountFeatures) add_logcounts_dyn(bta,pool,&ppfix.fvals);
|
||||
BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
|
||||
(*ff)(btb,pool,&ppfix.fvals);
|
||||
}
|
||||
if (ppfix.p2)
|
||||
tpcoll->Add(createTargetPhrase(src,bta,ppfix));
|
||||
@ -490,92 +563,6 @@ namespace Moses
|
||||
return (statsa || statsb);
|
||||
}
|
||||
|
||||
// // phrase statistics combination treating the two knowledge
|
||||
// // sources separately with backoff to pooling when only one
|
||||
// // of the two knowledge sources contains the phrase pair in
|
||||
// // question
|
||||
// void
|
||||
// Mmsapt::
|
||||
// process_pstats(uint64_t const mypid1,
|
||||
// uint64_t const otpid1,
|
||||
// pstats const& mystats, // my phrase stats
|
||||
// pstats const* otstats, // other phrase stats
|
||||
// Bitext<Token> const & mybt, // my bitext
|
||||
// Bitext<Token> const * otbt, // other bitext
|
||||
// PhraseScorer<Token> const& mypfwd,
|
||||
// PhraseScorer<Token> const& mypbwd,
|
||||
// PhraseScorer<Token> const* otpfwd,
|
||||
// PhraseScorer<Token> const* otpbwd,
|
||||
// TargetPhraseCollection* tpcoll)
|
||||
// {
|
||||
// boost::unordered_map<uint64_t,jstats>::const_iterator t;
|
||||
// vector<FactorType> ofact(1,0);
|
||||
// PhrasePair mypp,otpp,combo;
|
||||
// mypp.init(mypid1, mystats, this->m_numScoreComponents);
|
||||
// if (otstats)
|
||||
// {
|
||||
// otpp.init(otpid1, *otstats, 0);
|
||||
// combo.init(otpid1, mystats, *otstats, 0);
|
||||
// }
|
||||
// else combo = mypp;
|
||||
|
||||
// for (t = mystats.trg.begin(); t != mystats.trg.end(); ++t)
|
||||
// {
|
||||
// if (!t->second.valid()) continue;
|
||||
// // we dealt with this phrase pair already;
|
||||
// // see j->second.invalidate() below;
|
||||
// uint32_t sid,off,len; parse_pid(t->first,sid,off,len);
|
||||
|
||||
// mypp.update(t->first,t->second);
|
||||
// apply_pp(mybt,mypp);
|
||||
// calc_lex (mybt,mypp);
|
||||
// mypfwd(mybt,mypp);
|
||||
// mypbwd(mybt,mypp);
|
||||
|
||||
// if (otbt) // it's a dynamic phrase table
|
||||
// {
|
||||
// assert(otpfwd);
|
||||
// assert(otpbwd);
|
||||
// boost::unordered_map<uint64_t,jstats>::iterator j;
|
||||
|
||||
// // look up the current target phrase in the other bitext
|
||||
// Token const* x = mybt.T2->sntStart(sid) + off;
|
||||
// TSA<TOKEN>::tree_iterator m(otbt->I2.get(),x,x+len);
|
||||
// if (otstats // source phrase exists in other bitext
|
||||
// && m.size() // target phrase exists in other bitext
|
||||
// && ((j = otstats->trg.find(m.getPid()))
|
||||
// != otstats->trg.end())) // phrase pair found in other bitext
|
||||
// {
|
||||
// otpp.update(j->first,j->second);
|
||||
// j->second.invalidate(); // mark the phrase pair as seen
|
||||
// otpfwd(*otbt,otpp,&mypp.fvals);
|
||||
// otpbwd(*otbt,otpp,&mypp.fvals);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// if (m.size()) // target phrase seen in other bitext, but not the phrase pair
|
||||
// combo.update(t->first,m.approxOccurrenceCount(),t->second);
|
||||
// else
|
||||
// combo.update(t->first,t->second);
|
||||
// (*otpfwd)(mybt,combo,&mypp.fvals);
|
||||
// (*otpbwd)(mybt,combo,&mypp.fvals);
|
||||
// }
|
||||
// }
|
||||
|
||||
// // now add the phrase pair to the TargetPhraseCollection:
|
||||
// TargetPhrase* tp = new TargetPhrase();
|
||||
// for (size_t k = off; k < stop; ++k)
|
||||
// {
|
||||
// StringPiece wrd = (*mybt.V2)[x[k].id()];
|
||||
// Word w; w.CreateFromString(Output,ofact,wrd,false);
|
||||
// tp->AddWord(w);
|
||||
// }
|
||||
// tp->GetScoreBreakdown().Assign(this,mypp.fvals);
|
||||
// tp->Evaluate(src);
|
||||
// tpcoll->Add(tp);
|
||||
// }
|
||||
// }
|
||||
|
||||
Mmsapt::
|
||||
TargetPhraseCollectionWrapper::
|
||||
TargetPhraseCollectionWrapper(size_t r, uint64_t k)
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include <map>
|
||||
|
||||
#include "moses/TranslationModel/PhraseDictionary.h"
|
||||
#include "mmsapt_phrase_scorers.h"
|
||||
|
||||
// TO DO:
|
||||
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
|
||||
@ -51,6 +52,7 @@ namespace Moses
|
||||
typedef mmBitext<Token> mmbitext;
|
||||
typedef imBitext<Token> imbitext;
|
||||
typedef TSA<Token> tsa;
|
||||
typedef PhraseScorer<Token> pscorer;
|
||||
private:
|
||||
mmbitext btfix;
|
||||
sptr<imbitext> btdyn;
|
||||
@ -58,30 +60,48 @@ namespace Moses
|
||||
string L1;
|
||||
string L2;
|
||||
float m_lbop_parameter;
|
||||
float m_lex_alpha;
|
||||
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
|
||||
// must be > 0 if dynamic
|
||||
size_t m_default_sample_size;
|
||||
size_t m_workers; // number of worker threads for sampling the bitexts
|
||||
|
||||
// deprecated!
|
||||
char m_pfwd_denom; // denominator for computation of fwd phrase score:
|
||||
// 'r' - divide by raw count
|
||||
// 's' - divide by sample count
|
||||
// 'g' - devide by number of "good" (i.e. coherent) samples
|
||||
// size_t num_features;
|
||||
|
||||
size_t input_factor;
|
||||
size_t output_factor; // we can actually return entire Tokens!
|
||||
|
||||
bool withLogCountFeatures; // add logs of counts as features?
|
||||
bool withCoherence;
|
||||
string m_pfwd_features; // which pfwd functions to use
|
||||
vector<string> m_feature_names; // names of features activated
|
||||
vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
|
||||
vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
|
||||
vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
|
||||
|
||||
size_t
|
||||
add_corpus_specific_features
|
||||
(vector<sptr<pscorer > >& ffvec, size_t num_feats);
|
||||
|
||||
// built-in feature functions
|
||||
PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
|
||||
PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
|
||||
PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
|
||||
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
|
||||
// PScorePbwd<Token> calc_pbwd_fix, calc_pbwd_dyn;
|
||||
// PScoreLex<Token> calc_lex; // this one I'd like to see as an external ff eventually
|
||||
// PScorePP<Token> apply_pp; // apply phrase penalty
|
||||
PScoreLogCounts<Token> add_logcounts_fix;
|
||||
PScoreLogCounts<Token> add_logcounts_dyn;
|
||||
// PScoreLogCounts<Token> add_logcounts_fix;
|
||||
// PScoreLogCounts<Token> add_logcounts_dyn;
|
||||
void init(string const& line);
|
||||
mutable boost::mutex lock;
|
||||
bool withPbwd;
|
||||
bool poolCounts;
|
||||
bool withLogCountFeatures; // add logs of counts as features?
|
||||
bool withPfwd,withPbwd;
|
||||
vector<FactorType> ofactor;
|
||||
|
||||
|
||||
|
||||
public:
|
||||
// typedef boost::unordered_map<uint64_t, sptr<TargetPhraseCollection> > tpcoll_cache_t;
|
||||
class TargetPhraseCollectionWrapper
|
||||
@ -207,6 +227,12 @@ namespace Moses
|
||||
bool
|
||||
PrefixExists(Phrase const& phrase) const;
|
||||
|
||||
vector<string> const&
|
||||
GetFeatureNames() const;
|
||||
|
||||
void
|
||||
ScorePPfix(bitext::PhrasePair& pp) const;
|
||||
|
||||
private:
|
||||
};
|
||||
} // end namespace
|
||||
|
@ -127,6 +127,7 @@ namespace Moses
|
||||
Alignment::
|
||||
show(ostream& out, PhraseAlnHyp const& ah)
|
||||
{
|
||||
#if 0
|
||||
LexicalPhraseScorer2<Token>::table_t const&
|
||||
COOCjnt = PT.calc_lex.scorer.COOC;
|
||||
|
||||
@ -164,6 +165,7 @@ namespace Moses
|
||||
// << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
|
||||
// << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
|
||||
// << "]" << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
@ -283,9 +285,7 @@ namespace Moses
|
||||
psiter R = tpid2span.find(y->first);
|
||||
if (R == tpid2span.end()) continue;
|
||||
pp.update(y->first, y->second);
|
||||
PT.calc_lex(PT.btfix,pp);
|
||||
PT.calc_pfwd_fix(PT.btfix,pp);
|
||||
PT.calc_pbwd_fix(PT.btfix,pp);
|
||||
PT.ScorePPfix(pp);
|
||||
pp.eval(PT.feature_weights);
|
||||
PP.push_back(pp);
|
||||
BOOST_FOREACH(span const& sspan, L->second)
|
||||
@ -329,6 +329,7 @@ namespace Moses
|
||||
BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
|
||||
sptr<vector<int> > aln;
|
||||
return aln;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user