mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
486 lines
14 KiB
C++
486 lines
14 KiB
C++
#include "mmsapt.h"
|
|
#include <boost/foreach.hpp>
|
|
#include <boost/tokenizer.hpp>
|
|
|
|
namespace Moses
|
|
{
|
|
using namespace bitext;
|
|
using namespace std;
|
|
using namespace boost;
|
|
|
|
void
|
|
parseLine(string const& line, map<string,string> & params)
|
|
{
|
|
char_separator<char> sep("; ");
|
|
tokenizer<char_separator<char> > tokens(line,sep);
|
|
BOOST_FOREACH(string const& t,tokens)
|
|
{
|
|
size_t i = t.find_first_not_of(" =");
|
|
size_t j = t.find_first_of(" =",i+1);
|
|
size_t k = t.find_first_not_of(" =",j+1);
|
|
assert(i != string::npos);
|
|
assert(k != string::npos);
|
|
params[t.substr(i,j)] = t.substr(k);
|
|
}
|
|
}
|
|
|
|
Mmsapt::
|
|
Mmsapt(string const& description, string const& line)
|
|
: PhraseDictionary(description,line), ofactor(1,0)
|
|
{
|
|
this->init(line);
|
|
}
|
|
|
|
Mmsapt::
|
|
Mmsapt(string const& line)
|
|
: PhraseDictionary("Mmsapt",line), ofactor(1,0)
|
|
{
|
|
this->init(line);
|
|
}
|
|
|
|
void
|
|
Mmsapt::
|
|
init(string const& line)
|
|
{
|
|
map<string,string> param;
|
|
parseLine(line,param);
|
|
bname = param["base"];
|
|
L1 = param["L1"];
|
|
L2 = param["L2"];
|
|
assert(bname.size());
|
|
assert(L1.size());
|
|
assert(L2.size());
|
|
map<string,string>::const_iterator m;
|
|
m = param.find("smooth");
|
|
lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
|
|
m = param.find("max-samples");
|
|
default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
|
|
this->m_numScoreComponents = atoi(param["num-features"].c_str());
|
|
// num_features = 0;
|
|
m = param.find("ifactor");
|
|
input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
|
|
poolCounts = true;
|
|
}
|
|
|
|
void
|
|
Mmsapt::
|
|
Load()
|
|
{
|
|
btfix.open(bname, L1, L2);
|
|
size_t num_feats;
|
|
// TO DO: should we use different lbop parameters
|
|
// for the relative-frequency based features?
|
|
num_feats = calc_pfwd_fix.init(0,lbop_parameter);
|
|
num_feats = calc_pbwd_fix.init(num_feats,lbop_parameter);
|
|
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
|
|
num_feats = apply_pp.init(num_feats);
|
|
if (num_feats < this->m_numScoreComponents)
|
|
{
|
|
poolCounts = false;
|
|
num_feats = calc_pfwd_dyn.init(num_feats,lbop_parameter);
|
|
num_feats = calc_pbwd_dyn.init(num_feats,lbop_parameter);
|
|
}
|
|
btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2));
|
|
if (num_feats != this->m_numScoreComponents)
|
|
{
|
|
ostringstream buf;
|
|
buf << "At " << __FILE__ << ":" << __LINE__
|
|
<< ": number of feature values provided by Phrase table"
|
|
<< " does not match number specified in Moses config file!";
|
|
throw buf.str().c_str();
|
|
}
|
|
// cerr << "MMSAPT provides " << num_feats << " features at "
|
|
// << __FILE__ << ":" << __LINE__ << endl;
|
|
}
|
|
|
|
void
|
|
Mmsapt::
|
|
add(string const& s1, string const& s2, string const& a)
|
|
{
|
|
vector<string> S1(1,s1);
|
|
vector<string> S2(1,s2);
|
|
vector<string> ALN(1,a);
|
|
boost::lock_guard<boost::mutex> guard(this->lock);
|
|
btdyn = btdyn->add(S1,S2,ALN);
|
|
}
|
|
|
|
|
|
TargetPhrase*
|
|
Mmsapt::
|
|
createTargetPhrase(Phrase const& src,
|
|
Bitext<Token> const& bt,
|
|
PhrasePair const& pp) const
|
|
{
|
|
Word w; uint32_t sid,off,len;
|
|
TargetPhrase* tp = new TargetPhrase();
|
|
parse_pid(pp.p2, sid, off, len);
|
|
Token const* x = bt.T2->sntStart(sid) + off;
|
|
for (uint32_t k = 0; k < len; ++k)
|
|
{
|
|
StringPiece wrd = (*bt.V2)[x[k].id()];
|
|
w.CreateFromString(Output,ofactor,wrd,false);
|
|
tp->AddWord(w);
|
|
}
|
|
tp->GetScoreBreakdown().Assign(this, pp.fvals);
|
|
tp->Evaluate(src);
|
|
return tp;
|
|
}
|
|
|
|
// process phrase stats from a single parallel corpus
|
|
void
|
|
Mmsapt::
|
|
process_pstats
|
|
(Phrase const& src,
|
|
uint64_t const pid1,
|
|
pstats const& stats,
|
|
Bitext<Token> const & bt,
|
|
TargetPhraseCollection* tpcoll
|
|
) const
|
|
{
|
|
PhrasePair pp;
|
|
pp.init(pid1, stats, this->m_numScoreComponents);
|
|
apply_pp(bt,pp);
|
|
boost::unordered_map<uint64_t,jstats>::const_iterator t;
|
|
for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
|
|
{
|
|
pp.update(t->first,t->second);
|
|
calc_lex(bt,pp);
|
|
calc_pfwd_fix(bt,pp);
|
|
calc_pbwd_fix(bt,pp);
|
|
tpcoll->Add(createTargetPhrase(src,bt,pp));
|
|
}
|
|
}
|
|
|
|
// process phrase stats from a single parallel corpus
|
|
bool
|
|
Mmsapt::
|
|
pool_pstats(Phrase const& src,
|
|
uint64_t const pid1a,
|
|
pstats * statsa,
|
|
Bitext<Token> const & bta,
|
|
uint64_t const pid1b,
|
|
pstats const* statsb,
|
|
Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll) const
|
|
{
|
|
PhrasePair pp;
|
|
if (statsa && statsb)
|
|
pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
|
|
else if (statsa)
|
|
pp.init(pid1b, *statsa, this->m_numScoreComponents);
|
|
else if (statsb)
|
|
pp.init(pid1b, *statsb, this->m_numScoreComponents);
|
|
else return false; // throw "no stats for pooling available!";
|
|
|
|
apply_pp(bta,pp);
|
|
boost::unordered_map<uint64_t,jstats>::const_iterator b;
|
|
boost::unordered_map<uint64_t,jstats>::iterator a;
|
|
if (statsb)
|
|
{
|
|
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
|
|
{
|
|
uint32_t sid,off,len;
|
|
parse_pid(b->first, sid, off, len);
|
|
Token const* x = bta.T2->sntStart(sid) + off;
|
|
TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
|
|
if (m.size() == len)
|
|
{
|
|
;
|
|
if (statsa && ((a = statsa->trg.find(m.getPid()))
|
|
!= statsa->trg.end()))
|
|
{
|
|
pp.update(b->first,a->second,b->second);
|
|
a->second.invalidate();
|
|
}
|
|
else
|
|
pp.update(b->first,m.approxOccurrenceCount(),
|
|
b->second);
|
|
}
|
|
else pp.update(b->first,b->second);
|
|
calc_lex(btb,pp);
|
|
calc_pfwd_fix(btb,pp);
|
|
calc_pbwd_fix(btb,pp);
|
|
tpcoll->Add(createTargetPhrase(src,btb,pp));
|
|
}
|
|
}
|
|
if (!statsa) return statsb != NULL;
|
|
for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
|
|
{
|
|
uint32_t sid,off,len;
|
|
if (!a->second.valid()) continue;
|
|
parse_pid(a->first, sid, off, len);
|
|
if (btb.T2)
|
|
{
|
|
Token const* x = btb.T2->sntStart(sid) + off;
|
|
TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
|
|
if (m.size() == len)
|
|
pp.update(a->first,m.approxOccurrenceCount(),a->second);
|
|
else
|
|
pp.update(a->first,a->second);
|
|
}
|
|
else
|
|
pp.update(a->first,a->second);
|
|
calc_lex(bta,pp);
|
|
calc_pfwd_fix(bta,pp);
|
|
calc_pbwd_fix(bta,pp);
|
|
tpcoll->Add(createTargetPhrase(src,bta,pp));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
// process phrase stats from a single parallel corpus
|
|
bool
|
|
Mmsapt::
|
|
combine_pstats
|
|
(Phrase const& src,
|
|
uint64_t const pid1a,
|
|
pstats * statsa,
|
|
Bitext<Token> const & bta,
|
|
uint64_t const pid1b,
|
|
pstats const* statsb,
|
|
Bitext<Token> const & btb,
|
|
TargetPhraseCollection* tpcoll
|
|
) const
|
|
{
|
|
PhrasePair ppfix,ppdyn,pool;
|
|
Word w;
|
|
if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
|
|
if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
|
|
boost::unordered_map<uint64_t,jstats>::const_iterator b;
|
|
boost::unordered_map<uint64_t,jstats>::iterator a;
|
|
if (statsb)
|
|
{
|
|
pool.init(pid1b,*statsb,0);
|
|
apply_pp(btb,ppdyn);
|
|
for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
|
|
{
|
|
ppdyn.update(b->first,b->second);
|
|
calc_pfwd_dyn(btb,ppdyn);
|
|
calc_pbwd_dyn(btb,ppdyn);
|
|
calc_lex(btb,ppdyn);
|
|
|
|
uint32_t sid,off,len;
|
|
parse_pid(b->first, sid, off, len);
|
|
Token const* x = bta.T2->sntStart(sid) + off;
|
|
TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
|
|
if (m.size() && statsa &&
|
|
((a = statsa->trg.find(m.getPid()))
|
|
!= statsa->trg.end()))
|
|
{
|
|
ppfix.update(a->first,a->second);
|
|
calc_pfwd_fix(bta,ppfix,&ppdyn.fvals);
|
|
calc_pbwd_fix(btb,ppfix,&ppdyn.fvals);
|
|
a->second.invalidate();
|
|
}
|
|
else
|
|
{
|
|
if (m.size())
|
|
pool.update(b->first,m.approxOccurrenceCount(),
|
|
b->second);
|
|
else
|
|
pool.update(b->first,b->second);
|
|
calc_pfwd_fix(btb,pool,&ppdyn.fvals);
|
|
calc_pbwd_fix(btb,pool,&ppdyn.fvals);
|
|
}
|
|
tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
|
|
}
|
|
}
|
|
if (statsa)
|
|
{
|
|
pool.init(pid1a,*statsa,0);
|
|
apply_pp(bta,ppfix);
|
|
for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
|
|
{
|
|
if (!a->second.valid()) continue; // done above
|
|
ppfix.update(a->first,a->second);
|
|
calc_pfwd_fix(bta,ppfix);
|
|
calc_pbwd_fix(bta,ppfix);
|
|
calc_lex(bta,ppfix);
|
|
|
|
uint32_t sid,off,len;
|
|
parse_pid(a->first, sid, off, len);
|
|
Token const* x = btb.T2->sntStart(sid) + off;
|
|
TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
|
|
if (m.size())
|
|
pool.update(a->first,m.approxOccurrenceCount(),a->second);
|
|
else
|
|
pool.update(a->first,a->second);
|
|
calc_pfwd_dyn(bta,pool,&ppfix.fvals);
|
|
calc_pbwd_dyn(bta,pool,&ppfix.fvals);
|
|
}
|
|
tpcoll->Add(createTargetPhrase(src,bta,ppfix));
|
|
}
|
|
return (statsa || statsb);
|
|
}
|
|
|
|
// // phrase statistics combination treating the two knowledge
|
|
// // sources separately with backoff to pooling when only one
|
|
// // of the two knowledge sources contains the phrase pair in
|
|
// // question
|
|
// void
|
|
// Mmsapt::
|
|
// process_pstats(uint64_t const mypid1,
|
|
// uint64_t const otpid1,
|
|
// pstats const& mystats, // my phrase stats
|
|
// pstats const* otstats, // other phrase stats
|
|
// Bitext<Token> const & mybt, // my bitext
|
|
// Bitext<Token> const * otbt, // other bitext
|
|
// PhraseScorer<Token> const& mypfwd,
|
|
// PhraseScorer<Token> const& mypbwd,
|
|
// PhraseScorer<Token> const* otpfwd,
|
|
// PhraseScorer<Token> const* otpbwd,
|
|
// TargetPhraseCollection* tpcoll)
|
|
// {
|
|
// boost::unordered_map<uint64_t,jstats>::const_iterator t;
|
|
// vector<FactorType> ofact(1,0);
|
|
// PhrasePair mypp,otpp,combo;
|
|
// mypp.init(mypid1, mystats, this->m_numScoreComponents);
|
|
// if (otstats)
|
|
// {
|
|
// otpp.init(otpid1, *otstats, 0);
|
|
// combo.init(otpid1, mystats, *otstats, 0);
|
|
// }
|
|
// else combo = mypp;
|
|
|
|
// for (t = mystats.trg.begin(); t != mystats.trg.end(); ++t)
|
|
// {
|
|
// if (!t->second.valid()) continue;
|
|
// // we dealt with this phrase pair already;
|
|
// // see j->second.invalidate() below;
|
|
// uint32_t sid,off,len; parse_pid(t->first,sid,off,len);
|
|
|
|
// mypp.update(t->first,t->second);
|
|
// apply_pp(mybt,mypp);
|
|
// calc_lex (mybt,mypp);
|
|
// mypfwd(mybt,mypp);
|
|
// mypbwd(mybt,mypp);
|
|
|
|
// if (otbt) // it's a dynamic phrase table
|
|
// {
|
|
// assert(otpfwd);
|
|
// assert(otpbwd);
|
|
// boost::unordered_map<uint64_t,jstats>::iterator j;
|
|
|
|
// // look up the current target phrase in the other bitext
|
|
// Token const* x = mybt.T2->sntStart(sid) + off;
|
|
// TSA<TOKEN>::tree_iterator m(otbt->I2.get(),x,x+len);
|
|
// if (otstats // source phrase exists in other bitext
|
|
// && m.size() // target phrase exists in other bitext
|
|
// && ((j = otstats->trg.find(m.getPid()))
|
|
// != otstats->trg.end())) // phrase pair found in other bitext
|
|
// {
|
|
// otpp.update(j->first,j->second);
|
|
// j->second.invalidate(); // mark the phrase pair as seen
|
|
// otpfwd(*otbt,otpp,&mypp.fvals);
|
|
// otpbwd(*otbt,otpp,&mypp.fvals);
|
|
// }
|
|
// else
|
|
// {
|
|
// if (m.size()) // target phrase seen in other bitext, but not the phrase pair
|
|
// combo.update(t->first,m.approxOccurrenceCount(),t->second);
|
|
// else
|
|
// combo.update(t->first,t->second);
|
|
// (*otpfwd)(mybt,combo,&mypp.fvals);
|
|
// (*otpbwd)(mybt,combo,&mypp.fvals);
|
|
// }
|
|
// }
|
|
|
|
// // now add the phrase pair to the TargetPhraseCollection:
|
|
// TargetPhrase* tp = new TargetPhrase();
|
|
// for (size_t k = off; k < stop; ++k)
|
|
// {
|
|
// StringPiece wrd = (*mybt.V2)[x[k].id()];
|
|
// Word w; w.CreateFromString(Output,ofact,wrd,false);
|
|
// tp->AddWord(w);
|
|
// }
|
|
// tp->GetScoreBreakdown().Assign(this,mypp.fvals);
|
|
// tp->Evaluate(src);
|
|
// tpcoll->Add(tp);
|
|
// }
|
|
// }
|
|
|
|
// This is not the most efficient way of phrase lookup!
|
|
TargetPhraseCollection const*
|
|
Mmsapt::
|
|
GetTargetPhraseCollectionLEGACY(const Phrase& src) const
|
|
{
|
|
TargetPhraseCollection* ret = new TargetPhraseCollection();
|
|
|
|
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
|
|
// is set to a new copy of the dynamic bitext every time a sentence pair
|
|
// is added. /dyn/ keeps the old bitext around as long as we need it.
|
|
sptr<imBitext<Token> > dyn;
|
|
{ // braces are needed for scoping mutex lock guard!
|
|
boost::lock_guard<boost::mutex> guard(this->lock);
|
|
dyn = btdyn;
|
|
}
|
|
|
|
vector<id_type> sphrase(src.GetSize());
|
|
for (size_t i = 0; i < src.GetSize(); ++i)
|
|
{
|
|
Factor const* f = src.GetFactor(i,input_factor);
|
|
id_type wid = (*btfix.V1)[f->ToString()];
|
|
sphrase[i] = wid;
|
|
}
|
|
|
|
TSA<Token>::tree_iterator mfix(btfix.I1.get()), mdyn(dyn->I1.get());
|
|
for (size_t i = 0; mfix.size() == i && i < sphrase.size(); ++i)
|
|
mfix.extend(sphrase[i]);
|
|
|
|
if (dyn->I1.get())
|
|
{
|
|
for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
|
|
mdyn.extend(sphrase[i]);
|
|
}
|
|
|
|
sptr<pstats> sfix,sdyn;
|
|
if (mfix.size() == sphrase.size())
|
|
{
|
|
// do we need this lock here?
|
|
// Is it used here to control the total number of running threads???
|
|
boost::lock_guard<boost::mutex> guard(this->lock);
|
|
sfix = btfix.lookup(mfix);
|
|
}
|
|
if (mdyn.size() == sphrase.size())
|
|
sdyn = dyn->lookup(mdyn);
|
|
if (poolCounts)
|
|
{
|
|
if (!pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
|
|
mdyn.getPid(),sdyn.get(),*dyn,ret))
|
|
return NULL;
|
|
}
|
|
else if (!combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
|
|
mdyn.getPid(),sdyn.get(),*dyn,ret))
|
|
return NULL;
|
|
ret->NthElement(m_tableLimit);
|
|
#if 0
|
|
sort(ret->begin(), ret->end(), CompareTargetPhrase());
|
|
cout << "SOURCE PHRASE: " << src << endl;
|
|
size_t i = 0;
|
|
for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r)
|
|
{
|
|
cout << ++i << " " << **r << endl;
|
|
}
|
|
#endif
|
|
return ret;
|
|
}
|
|
|
|
ChartRuleLookupManager*
|
|
Mmsapt::
|
|
CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &)
|
|
{
|
|
throw "CreateRuleLookupManager is currently not supported in Mmsapt!";
|
|
}
|
|
|
|
template<typename Token>
|
|
void
|
|
fill_token_seq(TokenIndex& V, string const& line, vector<Token>& dest)
|
|
{
|
|
istringstream buf(line); string w;
|
|
while (buf>>w) dest.push_back(Token(V[w]));
|
|
}
|
|
|
|
|
|
}
|