Put alignment functionality into a separate class. Not working yet --- work in progress!

This commit is contained in:
Ulrich Germann 2014-03-10 12:03:27 +00:00
parent ff4ce426e7
commit aa8ba7d9a7

View File

@ -76,10 +76,25 @@ namespace Moses
if (next->s1 > e1) return po_jfwd;
return po_other;
}
float
dprob_fwd(PhraseAlnHyp const& next)
{
return pp.dfwd[po_fwd(&next)];
}
float
dprob_bwd(PhraseAlnHyp const& prev)
{
return pp.dbwd[po_bwd(&prev)];
}
};
class Alignment
{
typedef L2R_Token<SimpleWordId> Token;
typedef TSA<Token> tsa;
typedef pair<uint32_t, uint32_t> span;
typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
@ -89,24 +104,68 @@ namespace Moses
vector<id_type> s,t;
pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
pid2span_t spid2span,tpid2span;
vector<vector<sptr<pstats> > > spstats(s.size());
vector<vector<sptr<pstats> > > spstats;
vector<PhrasePair> PP;
// position-independent phrase pair info
public:
vector<PhraseAlnHyp> PAH;
vector<vector<int> > tpos2ahyp(t.size());
vector<vector<int> > tpos2ahyp;
// maps from target start positions to PhraseAlnHyps starting at
// that position
sptr<pstats> getPstats(span const& sspan);
void fill_tspan_maps();
void fill_sspan_maps();
void show(ostream& out);
public:
Alignment(Mmsapt const& pt, string const& src, string const& trg);
void show(ostream& out);
void show(ostream& out, PhraseAlnHyp const& ah);
};
void
Alignment::
show(ostream& out, PhraseAlnHyp const& ah)
{
LexicalPhraseScorer2<Token>::table_t const&
COOCjnt = PT.calc_lex.scorer.COOC;
out << setw(10) << exp(ah.score) << " "
<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
<< " <=> "
<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
vector<uchar> const& a = ah.pp.aln;
// BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
for (size_t u = 0; u+1 < a.size(); u += 2)
out << " " << int(a[u+1]) << "-" << int(a[u]);
if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
<< "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
<< "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
out << endl;
// float const* ofwdj = ah.pp.dfwd;
// float const* obwdj = ah.pp.dbwd;
// uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
// uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
// out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
// << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
// << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
// << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
// << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
// << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
// << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
// << "]" << endl
// << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
// << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
// << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
// << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
// << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
// << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
// << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
// << "]" << endl;
}
void
Alignment::
show(ostream& out)
@ -117,38 +176,7 @@ namespace Moses
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
{
PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]];
// pstats const & s = *ah.ps;
out << setw(10) << exp(ah.score) << " "
<< btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2)
<< " <=> "
<< btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1);
vector<uchar> const& a = ah.pp.aln;
for (size_t u = 0; u +1 < a.size(); ++u)
out << " " << int(a[u+1]) << "-" << int(a[u]);
out << endl;
float const* ofwdj = ah.pp.dfwd;
float const* obwdj = ah.pp.dbwd;
uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
<< " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
<< " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
<< " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
<< " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
<< " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
<< " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
<< "]" << endl
<< " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
<< " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
<< " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
<< " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
<< " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
<< " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
<< " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
<< "]" << endl;
}
show(out,PAH[tpos2ahyp[s2][h]]);
}
}
@ -187,7 +215,7 @@ namespace Moses
spstats.resize(s.size());
for (size_t i = 0; i < s.size(); ++i)
{
tsa::tree_iterator m(btfix.I1.get());
tsa::tree_iterator m(PT.btfix.I1.get());
for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
{
uint64_t pid = m.getPid();
@ -199,7 +227,13 @@ namespace Moses
int y = p->second[0].second-1;
spstats[i].push_back(spstats[x][y-x]);
}
else spstats[i].push_back(btfix.lookup(m));
else
{
spstats[i].push_back(PT.btfix.lookup(m));
cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
<< spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
<< endl;
}
spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
}
}
@ -211,9 +245,28 @@ namespace Moses
{
PT.btfix.V1->fillIdSeq(src,s);
PT.btfix.V2->fillIdSeq(trg,t);
// LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
// BOOST_FOREACH(id_type i, t)
// {
// cout << (*PT.btfix.V2)[i];
// if (i < PT.wlex21.size())
// {
// BOOST_FOREACH(id_type k, PT.wlex21[i])
// {
// size_t j = COOC[k][i];
// size_t m1 = COOC.m1(k);
// size_t m2 = COOC.m2(i);
// if (j*1000 > m1 && j*1000 > m2)
// cout << " " << (*PT.btfix.V1)[k];
// }
// }
// cout << endl;
// }
fill_tspan_maps();
fill_sspan_maps();
tpos2ahyp.resize(t.size());
// now fill the association score table
PAH.reserve(1000000);
typedef pid2span_t::iterator psiter;
@ -223,21 +276,21 @@ namespace Moses
int i = L->second[0].first;
int k = L->second[0].second - i -1;
sptr<pstats> ps = spstats[i][k];
PhrasePair pp; pp.init(L->first,*ps, this->m_numScoreComponents);
PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
jStatsTable & J = ps->trg;
for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
{
psiter R = tpid2span.find(y->first);
if (R == tpid2span.end()) continue;
pp.update(y->first, y->second);
calc_lex(btfix,pp);
calc_pfwd_fix(btfix,pp);
calc_pbwd_fix(btfix,pp);
pp.eval(this->feature_weights);
PT.calc_lex(PT.btfix,pp);
PT.calc_pfwd_fix(PT.btfix,pp);
PT.calc_pbwd_fix(PT.btfix,pp);
pp.eval(PT.feature_weights);
PP.push_back(pp);
BOOST_FOREACH(span const& sspan, L->second)
{
BOOST_FOREACH(xspan const& tspan, R->second)
BOOST_FOREACH(span const& tspan, R->second)
{
tpos2ahyp[tspan.first].push_back(PAH.size());
PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
@ -254,11 +307,14 @@ namespace Moses
{
if ((PAH[edge].scov & PAH[next].scov).count())
return -1;
int ret = PAH.size();
PAH.push_back(PAH[next]);
PhraseAlnHyp & h = PAH.back();
h.prev = edge;
h.scov |= PAH[edge].scov;
h.score += log(
h.score += log(PAH[edge].dprob_fwd(PAH[next]));
h.score += log(PAH[next].dprob_bwd(PAH[edge]));
return ret;
}
sptr<vector<int> >
@ -267,7 +323,10 @@ namespace Moses
{
// For the time being, we consult only the fixed bitext.
// We might also consider the dynamic bitext. => TO DO.
Alignment A(*this,src,trg);
VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
vector<size_t> o; foo.GetOrder(o);
BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
sptr<vector<int> > aln;
return aln;
}