mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-21 08:07:14 +03:00
Routine check-in.
This commit is contained in:
parent
2b19b71095
commit
81ed9937e1
@ -413,7 +413,11 @@ namespace Moses
|
||||
ushort const L1, ushort const R1, // limits of previous phrase
|
||||
ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
|
||||
{
|
||||
if (a1[seed].size() == 0) return false;
|
||||
if (a2[s2].size() == 0)
|
||||
{
|
||||
cout << __FILE__ << ":" << __LINE__ << endl;
|
||||
return false;
|
||||
}
|
||||
bitvector done1(a1.size());
|
||||
bitvector done2(a2.size());
|
||||
vector <pair<ushort,ushort> > agenda;
|
||||
@ -423,7 +427,11 @@ namespace Moses
|
||||
agenda.push_back(pair<ushort,ushort>(2,s2));
|
||||
e2 = s2;
|
||||
s1 = e1 = a2[s2].front();
|
||||
if (s1 >= L1 && s1 < R1) return false;
|
||||
if (s1 >= L1 && s1 < R1)
|
||||
{
|
||||
cout << __FILE__ << ":" << __LINE__ << endl;
|
||||
return false;
|
||||
}
|
||||
agenda.push_back(pair<ushort,ushort>(2,s2));
|
||||
while (agenda.size())
|
||||
{
|
||||
@ -435,7 +443,11 @@ namespace Moses
|
||||
done1.set(p);
|
||||
BOOST_FOREACH(ushort i, a1[p])
|
||||
{
|
||||
if (i < s2) return false;
|
||||
if (i < s2)
|
||||
{
|
||||
// cout << __FILE__ << ":" << __LINE__ << endl;
|
||||
return false;
|
||||
}
|
||||
if (done2[i]) continue;
|
||||
for (;e2 <= i;++e2)
|
||||
if (!done2[e2])
|
||||
@ -447,16 +459,22 @@ namespace Moses
|
||||
done2.set(p);
|
||||
BOOST_FOREACH(ushort i, a2[p])
|
||||
{
|
||||
if ((e1 < L1 && i >= L1) || (s1 >= R1 && i < R1) || (i >= L1 && i < R1))
|
||||
{
|
||||
// cout << __FILE__ << ":" << __LINE__ << " "
|
||||
// << L1 << "-" << R1 << " " << i << " "
|
||||
// << s1 << "-" << e1<< endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (e1 < i)
|
||||
{
|
||||
if (i >= L1) return false;
|
||||
for (; e1 <= i; ++e1)
|
||||
if (!done1[e1])
|
||||
agenda.push_back(pair<ushort,ushort>(1,e1));
|
||||
}
|
||||
else
|
||||
else if (s1 > i)
|
||||
{
|
||||
if (i <= R1) return false;
|
||||
for (; i <= s1; ++i)
|
||||
if (!done1[i])
|
||||
agenda.push_back(pair<ushort,ushort>(1,i));
|
||||
@ -466,6 +484,7 @@ namespace Moses
|
||||
}
|
||||
++e1;
|
||||
++e2;
|
||||
return true;
|
||||
}
|
||||
// s1 = seed;
|
||||
// e1 = seed;
|
||||
@ -547,6 +566,36 @@ namespace Moses
|
||||
// return true;
|
||||
// }
|
||||
|
||||
void
|
||||
print_amatrix(vector<vector<ushort> > a1, uint32_t len2,
|
||||
ushort b1, ushort e1, ushort b2, ushort e2)
|
||||
{
|
||||
vector<bitvector> M(a1.size(),bitvector(len2));
|
||||
for (ushort j = 0; j < a1.size(); ++j)
|
||||
{
|
||||
BOOST_FOREACH(ushort k, a1[j])
|
||||
M[j].set(k);
|
||||
}
|
||||
cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
|
||||
cout << " ";
|
||||
for (size_t c = 0; c < len2;++c)
|
||||
cout << c%10;
|
||||
cout << endl;
|
||||
for (size_t r = 0; r < M.size(); ++r)
|
||||
{
|
||||
cout << setw(3) << r << " ";
|
||||
for (size_t c = 0; c < M[r].size(); ++c)
|
||||
{
|
||||
if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
|
||||
cout << (M[r][c] ? 'x' : '-');
|
||||
else cout << (M[r][c] ? 'o' : '.');
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << string(90,'-') << endl;
|
||||
}
|
||||
|
||||
|
||||
PhraseOrientation
|
||||
find_po_fwd(vector<vector<ushort> >& a1,
|
||||
vector<vector<ushort> >& a2,
|
||||
@ -559,13 +608,14 @@ namespace Moses
|
||||
if (n2 == a2.size())
|
||||
return po_last;
|
||||
|
||||
ushort ns1,ns2,ne1,ne2;
|
||||
bool OK = expand_phrase_pair(a1,a2,n2,b1,e1-1,ns1,ne1,ne2);
|
||||
|
||||
if (!OK) return po_other;
|
||||
ushort ns1,ne1,ne2;
|
||||
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
|
||||
{
|
||||
return po_other;
|
||||
}
|
||||
if (ns1 >= e1)
|
||||
{
|
||||
for (ushort j = e1; j < ns2; ++j)
|
||||
for (ushort j = e1; j < ns1; ++j)
|
||||
if (a1[j].size()) return po_jfwd;
|
||||
return po_mono;
|
||||
}
|
||||
@ -587,12 +637,9 @@ namespace Moses
|
||||
int p2 = b2-1;
|
||||
while (p2 >= 0 && !a2[p2].size()) --p2;
|
||||
if (p2 < 0) return po_first;
|
||||
ushort ps1,ps2,pe1,pe2;
|
||||
bool OK = expand_phrase_pair(a2,a1,p2,
|
||||
0, b2-1,
|
||||
0, a1.size()-1,
|
||||
ps2,pe2,ps1,pe1);
|
||||
if (!OK) return po_other;
|
||||
ushort ps1,pe1,pe2;
|
||||
if (!expand_phrase_pair(a1,a2,p2,b1,e1,ps1,pe1,pe2))
|
||||
return po_other;
|
||||
|
||||
if (pe1 < b1)
|
||||
{
|
||||
@ -606,7 +653,6 @@ namespace Moses
|
||||
if (a1[j].size()) return po_jbwd;
|
||||
return po_swap;
|
||||
}
|
||||
return po_other;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1221,7 +1221,8 @@ namespace Moses {
|
||||
if (!ag)
|
||||
{
|
||||
ag.reset(new agenda(*this));
|
||||
ag->add_workers(20);
|
||||
// ag->add_workers(20);
|
||||
ag->add_workers(1);
|
||||
}
|
||||
}
|
||||
typedef boost::unordered_map<uint64_t,sptr<pstats> > pcache_t;
|
||||
|
@ -36,6 +36,8 @@ namespace Moses
|
||||
{
|
||||
s1 = sspan.first; e1 = sspan.second;
|
||||
s2 = tspan.first; e2 = tspan.second;
|
||||
for (size_t i = s1; i < e1; ++i)
|
||||
scov.set(i);
|
||||
}
|
||||
|
||||
bool operator<(PhraseAlnHyp const& other) const
|
||||
@ -76,45 +78,113 @@ namespace Moses
|
||||
}
|
||||
};
|
||||
|
||||
sptr<vector<int> >
|
||||
Mmsapt::
|
||||
align(string const& src, string const& trg) const
|
||||
class Alignment
|
||||
{
|
||||
// For the time being, we consult only the fixed bitext.
|
||||
// We might also consider the dynamic bitext. => TO DO.
|
||||
typedef pair<uint32_t, uint32_t> span;
|
||||
typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
|
||||
typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
|
||||
typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
|
||||
|
||||
Mmsapt const& PT;
|
||||
vector<id_type> s,t;
|
||||
btfix.V1->fillIdSeq(src,s);
|
||||
btfix.V2->fillIdSeq(trg,t);
|
||||
|
||||
vector<vector<sptr<pstats> > >
|
||||
M1(s.size(),vector<sptr<pstats> >(s.size())),
|
||||
M2(t.size(),vector<sptr<pstats> >(t.size()));
|
||||
|
||||
// get a pool of target phrase ids
|
||||
typedef vector<vector<uint64_t> > pidmap_t;
|
||||
pidmap_t sspan2pid(s.size(),vector<uint64_t>(s.size(),0));
|
||||
pidmap_t tspan2pid(t.size(),vector<uint64_t>(t.size(),0));
|
||||
typedef boost::unordered_map<uint64_t,vector<pair<uint32_t,uint32_t> > >
|
||||
pid2span_t;
|
||||
pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
|
||||
pid2span_t spid2span,tpid2span;
|
||||
vector<vector<sptr<pstats> > > spstats(s.size());
|
||||
|
||||
// Fill the lookup maps for target span to target phrase id
|
||||
// and vice versa:
|
||||
vector<PhrasePair> PP;
|
||||
// position-independent phrase pair info
|
||||
vector<PhraseAlnHyp> PAH;
|
||||
vector<vector<int> > tpos2ahyp(t.size());
|
||||
// maps from target start positions to PhraseAlnHyps starting at
|
||||
// that position
|
||||
|
||||
sptr<pstats> getPstats(span const& sspan);
|
||||
void fill_tspan_maps();
|
||||
void fill_sspan_maps();
|
||||
void show(ostream& out);
|
||||
public:
|
||||
Alignment(Mmsapt const& pt, string const& src, string const& trg);
|
||||
|
||||
};
|
||||
|
||||
void
|
||||
Alignment::
|
||||
show(ostream& out)
|
||||
{
|
||||
// show what we have so far ...
|
||||
for (size_t s2 = 0; s2 < t.size(); ++s2)
|
||||
{
|
||||
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
|
||||
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
|
||||
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
|
||||
{
|
||||
PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]];
|
||||
// pstats const & s = *ah.ps;
|
||||
out << setw(10) << exp(ah.score) << " "
|
||||
<< btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2)
|
||||
<< " <=> "
|
||||
<< btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1);
|
||||
vector<uchar> const& a = ah.pp.aln;
|
||||
for (size_t u = 0; u +1 < a.size(); ++u)
|
||||
out << " " << int(a[u+1]) << "-" << int(a[u]);
|
||||
out << endl;
|
||||
float const* ofwdj = ah.pp.dfwd;
|
||||
float const* obwdj = ah.pp.dbwd;
|
||||
uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
|
||||
uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
|
||||
out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
|
||||
<< " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
|
||||
<< " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
|
||||
<< " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
|
||||
<< " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
|
||||
<< " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
|
||||
<< " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
|
||||
<< "]" << endl
|
||||
<< " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
|
||||
<< " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
|
||||
<< " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
|
||||
<< " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
|
||||
<< " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
|
||||
<< " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
|
||||
<< " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
|
||||
<< "]" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sptr<pstats>
|
||||
Alignment::
|
||||
getPstats(span const& sspan)
|
||||
{
|
||||
size_t k = sspan.second - sspan.first - 1;
|
||||
if (k < spstats[sspan.first].size())
|
||||
return spstats[sspan.first][k];
|
||||
else return sptr<pstats>();
|
||||
}
|
||||
|
||||
void
|
||||
Alignment::
|
||||
fill_tspan_maps()
|
||||
{
|
||||
tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
|
||||
for (size_t i = 0; i < t.size(); ++i)
|
||||
{
|
||||
tsa::tree_iterator m(btfix.I2.get());
|
||||
tsa::tree_iterator m(PT.btfix.I2.get());
|
||||
for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
|
||||
{
|
||||
uint64_t pid = m.getPid();
|
||||
tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
|
||||
tspan2pid[i][k] = pid;
|
||||
}
|
||||
} // done filling lookup maps
|
||||
|
||||
// Fill the lookup maps for the source side, and gather
|
||||
// phrase statistics from the bitext
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Alignment::
|
||||
fill_sspan_maps()
|
||||
{
|
||||
sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
|
||||
spstats.resize(s.size());
|
||||
for (size_t i = 0; i < s.size(); ++i)
|
||||
{
|
||||
tsa::tree_iterator m(btfix.I1.get());
|
||||
@ -133,23 +203,27 @@ namespace Moses
|
||||
spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Alignment::
|
||||
Alignment(Mmsapt const& pt, string const& src, string const& trg)
|
||||
: PT(pt)
|
||||
{
|
||||
PT.btfix.V1->fillIdSeq(src,s);
|
||||
PT.btfix.V2->fillIdSeq(trg,t);
|
||||
fill_tspan_maps();
|
||||
fill_sspan_maps();
|
||||
|
||||
// now fill the association score table
|
||||
vector<PhrasePair> PP;
|
||||
vector<PhrasePair> cands;
|
||||
vector<PhraseAlnHyp> PAH; PAH.reserve(1000000);
|
||||
vector<vector<int> > tpos2ahyp(t.size());
|
||||
PAH.reserve(1000000);
|
||||
typedef pid2span_t::iterator psiter;
|
||||
typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
|
||||
typedef pair<uint32_t,uint32_t> xspan;
|
||||
for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
|
||||
{
|
||||
if (!L->second.size()) continue; // should never happen anyway
|
||||
int i = L->second[0].first;
|
||||
int k = L->second[0].second - i -1;
|
||||
sptr<pstats> ps = spstats[i][k];
|
||||
PhrasePair pp;
|
||||
pp.init(L->first,*ps, this->m_numScoreComponents);
|
||||
PhrasePair pp; pp.init(L->first,*ps, this->m_numScoreComponents);
|
||||
jStatsTable & J = ps->trg;
|
||||
for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
|
||||
{
|
||||
@ -161,7 +235,7 @@ namespace Moses
|
||||
calc_pbwd_fix(btfix,pp);
|
||||
pp.eval(this->feature_weights);
|
||||
PP.push_back(pp);
|
||||
BOOST_FOREACH(xspan const& sspan, L->second)
|
||||
BOOST_FOREACH(span const& sspan, L->second)
|
||||
{
|
||||
BOOST_FOREACH(xspan const& tspan, R->second)
|
||||
{
|
||||
@ -171,84 +245,29 @@ namespace Moses
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// show what we have so far ...
|
||||
for (size_t s2 = 0; s2 < t.size(); ++s2)
|
||||
{
|
||||
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
|
||||
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
|
||||
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
|
||||
{
|
||||
PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]];
|
||||
// pstats const & s = *ah.ps;
|
||||
cout << setw(10) << exp(ah.score) << " "
|
||||
<< btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2)
|
||||
<< " <=> "
|
||||
<< btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1);
|
||||
vector<uchar> const& a = ah.pp.aln;
|
||||
for (size_t u = 0; u +1 < a.size(); ++u)
|
||||
cout << " " << int(a[u+1]) << "-" << int(a[u]);
|
||||
cout << endl;
|
||||
float const* ofwdj = ah.pp.dfwd;
|
||||
float const* obwdj = ah.pp.dbwd;
|
||||
uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
|
||||
uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
|
||||
cout << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
|
||||
<< " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
|
||||
<< " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
|
||||
<< " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
|
||||
<< " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
|
||||
<< " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
|
||||
<< "]" << endl
|
||||
<< " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
|
||||
<< " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
|
||||
<< " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
|
||||
<< " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
|
||||
<< " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
|
||||
<< " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
|
||||
<< "]" << endl;
|
||||
}
|
||||
}
|
||||
// sort(cands.begin(), cands.end(), PPgreater());
|
||||
// for (size_t i = 0; i < cands.size(); ++i)
|
||||
// {
|
||||
// PhrasePair c = cands[i];
|
||||
// pair<uint32_t,uint32_t> & loc = spid2span[c.p1][0];
|
||||
// int x = loc.first;
|
||||
// int y = loc.second-1;
|
||||
// pstats& s = *spstats[x][y-x];
|
||||
// cout << setw(10) << exp(c.score) << " "
|
||||
// << btfix.T1->pid2str(btfix.V1.get(), c.p1) << " <=> "
|
||||
// << btfix.T2->pid2str(btfix.V2.get(), c.p2) << endl
|
||||
// << " [first: " << s.ofwd[po_first]
|
||||
// << " last: " << s.ofwd[po_last]
|
||||
// << " mono: " << s.ofwd[po_mono]
|
||||
// << " jfwd: " << s.ofwd[po_jfwd]
|
||||
// << " swap: " << s.ofwd[po_swap]
|
||||
// << " jbwd: " << s.ofwd[po_jbwd]
|
||||
// << "]" << endl
|
||||
// << " [first: " << s.obwd[po_first]
|
||||
// << " last: " << s.obwd[po_last]
|
||||
// << " mono: " << s.obwd[po_mono]
|
||||
// << " jfwd: " << s.obwd[po_jfwd]
|
||||
// << " swap: " << s.obwd[po_swap]
|
||||
// << " jbwd: " << s.obwd[po_jbwd]
|
||||
// << "]" << endl;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
// boost::unordered_set<uint64_t, sptr<pstats> > smap; // target phrase ids
|
||||
// vector<vector<sptr<pstats> > > M(s.size(),vector<sptr<pstats> >(t.size());
|
||||
// for (size_t i = 0; i < s.size(); ++i)
|
||||
// {
|
||||
|
||||
// tsa::tree_iterator m(btfix.I1.get(),&s[i],&(*s.end()),false);
|
||||
// while (m.size())
|
||||
// for (size_t k = 1; k <= m.size(); ++k)
|
||||
// cout << m.str(btfix.V1.get(),0,k) << " "
|
||||
// << m.approxOccurrenceCount(k-1)
|
||||
// << endl;
|
||||
// }
|
||||
|
||||
int
|
||||
extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
|
||||
{
|
||||
if ((PAH[edge].scov & PAH[next].scov).count())
|
||||
return -1;
|
||||
PAH.push_back(PAH[next]);
|
||||
PhraseAlnHyp & h = PAH.back();
|
||||
h.prev = edge;
|
||||
h.scov |= PAH[edge].scov;
|
||||
h.score += log(
|
||||
}
|
||||
|
||||
sptr<vector<int> >
|
||||
Mmsapt::
|
||||
align(string const& src, string const& trg) const
|
||||
{
|
||||
// For the time being, we consult only the fixed bitext.
|
||||
// We might also consider the dynamic bitext. => TO DO.
|
||||
|
||||
sptr<vector<int> > aln;
|
||||
return aln;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user