From 81ed9937e1452ff5c67ce97b46684c2803048076 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 5 Mar 2014 11:53:05 +0000 Subject: [PATCH] Routine check-in. --- moses/TranslationModel/UG/mm/ug_bitext.cc | 82 ++++++-- moses/TranslationModel/UG/mm/ug_bitext.h | 3 +- moses/TranslationModel/UG/mmsapt_align.cc | 239 ++++++++++++---------- 3 files changed, 195 insertions(+), 129 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index dc29c7c6c..8f095340a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -413,7 +413,11 @@ namespace Moses ushort const L1, ushort const R1, // limits of previous phrase ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg { - if (a1[seed].size() == 0) return false; + if (a2[s2].size() == 0) + { + cout << __FILE__ << ":" << __LINE__ << endl; + return false; + } bitvector done1(a1.size()); bitvector done2(a2.size()); vector > agenda; @@ -423,7 +427,11 @@ namespace Moses agenda.push_back(pair(2,s2)); e2 = s2; s1 = e1 = a2[s2].front(); - if (s1 >= L1 && s1 < R1) return false; + if (s1 >= L1 && s1 < R1) + { + cout << __FILE__ << ":" << __LINE__ << endl; + return false; + } agenda.push_back(pair(2,s2)); while (agenda.size()) { @@ -435,7 +443,11 @@ namespace Moses done1.set(p); BOOST_FOREACH(ushort i, a1[p]) { - if (i < s2) return false; + if (i < s2) + { + // cout << __FILE__ << ":" << __LINE__ << endl; + return false; + } if (done2[i]) continue; for (;e2 <= i;++e2) if (!done2[e2]) @@ -447,16 +459,22 @@ namespace Moses done2.set(p); BOOST_FOREACH(ushort i, a2[p]) { + if ((e1 < L1 && i >= L1) || (s1 >= R1 && i < R1) || (i >= L1 && i < R1)) + { + // cout << __FILE__ << ":" << __LINE__ << " " + // << L1 << "-" << R1 << " " << i << " " + // << s1 << "-" << e1<< endl; + return false; + } + if (e1 < i) { - if (i >= L1) return false; for (; e1 <= i; ++e1) if (!done1[e1]) agenda.push_back(pair(1,e1)); } - else + else if (s1 > i) { - if (i <= R1) return false; for (; i <= s1; ++i) if (!done1[i]) agenda.push_back(pair(1,i)); @@ -466,6 +484,7 @@ namespace Moses } ++e1; ++e2; + return true; } // s1 = seed; // e1 = seed; @@ -547,6 +566,36 @@ namespace Moses // return true; // } + void + print_amatrix(vector > a1, uint32_t len2, + ushort b1, ushort e1, ushort b2, ushort e2) + { + vector M(a1.size(),bitvector(len2)); + for (ushort j = 0; j < a1.size(); ++j) + { + BOOST_FOREACH(ushort k, a1[j]) + M[j].set(k); + } + cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl; + cout << " "; + for (size_t c = 0; c < len2;++c) + cout << c%10; + cout << endl; + for (size_t r = 0; r < M.size(); ++r) + { + cout << setw(3) << r << " "; + for (size_t c = 0; c < M[r].size(); ++c) + { + if ((b1 <= r) && (r < e1) && b2 <= c && c < e2) + cout << (M[r][c] ? 'x' : '-'); + else cout << (M[r][c] ? 'o' : '.'); + } + cout << endl; + } + cout << string(90,'-') << endl; + } + + PhraseOrientation find_po_fwd(vector >& a1, vector >& a2, @@ -559,13 +608,14 @@ namespace Moses if (n2 == a2.size()) return po_last; - ushort ns1,ns2,ne1,ne2; - bool OK = expand_phrase_pair(a1,a2,n2,b1,e1-1,ns1,ne1,ne2); - - if (!OK) return po_other; + ushort ns1,ne1,ne2; + if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2)) + { + return po_other; + } if (ns1 >= e1) { - for (ushort j = e1; j < ns2; ++j) + for (ushort j = e1; j < ns1; ++j) if (a1[j].size()) return po_jfwd; return po_mono; } @@ -587,12 +637,9 @@ namespace Moses int p2 = b2-1; while (p2 >= 0 && !a2[p2].size()) --p2; if (p2 < 0) return po_first; - ushort ps1,ps2,pe1,pe2; - bool OK = expand_phrase_pair(a2,a1,p2, - 0, b2-1, - 0, a1.size()-1, - ps2,pe2,ps1,pe1); - if (!OK) return po_other; + ushort ps1,pe1,pe2; + if (!expand_phrase_pair(a1,a2,p2,b1,e1,ps1,pe1,pe2)) + return po_other; if (pe1 < b1) { @@ -606,7 +653,6 @@ namespace Moses if (a1[j].size()) return po_jbwd; return po_swap; } - return po_other; } } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 1e378e7e0..2a0452df3 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -1221,7 +1221,8 @@ namespace Moses { if (!ag) { ag.reset(new agenda(*this)); - ag->add_workers(20); + // ag->add_workers(20); + ag->add_workers(1); } } typedef boost::unordered_map > pcache_t; diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index 96c2311d0..473e57d86 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -36,6 +36,8 @@ namespace Moses { s1 = sspan.first; e1 = sspan.second; s2 = tspan.first; e2 = tspan.second; + for (size_t i = s1; i < e1; ++i) + scov.set(i); } bool operator<(PhraseAlnHyp const& other) const @@ -76,45 +78,113 @@ namespace Moses } }; - sptr > - Mmsapt:: - align(string const& src, string const& trg) const + class Alignment { - // For the time being, we consult only the fixed bitext. - // We might also consider the dynamic bitext. => TO DO. + typedef pair span; + typedef vector > pidmap_t; // span -> phrase ID + typedef boost::unordered_map > pid2span_t; + typedef boost::unordered_map jStatsTable; + Mmsapt const& PT; vector s,t; - btfix.V1->fillIdSeq(src,s); - btfix.V2->fillIdSeq(trg,t); - - vector > > - M1(s.size(),vector >(s.size())), - M2(t.size(),vector >(t.size())); - - // get a pool of target phrase ids - typedef vector > pidmap_t; - pidmap_t sspan2pid(s.size(),vector(s.size(),0)); - pidmap_t tspan2pid(t.size(),vector(t.size(),0)); - typedef boost::unordered_map > > - pid2span_t; + pidmap_t sspan2pid, tspan2pid; // span -> phrase ID pid2span_t spid2span,tpid2span; vector > > spstats(s.size()); - // Fill the lookup maps for target span to target phrase id - // and vice versa: + vector PP; + // position-independent phrase pair info + vector PAH; + vector > tpos2ahyp(t.size()); + // maps from target start positions to PhraseAlnHyps starting at + // that position + + sptr getPstats(span const& sspan); + void fill_tspan_maps(); + void fill_sspan_maps(); + void show(ostream& out); + public: + Alignment(Mmsapt const& pt, string const& src, string const& trg); + + }; + + void + Alignment:: + show(ostream& out) + { + // show what we have so far ... + for (size_t s2 = 0; s2 < t.size(); ++s2) + { + VectorIndexSorter foo(PAH); + sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo); + for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h) + { + PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]]; + // pstats const & s = *ah.ps; + out << setw(10) << exp(ah.score) << " " + << btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2) + << " <=> " + << btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1); + vector const& a = ah.pp.aln; + for (size_t u = 0; u +1 < a.size(); ++u) + out << " " << int(a[u+1]) << "-" << int(a[u]); + out << endl; + float const* ofwdj = ah.pp.dfwd; + float const* obwdj = ah.pp.dbwd; + uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd; + uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd; + out << " [first: " << ofwdj[po_first]<<"/"< + Alignment:: + getPstats(span const& sspan) + { + size_t k = sspan.second - sspan.first - 1; + if (k < spstats[sspan.first].size()) + return spstats[sspan.first][k]; + else return sptr(); + } + + void + Alignment:: + fill_tspan_maps() + { + tspan2pid.assign(t.size(),vector(t.size(),0)); for (size_t i = 0; i < t.size(); ++i) { - tsa::tree_iterator m(btfix.I2.get()); + tsa::tree_iterator m(PT.btfix.I2.get()); for (size_t k = i; k < t.size() && m.extend(t[k]); ++k) { uint64_t pid = m.getPid(); tpid2span[pid].push_back(pair(i,k+1)); tspan2pid[i][k] = pid; } - } // done filling lookup maps - - // Fill the lookup maps for the source side, and gather - // phrase statistics from the bitext + } + } + + void + Alignment:: + fill_sspan_maps() + { + sspan2pid.assign(s.size(),vector(s.size(),0)); + spstats.resize(s.size()); for (size_t i = 0; i < s.size(); ++i) { tsa::tree_iterator m(btfix.I1.get()); @@ -133,23 +203,27 @@ namespace Moses spid2span[pid].push_back(pair(i,k+1)); } } + } + + Alignment:: + Alignment(Mmsapt const& pt, string const& src, string const& trg) + : PT(pt) + { + PT.btfix.V1->fillIdSeq(src,s); + PT.btfix.V2->fillIdSeq(trg,t); + fill_tspan_maps(); + fill_sspan_maps(); // now fill the association score table - vector PP; - vector cands; - vector PAH; PAH.reserve(1000000); - vector > tpos2ahyp(t.size()); + PAH.reserve(1000000); typedef pid2span_t::iterator psiter; - typedef boost::unordered_map jStatsTable; - typedef pair xspan; for (psiter L = spid2span.begin(); L != spid2span.end(); ++L) { if (!L->second.size()) continue; // should never happen anyway int i = L->second[0].first; int k = L->second[0].second - i -1; sptr ps = spstats[i][k]; - PhrasePair pp; - pp.init(L->first,*ps, this->m_numScoreComponents); + PhrasePair pp; pp.init(L->first,*ps, this->m_numScoreComponents); jStatsTable & J = ps->trg; for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y) { @@ -161,7 +235,7 @@ namespace Moses calc_pbwd_fix(btfix,pp); pp.eval(this->feature_weights); PP.push_back(pp); - BOOST_FOREACH(xspan const& sspan, L->second) + BOOST_FOREACH(span const& sspan, L->second) { BOOST_FOREACH(xspan const& tspan, R->second) { @@ -171,84 +245,29 @@ namespace Moses } } } - - // show what we have so far ... - for (size_t s2 = 0; s2 < t.size(); ++s2) - { - VectorIndexSorter foo(PAH); - sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo); - for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h) - { - PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]]; - // pstats const & s = *ah.ps; - cout << setw(10) << exp(ah.score) << " " - << btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2) - << " <=> " - << btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1); - vector const& a = ah.pp.aln; - for (size_t u = 0; u +1 < a.size(); ++u) - cout << " " << int(a[u+1]) << "-" << int(a[u]); - cout << endl; - float const* ofwdj = ah.pp.dfwd; - float const* obwdj = ah.pp.dbwd; - uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd; - uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd; - cout << " [first: " << ofwdj[po_first]<<"/"< & loc = spid2span[c.p1][0]; - // int x = loc.first; - // int y = loc.second-1; - // pstats& s = *spstats[x][y-x]; - // cout << setw(10) << exp(c.score) << " " - // << btfix.T1->pid2str(btfix.V1.get(), c.p1) << " <=> " - // << btfix.T2->pid2str(btfix.V2.get(), c.p2) << endl - // << " [first: " << s.ofwd[po_first] - // << " last: " << s.ofwd[po_last] - // << " mono: " << s.ofwd[po_mono] - // << " jfwd: " << s.ofwd[po_jfwd] - // << " swap: " << s.ofwd[po_swap] - // << " jbwd: " << s.ofwd[po_jbwd] - // << "]" << endl - // << " [first: " << s.obwd[po_first] - // << " last: " << s.obwd[po_last] - // << " mono: " << s.obwd[po_mono] - // << " jfwd: " << s.obwd[po_jfwd] - // << " swap: " << s.obwd[po_swap] - // << " jbwd: " << s.obwd[po_jbwd] - // << "]" << endl; - // } + } - // boost::unordered_set > smap; // target phrase ids - // vector > > M(s.size(),vector >(t.size()); - // for (size_t i = 0; i < s.size(); ++i) - // { - - // tsa::tree_iterator m(btfix.I1.get(),&s[i],&(*s.end()),false); - // while (m.size()) - // for (size_t k = 1; k <= m.size(); ++k) - // cout << m.str(btfix.V1.get(),0,k) << " " - // << m.approxOccurrenceCount(k-1) - // << endl; - // } + + int + extend(vector & PAH, int edge, int next) + { + if ((PAH[edge].scov & PAH[next].scov).count()) + return -1; + PAH.push_back(PAH[next]); + PhraseAlnHyp & h = PAH.back(); + h.prev = edge; + h.scov |= PAH[edge].scov; + h.score += log( + } + + sptr > + Mmsapt:: + align(string const& src, string const& trg) const + { + // For the time being, we consult only the fixed bitext. + // We might also consider the dynamic bitext. => TO DO. + sptr > aln; return aln; }