diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 3536de31d..6a5d71e6b 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -45,12 +45,14 @@ namespace Moses pstats:: add(uint64_t pid, float const w, vector const& a, - uint32_t const cnt2) + uint32_t const cnt2, + uint32_t fwd_o, + uint32_t bwd_o) { this->lock.lock(); jstats& entry = this->trg[pid]; this->lock.unlock(); - entry.add(w,a,cnt2); + entry.add(w,a,cnt2,fwd_o,bwd_o); if (this->good < entry.rcnt()) { this->lock.lock(); @@ -65,6 +67,8 @@ namespace Moses jstats() : my_rcnt(0), my_wcnt(0), my_cnt2(0) { + ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0; + obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0; my_aln.reserve(1); } @@ -74,11 +78,33 @@ namespace Moses my_rcnt = other.rcnt(); my_wcnt = other.wcnt(); my_aln = other.aln(); + for (int i = po_first; i <= po_other; i++) + { + ofwd[i] = other.ofwd[i]; + obwd[i] = other.obwd[i]; + } } + uint32_t + jstats:: + dcnt_fwd(PhraseOrientation const idx) const + { + assert(idx <= po_other); + return ofwd[idx]; + } + + uint32_t + jstats:: + dcnt_bwd(PhraseOrientation const idx) const + { + assert(idx <= po_other); + return obwd[idx]; + } + void jstats:: - add(float w, vector const& a, uint32_t const cnt2) + add(float w, vector const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient) { boost::lock_guard lk(this->lock); my_rcnt += 1; @@ -95,6 +121,8 @@ namespace Moses if (my_aln[i].first > my_aln[i/2].first) push_heap(my_aln.begin(),my_aln.begin()+i+1); } + ++ofwd[fwd_orient]; + ++obwd[bwd_orient]; } uint32_t @@ -159,7 +187,30 @@ namespace Moses return this->score > other.score; } - PhrasePair::PhrasePair() {} + PhrasePair:: + PhrasePair() {} + + PhrasePair:: + PhrasePair(PhrasePair const& o) + : p1(o.p1), + p2(o.p2), + raw1(o.raw1), + raw2(o.raw2), + sample1(o.sample1), + sample2(o.sample2), + good1(o.good1), + good2(o.good2), + joint(o.joint), + fvals(o.fvals), + aln(o.aln), + score(o.score) + { + for (size_t i = 0; i <= po_other; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } void PhrasePair:: @@ -208,6 +259,19 @@ namespace Moses assert(js.aln().size()); if (js.aln().size()) aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } return *this; } @@ -223,6 +287,12 @@ namespace Moses aln = js1.aln()[0].second; else if (js2.aln().size()) aln = js2.aln()[0].second; + for (int i = po_first; i < po_other; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other); + dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other); + } return *this; } @@ -238,6 +308,12 @@ namespace Moses assert(js.aln().size()); if (js.aln().size()) aln = js.aln()[0].second; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other); + dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other); + } return *this; } @@ -331,24 +407,53 @@ namespace Moses bool expand_phrase_pair - (vector >& a1, + (vector >& a1, vector >& a2, - ushort const seed, - ushort const L1, // hard left limit source - ushort const R1, // hard right limit source - ushort const L2, // hard left limit target - ushort const R2, // hard right limit target - ushort & s1, ushort & e1, // start/end src phrase - ushort & s2, ushort & e2) // start/end trg phrase + ushort const s2, // next word on in target side + ushort const L1, ushort const R1, // limits of previous phrase + ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg { if (a1[seed].size() == 0) return false; - assert(L1 <= seed); - assert(R1 > seed); bitvector done1(a1.size()); bitvector done2(a2.size()); vector > agenda; + // x.first: side (1 or 2) + // x.second: word position agenda.reserve(a1.size() + a2.size()); - agenda.push_back(pair(seed,0)); + agenda.push_back(pair(2,s2)); + e2 = s2; + s1 = e1 = a2[s2].front(); + if (s1 >= L1 && s1 < R1) return false; + for (size_t i = 1; i < a2[s2].size(); ++i) + { + ushort p = a2[s2][i]; + if (s1 >= R1) + { if (p < R1) return false; } + else if (e1 < L1) + { if (p > L1) return false; } + if (s1 > p) s1 = p; + if (e1 < p) e1 = p; + } + done2.set(s2); + + + while (agenda.size()) + { + ushort side = agenda.back().first; + ushort p = agenda.back().second; + agenda.pop_back(); + if (side == 1) + { + done1.set(p); + BOOST_FOREACH(ushort i, a1[p]) + { + if (i < seed) return false; + if (done2[i]) continue; + for (;e2 <= i;++e2) + if (!done2[e2]) + agenda.push_back(pair(2,e2)); + if (e2 < i) e2 = i; + } s1 = seed; e1 = seed; s2 = e2 = a1[seed].front(); @@ -437,13 +542,16 @@ namespace Moses { size_t n2 = e2; while (n2 < a2.size() && a2[n2].size() == 0) ++n2; - if (n2 == a2.size()) return po_last; + + if (n2 == a2.size()) + return po_last; ushort ns1,ns2,ne1,ne2; bool OK = expand_phrase_pair(a2,a1,n2, e2, a2.size()-1, 0, a1.size()-1, ns2,ne2,ns1,ne1); + if (!OK) return po_other; if (ns1 >= e1) { diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 94ace59bd..1e378e7e0 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -56,13 +56,13 @@ namespace Moses { enum PhraseOrientation { - po_first=0, - po_mono=1, - po_jfwd=2, - po_swap=3, - po_jbwd=4, - po_last=5, - po_other=6 + po_first, + po_mono, + po_jfwd, + po_swap, + po_jbwd, + po_last, + po_other }; PhraseOrientation @@ -102,6 +102,7 @@ namespace Moses { float my_wcnt; // weighted count uint32_t my_cnt2; vector > > my_aln; + uint32_t ofwd[7], obwd[7]; public: jstats(); jstats(jstats const& other); @@ -110,9 +111,12 @@ namespace Moses { float wcnt() const; vector > > const & aln() const; - void add(float w, vector const& a, uint32_t const cnt2); + void add(float w, vector const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient); void invalidate(); bool valid(); + uint32_t dcnt_fwd(PhraseOrientation const idx) const; + uint32_t dcnt_bwd(PhraseOrientation const idx) const; }; struct @@ -127,7 +131,7 @@ namespace Moses { size_t sum_pairs; size_t in_progress; // keeps track of how many threads are currently working on this - uint32_t ofwd[7], obwd[7]; + uint32_t ofwd[po_other+1], obwd[po_other+1]; typename boost::unordered_map trg; pstats(); @@ -135,8 +139,12 @@ namespace Moses { void register_worker(); size_t count_workers() { return in_progress; } - bool add(uint64_t const pid, float const w, - vector const& a, uint32_t const cnt2); + bool + add(uint64_t const pid, + float const w, + vector const& a, + uint32_t const cnt2, + uint32_t fwd_o, uint32_t bwd_o); }; class @@ -146,12 +154,15 @@ namespace Moses { uint64_t p1, p2; uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; vector fvals; + float dfwd[po_other+1]; + float dbwd[po_other+1]; vector aln; // float avlex12,avlex21; // average lexical probs (Moses std) // float znlex1,znlex2; // zens-ney lexical smoothing // float colex1,colex2; // based on raw lexical occurrences float score; PhrasePair(); + PhrasePair(PhrasePair const& o); bool operator<(PhrasePair const& other) const; bool operator>(PhrasePair const& other) const; bool operator<=(PhrasePair const& other) const; @@ -161,13 +172,15 @@ namespace Moses { void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, size_t const numfeats); - PhrasePair const& update(uint64_t const pid2, jstats const& js); - PhrasePair const& update(uint64_t const pid2, - jstats const& js1, - jstats const& js2); - PhrasePair const& update(uint64_t const pid2, - size_t const raw2extra, - jstats const& js); + PhrasePair const& + update(uint64_t const pid2, jstats const& js); + + PhrasePair const& + update(uint64_t const pid2, jstats const& js1, jstats const& js2); + + PhrasePair const& + update(uint64_t const pid2, size_t const raw2extra, jstats const& js); + float eval(vector const& w); }; @@ -355,9 +368,6 @@ namespace Moses { }; - - - template class Bitext { @@ -660,7 +670,8 @@ namespace Moses { for (size_t i = e1; i <= e2; ++i) { if (!j->stats->add(b->getPid(),sample_weight,aln, - b->approxOccurrenceCount())) + b->approxOccurrenceCount(), + po_fwd,po_bwd)) { for (size_t z = 0; z < j->len; ++z) { diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index 64e209c3b..96c2311d0 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -18,59 +18,70 @@ namespace Moses Mmsapt:: setWeights(vector const & w) { - assert(w.size() == this->numScoreComponents); + assert(w.size() == this->m_numScoreComponents); this->feature_weights = w; } struct PhraseAlnHyp { - int s1,e1,s2,e2; - uint64_t p1,p2; - float pscore; - sptr ps; - jstats const* js; - PhraseAlnHyp(PhrasePair& pp, + PhrasePair pp; + ushort s1,e1,s2,e2; // start and end positions + int prev; // preceding alignment hypothesis + float score; + bitvector scov; // source coverage + PhraseAlnHyp(PhrasePair const& ppx, int slen, pair const& sspan, - pair const& tspan, - sptr const& ps_, - jstats const* js_) - : js(js_) + pair const& tspan) + : pp(ppx), prev(-1), score(ppx.score), scov(slen) { - s1 = sspan.first; - e1 = sspan.second; - s2 = tspan.first; - e2 = tspan.second; - p1 = pp.p1; - p2 = pp.p2; - pscore = pp.score; - ps = ps_; - } - - PhraseAlnHyp(PhraseAlnHyp const& other) - : s1(other.s1), e1(other.e1), s2(other.s2), e2(other.e2) - , p1(other.p1), p2(other.p2), pscore(other.pscore), js(other.js) - { - ps = other.ps; + s1 = sspan.first; e1 = sspan.second; + s2 = tspan.first; e2 = tspan.second; } bool operator<(PhraseAlnHyp const& other) const { - return this->pscore < other.pscore; + return this->score < other.score; } bool operator>(PhraseAlnHyp const& other) const { - return this->pscore > other.pscore; + return this->score > other.score; + } + + PhraseOrientation + po_bwd(PhraseAlnHyp const* prev) const + { + if (s2 == 0) return po_first; + assert(prev); + assert(prev->e2 <= s2); + if (prev->e2 < s2) return po_other; + if (prev->e1 == s1) return po_mono; + if (prev->e1 < s1) return po_jfwd; + if (prev->s1 == e1) return po_swap; + if (prev->s1 > e1) return po_jbwd; + return po_other; + } + + PhraseOrientation + po_fwd(PhraseAlnHyp const* next) const + { + if (!next) return po_last; + assert(next->s2 >= e2); + if (next->s2 < e2) return po_other; + if (next->e1 == s1) return po_swap; + if (next->e1 < s1) return po_jbwd; + if (next->s1 == e1) return po_mono; + if (next->s1 > e1) return po_jfwd; + return po_other; } }; - sptr > Mmsapt:: align(string const& src, string const& trg) const { // For the time being, we consult only the fixed bitext. - // We might also consider the dynamic bitext. TO DO. + // We might also consider the dynamic bitext. => TO DO. vector s,t; btfix.V1->fillIdSeq(src,s); @@ -89,6 +100,8 @@ namespace Moses pid2span_t spid2span,tpid2span; vector > > spstats(s.size()); + // Fill the lookup maps for target span to target phrase id + // and vice versa: for (size_t i = 0; i < t.size(); ++i) { tsa::tree_iterator m(btfix.I2.get()); @@ -98,8 +111,10 @@ namespace Moses tpid2span[pid].push_back(pair(i,k+1)); tspan2pid[i][k] = pid; } - } + } // done filling lookup maps + // Fill the lookup maps for the source side, and gather + // phrase statistics from the bitext for (size_t i = 0; i < s.size(); ++i) { tsa::tree_iterator m(btfix.I1.get()); @@ -122,75 +137,75 @@ namespace Moses // now fill the association score table vector PP; vector cands; - vector > ahyps(t.size()); - for (pid2span_t::iterator x = spid2span.begin(); - x != spid2span.end(); ++x) + vector PAH; PAH.reserve(1000000); + vector > tpos2ahyp(t.size()); + typedef pid2span_t::iterator psiter; + typedef boost::unordered_map jStatsTable; + typedef pair xspan; + for (psiter L = spid2span.begin(); L != spid2span.end(); ++L) { - int i = x->second[0].first; - int k = x->second[0].second - i -1; + if (!L->second.size()) continue; // should never happen anyway + int i = L->second[0].first; + int k = L->second[0].second - i -1; sptr ps = spstats[i][k]; - boost::unordered_map & j = ps->trg; - typedef boost::unordered_map::iterator jiter; PhrasePair pp; - pp.init(x->first,*ps, this->m_numScoreComponents); - for (jiter y = j.begin(); y != j.end(); ++y) + pp.init(L->first,*ps, this->m_numScoreComponents); + jStatsTable & J = ps->trg; + for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y) { - pid2span_t::iterator z = tpid2span.find(y->first); - if (z != tpid2span.end()) + psiter R = tpid2span.find(y->first); + if (R == tpid2span.end()) continue; + pp.update(y->first, y->second); + calc_lex(btfix,pp); + calc_pfwd_fix(btfix,pp); + calc_pbwd_fix(btfix,pp); + pp.eval(this->feature_weights); + PP.push_back(pp); + BOOST_FOREACH(xspan const& sspan, L->second) { - pp.update(y->first, y->second); - calc_lex(btfix,pp); - calc_pfwd_fix(btfix,pp); - calc_pbwd_fix(btfix,pp); - pp.eval(this->feature_weights); - for (size_t js = 0; js < x->second.size(); ++js) + BOOST_FOREACH(xspan const& tspan, R->second) { - pair const& sspan = x->second[js]; - for (size_t jt = 0; jt < z->second.size(); ++jt) - { - pair & tspan = z->second[jt]; - PhraseAlnHyp pah(pp,sspan,tspan,ps,&y->second); - ahyps[tspan.first].push_back(pah); - } + tpos2ahyp[tspan.first].push_back(PAH.size()); + PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan)); } - // cands.push_back(pp); - // pair ss = x->second[0]; - // pair ts = tpid2span[y->first][0]; - // cout << btfix.V1->toString(&s[ss.first],&s[ss.second]) - // << " <=> " - // << btfix.V2->toString(&t[ts.first],&t[ts.second]) - // << endl; } } } + + // show what we have so far ... for (size_t s2 = 0; s2 < t.size(); ++s2) { - sort(ahyps[s2].begin(), ahyps[s2].end(), greater()); - for (size_t h = 0; h < ahyps[s2].size(); ++h) + VectorIndexSorter foo(PAH); + sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo); + for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h) { - PhraseAlnHyp const& ah = ahyps[s2][h]; - pstats const & s = *ah.ps; - cout << setw(10) << exp(ah.pscore) << " " - << btfix.T2->pid2str(btfix.V2.get(), ah.p2) + PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]]; + // pstats const & s = *ah.ps; + cout << setw(10) << exp(ah.score) << " " + << btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2) << " <=> " - << btfix.T1->pid2str(btfix.V1.get(), ah.p1); - vector const& a = ah.js->aln()[0].second; + << btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1); + vector const& a = ah.pp.aln; for (size_t u = 0; u +1 < a.size(); ++u) cout << " " << int(a[u+1]) << "-" << int(a[u]); cout << endl; - cout << " [first: " << s.ofwd[po_first] - << " last: " << s.ofwd[po_last] - << " mono: " << s.ofwd[po_mono] - << " jfwd: " << s.ofwd[po_jfwd] - << " swap: " << s.ofwd[po_swap] - << " jbwd: " << s.ofwd[po_jbwd] + float const* ofwdj = ah.pp.dfwd; + float const* obwdj = ah.pp.dbwd; + uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd; + uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd; + cout << " [first: " << ofwdj[po_first]<<"/"<