From a87a9ff2070d32410ce2e22468de0e2f8cd5086b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:28:47 +0100 Subject: [PATCH] Moved class PhrasePair back to ug_bitext. Moved function expand() from mmsapt.cc to ug_bitext.h. Added new lookup function to class Bitext. Bug fixes related to inverse lookup in class Bitext. --- moses/TranslationModel/UG/mm/ug_bitext.h | 535 +++++++++++++++---- moses/TranslationModel/UG/mm/ug_phrasepair.h | 244 +-------- moses/TranslationModel/UG/mmsapt.cpp | 43 +- 3 files changed, 463 insertions(+), 359 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 01d8187d6..9d80d32fa 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -47,6 +47,8 @@ #include "ug_corpus_token.h" #include "tpt_pickler.h" #include "ug_lexical_phrase_scorer2.h" +#include "ug_phrasepair.h" +#include "ug_lru_cache.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -57,6 +59,7 @@ namespace Moses { namespace bitext { template class Bitext; + template class PhrasePair; using namespace ugdiss; template class Bitext; @@ -160,6 +163,246 @@ namespace Moses { }; + template + string + toString(TokenIndex const& V, Token const* x, size_t const len) + { + if (!len) return ""; + UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); + ostringstream buf; + buf << V[x->id()]; + size_t i = 1; + for (x = x->next(); x && i < len; ++i, x = x->next()) + buf << " " << V[x->id()]; + UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); + return buf.str(); + } + + template + class + PhrasePair + { + public: + class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + uint64_t p1, p2; + uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; + vector fvals; + float dfwd[po_other+1]; // distortion counts // counts or probs? + float dbwd[po_other+1]; // distortion counts + vector aln; + float score; + bool inverse; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); + + PhrasePair const& operator+=(PhrasePair const& other); + + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; + + void init(); + void init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); + + // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); + // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, + // size_t const numfeats); + + // PhrasePair const& + // update(uint64_t const pid2, size_t r2 = 0); + + PhrasePair const& + update(uint64_t const pid2, Token const* x, + uint32_t const len, jstats const& js); + + // PhrasePair const& + // update(uint64_t const pid2, jstats const& js1, jstats const& js2); + + // PhrasePair const& + // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); + + // float + // eval(vector const& w); + + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; + + template + void + PhrasePair:: + init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + inverse = is_inverse; + start1 = x; len1 = len; + p1 = pid1; + p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + fvals.resize(numfeats); + } + + template + PhrasePair const& + PhrasePair:: + update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) + { + p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + + // should we do that here or leave the raw counts? + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } + + return *this; + } + + template + bool + PhrasePair:: + operator<(PhrasePair const& other) const + { return this->score < other.score; } + + template + bool + PhrasePair:: + operator>(PhrasePair const& other) const + { return this->score > other.score; } + + template + bool + PhrasePair:: + operator<=(PhrasePair const& other) const + { return this->score <= other.score; } + + template + bool + PhrasePair:: + operator>=(PhrasePair const& other) const + { return this->score >= other.score; } + + template + PhrasePair const& + PhrasePair:: + operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + sample1 += o.sample1; + sample2 += o.sample2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + return *this; + } + + template + PhrasePair:: + PhrasePair(PhrasePair const& o) + : start1(o.start1) + , start2(o.start2) + , len1(o.len1) + , len2(o.len2) + , p1(o.p1) + , p2(o.p2) + , raw1(o.raw1) + , raw2(o.raw2) + , sample1(o.sample1) + , sample2(o.sample2) + , good1(o.good1) + , good2(o.good2) + , joint(o.joint) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + , inverse(o.inverse) + { + for (size_t i = 0; i <= po_other; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } + + template + int + PhrasePair:: + SortByTargetIdSeq:: + cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } + + template + bool + PhrasePair:: + SortByTargetIdSeq:: + operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template + void + PhrasePair:: + init() + { + inverse = false; + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + p1 = p2 = 0; + } + template class Bitext { @@ -210,9 +453,14 @@ namespace Moses { #endif mutable pcache_t cache1,cache2; protected: + typedef typename + lru_cache::LRU_Cache > > + pplist_cache_t; + size_t default_sample_size; size_t num_workers; size_t m_pstats_cache_threshold; + mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; private: sptr prep2(iter const& phrase, size_t const max_sample) const; @@ -235,6 +483,14 @@ namespace Moses { // sptr lookup(Phrase const& phrase, size_t factor) const; sptr lookup(iter const& phrase) const; sptr lookup(iter const& phrase, size_t const max_sample) const; + + void + lookup(vector const& snt, TSA& idx, + vector > > > >& dest, + vector >* pidmap = NULL, + typename PhrasePair::Scorer* scorer=NULL, + bool multithread=true) const; + void prep(iter const& phrase) const; void setDefaultSampleSize(size_t const max_samples); @@ -487,7 +743,8 @@ namespace Moses { } else if (!ag.bt.find_trg_phr_bounds (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd, - NULL,NULL,true)) + // NULL,NULL,true)) + &aln,NULL,true)) continue; j->stats->lock.lock(); j->stats->good += 1; @@ -495,7 +752,8 @@ namespace Moses { ++j->stats->ofwd[po_fwd]; ++j->stats->obwd[po_bwd]; j->stats->lock.unlock(); - for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1; Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid); float sample_weight = 1./((s2-s1+1)*(e2-e1+1)); @@ -567,8 +825,10 @@ namespace Moses { #endif } } - if (j->fwd && s < s2) - for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + // if (j->fwd && s < s2) + // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + if (s < s2) + for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } // j->stats->lock.unlock(); @@ -584,7 +844,8 @@ namespace Moses { ~job() { if (stats) stats.reset(); - --active; + try { --active; } catch (...) {} + // counter may not exist any more at destruction time } template @@ -981,9 +1242,18 @@ namespace Moses { assert(T2); assert(Tx); - bitvector forbidden((flip ? T1 : T2)->sntLen(sid)); - size_t slen1 = (*T1).sntLen(sid); - size_t slen2 = (*T2).sntLen(sid); + size_t slen1,slen2; + if (flip) + { + slen1 = T2->sntLen(sid); + slen2 = T1->sntLen(sid); + } + else + { + slen1 = T1->sntLen(sid); + slen2 = T2->sntLen(sid); + } + bitvector forbidden(slen2); if (full_alignment) { if (slen1*slen2 > full_alignment->size()) @@ -1002,17 +1272,11 @@ namespace Moses { if (flip) { p = binread(p,trg); assert(p= slen1 || trg >= slen2) - { - ostringstream buf; - buf << "Alignment range error at sentence " << sid << "!" << endl - << src << "/" << slen1 << " " << trg << "/" << slen2 << endl; - cerr << buf.str() << endl; - UTIL_THROW(util::Exception, buf.str().c_str()); - } - + UTIL_THROW_IF2((src >= slen1 || trg >= slen2), + "Alignment range error at sentence " << sid << "!\n" + << src << "/" << slen1 << " " << + trg << "/" << slen2); + if (src < start || src >= stop) forbidden.set(trg); else @@ -1022,22 +1286,11 @@ namespace Moses { } if (core_alignment) { - if (flip) - { - aln1[trg].push_back(src); - aln2[src].push_back(trg); - } - else - { - aln1[src].push_back(trg); - aln2[trg].push_back(src); - } + aln1[src].push_back(trg); + aln2[trg].push_back(src); } if (full_alignment) - { - if (flip) full_alignment->set(trg*slen2 + src); - else full_alignment->set(src*slen2 + trg); - } + full_alignment->set(src*slen2 + trg); } for (size_t i = lft; i <= rgt; ++i) @@ -1051,67 +1304,17 @@ namespace Moses { if (core_alignment) { core_alignment->clear(); - if (flip) + for (size_t i = start; i < stop; ++i) { - for (size_t i = lft; i <= rgt; ++i) + BOOST_FOREACH(ushort x, aln1[i]) { - sort(aln1[i].begin(),aln1[i].end()); - BOOST_FOREACH(ushort x, aln1[i]) - { - core_alignment->push_back(i-lft); - core_alignment->push_back(x-start); - } + core_alignment->push_back(i-start); + core_alignment->push_back(x-lft); } } - else - { - for (size_t i = start; i < stop; ++i) - { - BOOST_FOREACH(ushort x, aln1[i]) - { - core_alignment->push_back(i-start); - core_alignment->push_back(x-lft); - } - } - } - // now determine fwd and bwd phrase orientation - if (flip) - { - po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2); - po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2); - } - else - { - po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); - po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); - } -#if 0 - // if (e1 - s1 > 3) - { - lock_guard guard(this->lock); - Token const* t1 = T1->sntStart(sid); - Token const* t2 = T2->sntStart(sid); - cout << "[" << start << ":" << stop << "] => [" - << s1 << ":" << s2 << ":" - << e1 << ":" << e2 << "]" << endl; - for (size_t k = start; k < stop; ++k) - cout << k-start << "." << (*V1)[t1[k].id()] << " "; - cout << endl; - for (size_t k = s1; k < e2;) - { - if (k == s2) cout << "["; - cout << int(k)-int(s2) << "." << (*V2)[t2[k].id()]; - if (++k == e1) cout << "] "; - else cout << " "; - } - cout << endl; - for (size_t k = 0; k < core_alignment->size(); k += 2) - cout << int((*core_alignment)[k]) << "-" << int((*core_alignment)[k+1]) << " "; - cout << "\n" << __FILE__ << ":" << __LINE__ << endl; - - } -#endif + po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); + po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); } return lft <= rgt; } @@ -1143,9 +1346,10 @@ namespace Moses { max_sample == this->default_sample_size && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { - // need to test what a good caching threshold is + // still need to test what a good caching threshold is // is caching here the cause of the apparent memory leak in - // confusion network decoding ???? + // confusion network decoding ???? No, it isn't. + // That was because of naive, brute-force input path generation. uint64_t pid = phrase.getPid(); pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2); pcache_t::value_type entry(pid,sptr()); @@ -1170,6 +1374,124 @@ namespace Moses { return ret; } + // worker for scoring and sorting phrase table entries in parallel + template + class pstats2pplist + { + Ttrack const& m_other; + sptr m_pstats; + vector >& m_pplist; + typename PhrasePair::Scorer const* m_scorer; + PhrasePair m_pp; + Token const* m_token; + size_t m_len; + uint64_t m_pid1; + bool m_is_inverse; + public: + + // CONSTRUCTOR + pstats2pplist(typename TSA::tree_iterator const& m, + Ttrack const& other, + sptr const& ps, + vector >& dest, + typename PhrasePair::Scorer const* scorer) + : m_other(other) + , m_pstats(ps) + , m_pplist(dest) + , m_scorer(scorer) + , m_token(m.getToken(0)) + , m_len(m.size()) + , m_pid1(m.getPid()) + , m_is_inverse(false) + { } + + // WORKER + void + operator()() + { + // wait till all statistics have been collected + boost::unique_lock lock(m_pstats->lock); + while (m_pstats->in_progress) + m_pstats->ready.wait(lock); + + m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); + + // convert pstats entries to phrase pairs + pstats::trg_map_t::iterator a; + for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); + m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint); + size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 + if (m_pp.good1 > J || m_pp.good2 > J) continue; + if (m_scorer) + { + (*m_scorer)(m_pp); + } + m_pplist.push_back(m_pp); + } + greater > sorter; + if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter); + } + }; + + template + void + Bitext:: + lookup(vector const& snt, TSA& idx, + vector > > > >& dest, + vector >* pidmap, + typename PhrasePair::Scorer* scorer, + bool multithread) const + { + typedef vector > > > > ret_t; + + dest.clear(); + dest.resize(snt.size()); + if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); } + + // collect statistics in parallel, then build PT entries as + // the sampling finishes + bool fwd = &idx == I1.get(); + vector workers; // background threads doing the lookup + pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2); + if (C.capacity() < 100000) C.reserve(100000); + for (size_t i = 0; i < snt.size(); ++i) + { + dest[i].reserve(snt.size()-i); + typename TSA::tree_iterator m(&idx); + for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k) + { + uint64_t key = m.getPid(); + if (pidmap) (*pidmap)[i].push_back(key); + sptr > > pp = C.get(key); + if (pp) + dest[i].push_back(pp); + else + { + pp.reset(new vector >()); + C.set(key,pp); + dest[i].push_back(pp); + sptr x = prep2(m, this->default_sample_size); + pstats2pplist w(m,*(fwd?T2:T1),x,*pp,scorer); + if (multithread) + { + boost::thread* t = new boost::thread(w); + workers.push_back(t); + } + else w(); + } + } + } + for (size_t w = 0; w < workers.size(); ++w) + { + workers[w]->join(); + delete workers[w]; + } + } + template sptr Bitext:: @@ -1242,6 +1564,37 @@ namespace Moses { agenda:: job::active; + + template + void + expand(typename Bitext::iter const& m, + Bitext const& bt, + pstats const& ps, vector >& dest) + { + bool fwd = m.root == bt.I1.get(); + dest.reserve(ps.trg.size()); + PhrasePair pp; + pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); + // cout << HERE << " " << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl; + pstats::trg_map_t::const_iterator a; + for (a = ps.trg.begin(); a != ps.trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, + len, a->second); + dest.push_back(pp); + } +#if 0 + typename PhrasePair::SortByTargetIdSeq sorter; + sort(dest.begin(), dest.end(),sorter); + BOOST_FOREACH(PhrasePair const& p, dest) + cout << toString (*(fwd ? bt.V1 : bt.V2),p.start1,p.len1) << " ::: " + << toString (*(fwd ? bt.V2 : bt.V1),p.start2,p.len2) << " " + << p.joint << endl; +#endif + } + } // end of namespace bitext } // end of namespace moses #endif diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 8cd43dc18..a966d00dc 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -1,243 +1,13 @@ //-*- c++ -*- #pragma once -#include "ug_bitext.h" -using namespace ugdiss; -using namespace std; +// using namespace ugdiss; +// using namespace std; -namespace Moses { - namespace bitext - { - - template - string - toString(TokenIndex const& V, Token const* x, size_t const len) - { - if (!len) return ""; - UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); - ostringstream buf; - buf << V[x->id()]; - size_t i = 1; - for (x = x->next(); x && i < len; ++i, x = x->next()) - buf << " " << V[x->id()]; - UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); - return buf.str(); - } - - template - class - PhrasePair - { - public: - Token const* start1; - Token const* start2; - uint32_t len1; - uint32_t len2; - // uint64_t p1, p2; - uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; - vector fvals; - float dfwd[po_other+1]; // distortion counts // counts or probs? - float dbwd[po_other+1]; // distortion counts - vector aln; - float score; - PhrasePair() { }; - PhrasePair(PhrasePair const& o); - - PhrasePair const& operator+=(PhrasePair const& other); - - bool operator<(PhrasePair const& other) const; - bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; - bool operator>=(PhrasePair const& other) const; - - void init(); - void init(Token const* x, uint32_t const len, - pstats const* ps = NULL, size_t const numfeats=0); - - // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); - // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, - // size_t const numfeats); - - // PhrasePair const& - // update(uint64_t const pid2, size_t r2 = 0); - - PhrasePair const& - update(Token const* x, uint32_t const len, jstats const& js); - - // PhrasePair const& - // update(uint64_t const pid2, jstats const& js1, jstats const& js2); - - // PhrasePair const& - // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); - - // float - // eval(vector const& w); - - class SortByTargetIdSeq - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; - }; - - template - void - PhrasePair:: - init(Token const* x, uint32_t const len, - pstats const* ps, size_t const numfeats) - { - start1 = x; len1 = len; - // p1 = pid1; - // p2 = 0; - if (ps) - { - raw1 = ps->raw_cnt; - sample1 = ps->sample_cnt; - good1 = ps->good; - } - else raw1 = sample1 = good1 = 0; - joint = 0; - good2 = 0; - sample2 = 0; - raw2 = 0; - fvals.resize(numfeats); - } - - template - PhrasePair const& - PhrasePair:: - update(Token const* x, uint32_t const len, jstats const& js) - { - // p2 = pid2; - start2 = x; len2 = len; - raw2 = js.cnt2(); - joint = js.rcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - float total_fwd = 0, total_bwd = 0; - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast(i); - total_fwd += js.dcnt_fwd(po)+1; - total_bwd += js.dcnt_bwd(po)+1; - } - - // should we do that here or leave the raw counts? - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast(i); - dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; - dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; - } - - return *this; - } - - template - bool - PhrasePair:: - operator<(PhrasePair const& other) const - { return this->score < other.score; } - - template - bool - PhrasePair:: - operator>(PhrasePair const& other) const - { return this->score > other.score; } - - template - bool - PhrasePair:: - operator<=(PhrasePair const& other) const - { return this->score <= other.score; } - - template - bool - PhrasePair:: - operator>=(PhrasePair const& other) const - { return this->score >= other.score; } - - template - PhrasePair const& - PhrasePair:: - operator+=(PhrasePair const& o) - { - raw1 += o.raw1; - raw2 += o.raw2; - sample1 += o.sample1; - sample2 += o.sample2; - good1 += o.good1; - good2 += o.good2; - joint += o.joint; - return *this; - } - - template - PhrasePair:: - PhrasePair(PhrasePair const& o) - : start1(o.start1) - , start2(o.start2) - , len1(o.len1) - , len2(o.len2) - , raw1(o.raw1) - , raw2(o.raw2) - , sample1(o.sample1) - , sample2(o.sample2) - , good1(o.good1) - , good2(o.good2) - , joint(o.joint) - , fvals(o.fvals) - , aln(o.aln) - , score(o.score) - { - for (size_t i = 0; i <= po_other; ++i) - { - dfwd[i] = o.dfwd[i]; - dbwd[i] = o.dbwd[i]; - } - } - - template - int - PhrasePair:: - SortByTargetIdSeq:: - cmp(PhrasePair const& a, PhrasePair const& b) const - { - size_t i = 0; - Token const* x = a.start2; - Token const* y = b.start2; - while (i < a.len2 && i < b.len2 && x->id() == y->id()) - { - x = x->next(); - y = y->next(); - ++i; - } - if (i == a.len2 && i == b.len2) return 0; - if (i == a.len2) return -1; - if (i == b.len2) return 1; - return x->id() < y->id() ? -1 : 1; - } - - template - bool - PhrasePair:: - SortByTargetIdSeq:: - operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } - - template - void - PhrasePair:: - init() - { - len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; - start1 = start2 = NULL; - } +// namespace Moses { +// namespace bitext +// { - } // namespace bitext -} // namespace Moses +// } // namespace bitext +// } // namespace Moses diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index e59d4c61a..459c64fa1 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -796,34 +796,6 @@ namespace Moses assert(this->refCount == 0); } - template - void - expand(typename Bitext::iter const& m, - Bitext const& bt, - pstats const& ps, vector >& dest) - { - dest.reserve(ps.trg.size()); - PhrasePair pp; - pp.init(m.getToken(0), m.size(), &ps, 0); - // cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl; - pstats::trg_map_t::const_iterator a; - for (a = ps.trg.begin(); a != ps.trg.end(); ++a) - { - uint32_t sid,off,len; - parse_pid(a->first, sid, off, len); - pp.update(bt.T2->sntStart(sid)+off, len, a->second); - dest.push_back(pp); - } - typename PhrasePair::SortByTargetIdSeq sorter; - sort(dest.begin(), dest.end(),sorter); -#if 0 - BOOST_FOREACH(PhrasePair const& p, dest) - cout << toString (*bt.V1,p.start1,p.len1) << " ::: " - << toString (*bt.V2,p.start2,p.len2) << " " - << p.joint << endl; -#endif - } - // This is not the most efficient way of phrase lookup! TargetPhraseCollection const* Mmsapt:: @@ -889,9 +861,18 @@ namespace Moses if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn); vector > ppfix,ppdyn; - if (sfix) expand(mfix, btfix, *sfix, ppfix); - if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn); - + PhrasePair::SortByTargetIdSeq sort_by_tgt_id; + if (sfix) + { + expand(mfix, btfix, *sfix, ppfix); + sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); + } + if (sdyn) + { + expand(mdyn, *dyn, *sdyn, ppdyn); + sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id); + } + // now we have two lists of Phrase Pairs, let's merge them TargetPhraseCollectionWrapper* ret; ret = new TargetPhraseCollectionWrapper(revision,phrasekey);