Moved class PhrasePair back to ug_bitext.

Moved function expand() from mmsapt.cc to ug_bitext.h.
Added new lookup function to class Bitext.
Bug fixes related to inverse lookup in class Bitext.
This commit is contained in:
Ulrich Germann 2014-08-30 07:28:47 +01:00
parent 015d690b6f
commit a87a9ff207
3 changed files with 463 additions and 359 deletions

View File

@ -47,6 +47,8 @@
#include "ug_corpus_token.h"
#include "tpt_pickler.h"
#include "ug_lexical_phrase_scorer2.h"
#include "ug_phrasepair.h"
#include "ug_lru_cache.h"
#define PSTATS_CACHE_THRESHOLD 50
@ -57,6 +59,7 @@ namespace Moses {
namespace bitext
{
template<typename TKN> class Bitext;
template<typename TKN> class PhrasePair;
using namespace ugdiss;
template<typename TKN> class Bitext;
@ -160,6 +163,246 @@ namespace Moses {
};
template<typename Token>
string
toString(TokenIndex const& V, Token const* x, size_t const len)
{
if (!len) return "";
UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
ostringstream buf;
buf << V[x->id()];
size_t i = 1;
for (x = x->next(); x && i < len; ++i, x = x->next())
buf << " " << V[x->id()];
UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
return buf.str();
}
template<typename Token>
class
PhrasePair
{
public:
class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
Token const* start1;
Token const* start2;
uint32_t len1;
uint32_t len2;
uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
float dfwd[po_other+1]; // distortion counts // counts or probs?
float dbwd[po_other+1]; // distortion counts
vector<uchar> aln;
float score;
bool inverse;
PhrasePair() { };
PhrasePair(PhrasePair const& o);
PhrasePair const& operator+=(PhrasePair const& other);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
void init();
void init(uint64_t const pid1, bool is_inverse,
Token const* x, uint32_t const len,
pstats const* ps = NULL, size_t const numfeats=0);
// void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
// void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
// size_t const numfeats);
// PhrasePair const&
// update(uint64_t const pid2, size_t r2 = 0);
PhrasePair const&
update(uint64_t const pid2, Token const* x,
uint32_t const len, jstats const& js);
// PhrasePair const&
// update(uint64_t const pid2, jstats const& js1, jstats const& js2);
// PhrasePair const&
// update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
// float
// eval(vector<float> const& w);
class SortByTargetIdSeq
{
public:
int cmp(PhrasePair const& a, PhrasePair const& b) const;
bool operator()(PhrasePair const& a, PhrasePair const& b) const;
};
};
template<typename Token>
void
PhrasePair<Token>::
init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len,
pstats const* ps, size_t const numfeats)
{
inverse = is_inverse;
start1 = x; len1 = len;
p1 = pid1;
p2 = 0;
if (ps)
{
raw1 = ps->raw_cnt;
sample1 = ps->sample_cnt;
good1 = ps->good;
}
else raw1 = sample1 = good1 = 0;
joint = 0;
good2 = 0;
sample2 = 0;
raw2 = 0;
fvals.resize(numfeats);
}
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
update(uint64_t const pid2,
Token const* x, uint32_t const len, jstats const& js)
{
p2 = pid2;
start2 = x; len2 = len;
raw2 = js.cnt2();
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
// should we do that here or leave the raw counts?
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
template<typename Token>
bool
PhrasePair<Token>::
operator<(PhrasePair const& other) const
{ return this->score < other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>(PhrasePair const& other) const
{ return this->score > other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator<=(PhrasePair const& other) const
{ return this->score <= other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>=(PhrasePair const& other) const
{ return this->score >= other.score; }
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
operator+=(PhrasePair const& o)
{
raw1 += o.raw1;
raw2 += o.raw2;
sample1 += o.sample1;
sample2 += o.sample2;
good1 += o.good1;
good2 += o.good2;
joint += o.joint;
return *this;
}
template<typename Token>
PhrasePair<Token>::
PhrasePair(PhrasePair<Token> const& o)
: start1(o.start1)
, start2(o.start2)
, len1(o.len1)
, len2(o.len2)
, p1(o.p1)
, p2(o.p2)
, raw1(o.raw1)
, raw2(o.raw2)
, sample1(o.sample1)
, sample2(o.sample2)
, good1(o.good1)
, good2(o.good2)
, joint(o.joint)
, fvals(o.fvals)
, aln(o.aln)
, score(o.score)
, inverse(o.inverse)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
template<typename Token>
int
PhrasePair<Token>::
SortByTargetIdSeq::
cmp(PhrasePair const& a, PhrasePair const& b) const
{
size_t i = 0;
Token const* x = a.start2;
Token const* y = b.start2;
while (i < a.len2 && i < b.len2 && x->id() == y->id())
{
x = x->next();
y = y->next();
++i;
}
if (i == a.len2 && i == b.len2) return 0;
if (i == a.len2) return -1;
if (i == b.len2) return 1;
return x->id() < y->id() ? -1 : 1;
}
template<typename Token>
bool
PhrasePair<Token>::
SortByTargetIdSeq::
operator()(PhrasePair const& a, PhrasePair const& b) const
{
return this->cmp(a,b) < 0;
}
template<typename Token>
void
PhrasePair<Token>::
init()
{
inverse = false;
len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
start1 = start2 = NULL;
p1 = p2 = 0;
}
template<typename TKN>
class Bitext
{
@ -210,9 +453,14 @@ namespace Moses {
#endif
mutable pcache_t cache1,cache2;
protected:
typedef typename
lru_cache::LRU_Cache<uint64_t, vector<PhrasePair<Token> > >
pplist_cache_t;
size_t default_sample_size;
size_t num_workers;
size_t m_pstats_cache_threshold;
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
private:
sptr<pstats>
prep2(iter const& phrase, size_t const max_sample) const;
@ -235,6 +483,14 @@ namespace Moses {
// sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
sptr<pstats> lookup(iter const& phrase) const;
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
void
lookup(vector<Token> const& snt, TSA<Token>& idx,
vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
vector<vector<uint64_t> >* pidmap = NULL,
typename PhrasePair<Token>::Scorer* scorer=NULL,
bool multithread=true) const;
void prep(iter const& phrase) const;
void setDefaultSampleSize(size_t const max_samples);
@ -487,7 +743,8 @@ namespace Moses {
}
else if (!ag.bt.find_trg_phr_bounds
(sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
NULL,NULL,true))
// NULL,NULL,true))
&aln,NULL,true))
continue;
j->stats->lock.lock();
j->stats->good += 1;
@ -495,7 +752,8 @@ namespace Moses {
++j->stats->ofwd[po_fwd];
++j->stats->obwd[po_bwd];
j->stats->lock.unlock();
for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
// for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
for (size_t k = 1; k < aln.size(); k += 2)
aln[k] += s2 - s1;
Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
@ -567,8 +825,10 @@ namespace Moses {
#endif
}
}
if (j->fwd && s < s2)
for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
// if (j->fwd && s < s2)
// for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2)
if (s < s2)
for (size_t k = 1; k < aln.size(); k += 2)
--aln[k];
}
// j->stats->lock.unlock();
@ -584,7 +844,8 @@ namespace Moses {
~job()
{
if (stats) stats.reset();
--active;
try { --active; } catch (...) {}
// counter may not exist any more at destruction time
}
template<typename Token>
@ -981,9 +1242,18 @@ namespace Moses {
assert(T2);
assert(Tx);
bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
size_t slen1 = (*T1).sntLen(sid);
size_t slen2 = (*T2).sntLen(sid);
size_t slen1,slen2;
if (flip)
{
slen1 = T2->sntLen(sid);
slen2 = T1->sntLen(sid);
}
else
{
slen1 = T1->sntLen(sid);
slen2 = T2->sntLen(sid);
}
bitvector forbidden(slen2);
if (full_alignment)
{
if (slen1*slen2 > full_alignment->size())
@ -1002,17 +1272,11 @@ namespace Moses {
if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); }
else { p = binread(p,src); assert(p<x); p = binread(p,trg); }
// cerr << sid << " " << src << "/" << slen1 << " " << trg << "/"
// << slen2 << endl;
if (src >= slen1 || trg >= slen2)
{
ostringstream buf;
buf << "Alignment range error at sentence " << sid << "!" << endl
<< src << "/" << slen1 << " " << trg << "/" << slen2 << endl;
cerr << buf.str() << endl;
UTIL_THROW(util::Exception, buf.str().c_str());
}
UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
"Alignment range error at sentence " << sid << "!\n"
<< src << "/" << slen1 << " " <<
trg << "/" << slen2);
if (src < start || src >= stop)
forbidden.set(trg);
else
@ -1022,22 +1286,11 @@ namespace Moses {
}
if (core_alignment)
{
if (flip)
{
aln1[trg].push_back(src);
aln2[src].push_back(trg);
}
else
{
aln1[src].push_back(trg);
aln2[trg].push_back(src);
}
aln1[src].push_back(trg);
aln2[trg].push_back(src);
}
if (full_alignment)
{
if (flip) full_alignment->set(trg*slen2 + src);
else full_alignment->set(src*slen2 + trg);
}
full_alignment->set(src*slen2 + trg);
}
for (size_t i = lft; i <= rgt; ++i)
@ -1051,67 +1304,17 @@ namespace Moses {
if (core_alignment)
{
core_alignment->clear();
if (flip)
for (size_t i = start; i < stop; ++i)
{
for (size_t i = lft; i <= rgt; ++i)
BOOST_FOREACH(ushort x, aln1[i])
{
sort(aln1[i].begin(),aln1[i].end());
BOOST_FOREACH(ushort x, aln1[i])
{
core_alignment->push_back(i-lft);
core_alignment->push_back(x-start);
}
core_alignment->push_back(i-start);
core_alignment->push_back(x-lft);
}
}
else
{
for (size_t i = start; i < stop; ++i)
{
BOOST_FOREACH(ushort x, aln1[i])
{
core_alignment->push_back(i-start);
core_alignment->push_back(x-lft);
}
}
}
// now determine fwd and bwd phrase orientation
if (flip)
{
po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2);
po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2);
}
else
{
po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
}
#if 0
// if (e1 - s1 > 3)
{
lock_guard<mutex> guard(this->lock);
Token const* t1 = T1->sntStart(sid);
Token const* t2 = T2->sntStart(sid);
cout << "[" << start << ":" << stop << "] => ["
<< s1 << ":" << s2 << ":"
<< e1 << ":" << e2 << "]" << endl;
for (size_t k = start; k < stop; ++k)
cout << k-start << "." << (*V1)[t1[k].id()] << " ";
cout << endl;
for (size_t k = s1; k < e2;)
{
if (k == s2) cout << "[";
cout << int(k)-int(s2) << "." << (*V2)[t2[k].id()];
if (++k == e1) cout << "] ";
else cout << " ";
}
cout << endl;
for (size_t k = 0; k < core_alignment->size(); k += 2)
cout << int((*core_alignment)[k]) << "-" << int((*core_alignment)[k+1]) << " ";
cout << "\n" << __FILE__ << ":" << __LINE__ << endl;
}
#endif
po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
}
return lft <= rgt;
}
@ -1143,9 +1346,10 @@ namespace Moses {
max_sample == this->default_sample_size &&
phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
// need to test what a good caching threshold is
// still need to test what a good caching threshold is
// is caching here the cause of the apparent memory leak in
// confusion network decoding ????
// confusion network decoding ???? No, it isn't.
// That was because of naive, brute-force input path generation.
uint64_t pid = phrase.getPid();
pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2);
pcache_t::value_type entry(pid,sptr<pstats>());
@ -1170,6 +1374,124 @@ namespace Moses {
return ret;
}
// worker for scoring and sorting phrase table entries in parallel
template<typename Token>
class pstats2pplist
{
Ttrack<Token> const& m_other;
sptr<pstats> m_pstats;
vector<PhrasePair<Token> >& m_pplist;
typename PhrasePair<Token>::Scorer const* m_scorer;
PhrasePair<Token> m_pp;
Token const* m_token;
size_t m_len;
uint64_t m_pid1;
bool m_is_inverse;
public:
// CONSTRUCTOR
pstats2pplist(typename TSA<Token>::tree_iterator const& m,
Ttrack<Token> const& other,
sptr<pstats> const& ps,
vector<PhrasePair<Token> >& dest,
typename PhrasePair<Token>::Scorer const* scorer)
: m_other(other)
, m_pstats(ps)
, m_pplist(dest)
, m_scorer(scorer)
, m_token(m.getToken(0))
, m_len(m.size())
, m_pid1(m.getPid())
, m_is_inverse(false)
{ }
// WORKER
void
operator()()
{
// wait till all statistics have been collected
boost::unique_lock<boost::mutex> lock(m_pstats->lock);
while (m_pstats->in_progress)
m_pstats->ready.wait(lock);
m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
// convert pstats entries to phrase pairs
pstats::trg_map_t::iterator a;
for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
{
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint);
size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
if (m_pp.good1 > J || m_pp.good2 > J) continue;
if (m_scorer)
{
(*m_scorer)(m_pp);
}
m_pplist.push_back(m_pp);
}
greater<PhrasePair<Token> > sorter;
if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
}
};
template<typename Token>
void
Bitext<Token>::
lookup(vector<Token> const& snt, TSA<Token>& idx,
vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
vector<vector<uint64_t> >* pidmap,
typename PhrasePair<Token>::Scorer* scorer,
bool multithread) const
{
typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
dest.clear();
dest.resize(snt.size());
if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); }
// collect statistics in parallel, then build PT entries as
// the sampling finishes
bool fwd = &idx == I1.get();
vector<boost::thread*> workers; // background threads doing the lookup
pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
if (C.capacity() < 100000) C.reserve(100000);
for (size_t i = 0; i < snt.size(); ++i)
{
dest[i].reserve(snt.size()-i);
typename TSA<Token>::tree_iterator m(&idx);
for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k)
{
uint64_t key = m.getPid();
if (pidmap) (*pidmap)[i].push_back(key);
sptr<vector<PhrasePair<Token> > > pp = C.get(key);
if (pp)
dest[i].push_back(pp);
else
{
pp.reset(new vector<PhrasePair<Token> >());
C.set(key,pp);
dest[i].push_back(pp);
sptr<pstats> x = prep2(m, this->default_sample_size);
pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
if (multithread)
{
boost::thread* t = new boost::thread(w);
workers.push_back(t);
}
else w();
}
}
}
for (size_t w = 0; w < workers.size(); ++w)
{
workers[w]->join();
delete workers[w];
}
}
template<typename Token>
sptr<pstats>
Bitext<Token>::
@ -1242,6 +1564,37 @@ namespace Moses {
agenda::
job::active;
template<typename Token>
void
expand(typename Bitext<Token>::iter const& m,
Bitext<Token> const& bt,
pstats const& ps, vector<PhrasePair<Token> >& dest)
{
bool fwd = m.root == bt.I1.get();
dest.reserve(ps.trg.size());
PhrasePair<Token> pp;
pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
// cout << HERE << " " << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl;
pstats::trg_map_t::const_iterator a;
for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
{
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
len, a->second);
dest.push_back(pp);
}
#if 0
typename PhrasePair<Token>::SortByTargetIdSeq sorter;
sort(dest.begin(), dest.end(),sorter);
BOOST_FOREACH(PhrasePair<Token> const& p, dest)
cout << toString (*(fwd ? bt.V1 : bt.V2),p.start1,p.len1) << " ::: "
<< toString (*(fwd ? bt.V2 : bt.V1),p.start2,p.len2) << " "
<< p.joint << endl;
#endif
}
} // end of namespace bitext
} // end of namespace moses
#endif

View File

@ -1,243 +1,13 @@
//-*- c++ -*-
#pragma once
#include "ug_bitext.h"
using namespace ugdiss;
using namespace std;
// using namespace ugdiss;
// using namespace std;
namespace Moses {
namespace bitext
{
template<typename Token>
string
toString(TokenIndex const& V, Token const* x, size_t const len)
{
if (!len) return "";
UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
ostringstream buf;
buf << V[x->id()];
size_t i = 1;
for (x = x->next(); x && i < len; ++i, x = x->next())
buf << " " << V[x->id()];
UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
return buf.str();
}
template<typename Token>
class
PhrasePair
{
public:
Token const* start1;
Token const* start2;
uint32_t len1;
uint32_t len2;
// uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
float dfwd[po_other+1]; // distortion counts // counts or probs?
float dbwd[po_other+1]; // distortion counts
vector<uchar> aln;
float score;
PhrasePair() { };
PhrasePair(PhrasePair const& o);
PhrasePair const& operator+=(PhrasePair const& other);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
bool operator<=(PhrasePair const& other) const;
bool operator>=(PhrasePair const& other) const;
void init();
void init(Token const* x, uint32_t const len,
pstats const* ps = NULL, size_t const numfeats=0);
// void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
// void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
// size_t const numfeats);
// PhrasePair const&
// update(uint64_t const pid2, size_t r2 = 0);
PhrasePair const&
update(Token const* x, uint32_t const len, jstats const& js);
// PhrasePair const&
// update(uint64_t const pid2, jstats const& js1, jstats const& js2);
// PhrasePair const&
// update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
// float
// eval(vector<float> const& w);
class SortByTargetIdSeq
{
public:
int cmp(PhrasePair const& a, PhrasePair const& b) const;
bool operator()(PhrasePair const& a, PhrasePair const& b) const;
};
};
template<typename Token>
void
PhrasePair<Token>::
init(Token const* x, uint32_t const len,
pstats const* ps, size_t const numfeats)
{
start1 = x; len1 = len;
// p1 = pid1;
// p2 = 0;
if (ps)
{
raw1 = ps->raw_cnt;
sample1 = ps->sample_cnt;
good1 = ps->good;
}
else raw1 = sample1 = good1 = 0;
joint = 0;
good2 = 0;
sample2 = 0;
raw2 = 0;
fvals.resize(numfeats);
}
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
update(Token const* x, uint32_t const len, jstats const& js)
{
// p2 = pid2;
start2 = x; len2 = len;
raw2 = js.cnt2();
joint = js.rcnt();
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
// should we do that here or leave the raw counts?
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
template<typename Token>
bool
PhrasePair<Token>::
operator<(PhrasePair const& other) const
{ return this->score < other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>(PhrasePair const& other) const
{ return this->score > other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator<=(PhrasePair const& other) const
{ return this->score <= other.score; }
template<typename Token>
bool
PhrasePair<Token>::
operator>=(PhrasePair const& other) const
{ return this->score >= other.score; }
template<typename Token>
PhrasePair<Token> const&
PhrasePair<Token>::
operator+=(PhrasePair const& o)
{
raw1 += o.raw1;
raw2 += o.raw2;
sample1 += o.sample1;
sample2 += o.sample2;
good1 += o.good1;
good2 += o.good2;
joint += o.joint;
return *this;
}
template<typename Token>
PhrasePair<Token>::
PhrasePair(PhrasePair<Token> const& o)
: start1(o.start1)
, start2(o.start2)
, len1(o.len1)
, len2(o.len2)
, raw1(o.raw1)
, raw2(o.raw2)
, sample1(o.sample1)
, sample2(o.sample2)
, good1(o.good1)
, good2(o.good2)
, joint(o.joint)
, fvals(o.fvals)
, aln(o.aln)
, score(o.score)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
template<typename Token>
int
PhrasePair<Token>::
SortByTargetIdSeq::
cmp(PhrasePair const& a, PhrasePair const& b) const
{
size_t i = 0;
Token const* x = a.start2;
Token const* y = b.start2;
while (i < a.len2 && i < b.len2 && x->id() == y->id())
{
x = x->next();
y = y->next();
++i;
}
if (i == a.len2 && i == b.len2) return 0;
if (i == a.len2) return -1;
if (i == b.len2) return 1;
return x->id() < y->id() ? -1 : 1;
}
template<typename Token>
bool
PhrasePair<Token>::
SortByTargetIdSeq::
operator()(PhrasePair const& a, PhrasePair const& b) const
{
return this->cmp(a,b) < 0;
}
template<typename Token>
void
PhrasePair<Token>::
init()
{
len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
start1 = start2 = NULL;
}
// namespace Moses {
// namespace bitext
// {
} // namespace bitext
} // namespace Moses
// } // namespace bitext
// } // namespace Moses

View File

@ -796,34 +796,6 @@ namespace Moses
assert(this->refCount == 0);
}
template<typename Token>
void
expand(typename Bitext<Token>::iter const& m,
Bitext<Token> const& bt,
pstats const& ps, vector<PhrasePair<Token> >& dest)
{
dest.reserve(ps.trg.size());
PhrasePair<Token> pp;
pp.init(m.getToken(0), m.size(), &ps, 0);
// cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl;
pstats::trg_map_t::const_iterator a;
for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
{
uint32_t sid,off,len;
parse_pid(a->first, sid, off, len);
pp.update(bt.T2->sntStart(sid)+off, len, a->second);
dest.push_back(pp);
}
typename PhrasePair<Token>::SortByTargetIdSeq sorter;
sort(dest.begin(), dest.end(),sorter);
#if 0
BOOST_FOREACH(PhrasePair<Token> const& p, dest)
cout << toString (*bt.V1,p.start1,p.len1) << " ::: "
<< toString (*bt.V2,p.start2,p.len2) << " "
<< p.joint << endl;
#endif
}
// This is not the most efficient way of phrase lookup!
TargetPhraseCollection const*
Mmsapt::
@ -889,9 +861,18 @@ namespace Moses
if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn);
vector<PhrasePair<Token> > ppfix,ppdyn;
if (sfix) expand(mfix, btfix, *sfix, ppfix);
if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn);
PhrasePair<Token>::SortByTargetIdSeq sort_by_tgt_id;
if (sfix)
{
expand(mfix, btfix, *sfix, ppfix);
sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id);
}
if (sdyn)
{
expand(mdyn, *dyn, *sdyn, ppdyn);
sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id);
}
// now we have two lists of Phrase Pairs, let's merge them
TargetPhraseCollectionWrapper* ret;
ret = new TargetPhraseCollectionWrapper(revision,phrasekey);