Routine check-in.

This commit is contained in:
Ulrich Germann 2014-03-03 12:13:41 +00:00
parent 3f678fa689
commit 6c37b8d252
3 changed files with 254 additions and 120 deletions

View File

@ -45,12 +45,14 @@ namespace Moses
pstats::
add(uint64_t pid, float const w,
vector<uchar> const& a,
uint32_t const cnt2)
uint32_t const cnt2,
uint32_t fwd_o,
uint32_t bwd_o)
{
this->lock.lock();
jstats& entry = this->trg[pid];
this->lock.unlock();
entry.add(w,a,cnt2);
entry.add(w,a,cnt2,fwd_o,bwd_o);
if (this->good < entry.rcnt())
{
this->lock.lock();
@ -65,6 +67,8 @@ namespace Moses
jstats()
: my_rcnt(0), my_wcnt(0), my_cnt2(0)
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
my_aln.reserve(1);
}
@ -74,11 +78,33 @@ namespace Moses
my_rcnt = other.rcnt();
my_wcnt = other.wcnt();
my_aln = other.aln();
for (int i = po_first; i <= po_other; i++)
{
ofwd[i] = other.ofwd[i];
obwd[i] = other.obwd[i];
}
}
uint32_t
jstats::
dcnt_fwd(PhraseOrientation const idx) const
{
assert(idx <= po_other);
return ofwd[idx];
}
uint32_t
jstats::
dcnt_bwd(PhraseOrientation const idx) const
{
assert(idx <= po_other);
return obwd[idx];
}
void
jstats::
add(float w, vector<uchar> const& a, uint32_t const cnt2)
add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient)
{
boost::lock_guard<boost::mutex> lk(this->lock);
my_rcnt += 1;
@ -95,6 +121,8 @@ namespace Moses
if (my_aln[i].first > my_aln[i/2].first)
push_heap(my_aln.begin(),my_aln.begin()+i+1);
}
++ofwd[fwd_orient];
++obwd[bwd_orient];
}
uint32_t
@ -159,7 +187,30 @@ namespace Moses
return this->score > other.score;
}
PhrasePair::PhrasePair() {}
PhrasePair::
PhrasePair() {}
PhrasePair::
PhrasePair(PhrasePair const& o)
: p1(o.p1),
p2(o.p2),
raw1(o.raw1),
raw2(o.raw2),
sample1(o.sample1),
sample2(o.sample2),
good1(o.good1),
good2(o.good2),
joint(o.joint),
fvals(o.fvals),
aln(o.aln),
score(o.score)
{
for (size_t i = 0; i <= po_other; ++i)
{
dfwd[i] = o.dfwd[i];
dbwd[i] = o.dbwd[i];
}
}
void
PhrasePair::
@ -208,6 +259,19 @@ namespace Moses
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
float total_fwd = 0, total_bwd = 0;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
total_fwd += js.dcnt_fwd(po)+1;
total_bwd += js.dcnt_bwd(po)+1;
}
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
}
return *this;
}
@ -223,6 +287,12 @@ namespace Moses
aln = js1.aln()[0].second;
else if (js2.aln().size())
aln = js2.aln()[0].second;
for (int i = po_first; i < po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
}
return *this;
}
@ -238,6 +308,12 @@ namespace Moses
assert(js.aln().size());
if (js.aln().size())
aln = js.aln()[0].second;
for (int i = po_first; i <= po_other; i++)
{
PhraseOrientation po = static_cast<PhraseOrientation>(i);
dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
}
return *this;
}
@ -331,24 +407,53 @@ namespace Moses
bool
expand_phrase_pair
(vector<vector<ushort> >& a1,
(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
ushort const seed,
ushort const L1, // hard left limit source
ushort const R1, // hard right limit source
ushort const L2, // hard left limit target
ushort const R2, // hard right limit target
ushort & s1, ushort & e1, // start/end src phrase
ushort & s2, ushort & e2) // start/end trg phrase
ushort const s2, // next word on in target side
ushort const L1, ushort const R1, // limits of previous phrase
ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
{
if (a1[seed].size() == 0) return false;
assert(L1 <= seed);
assert(R1 > seed);
bitvector done1(a1.size());
bitvector done2(a2.size());
vector <pair<ushort,ushort> > agenda;
// x.first: side (1 or 2)
// x.second: word position
agenda.reserve(a1.size() + a2.size());
agenda.push_back(pair<ushort,ushort>(seed,0));
agenda.push_back(pair<ushort,ushort>(2,s2));
e2 = s2;
s1 = e1 = a2[s2].front();
if (s1 >= L1 && s1 < R1) return false;
for (size_t i = 1; i < a2[s2].size(); ++i)
{
ushort p = a2[s2][i];
if (s1 >= R1)
{ if (p < R1) return false; }
else if (e1 < L1)
{ if (p > L1) return false; }
if (s1 > p) s1 = p;
if (e1 < p) e1 = p;
}
done2.set(s2);
while (agenda.size())
{
ushort side = agenda.back().first;
ushort p = agenda.back().second;
agenda.pop_back();
if (side == 1)
{
done1.set(p);
BOOST_FOREACH(ushort i, a1[p])
{
if (i < seed) return false;
if (done2[i]) continue;
for (;e2 <= i;++e2)
if (!done2[e2])
agenda.push_back(pair<ushort,ushort>(2,e2));
if (e2 < i) e2 = i;
}
s1 = seed;
e1 = seed;
s2 = e2 = a1[seed].front();
@ -437,13 +542,16 @@ namespace Moses
{
size_t n2 = e2;
while (n2 < a2.size() && a2[n2].size() == 0) ++n2;
if (n2 == a2.size()) return po_last;
if (n2 == a2.size())
return po_last;
ushort ns1,ns2,ne1,ne2;
bool OK = expand_phrase_pair(a2,a1,n2,
e2, a2.size()-1,
0, a1.size()-1,
ns2,ne2,ns1,ne1);
if (!OK) return po_other;
if (ns1 >= e1)
{

View File

@ -56,13 +56,13 @@ namespace Moses {
enum PhraseOrientation
{
po_first=0,
po_mono=1,
po_jfwd=2,
po_swap=3,
po_jbwd=4,
po_last=5,
po_other=6
po_first,
po_mono,
po_jfwd,
po_swap,
po_jbwd,
po_last,
po_other
};
PhraseOrientation
@ -102,6 +102,7 @@ namespace Moses {
float my_wcnt; // weighted count
uint32_t my_cnt2;
vector<pair<size_t, vector<uchar> > > my_aln;
uint32_t ofwd[7], obwd[7];
public:
jstats();
jstats(jstats const& other);
@ -110,9 +111,12 @@ namespace Moses {
float wcnt() const;
vector<pair<size_t, vector<uchar> > > const & aln() const;
void add(float w, vector<uchar> const& a, uint32_t const cnt2);
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
};
struct
@ -127,7 +131,7 @@ namespace Moses {
size_t sum_pairs;
size_t in_progress; // keeps track of how many threads are currently working on this
uint32_t ofwd[7], obwd[7];
uint32_t ofwd[po_other+1], obwd[po_other+1];
typename boost::unordered_map<uint64_t, jstats> trg;
pstats();
@ -135,8 +139,12 @@ namespace Moses {
void register_worker();
size_t count_workers() { return in_progress; }
bool add(uint64_t const pid, float const w,
vector<uchar> const& a, uint32_t const cnt2);
bool
add(uint64_t const pid,
float const w,
vector<uchar> const& a,
uint32_t const cnt2,
uint32_t fwd_o, uint32_t bwd_o);
};
class
@ -146,12 +154,15 @@ namespace Moses {
uint64_t p1, p2;
uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
vector<float> fvals;
float dfwd[po_other+1];
float dbwd[po_other+1];
vector<uchar> aln;
// float avlex12,avlex21; // average lexical probs (Moses std)
// float znlex1,znlex2; // zens-ney lexical smoothing
// float colex1,colex2; // based on raw lexical occurrences
float score;
PhrasePair();
PhrasePair(PhrasePair const& o);
bool operator<(PhrasePair const& other) const;
bool operator>(PhrasePair const& other) const;
bool operator<=(PhrasePair const& other) const;
@ -161,13 +172,15 @@ namespace Moses {
void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
size_t const numfeats);
PhrasePair const& update(uint64_t const pid2, jstats const& js);
PhrasePair const& update(uint64_t const pid2,
jstats const& js1,
jstats const& js2);
PhrasePair const& update(uint64_t const pid2,
size_t const raw2extra,
jstats const& js);
PhrasePair const&
update(uint64_t const pid2, jstats const& js);
PhrasePair const&
update(uint64_t const pid2, jstats const& js1, jstats const& js2);
PhrasePair const&
update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
float eval(vector<float> const& w);
};
@ -355,9 +368,6 @@ namespace Moses {
};
template<typename TKN>
class Bitext
{
@ -660,7 +670,8 @@ namespace Moses {
for (size_t i = e1; i <= e2; ++i)
{
if (!j->stats->add(b->getPid(),sample_weight,aln,
b->approxOccurrenceCount()))
b->approxOccurrenceCount(),
po_fwd,po_bwd))
{
for (size_t z = 0; z < j->len; ++z)
{

View File

@ -18,59 +18,70 @@ namespace Moses
Mmsapt::
setWeights(vector<float> const & w)
{
assert(w.size() == this->numScoreComponents);
assert(w.size() == this->m_numScoreComponents);
this->feature_weights = w;
}
struct PhraseAlnHyp
{
int s1,e1,s2,e2;
uint64_t p1,p2;
float pscore;
sptr<pstats> ps;
jstats const* js;
PhraseAlnHyp(PhrasePair& pp,
PhrasePair pp;
ushort s1,e1,s2,e2; // start and end positions
int prev; // preceding alignment hypothesis
float score;
bitvector scov; // source coverage
PhraseAlnHyp(PhrasePair const& ppx, int slen,
pair<uint32_t,uint32_t> const& sspan,
pair<uint32_t,uint32_t> const& tspan,
sptr<pstats> const& ps_,
jstats const* js_)
: js(js_)
pair<uint32_t,uint32_t> const& tspan)
: pp(ppx), prev(-1), score(ppx.score), scov(slen)
{
s1 = sspan.first;
e1 = sspan.second;
s2 = tspan.first;
e2 = tspan.second;
p1 = pp.p1;
p2 = pp.p2;
pscore = pp.score;
ps = ps_;
}
PhraseAlnHyp(PhraseAlnHyp const& other)
: s1(other.s1), e1(other.e1), s2(other.s2), e2(other.e2)
, p1(other.p1), p2(other.p2), pscore(other.pscore), js(other.js)
{
ps = other.ps;
s1 = sspan.first; e1 = sspan.second;
s2 = tspan.first; e2 = tspan.second;
}
bool operator<(PhraseAlnHyp const& other) const
{
return this->pscore < other.pscore;
return this->score < other.score;
}
bool operator>(PhraseAlnHyp const& other) const
{
return this->pscore > other.pscore;
return this->score > other.score;
}
PhraseOrientation
po_bwd(PhraseAlnHyp const* prev) const
{
if (s2 == 0) return po_first;
assert(prev);
assert(prev->e2 <= s2);
if (prev->e2 < s2) return po_other;
if (prev->e1 == s1) return po_mono;
if (prev->e1 < s1) return po_jfwd;
if (prev->s1 == e1) return po_swap;
if (prev->s1 > e1) return po_jbwd;
return po_other;
}
PhraseOrientation
po_fwd(PhraseAlnHyp const* next) const
{
if (!next) return po_last;
assert(next->s2 >= e2);
if (next->s2 < e2) return po_other;
if (next->e1 == s1) return po_swap;
if (next->e1 < s1) return po_jbwd;
if (next->s1 == e1) return po_mono;
if (next->s1 > e1) return po_jfwd;
return po_other;
}
};
sptr<vector<int> >
Mmsapt::
align(string const& src, string const& trg) const
{
// For the time being, we consult only the fixed bitext.
// We might also consider the dynamic bitext. TO DO.
// We might also consider the dynamic bitext. => TO DO.
vector<id_type> s,t;
btfix.V1->fillIdSeq(src,s);
@ -89,6 +100,8 @@ namespace Moses
pid2span_t spid2span,tpid2span;
vector<vector<sptr<pstats> > > spstats(s.size());
// Fill the lookup maps for target span to target phrase id
// and vice versa:
for (size_t i = 0; i < t.size(); ++i)
{
tsa::tree_iterator m(btfix.I2.get());
@ -98,8 +111,10 @@ namespace Moses
tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
tspan2pid[i][k] = pid;
}
}
} // done filling lookup maps
// Fill the lookup maps for the source side, and gather
// phrase statistics from the bitext
for (size_t i = 0; i < s.size(); ++i)
{
tsa::tree_iterator m(btfix.I1.get());
@ -122,75 +137,75 @@ namespace Moses
// now fill the association score table
vector<PhrasePair> PP;
vector<PhrasePair> cands;
vector<vector<PhraseAlnHyp> > ahyps(t.size());
for (pid2span_t::iterator x = spid2span.begin();
x != spid2span.end(); ++x)
vector<PhraseAlnHyp> PAH; PAH.reserve(1000000);
vector<vector<int> > tpos2ahyp(t.size());
typedef pid2span_t::iterator psiter;
typedef boost::unordered_map<uint64_t,jstats> jStatsTable;
typedef pair<uint32_t,uint32_t> xspan;
for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
{
int i = x->second[0].first;
int k = x->second[0].second - i -1;
if (!L->second.size()) continue; // should never happen anyway
int i = L->second[0].first;
int k = L->second[0].second - i -1;
sptr<pstats> ps = spstats[i][k];
boost::unordered_map<uint64_t,jstats> & j = ps->trg;
typedef boost::unordered_map<uint64_t,jstats>::iterator jiter;
PhrasePair pp;
pp.init(x->first,*ps, this->m_numScoreComponents);
for (jiter y = j.begin(); y != j.end(); ++y)
pp.init(L->first,*ps, this->m_numScoreComponents);
jStatsTable & J = ps->trg;
for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
{
pid2span_t::iterator z = tpid2span.find(y->first);
if (z != tpid2span.end())
psiter R = tpid2span.find(y->first);
if (R == tpid2span.end()) continue;
pp.update(y->first, y->second);
calc_lex(btfix,pp);
calc_pfwd_fix(btfix,pp);
calc_pbwd_fix(btfix,pp);
pp.eval(this->feature_weights);
PP.push_back(pp);
BOOST_FOREACH(xspan const& sspan, L->second)
{
pp.update(y->first, y->second);
calc_lex(btfix,pp);
calc_pfwd_fix(btfix,pp);
calc_pbwd_fix(btfix,pp);
pp.eval(this->feature_weights);
for (size_t js = 0; js < x->second.size(); ++js)
BOOST_FOREACH(xspan const& tspan, R->second)
{
pair<uint32_t,uint32_t> const& sspan = x->second[js];
for (size_t jt = 0; jt < z->second.size(); ++jt)
{
pair<uint32_t,uint32_t> & tspan = z->second[jt];
PhraseAlnHyp pah(pp,sspan,tspan,ps,&y->second);
ahyps[tspan.first].push_back(pah);
}
tpos2ahyp[tspan.first].push_back(PAH.size());
PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
}
// cands.push_back(pp);
// pair<uint32_t,uint32_t> ss = x->second[0];
// pair<uint32_t,uint32_t> ts = tpid2span[y->first][0];
// cout << btfix.V1->toString(&s[ss.first],&s[ss.second])
// << " <=> "
// << btfix.V2->toString(&t[ts.first],&t[ts.second])
// << endl;
}
}
}
// show what we have so far ...
for (size_t s2 = 0; s2 < t.size(); ++s2)
{
sort(ahyps[s2].begin(), ahyps[s2].end(), greater<PhraseAlnHyp>());
for (size_t h = 0; h < ahyps[s2].size(); ++h)
VectorIndexSorter<PhraseAlnHyp> foo(PAH);
sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
{
PhraseAlnHyp const& ah = ahyps[s2][h];
pstats const & s = *ah.ps;
cout << setw(10) << exp(ah.pscore) << " "
<< btfix.T2->pid2str(btfix.V2.get(), ah.p2)
PhraseAlnHyp const& ah = PAH[tpos2ahyp[s2][h]];
// pstats const & s = *ah.ps;
cout << setw(10) << exp(ah.score) << " "
<< btfix.T2->pid2str(btfix.V2.get(), ah.pp.p2)
<< " <=> "
<< btfix.T1->pid2str(btfix.V1.get(), ah.p1);
vector<uchar> const& a = ah.js->aln()[0].second;
<< btfix.T1->pid2str(btfix.V1.get(), ah.pp.p1);
vector<uchar> const& a = ah.pp.aln;
for (size_t u = 0; u +1 < a.size(); ++u)
cout << " " << int(a[u+1]) << "-" << int(a[u]);
cout << endl;
cout << " [first: " << s.ofwd[po_first]
<< " last: " << s.ofwd[po_last]
<< " mono: " << s.ofwd[po_mono]
<< " jfwd: " << s.ofwd[po_jfwd]
<< " swap: " << s.ofwd[po_swap]
<< " jbwd: " << s.ofwd[po_jbwd]
float const* ofwdj = ah.pp.dfwd;
float const* obwdj = ah.pp.dbwd;
uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
cout << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
<< " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
<< " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
<< " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
<< " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
<< " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
<< "]" << endl
<< " [first: " << s.obwd[po_first]
<< " last: " << s.obwd[po_last]
<< " mono: " << s.obwd[po_mono]
<< " jfwd: " << s.obwd[po_jfwd]
<< " swap: " << s.obwd[po_swap]
<< " jbwd: " << s.obwd[po_jbwd]
<< " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
<< " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
<< " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
<< " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
<< " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
<< " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
<< "]" << endl;
}
}