Added bias to bitext sampling.

This commit is contained in:
Ulrich Germann 2014-09-06 03:39:23 +01:00
parent cef6460981
commit a86d49fc88
8 changed files with 285 additions and 33 deletions

View File

@ -172,6 +172,7 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
moses/TranslationModel/UG//spe-check-coverage3
moses/TranslationModel/UG//spe-check-coverage2
moses/TranslationModel/UG//ptable-lookup
moses/TranslationModel/UG//sim-pe

View File

@ -65,6 +65,16 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe spe-check-coverage3 :
spe-check-coverage3.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
install $(PREFIX)/bin : try-align try-align2 ;
fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;

View File

@ -11,7 +11,9 @@ namespace Moses
namespace bitext
{
#if UG_BITEXT_TRACK_ACTIVE_THREADS
ThreadSafeCounter pstats::active;
#endif
pstats::
pstats()
@ -23,15 +25,15 @@ namespace Moses
{
ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
// if (++active%5 == 0)
// cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl;
}
pstats::
~pstats()
{
#if UG_BITEXT_TRACK_ACTIVE_THREADS
// counter may not exist any more at destruction time, so try ... catch
try { --active; } catch (...) {}
// counter may not exist any more at destruction time
#endif
}
void

View File

@ -15,6 +15,10 @@
//
// - use multiple agendas for better load balancing and to avoid
// competition for locks
//
#define UG_BITEXT_TRACK_ACTIVE_THREADS 0
#include <string>
#include <vector>
@ -133,7 +137,10 @@ namespace Moses {
struct
pstats
{
#if UG_BITEXT_TRACK_ACTIVE_THREADS
static ThreadSafeCounter active;
#endif
boost::mutex lock; // for parallel gathering of stats
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
@ -463,7 +470,8 @@ namespace Moses {
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
private:
sptr<pstats>
prep2(iter const& phrase, size_t const max_sample) const;
prep2(iter const& phrase, size_t const max_sample,
vector<float> const* const bias) const;
public:
Bitext(size_t const max_sample =1000,
size_t const xnum_workers =16);
@ -481,17 +489,19 @@ namespace Moses {
virtual void open(string const base, string const L1, string const L2) = 0;
// sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
sptr<pstats> lookup(iter const& phrase) const;
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
sptr<pstats> lookup(iter const& phrase,vector<float> const* const bias=NULL) const;
sptr<pstats> lookup(iter const& phrase, size_t const max_sample,
vector<float> const* const bias) const;
void
lookup(vector<Token> const& snt, TSA<Token>& idx,
vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
vector<vector<uint64_t> >* pidmap = NULL,
typename PhrasePair<Token>::Scorer* scorer=NULL,
vector<float> const* const bias=NULL,
bool multithread=true) const;
void prep(iter const& phrase) const;
void prep(iter const& phrase, vector<float> const* const bias) const;
void setDefaultSampleSize(size_t const max_samples);
size_t getDefaultSampleSize() const;
@ -576,7 +586,9 @@ namespace Moses {
boost::mutex lock;
class job
{
#if UG_BITEXT_TRACK_ACTIVE_THREADS
static ThreadSafeCounter active;
#endif
boost::mutex lock;
friend class agenda;
boost::taus88 rnd; // every job has its own pseudo random generator
@ -594,10 +606,13 @@ namespace Moses {
size_t len; // phrase length
bool fwd; // if true, source phrase is L1
sptr<pstats> stats; // stores statistics collected during sampling
vector<float> const* bias; // sentence-level bias for sampling
bool step(uint64_t & sid, uint64_t & offset); // select another occurrence
bool done() const;
job(typename TSA<Token>::tree_iterator const& m,
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd);
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
vector<float> const* const bias);
~job();
};
public:
@ -622,7 +637,9 @@ namespace Moses {
sptr<pstats>
add_job(typename TSA<Token>::tree_iterator const& phrase,
size_t const max_samples);
size_t const max_samples,
vector<float> const* const bias);
sptr<job> get_job();
};
@ -641,6 +658,8 @@ namespace Moses {
next = root->readOffset(next,stop,offset);
boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
if (bias && bias->at(sid) == 0)
return false;
stats->sample_cnt++;
}
else
@ -654,14 +673,21 @@ namespace Moses {
boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
size_t scalefac = (stats->raw_cnt - ctr++);
size_t rnum = scalefac*(rnd()/(rnd.max()+1.));
size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
size_t th = (bias == NULL ? max_samples
: bias->at(sid) * bias->size() * max_samples);
#if 0
cerr << rnum << "/" << scalefac << " vs. "
<< max_samples - stats->good << " ("
<< max_samples << " - " << stats->good << ")"
<< endl;
<< " th=" << th;
if (bias)
cerr << " with bias " << bias->at(sid)
<< " => " << bias->at(sid) * bias->size();
else cerr << " without bias";
cerr << endl;
#endif
if (rnum < max_samples - stats->good)
if (rnum + stats->good < th)
{
stats->sample_cnt++;
ret = true;
@ -743,8 +769,7 @@ namespace Moses {
}
else if (!ag.bt.find_trg_phr_bounds
(sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
// NULL,NULL,true))
&aln,NULL,true))
&aln,NULL,true)) // NULL,NULL,true))
continue;
j->stats->lock.lock();
j->stats->good += 1;
@ -844,7 +869,9 @@ namespace Moses {
~job()
{
if (stats) stats.reset();
#if UG_BITEXT_TRACK_ACTIVE_THREADS
try { --active; } catch (...) {}
#endif
// counter may not exist any more at destruction time
}
@ -853,7 +880,8 @@ namespace Moses {
agenda::
job::
job(typename TSA<Token>::tree_iterator const& m,
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
sptr<TSA<Token> > const& r, size_t maxsmpl,
bool isfwd, vector<float> const* const sntbias)
: rnd(0)
, rnddenom(rnd.max() + 1.)
, min_diverse(10)
@ -865,12 +893,15 @@ namespace Moses {
, ctr(0)
, len(m.size())
, fwd(isfwd)
, bias(sntbias)
{
stats.reset(new pstats());
stats->raw_cnt = m.approxOccurrenceCount();
#if UG_BITEXT_TRACK_ACTIVE_THREADS
// if (++active%5 == 0)
++active;
// cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
#endif
}
template<typename Token>
@ -878,12 +909,12 @@ namespace Moses {
Bitext<Token>::
agenda::
add_job(typename TSA<Token>::tree_iterator const& phrase,
size_t const max_samples)
size_t const max_samples, vector<float> const* const bias)
{
boost::unique_lock<boost::mutex> lk(this->lock);
static boost::posix_time::time_duration nodelay(0,0,0,0);
bool fwd = phrase.root == bt.I1.get();
sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd));
sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias));
j->stats->register_worker();
joblist.push_back(j);
@ -1322,15 +1353,16 @@ namespace Moses {
template<typename Token>
void
Bitext<Token>::
prep(iter const& phrase) const
prep(iter const& phrase, vector<float> const* const bias) const
{
prep2(phrase, this->default_sample_size);
prep2(phrase, this->default_sample_size,bias);
}
template<typename Token>
sptr<pstats>
Bitext<Token>::
prep2(iter const& phrase, size_t const max_sample) const
prep2(iter const& phrase, size_t const max_sample,
vector<float> const* const bias) const
{
boost::lock_guard<boost::mutex> guard(this->lock);
if (!ag)
@ -1343,7 +1375,7 @@ namespace Moses {
#if 1
// use pcache only for plain sentence input
if (StaticData::Instance().GetInputType() == SentenceInput &&
max_sample == this->default_sample_size &&
max_sample == this->default_sample_size && bias == NULL &&
phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
{
// still need to test what a good caching threshold is
@ -1360,7 +1392,7 @@ namespace Moses {
// cerr << "NEW FREQUENT PHRASE: "
// << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()
// << " at " << __FILE__ << ":" << __LINE__ << endl;
foo.first->second = ag->add_job(phrase, max_sample);
foo.first->second = ag->add_job(phrase, max_sample,NULL);
assert(foo.first->second);
}
assert(foo.first->second);
@ -1369,7 +1401,7 @@ namespace Moses {
}
else
#endif
ret = ag->add_job(phrase, max_sample);
ret = ag->add_job(phrase, max_sample,bias);
assert(ret);
return ret;
}
@ -1443,8 +1475,8 @@ namespace Moses {
lookup(vector<Token> const& snt, TSA<Token>& idx,
vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
vector<vector<uint64_t> >* pidmap,
typename PhrasePair<Token>::Scorer* scorer,
bool multithread) const
typename PhrasePair<Token>::Scorer* scorer,
vector<float> const* const bias, bool multithread) const
{
typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
@ -1474,7 +1506,7 @@ namespace Moses {
pp.reset(new vector<PhrasePair<Token> >());
C.set(key,pp);
dest[i].push_back(pp);
sptr<pstats> x = prep2(m, this->default_sample_size);
sptr<pstats> x = prep2(m, this->default_sample_size,bias);
pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
if (multithread)
{
@ -1495,9 +1527,9 @@ namespace Moses {
template<typename Token>
sptr<pstats>
Bitext<Token>::
lookup(iter const& phrase) const
lookup(iter const& phrase, vector<float> const* const bias) const
{
sptr<pstats> ret = prep2(phrase, this->default_sample_size);
sptr<pstats> ret = prep2(phrase, this->default_sample_size, bias);
assert(ret);
boost::lock_guard<boost::mutex> guard(this->lock);
if (this->num_workers <= 1)
@ -1514,7 +1546,8 @@ namespace Moses {
template<typename Token>
sptr<pstats>
Bitext<Token>::
lookup(iter const& phrase, size_t const max_sample) const
lookup(iter const& phrase, size_t const max_sample,
vector<float> const* const bias) const
{
sptr<pstats> ret = prep2(phrase, max_sample);
boost::lock_guard<boost::mutex> guard(this->lock);
@ -1558,12 +1591,13 @@ namespace Moses {
return (max_samples && stats->good >= max_samples) || next == stop;
}
#if UG_BITEXT_TRACK_ACTIVE_THREADS
template<typename TKN>
ThreadSafeCounter
Bitext<TKN>::
agenda::
job::active;
#endif
template<typename Token>
void

View File

@ -1072,6 +1072,13 @@ namespace Moses
bool
Mmsapt::
PrefixExists(Moses::Phrase const& phrase) const
{
return PrefixExists(phrase,NULL);
}
bool
Mmsapt::
PrefixExists(Moses::Phrase const& phrase, vector<float> const* const bias) const
{
if (phrase.GetSize() == 0) return false;
vector<id_type> myphrase;
@ -1080,7 +1087,7 @@ namespace Moses
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
if (mfix.size() == myphrase.size())
{
btfix.prep(mfix);
btfix.prep(mfix,bias);
// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
return true;
}
@ -1096,7 +1103,8 @@ namespace Moses
{
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
if (mdyn.size() == myphrase.size()) dyn->prep(mdyn);
// let's assume a uniform bias over the foreground corpus
if (mdyn.size() == myphrase.size()) dyn->prep(mdyn,NULL);
}
return mdyn.size() == myphrase.size();
}

View File

@ -264,6 +264,9 @@ namespace Moses
ProvidesPrefixCheck() const;
/// return true if prefix /phrase/ exists
bool
PrefixExists(Phrase const& phrase, vector<float> const* const bias) const;
bool
PrefixExists(Phrase const& phrase) const;

View File

@ -0,0 +1,194 @@
#include "mmsapt.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>
#include <algorithm>
#include <iostream>
#include <sstream>
using namespace Moses;
using namespace bitext;
using namespace std;
using namespace boost;
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef imBitext<Token> imbitext;
typedef Bitext<Token>::iter iter;
mmbitext bg;
vector<string> src,trg,aln;
void
show(ostream& out, iter& f)
{
iter b(bg.I2.get(),f.getToken(0),f.size());
if (b.size() == f.size())
out << setw(12) << int(round(b.approxOccurrenceCount()));
else
out << string(12,' ');
out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " ";
out << f.str(bg.V1.get()) << endl;
}
void
dump(ostream& out, iter& f)
{
float cnt = f.size() ? f.approxOccurrenceCount() : 0;
if (f.down())
{
cnt = f.approxOccurrenceCount();
do { dump(out,f); }
while (f.over());
f.up();
}
if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1)
show(out,f);
}
void
read_data(string fname, vector<string>& dest)
{
ifstream in(fname.c_str());
string line;
while (getline(in,line)) dest.push_back(line);
in.close();
}
void
show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt,
vector<vector<int> > const& a)
{
for (size_t i = 0; i < snt.size(); ++i)
{
cout << format("%d:%s[") % i % V[snt[i].id()];
for (size_t k = 0; k < a[i].size(); ++k)
cout << (k?",":"") << a[i][k];
cout << "] ";
}
cout << endl;
}
void show_pair(size_t const sid)
{
vector<Token> s,t;
fill_token_seq(*bg.V1,src[sid],s);
fill_token_seq(*bg.V2,trg[sid],t);
vector<vector<int> > a1(s.size()),a2(t.size());
istringstream buf(aln[sid]);
cout << aln[sid] << endl;
int i,k; char c;
while (buf >> i >> c >> k)
{
a1[i].push_back(k);
a2[k].push_back(i);
cout << i << "-" << k << " ";
}
cout << endl;
show_snt(cout,*bg.V1,s,a1);
show_snt(cout,*bg.V2,t,a2);
}
int main(int argc, char* argv[])
{
if (argc < 5)
{
cerr << "usage: " << argv[0]
<< " <bg base name> <L1> <L2> <fg base name>"
<< endl;
exit(1);
}
bg.open(argv[1],argv[2],argv[3]);
sptr<imbitext> fg(new imbitext(bg.V1,bg.V2));
string base = argv[4];
if (*base.rbegin() != '.') base += '.';
string srcfile = base + argv[2];
string trgfile = base + argv[3];
string alnfile = base + "symal";
read_data(srcfile,src);
read_data(trgfile,trg);
read_data(alnfile,aln);
fg = fg->add(src,trg,aln);
vector<float> bias(src.size(),1./(src.size()-1));
for (size_t sid = 0; sid < src.size(); ++sid)
{
bias[sid] = 0;
// cout << src[sid] << endl << trg[sid] << endl;
// show_pair(sid);
vector<Token> snt;
fill_token_seq(*bg.V1,src[sid],snt);
vector<vector<sptr<vector<PhrasePair<Token> > > > > FG,BG;
fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true);
bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true);
set<sptr<vector<PhrasePair<Token> > > > seen;
for (size_t i = 0; i < snt.size(); ++i)
{
Bitext<Token>::iter m0(fg->I1.get());
Bitext<Token>::iter m1(bg.I1.get());
for (size_t k = 0; k < FG[i].size(); ++k)
{
if (!m0.extend(snt[i+k].id())) break;
if (k && m0.approxOccurrenceCount() < 2) break;
if (m1.size() == k && (!m1.extend(snt[i+k].id()) ||
m1.approxOccurrenceCount() < 25))
{
cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " "
<< int(m0.approxOccurrenceCount());
if (m1.size() == k + 1)
cout << " "<< int(m1.approxOccurrenceCount());
else if (m1.size())
cout << " ["<< int(m1.approxOccurrenceCount()) << "]";
else
cout << " NEW!";
cout << endl;
}
if (m0.approxOccurrenceCount() < 2) break;
BOOST_FOREACH(PhrasePair<Token> const& pp, *FG[i][k])
{
if (pp.joint < 2) continue;
sptr<pstats> bgstats;
jstats const* bgjstats = NULL;
Bitext<Token>::iter m2(bg.I2.get(), pp.start2, pp.len2);
if (m1.approxOccurrenceCount() > 5000 ||
m2.approxOccurrenceCount() > 5000)
continue;
if (m1.size() == pp.len1 && m2.size() == pp.len2)
{
bgstats = bg.lookup(m1,NULL);
if (bgstats)
{
pstats::trg_map_t::const_iterator mx;
mx = bgstats->trg.find(m2.getPid());
if (mx != bgstats->trg.end())
bgjstats = &mx->second;
}
}
cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: "
<< toString(*fg->V2, pp.start2, pp.len2) << " "
<< format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2;
if (bgjstats)
cout << " " << (format("[%u/%u/%u]")
% bgstats->good % bgjstats->rcnt()
% (bgjstats->cnt2() * bgstats->good
/ bgstats->raw_cnt));
else if (m1.size() == pp.len1)
cout << " " << int(m1.approxOccurrenceCount());
cout << endl;
}
}
}
bias[sid] = 1./(src.size()-1);
}
exit(0);
}

View File

@ -729,7 +729,7 @@ int main(int argc, char* argv[])
for (size_t i = 0; i < A.size(); ++i)
{
cout << (*BT.V2)[snt1[i].id()] << ": ";
cout << (*BT.V1)[snt1[i].id()] << ": ";
for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k))
cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()];
cout << endl;