Added bias to bitext sampling.

2024-09-17 14:17:13 +03:00 · 2014-09-06 03:39:23 +01:00 · 2014-09-06 03:39:23 +01:00 · a86d49fc88
commit a86d49fc88
parent cef6460981
8 changed files with 285 additions and 33 deletions
--- a/1
+++ b/1
@ -172,6 +172,7 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
 if [ option.get "with-mm" : : "yes" ]
 {
 alias mm :  
+  moses/TranslationModel/UG//spe-check-coverage3
  moses/TranslationModel/UG//spe-check-coverage2
  moses/TranslationModel/UG//ptable-lookup 
  moses/TranslationModel/UG//sim-pe 
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@ -65,6 +65,16 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
 $(TOP)/util//kenutil 
 ; 

+exe spe-check-coverage3 : 
+spe-check-coverage3.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
 install $(PREFIX)/bin : try-align try-align2 ; 

 fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@ -11,7 +11,9 @@ namespace Moses
  namespace bitext 
  {

+#if UG_BITEXT_TRACK_ACTIVE_THREADS
    ThreadSafeCounter pstats::active;
+#endif
    
    pstats::
    pstats()
@ -23,15 +25,15 @@ namespace Moses
    {
      ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0;
      obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0;
-      // if (++active%5 == 0) 
-      // cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl;
    }

    pstats::
    ~pstats()
    {
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+      // counter may not exist any more at destruction time, so try ... catch
      try { --active; } catch (...) {} 
-      // counter may not exist any more at destruction time
+#endif
    }

    void
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@ -15,6 +15,10 @@
 //
 // - use multiple agendas for better load balancing and to avoid 
 //   competition for locks
+// 
+
+
+#define UG_BITEXT_TRACK_ACTIVE_THREADS 0

 #include <string>
 #include <vector>
@ -133,7 +137,10 @@ namespace Moses {
    struct 
    pstats
    {
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
      static ThreadSafeCounter active;
+#endif
      boost::mutex lock;               // for parallel gathering of stats
      boost::condition_variable ready; // consumers can wait for this data structure to be ready.
      
@ -463,7 +470,8 @@ namespace Moses {
      mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
    private:
      sptr<pstats> 
-	prep2(iter const& phrase, size_t const max_sample) const;
+      prep2(iter const& phrase, size_t const max_sample,
+	    vector<float> const* const bias) const;
    public:
      Bitext(size_t const max_sample =1000, 
 	     size_t const xnum_workers =16);
@ -481,17 +489,19 @@ namespace Moses {
      virtual void open(string const base, string const L1, string const L2) = 0;
      
      // sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
-      sptr<pstats> lookup(iter const& phrase) const;
-      sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
+      sptr<pstats> lookup(iter const& phrase,vector<float> const* const bias=NULL) const;
+      sptr<pstats> lookup(iter const& phrase, size_t const max_sample,
+			  vector<float> const* const bias) const;

      void
      lookup(vector<Token> const& snt, TSA<Token>& idx, 
 	     vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
 	     vector<vector<uint64_t> >* pidmap = NULL,
 	     typename PhrasePair<Token>::Scorer* scorer=NULL, 
+	     vector<float> const* const bias=NULL,
 	     bool multithread=true) const;

-      void prep(iter const& phrase) const;
+      void prep(iter const& phrase, vector<float> const* const bias) const;

      void   setDefaultSampleSize(size_t const max_samples);
      size_t getDefaultSampleSize() const;
@ -576,7 +586,9 @@ namespace Moses {
      boost::mutex lock; 
      class job 
      {
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
 	static ThreadSafeCounter active;
+#endif
 	boost::mutex lock; 
 	friend class agenda;
 	boost::taus88 rnd;  // every job has its own pseudo random generator 
@ -594,10 +606,13 @@ namespace Moses {
 	size_t             len; // phrase length
 	bool               fwd; // if true, source phrase is L1 
 	sptr<pstats>     stats; // stores statistics collected during sampling
+	vector<float> const* bias; // sentence-level bias for sampling
+
 	bool step(uint64_t & sid, uint64_t & offset); // select another occurrence
 	bool done() const;
 	job(typename TSA<Token>::tree_iterator const& m, 
-	    sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd);
+	    sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, 
+	    vector<float> const* const bias);
 	~job();
      };
    public:      
@ -622,7 +637,9 @@ namespace Moses {

      sptr<pstats> 
      add_job(typename TSA<Token>::tree_iterator const& phrase, 
-	      size_t const max_samples);
+	      size_t const max_samples, 
+	      vector<float> const* const bias);
+
      sptr<job> get_job();
    };
    
@ -641,6 +658,8 @@ namespace Moses {
 	  next = root->readOffset(next,stop,offset);
 	  boost::lock_guard<boost::mutex> sguard(stats->lock);
 	  if (stats->raw_cnt == ctr) ++stats->raw_cnt;
+	  if (bias && bias->at(sid) == 0)
+	    return false;
 	  stats->sample_cnt++;
 	}
      else 
@ -654,14 +673,21 @@ namespace Moses {
 		boost::lock_guard<boost::mutex> sguard(stats->lock); 
 		if (stats->raw_cnt == ctr) ++stats->raw_cnt;
 		size_t scalefac = (stats->raw_cnt - ctr++);
-		size_t rnum = scalefac*(rnd()/(rnd.max()+1.));
+		size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
+		size_t th = (bias == NULL ? max_samples
+			     : bias->at(sid) * bias->size() * max_samples);
 #if 0
 		cerr << rnum << "/" << scalefac << " vs. " 
 		     << max_samples - stats->good << " ("
 		     << max_samples << " - " << stats->good << ")" 
-		     << endl;
+		     << " th=" << th;
+		if (bias) 
+		  cerr << " with bias " << bias->at(sid) 
+		       << " => " << bias->at(sid) * bias->size();
+		else cerr << " without bias";
+		cerr << endl;
 #endif
-		if (rnum < max_samples - stats->good)
+		if (rnum + stats->good < th)
 		  {
 		    stats->sample_cnt++;
 		    ret = true;
@ -743,8 +769,7 @@ namespace Moses {
 		}
 	      else if (!ag.bt.find_trg_phr_bounds
 		       (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
-			// NULL,NULL,true))
-			&aln,NULL,true))
+			&aln,NULL,true)) // NULL,NULL,true))
 		continue;
 	      j->stats->lock.lock(); 
 	      j->stats->good += 1; 
@ -844,7 +869,9 @@ namespace Moses {
    ~job()
    {
      if (stats) stats.reset();
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
      try { --active; } catch (...) {} 
+#endif
      // counter may not exist any more at destruction time
    }

@ -853,7 +880,8 @@ namespace Moses {
    agenda::
    job::
    job(typename TSA<Token>::tree_iterator const& m, 
-	sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
+	sptr<TSA<Token> > const& r, size_t maxsmpl, 
+	bool isfwd, vector<float> const* const sntbias)
      : rnd(0)
      , rnddenom(rnd.max() + 1.)
      , min_diverse(10)
@ -865,12 +893,15 @@ namespace Moses {
      , ctr(0)
      , len(m.size())
      , fwd(isfwd)
+      , bias(sntbias)
    {
      stats.reset(new pstats());
      stats->raw_cnt = m.approxOccurrenceCount();
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
      // if (++active%5 == 0) 
      ++active;
      // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
    }

    template<typename Token>
@ -878,12 +909,12 @@ namespace Moses {
    Bitext<Token>::
    agenda::
    add_job(typename TSA<Token>::tree_iterator const& phrase, 
-	    size_t const max_samples)
+	    size_t const max_samples, vector<float> const* const bias)
    {
      boost::unique_lock<boost::mutex> lk(this->lock);
      static boost::posix_time::time_duration nodelay(0,0,0,0); 
      bool fwd = phrase.root == bt.I1.get();
-      sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd));
+      sptr<job> j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias));
      j->stats->register_worker();
      
      joblist.push_back(j);
@ -1322,15 +1353,16 @@ namespace Moses {
    template<typename Token>
    void
    Bitext<Token>::
-    prep(iter const& phrase) const
+    prep(iter const& phrase, vector<float> const* const bias) const
    {
-      prep2(phrase, this->default_sample_size);
+      prep2(phrase, this->default_sample_size,bias);
    }

    template<typename Token>
    sptr<pstats> 
    Bitext<Token>::
-    prep2(iter const& phrase, size_t const max_sample) const
+    prep2(iter const& phrase, size_t const max_sample, 
+	  vector<float> const* const bias) const
    {
      boost::lock_guard<boost::mutex> guard(this->lock);
      if (!ag) 
@ -1343,7 +1375,7 @@ namespace Moses {
 #if 1
      // use pcache only for plain sentence input
      if (StaticData::Instance().GetInputType() == SentenceInput && 
-	  max_sample == this->default_sample_size && 
+	  max_sample == this->default_sample_size && bias == NULL && 
 	  phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
      	{
 	  // still need to test what a good caching threshold is
@ -1360,7 +1392,7 @@ namespace Moses {
 	      // cerr << "NEW FREQUENT PHRASE: "
 	      // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount()  
 	      // << " at " << __FILE__ << ":" << __LINE__ << endl;
-	      foo.first->second = ag->add_job(phrase, max_sample);
+	      foo.first->second = ag->add_job(phrase, max_sample,NULL);
 	      assert(foo.first->second);
 	    }
 	  assert(foo.first->second);
@ -1369,7 +1401,7 @@ namespace Moses {
 	}
      else 
 #endif
-	ret = ag->add_job(phrase, max_sample);
+	ret = ag->add_job(phrase, max_sample,bias);
      assert(ret);
      return ret;
    }
@ -1443,8 +1475,8 @@ namespace Moses {
    lookup(vector<Token> const& snt, TSA<Token>& idx, 
 	   vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
 	   vector<vector<uint64_t> >* pidmap,
-	   typename PhrasePair<Token>::Scorer* scorer, 
-	   bool multithread) const
+	   typename PhrasePair<Token>::Scorer* scorer,
+	   vector<float> const* const bias, bool multithread) const
    {
      typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
      
@ -1474,7 +1506,7 @@ namespace Moses {
 		  pp.reset(new vector<PhrasePair<Token> >());
 		  C.set(key,pp);
 		  dest[i].push_back(pp);
-		  sptr<pstats> x = prep2(m, this->default_sample_size);
+		  sptr<pstats> x = prep2(m, this->default_sample_size,bias);
 		  pstats2pplist<Token> w(m,*(fwd?T2:T1),x,*pp,scorer);
 		  if (multithread) 
 		    {
@ -1495,9 +1527,9 @@ namespace Moses {
    template<typename Token>
    sptr<pstats> 
    Bitext<Token>::
-    lookup(iter const& phrase) const
+    lookup(iter const& phrase, vector<float> const* const bias) const
    {
-      sptr<pstats> ret = prep2(phrase, this->default_sample_size);
+      sptr<pstats> ret = prep2(phrase, this->default_sample_size, bias);
      assert(ret);
      boost::lock_guard<boost::mutex> guard(this->lock);
      if (this->num_workers <= 1)
@ -1514,7 +1546,8 @@ namespace Moses {
    template<typename Token>
    sptr<pstats> 
    Bitext<Token>::
-    lookup(iter const& phrase, size_t const max_sample) const
+    lookup(iter const& phrase, size_t const max_sample,
+	   vector<float> const* const bias) const
    {
      sptr<pstats> ret = prep2(phrase, max_sample);
      boost::lock_guard<boost::mutex> guard(this->lock);
@ -1558,12 +1591,13 @@ namespace Moses {
      return (max_samples && stats->good >= max_samples) || next == stop; 
    }

+#if UG_BITEXT_TRACK_ACTIVE_THREADS
    template<typename TKN>
    ThreadSafeCounter 
    Bitext<TKN>::
    agenda::
    job::active;
-
+#endif

    template<typename Token>
    void 
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -1072,6 +1072,13 @@ namespace Moses
  bool
  Mmsapt::
  PrefixExists(Moses::Phrase const& phrase) const
+  {
+    return PrefixExists(phrase,NULL); 
+  }
+
+  bool
+  Mmsapt::
+  PrefixExists(Moses::Phrase const& phrase, vector<float> const* const bias) const
  {
    if (phrase.GetSize() == 0) return false;
    vector<id_type> myphrase; 
@ -1080,7 +1087,7 @@ namespace Moses
    TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
    if (mfix.size() == myphrase.size()) 
      {
-	btfix.prep(mfix);
+	btfix.prep(mfix,bias);
 	// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
 	return true;
      }
@ -1096,7 +1103,8 @@ namespace Moses
      {
 	for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
 	  mdyn.extend(myphrase[i]);
-	if (mdyn.size() == myphrase.size()) dyn->prep(mdyn);
+	// let's assume a uniform bias over the foreground corpus
+	if (mdyn.size() == myphrase.size()) dyn->prep(mdyn,NULL);
      }
    return mdyn.size() == myphrase.size();
  }
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@ -264,6 +264,9 @@ namespace Moses
    ProvidesPrefixCheck() const;
    
    /// return true if prefix /phrase/ exists
+    bool
+    PrefixExists(Phrase const& phrase, vector<float> const* const bias) const;
+
    bool
    PrefixExists(Phrase const& phrase) const;

--- a/moses/TranslationModel/UG/spe-check-coverage3.cc
+++ b/moses/TranslationModel/UG/spe-check-coverage3.cc
@ -0,0 +1,194 @@
+#include "mmsapt.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmBitext<Token> mmbitext;
+typedef imBitext<Token> imbitext;
+typedef Bitext<Token>::iter iter;
+
+mmbitext bg;
+vector<string> src,trg,aln;
+
+void 
+show(ostream& out, iter& f)
+{
+  iter b(bg.I2.get(),f.getToken(0),f.size());
+  if (b.size() == f.size())
+    out << setw(12) << int(round(b.approxOccurrenceCount()));
+  else
+    out << string(12,' ');
+  out << " " << setw(5) <<  int(round(f.approxOccurrenceCount())) << " ";
+  out << f.str(bg.V1.get()) << endl; 
+}
+
+
+void 
+dump(ostream& out, iter& f)
+{
+  float cnt = f.size() ? f.approxOccurrenceCount() : 0;
+  if (f.down())
+    {
+      cnt = f.approxOccurrenceCount();
+      do { dump(out,f); }
+      while (f.over());
+      f.up();
+    }
+  if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) 
+    show(out,f);
+}
+
+
+void 
+read_data(string fname, vector<string>& dest)
+{
+  ifstream in(fname.c_str());
+  string line;
+  while (getline(in,line)) dest.push_back(line);
+  in.close();
+}
+
+void 
+show_snt(ostream& out, TokenIndex const& V, vector<Token> const& snt, 
+	 vector<vector<int> > const& a)
+{
+  for (size_t i = 0; i < snt.size(); ++i)
+    {
+      cout << format("%d:%s[") % i % V[snt[i].id()];
+      for (size_t k = 0; k < a[i].size(); ++k) 
+	cout << (k?",":"") << a[i][k];
+      cout << "] ";
+    }
+  cout << endl;
+}
+
+
+void show_pair(size_t const sid)
+{
+  vector<Token> s,t; 
+  fill_token_seq(*bg.V1,src[sid],s);
+  fill_token_seq(*bg.V2,trg[sid],t);
+  vector<vector<int> > a1(s.size()),a2(t.size());
+  istringstream buf(aln[sid]);
+  cout << aln[sid] << endl;
+  int i,k; char c;
+  while (buf >> i >> c >> k)
+    {
+      a1[i].push_back(k);
+      a2[k].push_back(i);
+      cout << i << "-" << k << " ";
+    }
+  cout << endl;
+  show_snt(cout,*bg.V1,s,a1);
+  show_snt(cout,*bg.V2,t,a2);
+}
+
+int main(int argc, char* argv[])
+{
+  if (argc < 5) 
+    {
+      cerr << "usage: " << argv[0] 
+	   << " <bg base name> <L1> <L2> <fg base name>" 
+	   << endl; 
+      exit(1);
+    }
+  bg.open(argv[1],argv[2],argv[3]);
+  sptr<imbitext> fg(new imbitext(bg.V1,bg.V2));
+  string base = argv[4];
+  if (*base.rbegin() != '.') base += '.';
+  string srcfile = base + argv[2];
+  string trgfile = base + argv[3];
+  string alnfile = base + "symal";
+  read_data(srcfile,src);
+  read_data(trgfile,trg);
+  read_data(alnfile,aln);
+  fg = fg->add(src,trg,aln);
+
+  vector<float> bias(src.size(),1./(src.size()-1));
+  for (size_t sid = 0; sid < src.size(); ++sid)
+    {
+      bias[sid] = 0;
+      // cout << src[sid] << endl << trg[sid] << endl;
+      // show_pair(sid);
+      vector<Token> snt; 
+      fill_token_seq(*bg.V1,src[sid],snt);
+      vector<vector<sptr<vector<PhrasePair<Token> > > > > FG,BG;
+      fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); 
+      bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true);
+      set<sptr<vector<PhrasePair<Token> > > > seen;
+      for (size_t i = 0; i < snt.size(); ++i)
+	{
+	  Bitext<Token>::iter m0(fg->I1.get());
+	  Bitext<Token>::iter m1(bg.I1.get());
+	  for (size_t k = 0; k < FG[i].size(); ++k)
+	    {
+	      if (!m0.extend(snt[i+k].id())) break;
+	      if (k && m0.approxOccurrenceCount() < 2) break;
+	      if (m1.size() == k && (!m1.extend(snt[i+k].id()) || 
+				     m1.approxOccurrenceCount() < 25))
+		{
+		  cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " "
+		       << int(m0.approxOccurrenceCount());
+		  if (m1.size() == k + 1)
+		    cout  << " "<< int(m1.approxOccurrenceCount());
+		  else if (m1.size())
+		    cout  << " ["<< int(m1.approxOccurrenceCount()) << "]";
+		  else
+		    cout << " NEW!";
+		  cout << endl;
+		}
+	      if (m0.approxOccurrenceCount() < 2) break;
+	      BOOST_FOREACH(PhrasePair<Token> const& pp, *FG[i][k])
+		{
+		  if (pp.joint < 2) continue;
+		  sptr<pstats> bgstats;
+		  jstats const* bgjstats = NULL;
+		  Bitext<Token>::iter m2(bg.I2.get(), pp.start2, pp.len2);
+		  if (m1.approxOccurrenceCount() > 5000 || 
+		      m2.approxOccurrenceCount() > 5000) 
+		    continue;
+		  if (m1.size() == pp.len1 && m2.size() == pp.len2)
+		    {
+		      bgstats = bg.lookup(m1,NULL);
+		      if (bgstats)
+			{
+			  pstats::trg_map_t::const_iterator mx;
+			  mx = bgstats->trg.find(m2.getPid());
+			  if (mx != bgstats->trg.end())
+			    bgjstats = &mx->second;
+			}
+		    }
+		  cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: "
+		       << toString(*fg->V2, pp.start2, pp.len2) << " "
+		       << format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2;
+		  if (bgjstats) 
+		    cout << " " << (format("[%u/%u/%u]") 
+				    % bgstats->good % bgjstats->rcnt() 
+				    % (bgjstats->cnt2() * bgstats->good
+				       / bgstats->raw_cnt));
+		  else if (m1.size() == pp.len1)
+		    cout << " " << int(m1.approxOccurrenceCount());
+		  cout << endl;
+
+		}
+	    }
+	}
+      bias[sid] = 1./(src.size()-1);
+    }
+  exit(0);
+}
+  
+  
+
--- a/moses/TranslationModel/UG/try-align2.cc
+++ b/moses/TranslationModel/UG/try-align2.cc
@ -729,7 +729,7 @@ int main(int argc, char* argv[])

      for (size_t i = 0; i < A.size(); ++i)
 	{
-	  cout << (*BT.V2)[snt1[i].id()] << ": ";
+	  cout << (*BT.V1)[snt1[i].id()] << ": ";
 	  for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k))
 	    cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()];
 	  cout << endl;