#include "mmsapt.h" #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" #include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h" #include #include #include #include #include #include #include using namespace Moses; using namespace bitext; using namespace std; using namespace boost; typedef L2R_Token Token; typedef mmBitext mmbitext; typedef imBitext imbitext; typedef Bitext::iter iter; mmbitext bg; vector src,trg,aln; void show(ostream& out, iter& f) { iter b(bg.I2.get(),f.getToken(0),f.size()); if (b.size() == f.size()) out << setw(12) << int(round(b.approxOccurrenceCount())); else out << string(12,' '); out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " "; out << f.str(bg.V1.get()) << endl; } void dump(ostream& out, iter& f) { float cnt = f.size() ? f.approxOccurrenceCount() : 0; if (f.down()) { cnt = f.approxOccurrenceCount(); do { dump(out,f); } while (f.over()); f.up(); } if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) show(out,f); } void read_data(string fname, vector& dest) { ifstream in(fname.c_str()); string line; while (getline(in,line)) dest.push_back(line); in.close(); } void show_snt(ostream& out, TokenIndex const& V, vector const& snt, vector > const& a) { for (size_t i = 0; i < snt.size(); ++i) { cout << format("%d:%s[") % i % V[snt[i].id()]; for (size_t k = 0; k < a[i].size(); ++k) cout << (k?",":"") << a[i][k]; cout << "] "; } cout << endl; } void show_pair(size_t const sid) { vector s,t; fill_token_seq(*bg.V1,src[sid],s); fill_token_seq(*bg.V2,trg[sid],t); vector > a1(s.size()),a2(t.size()); istringstream buf(aln[sid]); cout << aln[sid] << endl; int i,k; char c; while (buf >> i >> c >> k) { a1[i].push_back(k); a2[k].push_back(i); cout << i << "-" << k << " "; } cout << endl; show_snt(cout,*bg.V1,s,a1); show_snt(cout,*bg.V2,t,a2); } int main(int argc, char* argv[]) { if (argc < 5) { cerr << "usage: " << argv[0] << " " << endl; exit(1); } bg.open(argv[1],argv[2],argv[3]); sptr fg(new imbitext(bg.V1,bg.V2)); string base = argv[4]; if (*base.rbegin() != '.') base += '.'; string srcfile = base + argv[2]; string trgfile = base + argv[3]; string alnfile = base + "symal"; read_data(srcfile,src); read_data(trgfile,trg); read_data(alnfile,aln); fg = fg->add(src,trg,aln); vector bias(src.size(),1./(src.size()-1)); for (size_t sid = 0; sid < src.size(); ++sid) { bias[sid] = 0; // cout << src[sid] << endl << trg[sid] << endl; // show_pair(sid); vector snt; fill_token_seq(*bg.V1,src[sid],snt); vector > > > > FG,BG; fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true); set > > > seen; for (size_t i = 0; i < snt.size(); ++i) { Bitext::iter m0(fg->I1.get()); Bitext::iter m1(bg.I1.get()); for (size_t k = 0; k < FG[i].size(); ++k) { if (!m0.extend(snt[i+k].id())) break; if (k && m0.approxOccurrenceCount() < 2) break; if (m1.size() == k && (!m1.extend(snt[i+k].id()) || m1.approxOccurrenceCount() < 25)) { cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " " << int(m0.approxOccurrenceCount()); if (m1.size() == k + 1) cout << " "<< int(m1.approxOccurrenceCount()); else if (m1.size()) cout << " ["<< int(m1.approxOccurrenceCount()) << "]"; else cout << " NEW!"; cout << endl; } if (m0.approxOccurrenceCount() < 2) break; BOOST_FOREACH(PhrasePair const& pp, *FG[i][k]) { if (pp.joint < 2) continue; sptr bgstats; jstats const* bgjstats = NULL; Bitext::iter m2(bg.I2.get(), pp.start2, pp.len2); if (m1.approxOccurrenceCount() > 5000 || m2.approxOccurrenceCount() > 5000) continue; if (m1.size() == pp.len1 && m2.size() == pp.len2) { bgstats = bg.lookup(m1,NULL); if (bgstats) { pstats::trg_map_t::const_iterator mx; mx = bgstats->trg.find(m2.getPid()); if (mx != bgstats->trg.end()) bgjstats = &mx->second; } } cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: " << toString(*fg->V2, pp.start2, pp.len2) << " " << format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2; if (bgjstats) cout << " " << (format("[%u/%u/%u]") % bgstats->good % bgjstats->rcnt() % (bgjstats->cnt2() * bgstats->good / bgstats->raw_cnt)); else if (m1.size() == pp.len1) cout << " " << int(m1.approxOccurrenceCount()); cout << endl; } } } bias[sid] = 1./(src.size()-1); } exit(0); }