diff --git a/Jamroot b/Jamroot index 7390aff61..21a3710c9 100644 --- a/Jamroot +++ b/Jamroot @@ -149,6 +149,7 @@ if [ option.get "with-mm" : : "yes" ] moses/TranslationModel/UG/mm//mam_verify moses/TranslationModel/UG/mm//custom-pt moses/TranslationModel/UG/mm//mmlex-build + moses/TranslationModel/UG/mm//mmlex-lookup moses/TranslationModel/UG/mm//mtt-count-words moses/TranslationModel/UG/mm//calc-coverage moses/TranslationModel/UG//try-align diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile index 4bafed7c0..e05275c92 100644 --- a/moses/TranslationModel/UG/mm/Jamfile +++ b/moses/TranslationModel/UG/mm/Jamfile @@ -9,6 +9,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm $(TOP)/util//kenutil ; +exe mmlex-lookup : +mmlex-lookup.cc +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/util//kenutil +; + exe mtt-count-words : mtt-count-words.cc $(TOP)/moses/TranslationModel/UG/generic//generic @@ -90,6 +99,7 @@ symal2mam mam2symal custom-pt mmlex-build +mmlex-lookup mam_verify calc-coverage ; diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc new file mode 100644 index 000000000..14d839edf --- /dev/null +++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc @@ -0,0 +1,149 @@ +// -*- c++ -*- +// Program to extract word cooccurrence counts from a memory-mapped +// word-aligned bitext stores the counts lexicon in the format for +// mm2dTable (ug_mm_2d_table.h) +// +// (c) 2010-2012 Ulrich Germann + +// to do: multi-threading + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" +#include "ug_mm_2d_table.h" +#include "ug_mm_ttrack.h" +#include "ug_corpus_token.h" + +using namespace std; +using namespace ugdiss; +using namespace boost::math; + +typedef mm2dTable LEX_t; +typedef SimpleWordId Token; + +// DECLARATIONS +void interpret_args(int ac, char* av[]); + +string swrd,twrd,L1,L2,bname; +TokenIndex V1,V2; +LEX_t LEX; + + +void +lookup_source(ostream& out, id_type r) +{ + vector foo(LEX[r].start,LEX[r].stop); + sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue()); + out << V1[r] << " " << LEX.m1(r) << endl; + BOOST_FOREACH(LEX_t::Cell const& c, foo) + { + out << setw(10) << float(c.val)/LEX.m1(r) << " " + << setw(10) << float(c.val)/LEX.m2(c.id) << " " + << V2[c.id] << " " << c.val << "/" << LEX.m2(c.id) << endl; + } +} + +void +lookup_target(ostream& out, id_type c) +{ + vector foo; + LEX_t::Cell cell; + for (size_t r = 0; r < LEX.numRows; ++r) + { + size_t j = LEX[r][c]; + if (j) + { + cell.id = r; + cell.val = j; + foo.push_back(cell); + } + } + sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue()); + out << V2[c] << " " << LEX.m2(c) << endl; + BOOST_FOREACH(LEX_t::Cell const& r, foo) + { + out << setw(10) << float(r.val)/LEX.m2(c) << " " + << setw(10) << float(r.val)/LEX.m1(r.id) << " " + << V1[r.id] << " " << r.val << "/" << LEX.m1(r.id) << endl; + } +} + +void +dump(ostream& out) +{ + for (size_t r = 0; r < LEX.numRows; ++r) + lookup_source(out,r); + out << endl; +} + + +int +main(int argc, char* argv[]) +{ + interpret_args(argc,argv); + char c = *bname.rbegin(); + if (c != '/' && c != '.') bname += '.'; + V1.open(bname+L1+".tdx"); + V2.open(bname+L2+".tdx"); + LEX.open(bname+L1+"-"+L2+".lex"); + + cout.precision(2); + id_type swid = V1[swrd]; + id_type twid = V2[twrd]; + if (swid != 1 && twid != 1) + { + cout << swrd << " " << twrd << " " + << LEX.m1(swid) << " / " + << LEX[swid][twid] << " / " + << LEX.m2(twid) << endl; + } + else if (swid != 1) + lookup_source(cout,swid); + else if (twid != 1) + lookup_target(cout,twid); + else + dump(cout); +} + +void +interpret_args(int ac, char* av[]) +{ + namespace po=boost::program_options; + po::variables_map vm; + po::options_description o("Options"); + po::options_description h("Hidden Options"); + po::positional_options_description a; + + o.add_options() + ("help,h", "print this message") + ("source,s",po::value(&swrd),"source word") + ("target,t",po::value(&swrd),"target word") + ; + + h.add_options() + ("bname", po::value(&bname), "base name") + ("L1", po::value(&L1),"L1 tag") + ("L2", po::value(&L2),"L2 tag") + ; + a.add("bname",1); + a.add("L1",1); + a.add("L2",1); + get_options(ac,av,h.add(o),a,vm,"cfg"); + +} + +