mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 12:52:29 +03:00
Added mmlex-lookup utility program.
This commit is contained in:
parent
ef974cd6ad
commit
c91fb5cc84
1
Jamroot
1
Jamroot
@ -149,6 +149,7 @@ if [ option.get "with-mm" : : "yes" ]
|
||||
moses/TranslationModel/UG/mm//mam_verify
|
||||
moses/TranslationModel/UG/mm//custom-pt
|
||||
moses/TranslationModel/UG/mm//mmlex-build
|
||||
moses/TranslationModel/UG/mm//mmlex-lookup
|
||||
moses/TranslationModel/UG/mm//mtt-count-words
|
||||
moses/TranslationModel/UG/mm//calc-coverage
|
||||
moses/TranslationModel/UG//try-align
|
||||
|
@ -9,6 +9,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
|
||||
$(TOP)/util//kenutil
|
||||
;
|
||||
|
||||
exe mmlex-lookup :
|
||||
mmlex-lookup.cc
|
||||
$(TOP)/moses/TranslationModel/UG/generic//generic
|
||||
$(TOP)//boost_iostreams
|
||||
$(TOP)//boost_program_options
|
||||
$(TOP)/moses/TranslationModel/UG/mm//mm
|
||||
$(TOP)/util//kenutil
|
||||
;
|
||||
|
||||
exe mtt-count-words :
|
||||
mtt-count-words.cc
|
||||
$(TOP)/moses/TranslationModel/UG/generic//generic
|
||||
@ -90,6 +99,7 @@ symal2mam
|
||||
mam2symal
|
||||
custom-pt
|
||||
mmlex-build
|
||||
mmlex-lookup
|
||||
mam_verify
|
||||
calc-coverage
|
||||
;
|
||||
|
149
moses/TranslationModel/UG/mm/mmlex-lookup.cc
Normal file
149
moses/TranslationModel/UG/mm/mmlex-lookup.cc
Normal file
@ -0,0 +1,149 @@
|
||||
// -*- c++ -*-
|
||||
// Program to extract word cooccurrence counts from a memory-mapped
|
||||
// word-aligned bitext stores the counts lexicon in the format for
|
||||
// mm2dTable<uint32_t> (ug_mm_2d_table.h)
|
||||
//
|
||||
// (c) 2010-2012 Ulrich Germann
|
||||
|
||||
// to do: multi-threading
|
||||
|
||||
#include <queue>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <iterator>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
#include <boost/thread.hpp>
|
||||
#include <boost/math/distributions/binomial.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/unordered_set.hpp>
|
||||
|
||||
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
|
||||
#include "ug_mm_2d_table.h"
|
||||
#include "ug_mm_ttrack.h"
|
||||
#include "ug_corpus_token.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace ugdiss;
|
||||
using namespace boost::math;
|
||||
|
||||
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
|
||||
typedef SimpleWordId Token;
|
||||
|
||||
// DECLARATIONS
|
||||
void interpret_args(int ac, char* av[]);
|
||||
|
||||
string swrd,twrd,L1,L2,bname;
|
||||
TokenIndex V1,V2;
|
||||
LEX_t LEX;
|
||||
|
||||
|
||||
void
|
||||
lookup_source(ostream& out, id_type r)
|
||||
{
|
||||
vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
|
||||
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
|
||||
out << V1[r] << " " << LEX.m1(r) << endl;
|
||||
BOOST_FOREACH(LEX_t::Cell const& c, foo)
|
||||
{
|
||||
out << setw(10) << float(c.val)/LEX.m1(r) << " "
|
||||
<< setw(10) << float(c.val)/LEX.m2(c.id) << " "
|
||||
<< V2[c.id] << " " << c.val << "/" << LEX.m2(c.id) << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
lookup_target(ostream& out, id_type c)
|
||||
{
|
||||
vector<LEX_t::Cell> foo;
|
||||
LEX_t::Cell cell;
|
||||
for (size_t r = 0; r < LEX.numRows; ++r)
|
||||
{
|
||||
size_t j = LEX[r][c];
|
||||
if (j)
|
||||
{
|
||||
cell.id = r;
|
||||
cell.val = j;
|
||||
foo.push_back(cell);
|
||||
}
|
||||
}
|
||||
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
|
||||
out << V2[c] << " " << LEX.m2(c) << endl;
|
||||
BOOST_FOREACH(LEX_t::Cell const& r, foo)
|
||||
{
|
||||
out << setw(10) << float(r.val)/LEX.m2(c) << " "
|
||||
<< setw(10) << float(r.val)/LEX.m1(r.id) << " "
|
||||
<< V1[r.id] << " " << r.val << "/" << LEX.m1(r.id) << endl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dump(ostream& out)
|
||||
{
|
||||
for (size_t r = 0; r < LEX.numRows; ++r)
|
||||
lookup_source(out,r);
|
||||
out << endl;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char* argv[])
|
||||
{
|
||||
interpret_args(argc,argv);
|
||||
char c = *bname.rbegin();
|
||||
if (c != '/' && c != '.') bname += '.';
|
||||
V1.open(bname+L1+".tdx");
|
||||
V2.open(bname+L2+".tdx");
|
||||
LEX.open(bname+L1+"-"+L2+".lex");
|
||||
|
||||
cout.precision(2);
|
||||
id_type swid = V1[swrd];
|
||||
id_type twid = V2[twrd];
|
||||
if (swid != 1 && twid != 1)
|
||||
{
|
||||
cout << swrd << " " << twrd << " "
|
||||
<< LEX.m1(swid) << " / "
|
||||
<< LEX[swid][twid] << " / "
|
||||
<< LEX.m2(twid) << endl;
|
||||
}
|
||||
else if (swid != 1)
|
||||
lookup_source(cout,swid);
|
||||
else if (twid != 1)
|
||||
lookup_target(cout,twid);
|
||||
else
|
||||
dump(cout);
|
||||
}
|
||||
|
||||
void
|
||||
interpret_args(int ac, char* av[])
|
||||
{
|
||||
namespace po=boost::program_options;
|
||||
po::variables_map vm;
|
||||
po::options_description o("Options");
|
||||
po::options_description h("Hidden Options");
|
||||
po::positional_options_description a;
|
||||
|
||||
o.add_options()
|
||||
("help,h", "print this message")
|
||||
("source,s",po::value<string>(&swrd),"source word")
|
||||
("target,t",po::value<string>(&swrd),"target word")
|
||||
;
|
||||
|
||||
h.add_options()
|
||||
("bname", po::value<string>(&bname), "base name")
|
||||
("L1", po::value<string>(&L1),"L1 tag")
|
||||
("L2", po::value<string>(&L2),"L2 tag")
|
||||
;
|
||||
a.add("bname",1);
|
||||
a.add("L1",1);
|
||||
a.add("L2",1);
|
||||
get_options(ac,av,h.add(o),a,vm,"cfg");
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user