diff --git a/moses/TranslationModel/UG/check-coverage5.cc b/moses/TranslationModel/UG/check-coverage5.cc new file mode 100644 index 000000000..549eb7b21 --- /dev/null +++ b/moses/TranslationModel/UG/check-coverage5.cc @@ -0,0 +1,126 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- + +// read a text from stdin, report percentage of n-grams covered + +#include +#include +#include +#include +#include +#include +#include "mm/ug_bitext.h" +#include "generic/file_io/ug_stream.h" +#include +#include +#include "mm/ug_bitext_sampler.h" + +#include +#include + +// #include "LSA.h" + +namespace po=boost::program_options; +using namespace Moses; +using namespace sapt; +using namespace std; +using namespace boost; + +typedef sapt::L2R_Token Token; +typedef mmTtrack ttrack_t; + +size_t ngram_size; +string bname; +vector ifiles; + +void interpret_args(int ac, char* av[]); + + +void +dump(mmTSA::tree_iterator& m, TokenIndex& V) +{ + if (m.size()) cout << m.str(NULL) << endl; + if (m.size()) cout << m.str(&V) << endl; + if (m.down()) + { + do { dump(m, V); } while (m.over()); + m.up(); + } +} + +int +main(int argc, char* argv[]) +{ + interpret_args(argc,argv); + TokenIndex V; + V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex(); + boost::shared_ptr > T(new mmTtrack); + T->open(bname+".mct"); + mmTSA I; I.open(bname+".sfa", T); + + string line; + BOOST_FOREACH(string const& file, ifiles) + { + size_t total_ngrams=0; + float matched_ngrams=0; + ifstream in(file.c_str()); + while(getline(in,line)) + { + // cout << line << endl; + vector snt; + V.fillIdSeq(line,snt); + if (snt.size() < ngram_size) continue; + total_ngrams += snt.size() - ngram_size + 1; + for (size_t i = 0; i + ngram_size <= snt.size(); ++i) + // for (size_t i = 0; i < snt.size(); ++i) + { + mmTSA::tree_iterator m(&I); + size_t stop = min(snt.size(), i+ngram_size); + size_t k = i; + while (k < stop && m.extend(snt[k])) ++k; + // cout << i << " " << k-i << " " << m.str(&V) << endl; + if (k - i == ngram_size) + ++matched_ngrams; + } + } + printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n", + (100 * matched_ngrams / total_ngrams), ngram_size, + matched_ngrams, total_ngrams, file.c_str()); + } +} + +void +interpret_args(int ac, char* av[]) +{ + po::variables_map vm; + po::options_description o("Options"); + o.add_options() + + ("help,h", "print this message") + ("ngram-size,n", po::value(&ngram_size)->default_value(5), + "sample size") + ; + + po::options_description h("Hidden Options"); + h.add_options() + ("bname", po::value(&bname), "base name of corpus") + ("ifiles", po::value >(&ifiles), "input files") + ; + + h.add(o); + po::positional_options_description a; + a.add("bname",1); + a.add("ifiles",-1); + + po::store(po::command_line_parser(ac,av) + .options(h) + .positional(a) + .run(),vm); + po::notify(vm); + if (vm.count("help")) + { + std::cout << "\nusage:\n\t" << av[0] + << " [options] " << std::endl; + std::cout << o << std::endl; + exit(0); + } +}