mosesdecoder/moses/TranslationModel/UG/fuzzy.cc
2015-06-03 12:55:58 +01:00

178 lines
4.8 KiB
C++

// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
#include <boost/program_options.hpp>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/format.hpp>
#include "mm/ug_bitext.h"
#include "mm/tpt_typedefs.h"
#include "mm/ug_prime_sampling1.h"
#include "generic/sorting/VectorIndexSorter.h"
#include "generic/sorting/NBestList.h"
#include <string>
using namespace std;
using namespace Moses;
using namespace Moses::bitext;
namespace po=boost::program_options;
using namespace boost::algorithm;
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef Bitext<Token>::tsa tsa;
typedef imTtrack<Token> imttrack;
typedef imTSA<Token> imtsa;
string bname, bname1, bname2, L1, L2, Q1, Q2;
size_t maxhits;
void interpret_args(int ac, char* av[]);
TokenIndex V1;
TokenIndex V2;
sptr<mmTtrack<Token> > C1;
sptr<mmTtrack<Token> > C2;
mmTSA<Token> I1;
void
open_bitext()
{
C1.reset(new mmTtrack<Token>);
if (L2.size())
{
bname1 = bname + L1 + ".";
bname2 = bname + L2 + ".";
}
else if (L1.size())
{
bname1 = bname;
bname2 = L1;
}
else bname1 = bname;
if (bname2.size()) C2.reset(new mmTtrack<Token>);
C1->open(bname1+"mct");
I1.open(bname1+"sfa", C1);
V1.open(bname1+"tdx");
V1.setDynamic(true);
if (bname2.size())
{
C2->open(bname2+"mct");
V2.open(bname2+"tdx");
}
}
sptr<imttrack>
read_input()
{
sptr<vector<vector<Token> > > crp(new vector<vector<Token> >);
crp->reserve(1000);
string line;
while (getline(cin,line))
{
crp->push_back(vector<Token>());
fill_token_seq(V1, line, crp->back());
}
sptr<imttrack> ret(new imttrack (crp));
return ret;
}
sptr<NBestList<uint32_t, VectorIndexSorter<float> > >
nbest(TSA<Token>::tree_iterator const& r, vector<float> const& hits,
vector<float>& score, VectorIndexSorter<float>& sorter,
size_t const nbest_size)
{
typedef NBestList<uint32_t, VectorIndexSorter<float> > nbest_list_t;
sptr<nbest_list_t> ret(new nbest_list_t(nbest_size, sorter));
bitvector mycheck(hits.size());
tsa::ArrayEntry I(r.lower_bound(-1));
char const* stop = r.upper_bound(-1);
while (I.next < stop)
{
r.root->readEntry(I.next,I);
if (mycheck[I.sid]) continue;
score[I.sid] = hits[I.sid] / r.root->getCorpus()->sntLen(I.sid);
ret->add(I.sid);
mycheck.set(I.sid);
}
return ret;
}
int main(int argc, char* argv[])
{
interpret_args(argc, argv);
open_bitext();
sptr<imttrack> icrp = read_input();
imtsa newIdx(icrp,NULL);
sptr<SentenceBias> hits = prime_sampling1(I1, newIdx, 1000);
vector<float> score(hits->size());
VectorIndexSorter<float> sorter(score);
for (size_t s = 0; s < icrp->size(); ++s)
{
size_t stop = icrp->sntLen(s);
Token const* t = icrp->sntStart(s);
cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl;
for (size_t i = 0; i < stop; ++i)
{
TSA<Token>::tree_iterator r(&I1);
for (size_t k = i; k < stop && r.extend(t[k].id()); ++k)
{
if (r.ca() < 3) continue;
cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl;
if (r.ca() > 10000) continue;
sptr<NBestList<uint32_t, VectorIndexSorter<float> > > top;
top = nbest(r, *hits, score, sorter, 5);
for (size_t n = 0; n < top->size(); ++n)
{
cout << "[" << n << ": " << score[(*top)[n]]
<< " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n"
<< toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n";
if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n";
cout << endl;
}
}
}
}
}
void
interpret_args(int ac, char* av[])
{
po::variables_map vm;
po::options_description o("Options");
o.add_options()
("help,h", "print this message")
("maxhits,n", po::value<size_t>(&maxhits)->default_value(25),
"max. number of hits")
("q1", po::value<string>(&Q1), "query in L1")
("q2", po::value<string>(&Q2), "query in L2")
;
po::options_description h("Hidden Options");
h.add_options()
("bname", po::value<string>(&bname), "base name of corpus")
("L1", po::value<string>(&L1), "L1 tag")
("L2", po::value<string>(&L2), "L2 tag")
;
h.add(o);
po::positional_options_description a;
a.add("bname",1);
a.add("L1",1);
a.add("L2",1);
po::store(po::command_line_parser(ac,av)
.options(h)
.positional(a)
.run(),vm);
po::notify(vm);
if (vm.count("help"))
{
cout << "\nusage:\n\t" << av[0]
<< " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl;
cout << o << endl;
exit(0);
}
}