mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-27 03:49:57 +03:00
152 lines
3.7 KiB
C++
152 lines
3.7 KiB
C++
#include <boost/program_options.hpp>
|
|
#include "mm/ug_bitext.h"
|
|
#include <string>
|
|
|
|
using namespace std;
|
|
using namespace Moses;
|
|
using namespace sapt;
|
|
|
|
namespace po=boost::program_options;
|
|
typedef L2R_Token<SimpleWordId> Token;
|
|
typedef mmBitext<Token> mmbitext;
|
|
typedef Bitext<Token>::tsa tsa;
|
|
|
|
string bname, L1, L2, Q1, Q2;
|
|
size_t maxhits;
|
|
void interpret_args(int ac, char* av[]);
|
|
|
|
|
|
void
|
|
write_sentence
|
|
(Ttrack<Token> const& T, uint32_t const sid, TokenIndex const& V, ostream& out)
|
|
{
|
|
Token const* t = T.sntStart(sid);
|
|
Token const* e = T.sntEnd(sid);
|
|
// size_t i = 0;
|
|
while (t < e)
|
|
{
|
|
// out << i++ << ":";
|
|
out << V[t->id()];
|
|
if (++t < e) out << " ";
|
|
}
|
|
}
|
|
|
|
bool
|
|
fill(string const& query, TSA<Token> const& tsa,
|
|
TokenIndex const& V, bitvector& v)
|
|
{
|
|
v.resize(tsa.getCorpus()->size());
|
|
Bitext<Token>::iter m(&tsa);
|
|
istringstream buf(query); string w;
|
|
while (buf >> w)
|
|
if (!m.extend(V[w]))
|
|
return false;
|
|
m.markSentences(v);
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
interpret_args(argc, argv);
|
|
if (Q1.empty() && Q2.empty()) exit(0);
|
|
|
|
boost::shared_ptr<mmbitext> B(new mmbitext); string w;
|
|
B->open(bname, L1, L2);
|
|
|
|
Bitext<Token>::iter m1(B->I1.get(), *B->V1, Q1);
|
|
if (Q1.size() && m1.size() == 0) exit(0);
|
|
|
|
Bitext<Token>::iter m2(B->I2.get(), *B->V2, Q2);
|
|
if (Q2.size() && m2.size() == 0) exit(0);
|
|
|
|
bitvector check(B->T1->size());
|
|
if (Q1.size() == 0 || Q2.size() == 0) check.set();
|
|
else (m2.markSentences(check));
|
|
|
|
Bitext<Token>::iter& m = m1.size() ? m1 : m2;
|
|
char const* x = m.lower_bound(-1);
|
|
char const* stop = m.upper_bound(-1);
|
|
uint64_t sid;
|
|
ushort off;
|
|
boost::taus88 rnd;
|
|
size_t N = m.approxOccurrenceCount();
|
|
maxhits = min(N, maxhits);
|
|
size_t k = 0; // selected
|
|
for (size_t i = 0; x < stop; ++i)
|
|
{
|
|
x = m.root->readSid(x,stop,sid);
|
|
x = m.root->readOffset(x,stop,off);
|
|
|
|
if (!check[sid]) continue;
|
|
size_t r = (N - i) * rnd()/(rnd.max()+1.) + k;
|
|
if (maxhits != N && r >= maxhits) continue;
|
|
++k;
|
|
|
|
size_t s1,s2,e1,e2; int po_fwd=-1,po_bwd=-1;
|
|
std::vector<unsigned char> caln;
|
|
// cout << sid << " " << B->docname(sid) << std::endl;
|
|
if (!B->find_trg_phr_bounds(sid, off, off+m.size(),
|
|
s1,s2,e1,e2,po_fwd,po_bwd,
|
|
&caln, NULL, &m == &m2))
|
|
{
|
|
// cout << "alignment failure" << std::endl;
|
|
}
|
|
|
|
std::cout << sid << " " << B->sid2docname(sid)
|
|
<< " dfwd=" << po_fwd << " dbwd=" << po_bwd
|
|
<< "\n";
|
|
|
|
write_sentence(*B->T1, sid, *B->V1, std::cout); std::cout << "\n";
|
|
write_sentence(*B->T2, sid, *B->V2, std::cout); std::cout << "\n";
|
|
B->write_yawat_alignment(sid,
|
|
m1.size() ? &m1 : NULL,
|
|
m2.size() ? &m2 : NULL, std::cout);
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
}
|
|
|
|
void
|
|
interpret_args(int ac, char* av[])
|
|
{
|
|
po::variables_map vm;
|
|
po::options_description o("Options");
|
|
o.add_options()
|
|
|
|
("help,h", "print this message")
|
|
("maxhits,n", po::value<size_t>(&maxhits)->default_value(25),
|
|
"max. number of hits")
|
|
("q1", po::value<string>(&Q1), "query in L1")
|
|
("q2", po::value<string>(&Q2), "query in L2")
|
|
;
|
|
|
|
po::options_description h("Hidden Options");
|
|
h.add_options()
|
|
("bname", po::value<string>(&bname), "base name of corpus")
|
|
("L1", po::value<string>(&L1), "L1 tag")
|
|
("L2", po::value<string>(&L2), "L2 tag")
|
|
;
|
|
|
|
h.add(o);
|
|
po::positional_options_description a;
|
|
a.add("bname",1);
|
|
a.add("L1",1);
|
|
a.add("L2",1);
|
|
|
|
po::store(po::command_line_parser(ac,av)
|
|
.options(h)
|
|
.positional(a)
|
|
.run(),vm);
|
|
po::notify(vm);
|
|
if (vm.count("help"))
|
|
{
|
|
std::cout << "\nusage:\n\t" << av[0]
|
|
<< " [options] [--q1=<L1string>] [--q2=<L2string>]" << std::endl;
|
|
std::cout << o << std::endl;
|
|
exit(0);
|
|
}
|
|
}
|