mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 23:12:41 +03:00
More analysis tools; work in progress.
This commit is contained in:
parent
b40bc81811
commit
dc8ad89945
67
moses/TranslationModel/UG/check-coverage2.cc
Normal file
67
moses/TranslationModel/UG/check-coverage2.cc
Normal file
@ -0,0 +1,67 @@
|
||||
// for each word in the input, keep track of the longest matching ngram covering it
|
||||
#include <boost/foreach.hpp>
|
||||
#include <boost/format.hpp>
|
||||
#include <boost/tokenizer.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "mm/ug_bitext.h"
|
||||
#include "generic/file_io/ug_stream.h"
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
using namespace Moses;
|
||||
using namespace sapt;
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
|
||||
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
||||
typedef mmBitext<Token> bitext_t;
|
||||
|
||||
struct mycmp
|
||||
{
|
||||
bool operator() (pair<string,uint32_t> const& a,
|
||||
pair<string,uint32_t> const& b) const
|
||||
{
|
||||
return a.second > b.second;
|
||||
}
|
||||
};
|
||||
|
||||
string
|
||||
basename(string const path, string const suffix)
|
||||
{
|
||||
size_t p = path.find_last_of("/");
|
||||
size_t k = path.size() - suffix.size();
|
||||
cout << path << " " << suffix << endl;
|
||||
cout << path.substr(0,p) << " " << path.substr(k) << endl;
|
||||
return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bitext_t B;
|
||||
B.open(argv[1],argv[2],argv[3]);
|
||||
B.V1->setDynamic(true);
|
||||
string line;
|
||||
string ifile = argv[4];
|
||||
string docname = basename(ifile, string(".") + argv[2] + ".gz");
|
||||
boost::iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(ifile,in);
|
||||
while(getline(in,line))
|
||||
{
|
||||
cout << line << " [" << docname << "]" << endl;
|
||||
vector<id_type> snt;
|
||||
B.V1->fillIdSeq(line,snt);
|
||||
vector<size_t> match(snt.size(),0);
|
||||
for (size_t i = 0; i < snt.size(); ++i)
|
||||
{
|
||||
bitext_t::iter m(B.I1.get());
|
||||
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
|
||||
for (size_t j = 0; j < m.size(); ++j)
|
||||
|
||||
match[i+j] = max(match[i+j], m.size());
|
||||
}
|
||||
for (size_t i = 0; i < snt.size(); ++i)
|
||||
cout << setw(3) << match[i] << " " << (*B.V1)[snt[i]] << endl;
|
||||
}
|
||||
}
|
70
moses/TranslationModel/UG/check-coverage3.cc
Normal file
70
moses/TranslationModel/UG/check-coverage3.cc
Normal file
@ -0,0 +1,70 @@
|
||||
// #include "mmsapt.h"
|
||||
// #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
|
||||
// #include "moses/TranslationTask.h"
|
||||
#include <boost/foreach.hpp>
|
||||
#include <boost/format.hpp>
|
||||
#include <boost/tokenizer.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "mm/ug_bitext.h"
|
||||
#include "generic/file_io/ug_stream.h"
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include "mm/ug_bitext_sampler.h"
|
||||
|
||||
using namespace Moses;
|
||||
using namespace sapt;
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
|
||||
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
||||
typedef mmBitext<Token> bitext_t;
|
||||
|
||||
struct mycmp
|
||||
{
|
||||
bool operator() (pair<string,uint32_t> const& a,
|
||||
pair<string,uint32_t> const& b) const
|
||||
{
|
||||
return a.second > b.second;
|
||||
}
|
||||
};
|
||||
|
||||
string
|
||||
basename(string const path, string const suffix)
|
||||
{
|
||||
size_t p = path.find_last_of("/");
|
||||
size_t k = path.size() - suffix.size();
|
||||
cout << path << " " << suffix << endl;
|
||||
cout << path.substr(0,p) << " " << path.substr(k) << endl;
|
||||
return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
boost::intrusive_ptr<bitext_t> B(new bitext_t);
|
||||
B->open(argv[1],argv[2],argv[3]);
|
||||
string line;
|
||||
string ifile = argv[4];
|
||||
string docname = basename(ifile, string(".") + argv[2] + ".gz");
|
||||
boost::iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(ifile,in);
|
||||
while(getline(in,line))
|
||||
{
|
||||
cout << line << " [" << docname << "]" << endl;
|
||||
vector<id_type> snt;
|
||||
B->V1->fillIdSeq(line,snt);
|
||||
for (size_t i = 0; i < snt.size(); ++i)
|
||||
{
|
||||
bitext_t::iter m(B->I1.get());
|
||||
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
|
||||
{
|
||||
SPTR<SamplingBias const> zilch;
|
||||
BitextSampler<Token> s(B.get(), m, zilch, 1000, 1000,
|
||||
sapt::random_sampling);
|
||||
s();
|
||||
cout << m.size() << " " << s.stats()->trg.size() << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user