Initial check-in.

This commit is contained in:
Ulrich Germann 2014-02-08 17:50:26 +00:00
parent 584626a767
commit 784654c831
2 changed files with 133 additions and 0 deletions

View File

@ -0,0 +1,69 @@
// count words in a memory-mapped corpus
#include "ug_mm_ttrack.h"
#include "tpt_tokenindex.h"
#include "ug_corpus_token.h"
#include <string>
#include <vector>
#include <cassert>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include <iomanip>
#include "ug_typedefs.h"
#include "tpt_pickler.h"
// #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
// #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
// #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <algorithm>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
typedef L2R_Token<SimpleWordId> Token;
// typedef mmTSA<Token>::tree_iterator iter;
typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
#define CACHING_THRESHOLD 1000
mmTtrack<Token> T; // token tracks
TokenIndex V; // vocabs
// mmTSA<Token> I; // suffix arrays
void interpret_args(int ac, char* av[]);
string bname;
bool echo;
int main(int argc, char* argv[])
{
interpret_args(argc,argv);
T.open(bname+".mct");
V.open(bname+".tdx");
vector<size_t> cnt(V.ksize(),0);
for (size_t sid = 0; sid < T.size(); ++sid)
{
Token const* stop = T.sntEnd(sid);
for (Token const* t = T.sntStart(sid); t < stop; ++cnt[(t++)->id()]);
}
for (size_t wid = 2; wid < V.ksize(); ++wid)
cout << V[wid] << " " << cnt[wid] << endl;
exit(0);
}
void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
po::variables_map vm;
po::options_description o("Options");
po::options_description h("Hidden Options");
po::positional_options_description a;
o.add_options()
("help,h", "print this message")
;
h.add_options()
("bname", po::value<string>(&bname), "base name")
;
a.add("bname",1);
get_options(ac,av,h.add(o),a,vm);
}

View File

@ -0,0 +1,64 @@
// -*- c++ -*-
// test program for dynamic tsas
#include <boost/program_options.hpp>
#include <boost/program_options/options_description.hpp>
#include <boost/program_options/parsers.hpp>
#include <boost/program_options/variables_map.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <string>
#include <sys/types.h>
#include <sys/wait.h>
#include "ug_conll_record.h"
#include "tpt_tokenindex.h"
#include "ug_mm_ttrack.h"
#include "tpt_pickler.h"
#include "ug_deptree.h"
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "ug_im_ttrack.h"
#include "ug_bitext.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace boost;
using namespace Moses::bitext;
namespace po=boost::program_options;
typedef L2R_Token<SimpleWordId> L2R;
int main()
{
sptr<imBitext<L2R> > bt(new imBitext<L2R>());
string s1,s2,aln;
vector<string> S1,S2,ALN;
while (getline(cin,s1) && getline(cin,s2) && getline(cin,aln))
{
S1.push_back(s1);
S2.push_back(s2);
ALN.push_back(aln);
}
bt = bt->add(S1,S2,ALN);
TSA<L2R>::tree_iterator m(bt->I2.get());
m.down();
do {
char const* p = m.lower_bound(-1);
tsa::ArrayEntry I(p);
do {
m.root->readEntry(I.next,I);
L2R const* stop = m.root->getCorpus()->sntEnd(I.sid);
for (L2R const* t = m.root->getCorpus()->getToken(I); t < stop; ++t)
cout << (*bt->V2)[t->id()] << " ";
cout << endl;
} while (I.next < m.upper_bound(-1));
} while (m.over());
}