More analysis tools; work in progress.

2024-12-29 23:12:41 +03:00 · 2015-10-30 14:32:12 +00:00 · 2015-10-30 14:32:12 +00:00 · dc8ad89945
commit dc8ad89945
parent b40bc81811
2 changed files with 137 additions and 0 deletions
--- a/moses/TranslationModel/UG/check-coverage2.cc
+++ b/moses/TranslationModel/UG/check-coverage2.cc
@ -0,0 +1,67 @@
+// for each word in the input, keep track of the longest matching ngram covering it
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+#include "mm/ug_bitext.h"
+#include "generic/file_io/ug_stream.h"
+#include <string>
+#include <sstream>
+
+using namespace Moses;
+using namespace sapt;
+using namespace std;
+using namespace boost;
+
+typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
+typedef mmBitext<Token> bitext_t;
+
+struct mycmp 
+{
+  bool operator() (pair<string,uint32_t> const& a, 
+		   pair<string,uint32_t> const& b) const
+  {
+    return a.second > b.second;
+  }
+};
+
+string 
+basename(string const path, string const suffix)
+{
+  size_t p = path.find_last_of("/");
+  size_t k = path.size() - suffix.size();
+  cout << path << " " << suffix << endl;
+  cout << path.substr(0,p) << " " << path.substr(k) << endl;
+  return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
+}
+
+int main(int argc, char* argv[])
+{
+  bitext_t B;
+  B.open(argv[1],argv[2],argv[3]);
+  B.V1->setDynamic(true);
+  string line;
+  string ifile = argv[4];
+  string docname = basename(ifile, string(".") + argv[2] + ".gz");
+  boost::iostreams::filtering_istream in;
+  ugdiss::open_input_stream(ifile,in);
+  while(getline(in,line))
+    {
+      cout << line << " [" << docname << "]" << endl;
+      vector<id_type> snt;
+      B.V1->fillIdSeq(line,snt);
+      vector<size_t> match(snt.size(),0);
+      for (size_t i = 0; i < snt.size(); ++i)
+	{
+	  bitext_t::iter m(B.I1.get());
+	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
+	  for (size_t j = 0; j < m.size(); ++j) 
+
+	    match[i+j] = max(match[i+j], m.size());
+	}
+      for (size_t i = 0; i < snt.size(); ++i)
+	cout << setw(3) << match[i] << " " << (*B.V1)[snt[i]] << endl;
+    }
+}
--- a/moses/TranslationModel/UG/check-coverage3.cc
+++ b/moses/TranslationModel/UG/check-coverage3.cc
@ -0,0 +1,70 @@
+// #include "mmsapt.h"
+// #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+// #include "moses/TranslationTask.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+#include "mm/ug_bitext.h"
+#include "generic/file_io/ug_stream.h"
+#include <string>
+#include <sstream>
+#include "mm/ug_bitext_sampler.h"
+
+using namespace Moses;
+using namespace sapt;
+using namespace std;
+using namespace boost;
+
+typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
+typedef mmBitext<Token> bitext_t;
+
+struct mycmp 
+{
+  bool operator() (pair<string,uint32_t> const& a, 
+		   pair<string,uint32_t> const& b) const
+  {
+    return a.second > b.second;
+  }
+};
+
+string 
+basename(string const path, string const suffix)
+{
+  size_t p = path.find_last_of("/");
+  size_t k = path.size() - suffix.size();
+  cout << path << " " << suffix << endl;
+  cout << path.substr(0,p) << " " << path.substr(k) << endl;
+  return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
+}
+
+int main(int argc, char* argv[])
+{
+  boost::intrusive_ptr<bitext_t> B(new bitext_t);
+  B->open(argv[1],argv[2],argv[3]);
+  string line;
+  string ifile = argv[4];
+  string docname = basename(ifile, string(".") + argv[2] + ".gz");
+  boost::iostreams::filtering_istream in;
+  ugdiss::open_input_stream(ifile,in);
+  while(getline(in,line))
+    {
+      cout << line << " [" << docname << "]" << endl;
+      vector<id_type> snt;
+      B->V1->fillIdSeq(line,snt);
+      for (size_t i = 0; i < snt.size(); ++i)
+	{
+	  bitext_t::iter m(B->I1.get());
+	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
+	    {
+	      SPTR<SamplingBias const> zilch;
+	      BitextSampler<Token> s(B.get(), m, zilch, 1000, 1000, 
+				     sapt::random_sampling);
+	      s();
+	      cout << m.size() << " " << s.stats()->trg.size() << endl;
+	    }
+	}
+    }
+}