// for each word in the input, keep track of the longest matching ngram covering it #include #include #include #include #include #include #include "mm/ug_bitext.h" #include "generic/file_io/ug_stream.h" #include #include using namespace Moses; using namespace sapt; using namespace std; using namespace boost; typedef sapt::L2R_Token Token; typedef mmBitext bitext_t; struct mycmp { bool operator() (pair const& a, pair const& b) const { return a.second > b.second; } }; string basename(string const path, string const suffix) { size_t p = path.find_last_of("/"); size_t k = path.size() - suffix.size(); cout << path << " " << suffix << endl; cout << path.substr(0,p) << " " << path.substr(k) << endl; return path.substr(p, suffix == &path[k] ? k-p : path.size() - p); } int main(int argc, char* argv[]) { bitext_t B; B.open(argv[1],argv[2],argv[3]); B.V1->setDynamic(true); string line; string ifile = argv[4]; string docname = basename(ifile, string(".") + argv[2] + ".gz"); boost::iostreams::filtering_istream in; ugdiss::open_input_stream(ifile,in); while(getline(in,line)) { cout << line << " [" << docname << "]" << endl; vector snt; B.V1->fillIdSeq(line,snt); vector match(snt.size(),0); for (size_t i = 0; i < snt.size(); ++i) { bitext_t::iter m(B.I1.get()); for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k); for (size_t j = 0; j < m.size(); ++j) match[i+j] = max(match[i+j], m.size()); } for (size_t i = 0; i < snt.size(); ++i) cout << setw(3) << match[i] << " " << (*B.V1)[snt[i]] << endl; } }