diff --git a/compile.sh b/compile.sh index 01f86bc12..10de8c406 100755 --- a/compile.sh +++ b/compile.sh @@ -4,5 +4,5 @@ set -e -o pipefail opt=$(pwd)/opt -./bjam --with-irstlm=$opt --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@ +./bjam --with-irstlm=$opt/irstlm-5.80.08 --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@ diff --git a/contrib/Makefiles/install-dependencies.gmake b/contrib/Makefiles/install-dependencies.gmake index bdbe7e43a..ce29e5172 100644 --- a/contrib/Makefiles/install-dependencies.gmake +++ b/contrib/Makefiles/install-dependencies.gmake @@ -31,7 +31,7 @@ BUILD_DIR = $(CWD)/opt/build/${URL} # you can also specify specific prefixes for different packages: XMLRPC_PREFIX ?= ${PREFIX} CMPH_PREFIX ?= ${PREFIX} -IRSTLM_PREFIX ?= ${PREFIX} +IRSTLM_PREFIX ?= ${PREFIX}/irstlm-5.80.08 BOOST_PREFIX ?= ${PREFIX} # currently, the full enchilada means xmlrpc-c, cmph, irstlm, boost diff --git a/moses/TranslationModel/UG/check-coverage3.cc b/moses/TranslationModel/UG/check-coverage3.cc index b41ca9025..5eb18164a 100644 --- a/moses/TranslationModel/UG/check-coverage3.cc +++ b/moses/TranslationModel/UG/check-coverage3.cc @@ -13,6 +13,8 @@ #include #include "mm/ug_bitext_sampler.h" +#include +namespace po=boost::program_options; using namespace Moses; using namespace sapt; using namespace std; @@ -21,6 +23,13 @@ using namespace boost; typedef sapt::L2R_Token Token; typedef mmBitext bitext_t; +size_t topN; +string docname; +string reference_file; +string domain_name; +string bname, L1, L2; +string ifile; + struct mycmp { bool operator() (pair const& a, @@ -30,36 +39,70 @@ struct mycmp } }; + + +void interpret_args(int ac, char* av[]); + string -basename(string const path, string const suffix) +basename(string const path) { size_t p = path.find_last_of("/"); - size_t k = path.size() - suffix.size(); - cout << path << " " << suffix << endl; - cout << path.substr(0,p) << " " << path.substr(k) << endl; - return path.substr(p+1, suffix == &path[k] ? k-p-1 : path.size() - p); + string dot = "."; + size_t k = path.find((dot + L1),p+1); + if (k == string::npos) k = path.find(dot + L1 + ".gz"); + if (k == string::npos) return path.substr(p+1); + return path.substr(p+1, k-p-1); +} + +void +print_evidence_list(bitext_t const& B, std::map const& indoc) +{ + typedef std::map::const_iterator iter; + typedef pair item; + vector where; + where.reserve(indoc.size()); + + for (iter d = indoc.begin(); d != indoc.end(); ++d) + where.push_back(item(d->second, B.docid2name(d->first))); + sort(where.begin(),where.end(),greater()); + BOOST_FOREACH(item const& doc, where) + if (domain_name == doc.second) + cout << (boost::format("\t\t%4d ! %s") % doc.first % doc.second) << endl; + else + cout << (boost::format("\t\t%4d %s") % doc.first % doc.second) << endl; } int main(int argc, char* argv[]) { - boost::intrusive_ptr B(new bitext_t); - B->open(argv[1],argv[2],argv[3]); - string line; - string ifile = argv[4]; - string docname = basename(ifile, string(".") + argv[2] + ".gz"); - id_type docid = B->docname2docid(docname); - boost::iostreams::filtering_istream in; + boost::shared_ptr B(new bitext_t); + interpret_args(argc,argv); + + B->open(bname, L1, L2); + string line, refline; + if (domain_name == "" && ifile != "-") + domain_name = basename(ifile); + + id_type docid = B->docname2docid(domain_name); + boost::iostreams::filtering_istream in, ref; ugdiss::open_input_stream(ifile,in); + if (reference_file.size()) + ugdiss::open_input_stream(reference_file,ref); + while(getline(in,line)) { - cout << line << " [" << docname << "]" << endl; + if (reference_file.size()) getline(ref, refline); + cout << string(80,'-') << endl; + cout << " [" << domain_name << "]" << endl; + cout << line << endl; + if (refline.size()) cout << refline << endl; + cout << string(80,'-') << endl; vector snt; B->V1->fillIdSeq(line,snt); for (size_t i = 0; i < snt.size(); ++i) { bitext_t::iter m(B->I1.get()); for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k); - for (size_t num_occurrences = m.ca(); m.size(); m.up()) + for (size_t num_occurrences = 0; m.size(); m.up()) { if (size_t(m.ca()) == num_occurrences) continue; num_occurrences = m.ca(); @@ -68,40 +111,45 @@ int main(int argc, char* argv[]) sapt::random_sampling); s(); if (s.stats()->trg.size() == 0) continue; - // if (s.stats()->indoc[docname] > 10) continue; sapt::pstats::indoc_map_t::const_iterator d = s.stats()->indoc.find(docid); size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0; - cout << m.size() << " : " << m.str(B->V1.get()) << " (" + cout << m.str(B->V1.get()) << " (" << s.stats()->trg.size() << " entries; " << indoccnt << "/" << s.stats()->good - << " samples in domain)" << endl; + << " samples in domain; " << num_occurrences + << " occ.)" << endl; vector > ppairs; PhrasePair::SortDescendingByJointCount sorter; expand(m,*B,*s.stats(),ppairs,NULL); sort(ppairs.begin(),ppairs.end(),sorter); boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)"); + size_t ctr = 0; + bool skipped_some = false; BOOST_FOREACH(PhrasePair& ppair, ppairs) { - if (ppair.joint * 100 < ppair.good1) break; + if (++ctr > topN && ppair.indoc.find(docid) == ppair.indoc.end()) + { + skipped_some = true; + continue; + } + if (skipped_some) + { + cout << string(17,' ') << "..." << endl; + skipped_some = false; + } + // if (ppair.joint * 100 < ppair.good1) break; ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1; ppair.good2 = max(ppair.good2, ppair.joint); -#if 0 +#if 1 cout << "\t" << (fmt % ppair.joint % ppair.good1 % ppair.good2 % B->T2->pid2str(B->V2.get(),ppair.p2) % (float(ppair.joint)/ppair.good1) % (float(ppair.joint)/ppair.good2) ) << "\n"; - typedef std::map::const_iterator iter; - for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d) - { - // if (d != ppair.indoc.begin()) cout << "; "; - cout << (boost::format("\t\t%4d %s") % d->second - % B->docid2name(d->first)) - << endl; - } + print_evidence_list(*B, ppair.indoc); cout << endl; #else cout << "\t" @@ -126,3 +174,48 @@ int main(int argc, char* argv[]) } } } + +void +interpret_args(int ac, char* av[]) +{ + po::variables_map vm; + po::options_description o("Options"); + o.add_options() + + ("help,h", "print this message") + ("top,n", po::value(&topN)->default_value(5), + "max. number of entries to show") + ("domain,D", po::value(&domain_name), + "domain name (when reading from stdin)") + ("reference,r", po::value(&reference_file), + "reference file") + ; + + po::options_description h("Hidden Options"); + h.add_options() + ("bname", po::value(&bname), "base name of corpus") + ("L1", po::value(&L1), "L1 tag") + ("L2", po::value(&L2), "L2 tag") + ("input", po::value(&ifile), "input file") + ; + + h.add(o); + po::positional_options_description a; + a.add("bname",1); + a.add("L1",1); + a.add("L2",1); + a.add("input",1); + + po::store(po::command_line_parser(ac,av) + .options(h) + .positional(a) + .run(),vm); + po::notify(vm); + if (vm.count("help")) + { + std::cout << "\nusage:\n\t" << av[0] + << " [options] " << std::endl; + std::cout << o << std::endl; + exit(0); + } +} diff --git a/moses/TranslationModel/UG/ptable-lookup-corpus.cc b/moses/TranslationModel/UG/ptable-lookup-corpus.cc index 6bc515658..8818570d4 100644 --- a/moses/TranslationModel/UG/ptable-lookup-corpus.cc +++ b/moses/TranslationModel/UG/ptable-lookup-corpus.cc @@ -29,6 +29,13 @@ int main(int argc, char const* argv[]) // Only lookup each phrase once unordered_set seen; + string context_weight_spec; + params.SetParameter(context_weight_spec,"context-weights",string("")); + boost::shared_ptr scope(new ContextScope); + boost::shared_ptr none; + if (context_weight_spec.size()) + scope->SetContextWeights(context_weight_spec); + string line; while (true) { // Input line @@ -57,7 +64,8 @@ int main(int argc, char const* argv[]) // Setup task for phrase boost::shared_ptr ttask; - ttask = TranslationTask::create(phrase); + ttask = TranslationTask::create(phrase, none, scope); + // Support model combinations (PhraseDictionaryGroup) BOOST_FOREACH(PhraseDictionary* p, PhraseDictionary::GetColl()) { p->InitializeForInput(ttask); diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h index d079a0af8..592d86866 100644 --- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h +++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h @@ -56,7 +56,7 @@ namespace sapt { if (m_specs.find("j") != std::string::npos) (*dest)[i++] = log(pp.joint); if (m_specs.find("r2") != std::string::npos) - (*dest)[++i] = log(pp.raw2); + (*dest)[i] = log(pp.raw2); } }; } // namespace sapt diff --git a/moses/TranslationModel/UG/util/tokenindex.dump.cc b/moses/TranslationModel/UG/util/tokenindex.dump.cc index 0e885630f..01dfeb03e 100644 --- a/moses/TranslationModel/UG/util/tokenindex.dump.cc +++ b/moses/TranslationModel/UG/util/tokenindex.dump.cc @@ -12,7 +12,7 @@ #include using namespace std; -using namespace ugdiss; +using namespace sapt; int main(int argc,char* argv[]) { diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 173ad5d40..c8571ffce 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -235,6 +235,10 @@ check(std::map const& param, { std::map::const_iterator m = param.find(key); if(m == param.end()) return false; + + if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN) + return xmlrpc_c::value_boolean(m->second); + std::string val = string(xmlrpc_c::value_string(m->second)); if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true; return false; diff --git a/run-regtests.sh b/run-regtests.sh index 843ee3a94..f2c02aaa8 100755 --- a/run-regtests.sh +++ b/run-regtests.sh @@ -13,7 +13,7 @@ eval set -- "$args" noserver=false; full=false; j=$(getconf _NPROCESSORS_ONLN) -irstlm=$opt +irstlm=$opt/irstlm-5.80.08 boost=$opt cmph=$opt xmlrpc=--with-xmlrpc-c\=$opt