mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-30 15:34:01 +03:00
Merge ../mosesdecoder into perf_moses2
This commit is contained in:
commit
c9a07dd25c
@ -4,5 +4,5 @@
|
|||||||
|
|
||||||
set -e -o pipefail
|
set -e -o pipefail
|
||||||
opt=$(pwd)/opt
|
opt=$(pwd)/opt
|
||||||
./bjam --with-irstlm=$opt --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@
|
./bjam --with-irstlm=$opt/irstlm-5.80.08 --with-boost=$opt --with-cmph=$opt --with-xmlrpc-c=$opt --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ BUILD_DIR = $(CWD)/opt/build/${URL}
|
|||||||
# you can also specify specific prefixes for different packages:
|
# you can also specify specific prefixes for different packages:
|
||||||
XMLRPC_PREFIX ?= ${PREFIX}
|
XMLRPC_PREFIX ?= ${PREFIX}
|
||||||
CMPH_PREFIX ?= ${PREFIX}
|
CMPH_PREFIX ?= ${PREFIX}
|
||||||
IRSTLM_PREFIX ?= ${PREFIX}
|
IRSTLM_PREFIX ?= ${PREFIX}/irstlm-5.80.08
|
||||||
BOOST_PREFIX ?= ${PREFIX}
|
BOOST_PREFIX ?= ${PREFIX}
|
||||||
|
|
||||||
# currently, the full enchilada means xmlrpc-c, cmph, irstlm, boost
|
# currently, the full enchilada means xmlrpc-c, cmph, irstlm, boost
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include "mm/ug_bitext_sampler.h"
|
#include "mm/ug_bitext_sampler.h"
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
namespace po=boost::program_options;
|
||||||
using namespace Moses;
|
using namespace Moses;
|
||||||
using namespace sapt;
|
using namespace sapt;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -21,6 +23,13 @@ using namespace boost;
|
|||||||
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
||||||
typedef mmBitext<Token> bitext_t;
|
typedef mmBitext<Token> bitext_t;
|
||||||
|
|
||||||
|
size_t topN;
|
||||||
|
string docname;
|
||||||
|
string reference_file;
|
||||||
|
string domain_name;
|
||||||
|
string bname, L1, L2;
|
||||||
|
string ifile;
|
||||||
|
|
||||||
struct mycmp
|
struct mycmp
|
||||||
{
|
{
|
||||||
bool operator() (pair<string,uint32_t> const& a,
|
bool operator() (pair<string,uint32_t> const& a,
|
||||||
@ -30,36 +39,70 @@ struct mycmp
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void interpret_args(int ac, char* av[]);
|
||||||
|
|
||||||
string
|
string
|
||||||
basename(string const path, string const suffix)
|
basename(string const path)
|
||||||
{
|
{
|
||||||
size_t p = path.find_last_of("/");
|
size_t p = path.find_last_of("/");
|
||||||
size_t k = path.size() - suffix.size();
|
string dot = ".";
|
||||||
cout << path << " " << suffix << endl;
|
size_t k = path.find((dot + L1),p+1);
|
||||||
cout << path.substr(0,p) << " " << path.substr(k) << endl;
|
if (k == string::npos) k = path.find(dot + L1 + ".gz");
|
||||||
return path.substr(p+1, suffix == &path[k] ? k-p-1 : path.size() - p);
|
if (k == string::npos) return path.substr(p+1);
|
||||||
|
return path.substr(p+1, k-p-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
print_evidence_list(bitext_t const& B, std::map<uint32_t, uint32_t> const& indoc)
|
||||||
|
{
|
||||||
|
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
|
||||||
|
typedef pair<size_t,string> item;
|
||||||
|
vector<item> where;
|
||||||
|
where.reserve(indoc.size());
|
||||||
|
|
||||||
|
for (iter d = indoc.begin(); d != indoc.end(); ++d)
|
||||||
|
where.push_back(item(d->second, B.docid2name(d->first)));
|
||||||
|
sort(where.begin(),where.end(),greater<item>());
|
||||||
|
BOOST_FOREACH(item const& doc, where)
|
||||||
|
if (domain_name == doc.second)
|
||||||
|
cout << (boost::format("\t\t%4d ! %s") % doc.first % doc.second) << endl;
|
||||||
|
else
|
||||||
|
cout << (boost::format("\t\t%4d %s") % doc.first % doc.second) << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
boost::intrusive_ptr<bitext_t> B(new bitext_t);
|
boost::shared_ptr<bitext_t> B(new bitext_t);
|
||||||
B->open(argv[1],argv[2],argv[3]);
|
interpret_args(argc,argv);
|
||||||
string line;
|
|
||||||
string ifile = argv[4];
|
B->open(bname, L1, L2);
|
||||||
string docname = basename(ifile, string(".") + argv[2] + ".gz");
|
string line, refline;
|
||||||
id_type docid = B->docname2docid(docname);
|
if (domain_name == "" && ifile != "-")
|
||||||
boost::iostreams::filtering_istream in;
|
domain_name = basename(ifile);
|
||||||
|
|
||||||
|
id_type docid = B->docname2docid(domain_name);
|
||||||
|
boost::iostreams::filtering_istream in, ref;
|
||||||
ugdiss::open_input_stream(ifile,in);
|
ugdiss::open_input_stream(ifile,in);
|
||||||
|
if (reference_file.size())
|
||||||
|
ugdiss::open_input_stream(reference_file,ref);
|
||||||
|
|
||||||
while(getline(in,line))
|
while(getline(in,line))
|
||||||
{
|
{
|
||||||
cout << line << " [" << docname << "]" << endl;
|
if (reference_file.size()) getline(ref, refline);
|
||||||
|
cout << string(80,'-') << endl;
|
||||||
|
cout << " [" << domain_name << "]" << endl;
|
||||||
|
cout << line << endl;
|
||||||
|
if (refline.size()) cout << refline << endl;
|
||||||
|
cout << string(80,'-') << endl;
|
||||||
vector<id_type> snt;
|
vector<id_type> snt;
|
||||||
B->V1->fillIdSeq(line,snt);
|
B->V1->fillIdSeq(line,snt);
|
||||||
for (size_t i = 0; i < snt.size(); ++i)
|
for (size_t i = 0; i < snt.size(); ++i)
|
||||||
{
|
{
|
||||||
bitext_t::iter m(B->I1.get());
|
bitext_t::iter m(B->I1.get());
|
||||||
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
|
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
|
||||||
for (size_t num_occurrences = m.ca(); m.size(); m.up())
|
for (size_t num_occurrences = 0; m.size(); m.up())
|
||||||
{
|
{
|
||||||
if (size_t(m.ca()) == num_occurrences) continue;
|
if (size_t(m.ca()) == num_occurrences) continue;
|
||||||
num_occurrences = m.ca();
|
num_occurrences = m.ca();
|
||||||
@ -68,40 +111,45 @@ int main(int argc, char* argv[])
|
|||||||
sapt::random_sampling);
|
sapt::random_sampling);
|
||||||
s();
|
s();
|
||||||
if (s.stats()->trg.size() == 0) continue;
|
if (s.stats()->trg.size() == 0) continue;
|
||||||
// if (s.stats()->indoc[docname] > 10) continue;
|
|
||||||
sapt::pstats::indoc_map_t::const_iterator d
|
sapt::pstats::indoc_map_t::const_iterator d
|
||||||
= s.stats()->indoc.find(docid);
|
= s.stats()->indoc.find(docid);
|
||||||
size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0;
|
size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0;
|
||||||
cout << m.size() << " : " << m.str(B->V1.get()) << " ("
|
cout << m.str(B->V1.get()) << " ("
|
||||||
<< s.stats()->trg.size() << " entries; "
|
<< s.stats()->trg.size() << " entries; "
|
||||||
<< indoccnt << "/" << s.stats()->good
|
<< indoccnt << "/" << s.stats()->good
|
||||||
<< " samples in domain)" << endl;
|
<< " samples in domain; " << num_occurrences
|
||||||
|
<< " occ.)" << endl;
|
||||||
vector<PhrasePair<Token> > ppairs;
|
vector<PhrasePair<Token> > ppairs;
|
||||||
PhrasePair<Token>::SortDescendingByJointCount sorter;
|
PhrasePair<Token>::SortDescendingByJointCount sorter;
|
||||||
expand(m,*B,*s.stats(),ppairs,NULL);
|
expand(m,*B,*s.stats(),ppairs,NULL);
|
||||||
sort(ppairs.begin(),ppairs.end(),sorter);
|
sort(ppairs.begin(),ppairs.end(),sorter);
|
||||||
boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)");
|
boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)");
|
||||||
|
size_t ctr = 0;
|
||||||
|
bool skipped_some = false;
|
||||||
BOOST_FOREACH(PhrasePair<Token>& ppair, ppairs)
|
BOOST_FOREACH(PhrasePair<Token>& ppair, ppairs)
|
||||||
{
|
{
|
||||||
if (ppair.joint * 100 < ppair.good1) break;
|
if (++ctr > topN && ppair.indoc.find(docid) == ppair.indoc.end())
|
||||||
|
{
|
||||||
|
skipped_some = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (skipped_some)
|
||||||
|
{
|
||||||
|
cout << string(17,' ') << "..." << endl;
|
||||||
|
skipped_some = false;
|
||||||
|
}
|
||||||
|
// if (ppair.joint * 100 < ppair.good1) break;
|
||||||
ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1;
|
ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1;
|
||||||
ppair.good2 = max(ppair.good2, ppair.joint);
|
ppair.good2 = max(ppair.good2, ppair.joint);
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
cout << "\t"
|
cout << "\t"
|
||||||
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
|
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
|
||||||
% B->T2->pid2str(B->V2.get(),ppair.p2)
|
% B->T2->pid2str(B->V2.get(),ppair.p2)
|
||||||
% (float(ppair.joint)/ppair.good1)
|
% (float(ppair.joint)/ppair.good1)
|
||||||
% (float(ppair.joint)/ppair.good2)
|
% (float(ppair.joint)/ppair.good2)
|
||||||
) << "\n";
|
) << "\n";
|
||||||
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
|
print_evidence_list(*B, ppair.indoc);
|
||||||
for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
|
|
||||||
{
|
|
||||||
// if (d != ppair.indoc.begin()) cout << "; ";
|
|
||||||
cout << (boost::format("\t\t%4d %s") % d->second
|
|
||||||
% B->docid2name(d->first))
|
|
||||||
<< endl;
|
|
||||||
}
|
|
||||||
cout << endl;
|
cout << endl;
|
||||||
#else
|
#else
|
||||||
cout << "\t"
|
cout << "\t"
|
||||||
@ -126,3 +174,48 @@ int main(int argc, char* argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
interpret_args(int ac, char* av[])
|
||||||
|
{
|
||||||
|
po::variables_map vm;
|
||||||
|
po::options_description o("Options");
|
||||||
|
o.add_options()
|
||||||
|
|
||||||
|
("help,h", "print this message")
|
||||||
|
("top,n", po::value<size_t>(&topN)->default_value(5),
|
||||||
|
"max. number of entries to show")
|
||||||
|
("domain,D", po::value<string>(&domain_name),
|
||||||
|
"domain name (when reading from stdin)")
|
||||||
|
("reference,r", po::value<string>(&reference_file),
|
||||||
|
"reference file")
|
||||||
|
;
|
||||||
|
|
||||||
|
po::options_description h("Hidden Options");
|
||||||
|
h.add_options()
|
||||||
|
("bname", po::value<string>(&bname), "base name of corpus")
|
||||||
|
("L1", po::value<string>(&L1), "L1 tag")
|
||||||
|
("L2", po::value<string>(&L2), "L2 tag")
|
||||||
|
("input", po::value<string>(&ifile), "input file")
|
||||||
|
;
|
||||||
|
|
||||||
|
h.add(o);
|
||||||
|
po::positional_options_description a;
|
||||||
|
a.add("bname",1);
|
||||||
|
a.add("L1",1);
|
||||||
|
a.add("L2",1);
|
||||||
|
a.add("input",1);
|
||||||
|
|
||||||
|
po::store(po::command_line_parser(ac,av)
|
||||||
|
.options(h)
|
||||||
|
.positional(a)
|
||||||
|
.run(),vm);
|
||||||
|
po::notify(vm);
|
||||||
|
if (vm.count("help"))
|
||||||
|
{
|
||||||
|
std::cout << "\nusage:\n\t" << av[0]
|
||||||
|
<< " [options] <model file stem> <L1> <L2> <input file>" << std::endl;
|
||||||
|
std::cout << o << std::endl;
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -29,6 +29,13 @@ int main(int argc, char const* argv[])
|
|||||||
// Only lookup each phrase once
|
// Only lookup each phrase once
|
||||||
unordered_set<string> seen;
|
unordered_set<string> seen;
|
||||||
|
|
||||||
|
string context_weight_spec;
|
||||||
|
params.SetParameter(context_weight_spec,"context-weights",string(""));
|
||||||
|
boost::shared_ptr<ContextScope> scope(new ContextScope);
|
||||||
|
boost::shared_ptr<IOWrapper> none;
|
||||||
|
if (context_weight_spec.size())
|
||||||
|
scope->SetContextWeights(context_weight_spec);
|
||||||
|
|
||||||
string line;
|
string line;
|
||||||
while (true) {
|
while (true) {
|
||||||
// Input line
|
// Input line
|
||||||
@ -57,7 +64,8 @@ int main(int argc, char const* argv[])
|
|||||||
|
|
||||||
// Setup task for phrase
|
// Setup task for phrase
|
||||||
boost::shared_ptr<TranslationTask> ttask;
|
boost::shared_ptr<TranslationTask> ttask;
|
||||||
ttask = TranslationTask::create(phrase);
|
ttask = TranslationTask::create(phrase, none, scope);
|
||||||
|
|
||||||
// Support model combinations (PhraseDictionaryGroup)
|
// Support model combinations (PhraseDictionaryGroup)
|
||||||
BOOST_FOREACH(PhraseDictionary* p, PhraseDictionary::GetColl()) {
|
BOOST_FOREACH(PhraseDictionary* p, PhraseDictionary::GetColl()) {
|
||||||
p->InitializeForInput(ttask);
|
p->InitializeForInput(ttask);
|
||||||
|
@ -56,7 +56,7 @@ namespace sapt {
|
|||||||
if (m_specs.find("j") != std::string::npos)
|
if (m_specs.find("j") != std::string::npos)
|
||||||
(*dest)[i++] = log(pp.joint);
|
(*dest)[i++] = log(pp.joint);
|
||||||
if (m_specs.find("r2") != std::string::npos)
|
if (m_specs.find("r2") != std::string::npos)
|
||||||
(*dest)[++i] = log(pp.raw2);
|
(*dest)[i] = log(pp.raw2);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace sapt
|
} // namespace sapt
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace ugdiss;
|
using namespace sapt;
|
||||||
int
|
int
|
||||||
main(int argc,char* argv[])
|
main(int argc,char* argv[])
|
||||||
{
|
{
|
||||||
|
@ -235,6 +235,10 @@ check(std::map<std::string, xmlrpc_c::value> const& param,
|
|||||||
{
|
{
|
||||||
std::map<std::string, xmlrpc_c::value>::const_iterator m = param.find(key);
|
std::map<std::string, xmlrpc_c::value>::const_iterator m = param.find(key);
|
||||||
if(m == param.end()) return false;
|
if(m == param.end()) return false;
|
||||||
|
|
||||||
|
if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN)
|
||||||
|
return xmlrpc_c::value_boolean(m->second);
|
||||||
|
|
||||||
std::string val = string(xmlrpc_c::value_string(m->second));
|
std::string val = string(xmlrpc_c::value_string(m->second));
|
||||||
if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true;
|
if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true;
|
||||||
return false;
|
return false;
|
||||||
|
@ -13,7 +13,7 @@ eval set -- "$args"
|
|||||||
noserver=false;
|
noserver=false;
|
||||||
full=false;
|
full=false;
|
||||||
j=$(getconf _NPROCESSORS_ONLN)
|
j=$(getconf _NPROCESSORS_ONLN)
|
||||||
irstlm=$opt
|
irstlm=$opt/irstlm-5.80.08
|
||||||
boost=$opt
|
boost=$opt
|
||||||
cmph=$opt
|
cmph=$opt
|
||||||
xmlrpc=--with-xmlrpc-c\=$opt
|
xmlrpc=--with-xmlrpc-c\=$opt
|
||||||
|
Loading…
Reference in New Issue
Block a user