Work in progress.

This commit is contained in:
Ulrich Germann 2015-11-20 17:07:17 +00:00
parent 7b8a875efb
commit 1fa81806c1

View File

@ -37,7 +37,7 @@ basename(string const path, string const suffix)
size_t k = path.size() - suffix.size();
cout << path << " " << suffix << endl;
cout << path.substr(0,p) << " " << path.substr(k) << endl;
return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
return path.substr(p+1, suffix == &path[k] ? k-p-1 : path.size() - p);
}
int main(int argc, char* argv[])
@ -47,6 +47,7 @@ int main(int argc, char* argv[])
string line;
string ifile = argv[4];
string docname = basename(ifile, string(".") + argv[2] + ".gz");
id_type docid = B->docname2docid(docname);
boost::iostreams::filtering_istream in;
ugdiss::open_input_stream(ifile,in);
while(getline(in,line))
@ -57,13 +58,70 @@ int main(int argc, char* argv[])
for (size_t i = 0; i < snt.size(); ++i)
{
bitext_t::iter m(B->I1.get());
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
for (size_t num_occurrences = m.ca(); m.size(); m.up())
{
if (size_t(m.ca()) == num_occurrences) continue;
num_occurrences = m.ca();
SPTR<SamplingBias const> zilch;
BitextSampler<Token> s(B.get(), m, zilch, 1000, 1000,
sapt::random_sampling);
s();
cout << m.size() << " " << s.stats()->trg.size() << endl;
if (s.stats()->trg.size() == 0) continue;
// if (s.stats()->indoc[docname] > 10) continue;
sapt::pstats::indoc_map_t::const_iterator d
= s.stats()->indoc.find(docid);
size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0;
cout << m.size() << " : " << m.str(B->V1.get()) << " ("
<< s.stats()->trg.size() << " entries; "
<< indoccnt << "/" << s.stats()->good
<< " samples in domain)" << endl;
vector<PhrasePair<Token> > ppairs;
PhrasePair<Token>::SortDescendingByJointCount sorter;
expand(m,*B,*s.stats(),ppairs,NULL);
sort(ppairs.begin(),ppairs.end(),sorter);
boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)");
BOOST_FOREACH(PhrasePair<Token>& ppair, ppairs)
{
if (ppair.joint * 100 < ppair.good1) break;
ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1;
ppair.good2 = max(ppair.good2, ppair.joint);
#if 0
cout << "\t"
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
% B->T2->pid2str(B->V2.get(),ppair.p2)
% (float(ppair.joint)/ppair.good1)
% (float(ppair.joint)/ppair.good2)
) << "\n";
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
{
// if (d != ppair.indoc.begin()) cout << "; ";
cout << (boost::format("\t\t%4d %s") % d->second
% B->docid2name(d->first))
<< endl;
}
cout << endl;
#else
cout << "\t"
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
% B->T2->pid2str(B->V2.get(),ppair.p2)
% (float(ppair.joint)/ppair.good1)
% (float(ppair.joint)/ppair.good2)
) << " [";
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
{
if (d != ppair.indoc.begin()) cout << "; ";
cout << (boost::format("%s: %d") % B->docid2name(d->first)
% d->second) ;
}
cout << "]" << endl;
#endif
}
}
}
}