mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-11 19:27:11 +03:00
Merge ../master into sync-with-mmt
This commit is contained in:
commit
894644d615
@ -37,7 +37,7 @@ basename(string const path, string const suffix)
|
||||
size_t k = path.size() - suffix.size();
|
||||
cout << path << " " << suffix << endl;
|
||||
cout << path.substr(0,p) << " " << path.substr(k) << endl;
|
||||
return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
|
||||
return path.substr(p+1, suffix == &path[k] ? k-p-1 : path.size() - p);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@ -47,6 +47,7 @@ int main(int argc, char* argv[])
|
||||
string line;
|
||||
string ifile = argv[4];
|
||||
string docname = basename(ifile, string(".") + argv[2] + ".gz");
|
||||
id_type docid = B->docname2docid(docname);
|
||||
boost::iostreams::filtering_istream in;
|
||||
ugdiss::open_input_stream(ifile,in);
|
||||
while(getline(in,line))
|
||||
@ -57,13 +58,70 @@ int main(int argc, char* argv[])
|
||||
for (size_t i = 0; i < snt.size(); ++i)
|
||||
{
|
||||
bitext_t::iter m(B->I1.get());
|
||||
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
|
||||
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
|
||||
for (size_t num_occurrences = m.ca(); m.size(); m.up())
|
||||
{
|
||||
if (size_t(m.ca()) == num_occurrences) continue;
|
||||
num_occurrences = m.ca();
|
||||
SPTR<SamplingBias const> zilch;
|
||||
BitextSampler<Token> s(B.get(), m, zilch, 1000, 1000,
|
||||
sapt::random_sampling);
|
||||
s();
|
||||
cout << m.size() << " " << s.stats()->trg.size() << endl;
|
||||
if (s.stats()->trg.size() == 0) continue;
|
||||
// if (s.stats()->indoc[docname] > 10) continue;
|
||||
sapt::pstats::indoc_map_t::const_iterator d
|
||||
= s.stats()->indoc.find(docid);
|
||||
size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0;
|
||||
cout << m.size() << " : " << m.str(B->V1.get()) << " ("
|
||||
<< s.stats()->trg.size() << " entries; "
|
||||
<< indoccnt << "/" << s.stats()->good
|
||||
<< " samples in domain)" << endl;
|
||||
vector<PhrasePair<Token> > ppairs;
|
||||
PhrasePair<Token>::SortDescendingByJointCount sorter;
|
||||
expand(m,*B,*s.stats(),ppairs,NULL);
|
||||
sort(ppairs.begin(),ppairs.end(),sorter);
|
||||
boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)");
|
||||
BOOST_FOREACH(PhrasePair<Token>& ppair, ppairs)
|
||||
{
|
||||
if (ppair.joint * 100 < ppair.good1) break;
|
||||
ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1;
|
||||
ppair.good2 = max(ppair.good2, ppair.joint);
|
||||
|
||||
#if 0
|
||||
cout << "\t"
|
||||
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
|
||||
% B->T2->pid2str(B->V2.get(),ppair.p2)
|
||||
% (float(ppair.joint)/ppair.good1)
|
||||
% (float(ppair.joint)/ppair.good2)
|
||||
) << "\n";
|
||||
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
|
||||
for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
|
||||
{
|
||||
// if (d != ppair.indoc.begin()) cout << "; ";
|
||||
cout << (boost::format("\t\t%4d %s") % d->second
|
||||
% B->docid2name(d->first))
|
||||
<< endl;
|
||||
}
|
||||
cout << endl;
|
||||
#else
|
||||
cout << "\t"
|
||||
<< (fmt % ppair.joint % ppair.good1 % ppair.good2
|
||||
% B->T2->pid2str(B->V2.get(),ppair.p2)
|
||||
% (float(ppair.joint)/ppair.good1)
|
||||
% (float(ppair.joint)/ppair.good2)
|
||||
) << " [";
|
||||
typedef std::map<uint32_t, uint32_t>::const_iterator iter;
|
||||
for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
|
||||
{
|
||||
if (d != ppair.indoc.begin()) cout << "; ";
|
||||
cout << (boost::format("%s: %d") % B->docid2name(d->first)
|
||||
% d->second) ;
|
||||
}
|
||||
cout << "]" << endl;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -217,17 +217,42 @@ namespace sapt
|
||||
write_yawat_alignment
|
||||
( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
|
||||
|
||||
std::string docname(id_type const sid) const;
|
||||
|
||||
std::string sid2docname(id_type const sid) const;
|
||||
std::string docid2name(id_type const sid) const;
|
||||
int docname2docid(std::string const& name) const;
|
||||
|
||||
std::vector<id_type> const* sid2did() const;
|
||||
int sid2did(uint32_t sid) const;
|
||||
};
|
||||
|
||||
#include "ug_bitext_agenda.h"
|
||||
|
||||
template<typename Token>
|
||||
int
|
||||
Bitext<Token>::
|
||||
docname2docid(std::string const& name) const
|
||||
{
|
||||
std::map<std::string,id_type>::const_iterator m;
|
||||
m = m_docname2docid.find(name);
|
||||
if (m != m_docname2docid.end()) return m->second;
|
||||
return -1;
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
std::string
|
||||
Bitext<Token>::
|
||||
docname(id_type const sid) const
|
||||
docid2name(id_type const did) const
|
||||
{
|
||||
if (did < m_docname.size())
|
||||
return m_docname[did];
|
||||
else
|
||||
return (boost::format("%d") % did).str();
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
std::string
|
||||
Bitext<Token>::
|
||||
sid2docname(id_type const sid) const
|
||||
{
|
||||
if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
|
||||
return m_docname[(*m_sid2docid)[sid]];
|
||||
@ -243,6 +268,17 @@ namespace sapt
|
||||
return m_sid2docid.get();
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
int
|
||||
Bitext<Token>::
|
||||
sid2did(uint32_t sid) const
|
||||
{
|
||||
if (m_sid2docid)
|
||||
return m_sid2docid->at(sid);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
template<typename Token>
|
||||
SPTR<SentenceBias>
|
||||
Bitext<Token>::
|
||||
|
@ -275,7 +275,7 @@ consider_sample(TokenPosition const& p)
|
||||
bitvector full_aln(100*100);
|
||||
PhraseExtractionRecord
|
||||
rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln);
|
||||
int docid = m_bias ? m_bias->GetClass(p.sid) : -1;
|
||||
int docid = m_bias ? m_bias->GetClass(p.sid) : m_bitext->sid2did(p.sid);
|
||||
if (!m_bitext->find_trg_phr_bounds(rec))
|
||||
{ // no good, probably because phrase is not coherent
|
||||
m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
|
||||
|
Loading…
Reference in New Issue
Block a user