2010-01-30 13:11:35 +03:00
|
|
|
/*
|
|
|
|
* File: lexdecom.cpp
|
|
|
|
* Author: Felipe Sánchez-Martínez, Universitat d'Alacant <fsanchez@dlsi.ua.es>
|
|
|
|
*
|
|
|
|
* Created on 2010/01/27
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "lexdecom.h"
|
|
|
|
#include "scorer-impl.h"
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
|
|
|
|
PhraseScorer*
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf)
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
if(argv[argp] == NULL)
|
|
|
|
usage();
|
|
|
|
|
|
|
|
String lwfile(argv[argp++]);
|
|
|
|
|
|
|
|
return new LexicalDecompositionPhraseScorer(ptf.get_phrase_table(), reverse, lwfile, argv, argp, ptf);
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::LexicalDecompositionPhraseScorer(PhraseTable &pd, bool reverse,
|
|
|
|
const String &weightfile, const char *argv[], int &argp,
|
|
|
|
const PhraseScorerFactory &ptf) :
|
|
|
|
PhraseScorer(pd, reverse)
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
black_box_scorer = AbsoluteDiscountPhraseScorer::create_scorer(argv, argp, reverse, ptf);
|
|
|
|
|
|
|
|
std::ifstream wfile(weightfile.c_str());
|
|
|
|
|
|
|
|
//Code copied from LexicalWeightPhraseScorer; it should be factored !! -- 2010/01/27
|
|
|
|
|
|
|
|
std::cerr<<"Reading lexical weights from '"<<weightfile<<"' ... ";
|
|
|
|
|
|
|
|
while(!wfile.eof()) {
|
|
|
|
if(wfile.fail()) {
|
|
|
|
std::cerr << "Problem reading file: " << weightfile << std::endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
String src, tgt;
|
|
|
|
Score weight;
|
|
|
|
|
|
|
|
wfile >> src >> tgt >> weight;
|
|
|
|
Count src_id = PhraseText::index_word(src);
|
|
|
|
Count tgt_id = PhraseText::index_word(tgt);
|
|
|
|
weight_map_.insert(std::make_pair(std::make_pair(src_id, tgt_id), weight));
|
|
|
|
}
|
|
|
|
|
|
|
|
wfile.close();
|
|
|
|
std::cerr<<"done."<<std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
Score
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::get_weight(const String &s_src, const String &s_tgt) const
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
//Code copied from LexicalWeightPhraseScorer; it should be factored !! -- 2010/01/27
|
|
|
|
|
|
|
|
Count src = PhraseText::index_word(s_src);
|
|
|
|
Count tgt = PhraseText::index_word(s_tgt);
|
|
|
|
return get_weight(src, tgt);
|
|
|
|
}
|
|
|
|
|
|
|
|
inline Score
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::get_weight(Count src, Count tgt) const
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
//Code copied from LexicalWeightPhraseScorer; it should be factored !! -- 2010/01/27
|
|
|
|
|
|
|
|
WeightMapType_::const_iterator it = weight_map_.find(std::make_pair(src, tgt));
|
|
|
|
if(it == weight_map_.end())
|
|
|
|
return 0.00001; // default value copied from Philipp Koehn's scorer
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::do_score_phrases()
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
//Estimate p(J|I) = p(src_len|tgt_len)
|
|
|
|
|
|
|
|
black_box_scorer->score_phrases();
|
|
|
|
|
|
|
|
std::cerr<<"LexicalDecompositionPhraseScorer::do_score_phrases"<<std::endl;
|
|
|
|
|
|
|
|
std::map<unsigned, std::map<unsigned, Count> > count_srclen_tgtlen;
|
|
|
|
std::map<unsigned, Count> total_tgtlen;
|
|
|
|
|
|
|
|
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); it++) {
|
|
|
|
const PhrasePairInfo &ppair = *it;
|
|
|
|
PhraseInfo &src = phrase_table_.get_src_phrase(ppair.get_src());
|
|
|
|
PhraseInfo &tgt = phrase_table_.get_tgt_phrase(ppair.get_tgt());
|
|
|
|
unsigned src_len = src.get_phrase().size();
|
|
|
|
unsigned tgt_len = tgt.get_phrase().size();
|
2011-02-24 16:57:11 +03:00
|
|
|
|
2010-01-30 13:11:35 +03:00
|
|
|
/*//Debug code
|
|
|
|
for (unsigned i=0; i<src_len; i++)
|
|
|
|
std::cerr<<src.get_phrase().word(i)<<" ";
|
|
|
|
std::cerr<<"-> "<<src_len<<" ||| ";
|
|
|
|
for (unsigned i=0; i<tgt_len; i++)
|
|
|
|
std::cerr<<tgt.get_phrase().word(i)<<" ";
|
|
|
|
std::cerr<<"-> "<<tgt_len<<std::endl;
|
|
|
|
*/
|
|
|
|
|
|
|
|
count_srclen_tgtlen[src_len][tgt_len]+=ppair.get_count();
|
|
|
|
total_tgtlen[tgt_len]+=ppair.get_count();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<unsigned, std::map<unsigned, Count> >::iterator its;
|
|
|
|
std::map<unsigned, Count>::iterator itt;
|
|
|
|
|
|
|
|
for (its=count_srclen_tgtlen.begin(); its!=count_srclen_tgtlen.end(); its++) {
|
|
|
|
unsigned src_len=its->first;
|
|
|
|
|
|
|
|
for(itt=its->second.begin(); itt!=its->second.end(); itt++) {
|
|
|
|
unsigned tgt_len=itt->first;
|
|
|
|
Count cnt=itt->second;
|
|
|
|
prob_srclen_tgtlen_[src_len][tgt_len] = static_cast<Score>(cnt)/static_cast<Score>(total_tgtlen[tgt_len]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
Score
|
|
|
|
LexicalDecompositionPhraseScorer::get_noisy_or_combination(Count src_word, PhraseInfo &tgt_phrase)
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
Score sc=1.0;
|
|
|
|
|
|
|
|
unsigned tgt_len=tgt_phrase.get_phrase().size();
|
|
|
|
|
|
|
|
for(unsigned i=0; i<tgt_len; i++) {
|
|
|
|
Count tgt_word=tgt_phrase.get_phrase()[i];
|
|
|
|
sc *= (1.0 - get_weight(src_word, tgt_word));
|
|
|
|
}
|
|
|
|
|
|
|
|
return (1.0 - sc);
|
|
|
|
}
|
|
|
|
|
|
|
|
Score
|
2011-02-24 16:57:11 +03:00
|
|
|
LexicalDecompositionPhraseScorer::do_get_score(const PhraseTable::const_iterator &it)
|
|
|
|
{
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
The implementation of this method relies on the asumption that the
|
|
|
|
smoothed probabilities produced by AbsoluteDiscountPhraseScorer
|
|
|
|
are deficient.
|
|
|
|
*/
|
|
|
|
|
|
|
|
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
|
|
|
|
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
|
|
|
|
|
|
|
unsigned src_len=src_phrase.get_phrase().size();
|
|
|
|
unsigned tgt_len=tgt_phrase.get_phrase().size();
|
|
|
|
|
|
|
|
Score prod=1.0;
|
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
for(unsigned j=0; j<src_len; j++)
|
2010-01-30 13:11:35 +03:00
|
|
|
prod *= get_noisy_or_combination(src_phrase.get_phrase()[j], tgt_phrase);
|
|
|
|
|
|
|
|
Score lambda= static_cast<Score>(black_box_scorer->get_discount()) *
|
2011-02-24 16:57:11 +03:00
|
|
|
tgt_phrase.get_distinct() / tgt_phrase.get_count();
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
Score ret_value = black_box_scorer->get_score(it) + (lambda * prod * prob_srclen_tgtlen_[src_len][tgt_len]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
//Debug code
|
|
|
|
for (unsigned i=0; i<src_len; i++)
|
|
|
|
std::cerr<<src_phrase.get_phrase().word(i)<<" ";
|
|
|
|
std::cerr<<" ||| ";
|
|
|
|
for (unsigned i=0; i<tgt_len; i++)
|
|
|
|
std::cerr<<tgt_phrase.get_phrase().word(i)<<" ";
|
|
|
|
std::cerr<<"==> discount: "<<black_box_scorer->get_discount()<<"; black box score: "<<black_box_scorer->get_score(it)
|
|
|
|
<<"; lambda: "<<lambda<<"; prod: "
|
|
|
|
<<prod<<"; prob_srclen_tgtlen_["<<src_len<<"]["<<tgt_len<<"]: "<<prob_srclen_tgtlen_[src_len][tgt_len]
|
|
|
|
<<"; count: "<<it->get_count()<<"; score: "<<ret_value;
|
|
|
|
std::cerr<<std::endl;
|
2011-02-24 16:57:11 +03:00
|
|
|
*/
|
2010-01-30 13:11:35 +03:00
|
|
|
|
|
|
|
return ret_value;
|
|
|
|
}
|