Kneser-Ney and modified Kneser-Ney smoothing implementation.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2837 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
pasmargo 2010-02-02 18:14:01 +00:00
parent 63bdf3b602
commit 275c06d9e7
3 changed files with 198 additions and 16 deletions

View File

@ -106,8 +106,12 @@ protected:
PhraseText phrase_;
Score *data_;
Count n1_;
Count n2_;
Count n3plus_;
PhraseInfo(Count data_size, const String &phrase) :
data_size_(data_size), count_(0), distinct_(0), phrase_(phrase) {
data_size_(data_size), count_(0), distinct_(0), phrase_(phrase), n1_(0), n2_(0), n3plus_(0){
data_ = DataStorage<Score>::get_instance().alloc(data_size_);
}
@ -134,13 +138,38 @@ public:
return distinct_;
}
void inc_distinct() {
void inc_distinct(){
distinct_++;
}
const PhraseText &get_phrase() const {
return phrase_;
}
void inc_n1(){
n1_++;
}
Count get_n1(){
return n1_;
}
void inc_n2(){
n2_++;
}
Count get_n2(){
return n2_;
}
void inc_n3plus(){
n3plus_++;
}
Count get_n3plus(){
return n3plus_;
}
};
inline std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt) {
@ -161,7 +190,6 @@ protected:
boost::object_pool<PhraseInfo> phrase_info_pool_;
Count data_size_;
public:
typedef ListType_::iterator iterator;
typedef ListType_::const_iterator const_iterator;
@ -197,6 +225,7 @@ public:
size_type size() const {
return list_.size();
}
};
class PhraseAlignment {

View File

@ -46,6 +46,42 @@ public:
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
};
class KNDiscount1PhraseScorer : public PhraseScorer {
private:
Count total_distinct_;
Score discount_;
Count total_count_;
explicit KNDiscount1PhraseScorer(PhraseTable &pd, bool reverse) :
PhraseScorer(pd, reverse) {}
virtual void do_score_phrases();
virtual Score do_get_score(const PhraseTable::const_iterator &it);
public:
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
};
class KNDiscount3PhraseScorer : public PhraseScorer {
private:
Score discount1_;
Score discount2_;
Score discount3plus_;
Count total_distinct_n1_;
Count total_distinct_n2_;
Count total_distinct_n3plus_;
explicit KNDiscount3PhraseScorer(PhraseTable &pd, bool reverse) :
PhraseScorer(pd, reverse) {}
virtual void do_score_phrases();
virtual Score do_get_score(const PhraseTable::const_iterator &it);
public:
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
};
class LexicalWeightPhraseScorer : public PhraseScorer {
private:
typedef std::map<std::pair<Count,Count>,Score> WeightMapType_;

View File

@ -20,6 +20,8 @@ const std::vector<String> &PhraseScorerFactory::scorer_list() {
list.push_back("ml - maximum likelihood score (relative frequency)");
list.push_back("wittenbell - Witten-Bell smoothing");
list.push_back("absdiscount - absolute discounting");
list.push_back("kndiscount1 - Knesser-Ney discounting");
list.push_back("kndiscount3 - modified Knesser-Ney discounting");
list.push_back("lexweights <weightfile> - lexical weights (Koehn et al., NAACL 2003)");
#ifdef ENABLE_CHANNEL_SCORER
list.push_back("channel <sigma> <srclm> <tgtlm> - channel adaptation");
@ -42,6 +44,10 @@ PhraseScorer *PhraseScorerFactory::create_scorer(const char *argv[], int &argp,
return WittenBellPhraseScorer::create_scorer(argv, argp, reverse, *this);
else if(!strcmp(arg, "absdiscount"))
return AbsoluteDiscountPhraseScorer::create_scorer(argv, argp, reverse, *this);
else if(!strcmp(arg, "kndiscount1"))
return KNDiscount1PhraseScorer::create_scorer(argv, argp, reverse, *this);
else if(!strcmp(arg, "kndiscount3"))
return KNDiscount3PhraseScorer::create_scorer(argv, argp, reverse, *this);
else if(!strcmp(arg, "lexweights"))
return LexicalWeightPhraseScorer::create_scorer(argv, argp, reverse, *this);
#ifdef ENABLE_CHANNEL_SCORER
@ -94,9 +100,11 @@ Score WittenBellPhraseScorer::do_get_score(const PhraseTable::const_iterator &it
return static_cast<Score>(it->get_count()) / (tgt_phrase.get_count() + tgt_phrase.get_distinct());
}
PhraseScorer *AbsoluteDiscountPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
return new AbsoluteDiscountPhraseScorer(ptf.get_phrase_table(), reverse);
}
// p(s|t) = (c(s,t) - beta) / c(t) <-- absolute discounting
void AbsoluteDiscountPhraseScorer::do_score_phrases() {
Count n1 = 0, n2 = 0;
@ -121,16 +129,125 @@ inline Score AbsoluteDiscountPhraseScorer::get_discount() {
}
Score AbsoluteDiscountPhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
/*
The implementation of LexicalDecompositionPhraseScorer relies
on the asumption that the smoothed probabilities produced by
this method are deficient.
*/
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
return (it->get_count() - discount_) / tgt_phrase.get_count();
}
PhraseScorer *KNDiscount1PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
return new KNDiscount1PhraseScorer(ptf.get_phrase_table(), reverse);
}
void KNDiscount1PhraseScorer::do_score_phrases() {
Count n1 = 0, n2 = 0;
Count total_count = 0;
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
PhrasePairInfo ppinfo = *it;
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
total_count += tgt_phrase.get_count();
Count c = ppinfo.get_count();
switch(c) {
case 1:
n1++;
break;
case 2:
n2++;
}
}
discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
total_count_ = static_cast<Count>(total_count);
}
Score KNDiscount1PhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
return ((it->get_count() - discount_) / tgt_phrase.get_count()) + (discount_ * tgt_phrase.get_distinct() / tgt_phrase.get_count())*(src_phrase.get_count() / total_count_);
}
PhraseScorer *KNDiscount3PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
return new KNDiscount3PhraseScorer(ptf.get_phrase_table(), reverse);
}
void KNDiscount3PhraseScorer::do_score_phrases() {
Count n1 = 0, n2 = 0, n3 = 0, n4 = 0;
Count total_count = 0; //total number of source or target phrases (including repetitions)
Count total_distinct_n1 = 0; //sum_{s} n1plus(s,*)
Count total_distinct_n2 = 0;
Count total_distinct_n3plus = 0;
Score y;
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
PhrasePairInfo ppinfo = *it;
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
total_count += src_phrase.get_count();
Count c = ppinfo.get_count();
switch(c) {
case 1:
n1++;
tgt_phrase.inc_n1();
src_phrase.inc_n1();
total_distinct_n1++;
break;
case 2:
n2++;
tgt_phrase.inc_n2();
src_phrase.inc_n2();
total_distinct_n2++;
break;
case 3:
n3++;
tgt_phrase.inc_n3plus();
src_phrase.inc_n3plus();
total_distinct_n3plus++;
break;
case 4:
n4++;
tgt_phrase.inc_n3plus();
src_phrase.inc_n3plus();
total_distinct_n3plus++;
}
}
y = (Score)(n1) / (n1 + 2*n2);
discount1_ = static_cast<Score> (1) - (2)*(y)*(n2 / n1);
discount2_ = static_cast<Score> (2) - (3)*(y)*(n3 / n2);
discount3plus_ = static_cast<Score> (3) - (4)*(y)*(n4 / n3);
total_distinct_n1_ = static_cast<Count>(total_distinct_n1);
total_distinct_n2_ = static_cast<Count>(total_distinct_n2);
total_distinct_n3plus_ = static_cast<Count>(total_distinct_n3plus);
}
Score KNDiscount3PhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
PhrasePairInfo ppinfo = *it;
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
Score norm = (discount1_ * tgt_phrase.get_n1() + discount2_ * tgt_phrase.get_n2() + discount3plus_ * tgt_phrase.get_n3plus()) / tgt_phrase.get_count();
Count c = ppinfo.get_count();
switch(c) {
case 1:
return ((it->get_count() - discount1_) / tgt_phrase.get_count()) + \
norm*(src_phrase.get_n1() / total_distinct_n1_);
break;
case 2:
return ((it->get_count() - discount2_) / tgt_phrase.get_count()) + \
norm*(src_phrase.get_n2() / total_distinct_n2_);
break;
default:
return ((it->get_count() - discount3plus_) / tgt_phrase.get_count()) + \
norm*(src_phrase.get_n3plus() / total_distinct_n3plus_);
}
}
PhraseScorer *LexicalWeightPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
bool overall_max = true;