mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 22:45:50 +03:00
Kneser-Ney and modified Kneser-Ney smoothing implementation.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2837 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
63bdf3b602
commit
275c06d9e7
@ -105,9 +105,13 @@ protected:
|
||||
Count distinct_;
|
||||
PhraseText phrase_;
|
||||
Score *data_;
|
||||
|
||||
Count n1_;
|
||||
Count n2_;
|
||||
Count n3plus_;
|
||||
|
||||
PhraseInfo(Count data_size, const String &phrase) :
|
||||
data_size_(data_size), count_(0), distinct_(0), phrase_(phrase) {
|
||||
data_size_(data_size), count_(0), distinct_(0), phrase_(phrase), n1_(0), n2_(0), n3plus_(0){
|
||||
data_ = DataStorage<Score>::get_instance().alloc(data_size_);
|
||||
}
|
||||
|
||||
@ -129,18 +133,43 @@ public:
|
||||
void inc_count() {
|
||||
count_++;
|
||||
}
|
||||
|
||||
|
||||
Count get_distinct() const {
|
||||
return distinct_;
|
||||
}
|
||||
|
||||
void inc_distinct() {
|
||||
void inc_distinct(){
|
||||
distinct_++;
|
||||
}
|
||||
|
||||
const PhraseText &get_phrase() const {
|
||||
return phrase_;
|
||||
}
|
||||
|
||||
void inc_n1(){
|
||||
n1_++;
|
||||
}
|
||||
|
||||
Count get_n1(){
|
||||
return n1_;
|
||||
}
|
||||
|
||||
void inc_n2(){
|
||||
n2_++;
|
||||
}
|
||||
|
||||
Count get_n2(){
|
||||
return n2_;
|
||||
}
|
||||
|
||||
void inc_n3plus(){
|
||||
n3plus_++;
|
||||
}
|
||||
|
||||
Count get_n3plus(){
|
||||
return n3plus_;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt) {
|
||||
@ -161,7 +190,6 @@ protected:
|
||||
boost::object_pool<PhraseInfo> phrase_info_pool_;
|
||||
|
||||
Count data_size_;
|
||||
|
||||
public:
|
||||
typedef ListType_::iterator iterator;
|
||||
typedef ListType_::const_iterator const_iterator;
|
||||
@ -197,6 +225,7 @@ public:
|
||||
size_type size() const {
|
||||
return list_.size();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class PhraseAlignment {
|
||||
|
@ -42,10 +42,46 @@ private:
|
||||
virtual Score do_get_score(const PhraseTable::const_iterator &it);
|
||||
|
||||
public:
|
||||
Score get_discount();
|
||||
Score get_discount();
|
||||
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
|
||||
};
|
||||
|
||||
class KNDiscount1PhraseScorer : public PhraseScorer {
|
||||
private:
|
||||
Count total_distinct_;
|
||||
Score discount_;
|
||||
Count total_count_;
|
||||
|
||||
explicit KNDiscount1PhraseScorer(PhraseTable &pd, bool reverse) :
|
||||
PhraseScorer(pd, reverse) {}
|
||||
|
||||
virtual void do_score_phrases();
|
||||
virtual Score do_get_score(const PhraseTable::const_iterator &it);
|
||||
|
||||
public:
|
||||
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
|
||||
};
|
||||
|
||||
class KNDiscount3PhraseScorer : public PhraseScorer {
|
||||
private:
|
||||
Score discount1_;
|
||||
Score discount2_;
|
||||
Score discount3plus_;
|
||||
|
||||
Count total_distinct_n1_;
|
||||
Count total_distinct_n2_;
|
||||
Count total_distinct_n3plus_;
|
||||
|
||||
explicit KNDiscount3PhraseScorer(PhraseTable &pd, bool reverse) :
|
||||
PhraseScorer(pd, reverse) {}
|
||||
|
||||
virtual void do_score_phrases();
|
||||
virtual Score do_get_score(const PhraseTable::const_iterator &it);
|
||||
|
||||
public:
|
||||
static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
|
||||
};
|
||||
|
||||
class LexicalWeightPhraseScorer : public PhraseScorer {
|
||||
private:
|
||||
typedef std::map<std::pair<Count,Count>,Score> WeightMapType_;
|
||||
|
@ -20,12 +20,14 @@ const std::vector<String> &PhraseScorerFactory::scorer_list() {
|
||||
list.push_back("ml - maximum likelihood score (relative frequency)");
|
||||
list.push_back("wittenbell - Witten-Bell smoothing");
|
||||
list.push_back("absdiscount - absolute discounting");
|
||||
list.push_back("kndiscount1 - Knesser-Ney discounting");
|
||||
list.push_back("kndiscount3 - modified Knesser-Ney discounting");
|
||||
list.push_back("lexweights <weightfile> - lexical weights (Koehn et al., NAACL 2003)");
|
||||
#ifdef ENABLE_CHANNEL_SCORER
|
||||
list.push_back("channel <sigma> <srclm> <tgtlm> - channel adaptation");
|
||||
#endif
|
||||
list.push_back("const <constant> - constant phrase penalty");
|
||||
list.push_back("lexdecomp <weightfile> - lexical decomposition smoothing");
|
||||
list.push_back("lexdecomp <weightfile> - lexical decomposition smoothing");
|
||||
}
|
||||
|
||||
return list;
|
||||
@ -42,6 +44,10 @@ PhraseScorer *PhraseScorerFactory::create_scorer(const char *argv[], int &argp,
|
||||
return WittenBellPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if(!strcmp(arg, "absdiscount"))
|
||||
return AbsoluteDiscountPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if(!strcmp(arg, "kndiscount1"))
|
||||
return KNDiscount1PhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if(!strcmp(arg, "kndiscount3"))
|
||||
return KNDiscount3PhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if(!strcmp(arg, "lexweights"))
|
||||
return LexicalWeightPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
#ifdef ENABLE_CHANNEL_SCORER
|
||||
@ -50,8 +56,8 @@ PhraseScorer *PhraseScorerFactory::create_scorer(const char *argv[], int &argp,
|
||||
#endif
|
||||
else if(!strcmp(arg, "const"))
|
||||
return ConstantPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if (!strcmp(arg, "lexdecomp"))
|
||||
return LexicalDecompositionPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else if (!strcmp(arg, "lexdecomp"))
|
||||
return LexicalDecompositionPhraseScorer::create_scorer(argv, argp, reverse, *this);
|
||||
else {
|
||||
std::cerr << "Unknown phrase scorer type: " << arg << std::endl << std::endl;
|
||||
usage();
|
||||
@ -94,9 +100,11 @@ Score WittenBellPhraseScorer::do_get_score(const PhraseTable::const_iterator &it
|
||||
return static_cast<Score>(it->get_count()) / (tgt_phrase.get_count() + tgt_phrase.get_distinct());
|
||||
}
|
||||
|
||||
|
||||
PhraseScorer *AbsoluteDiscountPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
|
||||
return new AbsoluteDiscountPhraseScorer(ptf.get_phrase_table(), reverse);
|
||||
}
|
||||
// p(s|t) = (c(s,t) - beta) / c(t) <-- absolute discounting
|
||||
|
||||
void AbsoluteDiscountPhraseScorer::do_score_phrases() {
|
||||
Count n1 = 0, n2 = 0;
|
||||
@ -113,24 +121,133 @@ void AbsoluteDiscountPhraseScorer::do_score_phrases() {
|
||||
}
|
||||
}
|
||||
|
||||
discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
|
||||
discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
|
||||
}
|
||||
|
||||
inline Score AbsoluteDiscountPhraseScorer::get_discount() {
|
||||
return discount_;
|
||||
return discount_;
|
||||
}
|
||||
|
||||
Score AbsoluteDiscountPhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
|
||||
|
||||
/*
|
||||
The implementation of LexicalDecompositionPhraseScorer relies
|
||||
on the asumption that the smoothed probabilities produced by
|
||||
this method are deficient.
|
||||
*/
|
||||
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
||||
return (it->get_count() - discount_) / tgt_phrase.get_count();
|
||||
}
|
||||
|
||||
PhraseScorer *KNDiscount1PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
|
||||
return new KNDiscount1PhraseScorer(ptf.get_phrase_table(), reverse);
|
||||
}
|
||||
|
||||
|
||||
void KNDiscount1PhraseScorer::do_score_phrases() {
|
||||
Count n1 = 0, n2 = 0;
|
||||
Count total_count = 0;
|
||||
|
||||
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
|
||||
PhrasePairInfo ppinfo = *it;
|
||||
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
||||
total_count += tgt_phrase.get_count();
|
||||
|
||||
Count c = ppinfo.get_count();
|
||||
switch(c) {
|
||||
case 1:
|
||||
n1++;
|
||||
break;
|
||||
case 2:
|
||||
n2++;
|
||||
}
|
||||
}
|
||||
|
||||
discount_ = static_cast<Score>(n1) / (n1 + 2*n2);
|
||||
total_count_ = static_cast<Count>(total_count);
|
||||
|
||||
}
|
||||
|
||||
Score KNDiscount1PhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
|
||||
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
||||
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
|
||||
return ((it->get_count() - discount_) / tgt_phrase.get_count()) + (discount_ * tgt_phrase.get_distinct() / tgt_phrase.get_count())*(src_phrase.get_count() / total_count_);
|
||||
}
|
||||
|
||||
PhraseScorer *KNDiscount3PhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
|
||||
return new KNDiscount3PhraseScorer(ptf.get_phrase_table(), reverse);
|
||||
}
|
||||
|
||||
|
||||
void KNDiscount3PhraseScorer::do_score_phrases() {
|
||||
Count n1 = 0, n2 = 0, n3 = 0, n4 = 0;
|
||||
Count total_count = 0; //total number of source or target phrases (including repetitions)
|
||||
Count total_distinct_n1 = 0; //sum_{s} n1plus(s,*)
|
||||
Count total_distinct_n2 = 0;
|
||||
Count total_distinct_n3plus = 0;
|
||||
Score y;
|
||||
|
||||
for(PhraseTable::iterator it = phrase_table_.begin(); it != phrase_table_.end(); ++it) {
|
||||
PhrasePairInfo ppinfo = *it;
|
||||
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
|
||||
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
||||
|
||||
total_count += src_phrase.get_count();
|
||||
|
||||
Count c = ppinfo.get_count();
|
||||
switch(c) {
|
||||
case 1:
|
||||
n1++;
|
||||
tgt_phrase.inc_n1();
|
||||
src_phrase.inc_n1();
|
||||
total_distinct_n1++;
|
||||
break;
|
||||
case 2:
|
||||
n2++;
|
||||
tgt_phrase.inc_n2();
|
||||
src_phrase.inc_n2();
|
||||
total_distinct_n2++;
|
||||
break;
|
||||
case 3:
|
||||
n3++;
|
||||
tgt_phrase.inc_n3plus();
|
||||
src_phrase.inc_n3plus();
|
||||
total_distinct_n3plus++;
|
||||
break;
|
||||
case 4:
|
||||
n4++;
|
||||
tgt_phrase.inc_n3plus();
|
||||
src_phrase.inc_n3plus();
|
||||
total_distinct_n3plus++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
y = (Score)(n1) / (n1 + 2*n2);
|
||||
discount1_ = static_cast<Score> (1) - (2)*(y)*(n2 / n1);
|
||||
discount2_ = static_cast<Score> (2) - (3)*(y)*(n3 / n2);
|
||||
discount3plus_ = static_cast<Score> (3) - (4)*(y)*(n4 / n3);
|
||||
total_distinct_n1_ = static_cast<Count>(total_distinct_n1);
|
||||
total_distinct_n2_ = static_cast<Count>(total_distinct_n2);
|
||||
total_distinct_n3plus_ = static_cast<Count>(total_distinct_n3plus);
|
||||
}
|
||||
|
||||
Score KNDiscount3PhraseScorer::do_get_score(const PhraseTable::const_iterator &it) {
|
||||
PhrasePairInfo ppinfo = *it;
|
||||
PhraseInfo &tgt_phrase = phrase_table_.get_tgt_phrase(it->get_tgt());
|
||||
PhraseInfo &src_phrase = phrase_table_.get_src_phrase(it->get_src());
|
||||
|
||||
Score norm = (discount1_ * tgt_phrase.get_n1() + discount2_ * tgt_phrase.get_n2() + discount3plus_ * tgt_phrase.get_n3plus()) / tgt_phrase.get_count();
|
||||
Count c = ppinfo.get_count();
|
||||
switch(c) {
|
||||
case 1:
|
||||
return ((it->get_count() - discount1_) / tgt_phrase.get_count()) + \
|
||||
norm*(src_phrase.get_n1() / total_distinct_n1_);
|
||||
break;
|
||||
case 2:
|
||||
return ((it->get_count() - discount2_) / tgt_phrase.get_count()) + \
|
||||
norm*(src_phrase.get_n2() / total_distinct_n2_);
|
||||
break;
|
||||
default:
|
||||
return ((it->get_count() - discount3plus_) / tgt_phrase.get_count()) + \
|
||||
norm*(src_phrase.get_n3plus() / total_distinct_n3plus_);
|
||||
}
|
||||
}
|
||||
|
||||
PhraseScorer *LexicalWeightPhraseScorer::create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf) {
|
||||
bool overall_max = true;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user