From 9e61900fad22c87341b4690c202c57f083eb1402 Mon Sep 17 00:00:00 2001 From: mfederico Date: Mon, 4 Aug 2008 13:06:52 +0000 Subject: [PATCH] Fixed bug concerning the handling of the oov penalty with IRSTLM Now, the penalty for out-of-vocabulary words is specified by the parameter -lmodel-dub: dictionary upper bounds of language models For instance, if you set it lmodel-dub to 1000000 (1M) and your actual vocabulary is let me say 200000 (200K), then the LM probabilty of the OOV word-class is divided by 800000 (800K), i.e. 1M-200K You have to make sure that lmodel-dub is always larger than the LM dictionary. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1870 1f5c12ca-751b-0410-a591-d2e778427230 --- moses/src/LanguageModelIRST.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/moses/src/LanguageModelIRST.cpp b/moses/src/LanguageModelIRST.cpp index 4631396da..d3e09705c 100644 --- a/moses/src/LanguageModelIRST.cpp +++ b/moses/src/LanguageModelIRST.cpp @@ -118,7 +118,7 @@ bool LanguageModelIRST::Load(const std::string &filePath, m_lmtb->init_statecache(); m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2); - m_lmtb->set_dictionary_upperbound(m_lmtb_dub); + m_lmtb->setlogOOVpenalty(m_lmtb_dub); return true; } @@ -202,6 +202,14 @@ float LanguageModelIRST::GetValue(const vector &contextFactor, Stat } float prob = m_lmtb->clprob(*m_lmtb_ng); + + //apply OOV penalty if the n-gram starts with an OOV word + //in a following version this will be integrated into the + //irstlm library + + if (*m_lmtb_ng->wordp(1) == m_lmtb->dict->oovcode()) + prob-=m_lmtb->getlogOOVpenalty(); + return TransformIRSTScore(prob); }