ken lm integration

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3570 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
hieuhoang1972 2010-09-26 22:29:01 +00:00
parent a82c2d5531
commit 559a5e3ece
7 changed files with 54 additions and 215 deletions

View File

@ -237,6 +237,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lz",
@ -246,6 +247,7 @@
"-loolm",
"-lflm",
"-llattice",
"-lkenlm",
);
PRODUCT_NAME = CreateOnDisk;
};
@ -261,6 +263,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lz",
@ -270,6 +273,7 @@
"-loolm",
"-lflm",
"-llattice",
"-lkenlm",
);
PRODUCT_NAME = CreateOnDisk;
};

View File

@ -17,28 +17,28 @@
isa = PBXContainerItemProxy;
containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */;
proxyType = 2;
remoteGlobalIDString = D2AAC046055464E500DB518D /* libmoses.a */;
remoteGlobalIDString = D2AAC046055464E500DB518D;
remoteInfo = moses;
};
1EF455D71227C50C0022403A /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */;
proxyType = 2;
remoteGlobalIDString = D2AAC046055464E500DB518D /* libOnDiskPt.a */;
remoteGlobalIDString = D2AAC046055464E500DB518D;
remoteInfo = OnDiskPt;
};
1EF456211227C8A30022403A /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */;
proxyType = 1;
remoteGlobalIDString = D2AAC045055464E500DB518D /* moses */;
remoteGlobalIDString = D2AAC045055464E500DB518D;
remoteInfo = moses;
};
1EF456231227C8A80022403A /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */;
proxyType = 1;
remoteGlobalIDString = D2AAC045055464E500DB518D /* OnDiskPt */;
remoteGlobalIDString = D2AAC045055464E500DB518D;
remoteInfo = OnDiskPt;
};
/* End PBXContainerItemProxy section */
@ -246,6 +246,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lflm",
@ -254,6 +255,7 @@
"-ldstruct",
"-lz",
"-lirstlm",
"-lkenlm",
);
PREBINDING = NO;
PRODUCT_NAME = processLexicalTable;
@ -273,6 +275,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lflm",
@ -281,6 +284,7 @@
"-ldstruct",
"-lz",
"-lirstlm",
"-lkenlm",
);
PREBINDING = NO;
PRODUCT_NAME = processLexicalTable;

View File

@ -342,6 +342,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lz",
@ -351,6 +352,7 @@
"-loolm",
"-lflm",
"-llattice",
"-lkenlm",
);
PRODUCT_NAME = "moses-chart-cmd";
};
@ -367,6 +369,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lz",
@ -376,6 +379,7 @@
"-loolm",
"-lflm",
"-llattice",
"-lkenlm",
);
PRODUCT_NAME = "moses-chart-cmd";
};

View File

@ -271,6 +271,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lflm",
@ -279,6 +280,7 @@
"-ldstruct",
"-lz",
"-lirstlm",
"-lkenlm",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";
@ -306,6 +308,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lflm",
@ -314,6 +317,7 @@
"-ldstruct",
"-lz",
"-lirstlm",
"-lkenlm",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";
@ -333,6 +337,7 @@
LIBRARY_SEARCH_PATHS = (
../irstlm/lib/i386,
../srilm/lib/macosx,
../kenlm/lm,
);
OTHER_LDFLAGS = (
"-lflm",
@ -341,6 +346,7 @@
"-ldstruct",
"-lz",
"-lirstlm",
"-lkenlm",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";

View File

@ -44,20 +44,12 @@ namespace Moses
LanguageModelKen::LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub)
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
,m_lmtb(0),m_lmtb_dub(dub)
{
}
LanguageModelKen::~LanguageModelKen()
{
#ifndef WIN32
TRACE_ERR( "reset mmap\n");
m_lmtb->reset_mmap();
#endif
delete m_lmtb;
delete m_lmtb_ng;
delete m_ngram;
}
@ -65,189 +57,45 @@ bool LanguageModelKen::Load(const std::string &filePath,
FactorType factorType,
size_t nGramOrder)
{
const char *SepString = " \t\n";
cerr << "In LanguageModelKen::Load: nGramOrder = " << nGramOrder << "\n";
FactorCollection &factorCollection = FactorCollection::Instance();
m_factorType = factorType;
m_nGramOrder = nGramOrder;
// get name of LM file and, if any, of the micro-macro map file
char *filenamesOrig = strdup(filePath.c_str());
char *filenames = filenamesOrig;
m_filePath = strsep(&filenames, SepString);
// Open the input file (possibly gzipped)
InputFileStream inp(m_filePath);
if (filenames) {
// case LMfile + MAPfile: create an object of lmmacro class and load both LM file and map
cerr << "Loading LM file + MAP\n";
m_mapFilePath = strsep(&filenames, SepString);
if (!FileExists(m_mapFilePath)) {
cerr << "ERROR: Map file <" << m_mapFilePath << "> does not exist\n";
free(filenamesOrig);
return false;
}
InputFileStream inpMap(m_mapFilePath);
m_lmtb = new lmmacro(m_filePath, inp, inpMap);
} else {
// case (standard) LMfile only: create an object of lmtable
cerr << "Loading LM file (no MAP)\n";
m_lmtb = (lmtable *)new lmtable;
// Load the (possibly binary) model
#ifdef WIN32
m_lmtb->load(inp); //don't use memory map
#else
if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0)
m_lmtb->load(inp,m_filePath.c_str(),NULL,1);
else
m_lmtb->load(inp,m_filePath.c_str(),NULL,0);
#endif
}
m_lmtb_ng=new ngram(m_lmtb->getDict()); // ngram of words/micro tags
m_lmtb_size=m_lmtb->maxlevel();
// LM can be ok, just outputs warnings
// Mauro: in the original, the following two instructions are wrongly switched:
m_unknownId = m_lmtb->getDict()->oovcode(); // at the level of micro tags
CreateFactors(factorCollection);
VERBOSE(1, "Ken: m_unknownId=" << m_unknownId << std::endl);
//install caches
m_lmtb->init_probcache();
m_lmtb->init_statecache();
m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2);
if (m_lmtb_dub >0) m_lmtb->setlogOOVpenalty(m_lmtb_dub);
free(filenamesOrig);
m_ngram = new lm::ngram::Model(filePath.c_str());
return true;
}
void LanguageModelKen::CreateFactors(FactorCollection &factorCollection)
{ // add factors which have srilm id
// code copied & paste from SRI LM class. should do template function
std::map<size_t, int> lmIdMap;
size_t maxFactorId = 0; // to create lookup vector later on
dict_entry *entry;
dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags
while ( (entry = iter.next()) != NULL)
{
size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId();
lmIdMap[factorId] = entry->code;
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
}
size_t factorId;
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
factorId = m_sentenceStart->GetId();
m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceStartArray[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
factorId = m_sentenceEnd->GetId();
m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_);
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
m_lmIdLookup.resize(maxFactorId+1);
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
map<size_t, int>::iterator iterMap;
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
{
m_lmIdLookup[iterMap->first] = iterMap->second;
}
}
int LanguageModelKen::GetLmID( const std::string &str ) const
{
return m_lmtb->getDict()->encode( str.c_str() ); // at the level of micro tags
}
/* get score of n-gram. n-gram should not be bigger than m_nGramOrder
* Specific implementation can return State and len data to be used in hypothesis pruning
* \param contextFactor n-gram to be scored
* \param finalState state used by LM. Return arg
* \param len ???
*/
float LanguageModelKen::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const
{
unsigned int dummy;
if (!len) { len = &dummy; }
FactorType factorType = GetFactorType();
size_t count = contextFactor.size();
assert(count <= GetNGramOrder());
if (count == 0)
{
finalState = NULL;
return 0;
}
// set up context
size_t count = contextFactor.size();
m_lmtb_ng->size=0;
if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd);
if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart);
for (size_t i = 0 ; i < count ; i++)
const vector<int> ngramId(count);
for (size_t i = 0 ; i < count - 1 ; i++)
{
//int lmId = GetLmID((*contextFactor[i])[factorType]);
#ifdef DEBUG
cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";
#endif
int lmId = GetLmID((*contextFactor[i])[factorType]->GetString());
// cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId;
m_lmtb_ng->pushc(lmId);
const Factor *factor = contextFactor[i]->GetFactor(factorType);
const string &word = factor->GetString();
//ngramId[i] = StringToId(word); FOR_KEN
}
if (finalState){
*finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng);
// back off stats not currently available
*len = 0;
}
float prob = m_lmtb->clprob(*m_lmtb_ng);
float prob;
//prob = m_ngram.GetScore(ngramId); FOR_KEN
return TransformLMScore(prob);
}
bool DELETEMELMCacheCleanup(size_t sentences_done, size_t m_lmcache_cleanup_threshold)
{
if (sentences_done==-1) return true;
if (m_lmcache_cleanup_threshold)
if (sentences_done % m_lmcache_cleanup_threshold == 0)
return true;
return false;
}
void LanguageModelKen::CleanUpAfterSentenceProcessing()
{
const StaticData &staticData = StaticData::Instance();
static int sentenceCount = 0;
sentenceCount++;
size_t lmcache_cleanup_threshold = staticData.GetLMCacheCleanupThreshold();
if (DELETEMELMCacheCleanup(sentenceCount, lmcache_cleanup_threshold)){
TRACE_ERR( "reset caches\n");
m_lmtb->reset_caches();
}
}
void LanguageModelKen::InitializeBeforeSentenceProcessing(){
//nothing to do
#ifdef TRACE_CACHE
m_lmtb->sentence_id++;
#endif
}
}

View File

@ -28,42 +28,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
#include "Util.h"
#include "LanguageModelSingleFactor.h"
class lmtable; // Ken lm table
class lmmacro; // Ken lm for macro tags
class ngram;
#include "../../kenlm/lm/ngram.hh"
namespace Moses
{
class Phrase;
/** Implementation of single factor LM using Ken's code.
* This is the default LM for Moses and is available from the same sourceforge repository
*/
class LanguageModelKen : public LanguageModelSingleFactor
{
protected:
std::vector<int> m_lmIdLookup;
lmtable* m_lmtb;
ngram* m_lmtb_ng;
int m_unknownId;
int m_lmtb_sentenceStart; //lmtb symbols to initialize ngram with
int m_lmtb_sentenceEnd; //lmt symbol to initialize ngram with
int m_lmtb_size; //max ngram stored in the table
int m_lmtb_dub; //dictionary upperboud
std::string m_mapFilePath;
// float GetValue(LmId wordId, ngram *context) const;
void CreateFactors(FactorCollection &factorCollection);
int GetLmID( const std::string &str ) const;
int GetLmID( const Factor *factor ) const{
size_t factorId = factor->GetId();
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
};
lm::ngram::Model *m_ngram;
public:
LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub);
@ -74,14 +50,11 @@ public:
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len=0) const;
void CleanUpAfterSentenceProcessing();
void InitializeBeforeSentenceProcessing();
void CleanUpAfterSentenceProcessing() {}
void InitializeBeforeSentenceProcessing() {}
void set_dictionary_upperbound(int dub){ m_lmtb_size=dub ;
//m_lmtb->set_dictionary_upperbound(dub);
};
};
}
#endif

View File

@ -229,10 +229,10 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
isDigit = s.find_first_of("0123456789");
if (isDigit == string::npos)
isDigit = 0;
else
if (isDigit == 1)
isDigit = 1;
else
isDigit = 0;
// modify the starting bitmap
}