mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-27 03:49:57 +03:00
ken lm integration
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3570 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
a82c2d5531
commit
559a5e3ece
@ -237,6 +237,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lz",
|
||||
@ -246,6 +247,7 @@
|
||||
"-loolm",
|
||||
"-lflm",
|
||||
"-llattice",
|
||||
"-lkenlm",
|
||||
);
|
||||
PRODUCT_NAME = CreateOnDisk;
|
||||
};
|
||||
@ -261,6 +263,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lz",
|
||||
@ -270,6 +273,7 @@
|
||||
"-loolm",
|
||||
"-lflm",
|
||||
"-llattice",
|
||||
"-lkenlm",
|
||||
);
|
||||
PRODUCT_NAME = CreateOnDisk;
|
||||
};
|
||||
|
@ -17,28 +17,28 @@
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */;
|
||||
proxyType = 2;
|
||||
remoteGlobalIDString = D2AAC046055464E500DB518D /* libmoses.a */;
|
||||
remoteGlobalIDString = D2AAC046055464E500DB518D;
|
||||
remoteInfo = moses;
|
||||
};
|
||||
1EF455D71227C50C0022403A /* PBXContainerItemProxy */ = {
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */;
|
||||
proxyType = 2;
|
||||
remoteGlobalIDString = D2AAC046055464E500DB518D /* libOnDiskPt.a */;
|
||||
remoteGlobalIDString = D2AAC046055464E500DB518D;
|
||||
remoteInfo = OnDiskPt;
|
||||
};
|
||||
1EF456211227C8A30022403A /* PBXContainerItemProxy */ = {
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */;
|
||||
proxyType = 1;
|
||||
remoteGlobalIDString = D2AAC045055464E500DB518D /* moses */;
|
||||
remoteGlobalIDString = D2AAC045055464E500DB518D;
|
||||
remoteInfo = moses;
|
||||
};
|
||||
1EF456231227C8A80022403A /* PBXContainerItemProxy */ = {
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */;
|
||||
proxyType = 1;
|
||||
remoteGlobalIDString = D2AAC045055464E500DB518D /* OnDiskPt */;
|
||||
remoteGlobalIDString = D2AAC045055464E500DB518D;
|
||||
remoteInfo = OnDiskPt;
|
||||
};
|
||||
/* End PBXContainerItemProxy section */
|
||||
@ -246,6 +246,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lflm",
|
||||
@ -254,6 +255,7 @@
|
||||
"-ldstruct",
|
||||
"-lz",
|
||||
"-lirstlm",
|
||||
"-lkenlm",
|
||||
);
|
||||
PREBINDING = NO;
|
||||
PRODUCT_NAME = processLexicalTable;
|
||||
@ -273,6 +275,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lflm",
|
||||
@ -281,6 +284,7 @@
|
||||
"-ldstruct",
|
||||
"-lz",
|
||||
"-lirstlm",
|
||||
"-lkenlm",
|
||||
);
|
||||
PREBINDING = NO;
|
||||
PRODUCT_NAME = processLexicalTable;
|
||||
|
@ -342,6 +342,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lz",
|
||||
@ -351,6 +352,7 @@
|
||||
"-loolm",
|
||||
"-lflm",
|
||||
"-llattice",
|
||||
"-lkenlm",
|
||||
);
|
||||
PRODUCT_NAME = "moses-chart-cmd";
|
||||
};
|
||||
@ -367,6 +369,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lz",
|
||||
@ -376,6 +379,7 @@
|
||||
"-loolm",
|
||||
"-lflm",
|
||||
"-llattice",
|
||||
"-lkenlm",
|
||||
);
|
||||
PRODUCT_NAME = "moses-chart-cmd";
|
||||
};
|
||||
|
@ -271,6 +271,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lflm",
|
||||
@ -279,6 +280,7 @@
|
||||
"-ldstruct",
|
||||
"-lz",
|
||||
"-lirstlm",
|
||||
"-lkenlm",
|
||||
);
|
||||
PREBINDING = NO;
|
||||
PRODUCT_NAME = "moses-cmd";
|
||||
@ -306,6 +308,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lflm",
|
||||
@ -314,6 +317,7 @@
|
||||
"-ldstruct",
|
||||
"-lz",
|
||||
"-lirstlm",
|
||||
"-lkenlm",
|
||||
);
|
||||
PREBINDING = NO;
|
||||
PRODUCT_NAME = "moses-cmd";
|
||||
@ -333,6 +337,7 @@
|
||||
LIBRARY_SEARCH_PATHS = (
|
||||
../irstlm/lib/i386,
|
||||
../srilm/lib/macosx,
|
||||
../kenlm/lm,
|
||||
);
|
||||
OTHER_LDFLAGS = (
|
||||
"-lflm",
|
||||
@ -341,6 +346,7 @@
|
||||
"-ldstruct",
|
||||
"-lz",
|
||||
"-lirstlm",
|
||||
"-lkenlm",
|
||||
);
|
||||
PREBINDING = NO;
|
||||
PRODUCT_NAME = "moses-cmd";
|
||||
|
@ -44,20 +44,12 @@ namespace Moses
|
||||
|
||||
LanguageModelKen::LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub)
|
||||
:LanguageModelSingleFactor(registerScore, scoreIndexManager)
|
||||
,m_lmtb(0),m_lmtb_dub(dub)
|
||||
{
|
||||
}
|
||||
|
||||
LanguageModelKen::~LanguageModelKen()
|
||||
{
|
||||
|
||||
#ifndef WIN32
|
||||
TRACE_ERR( "reset mmap\n");
|
||||
m_lmtb->reset_mmap();
|
||||
#endif
|
||||
|
||||
delete m_lmtb;
|
||||
delete m_lmtb_ng;
|
||||
delete m_ngram;
|
||||
}
|
||||
|
||||
|
||||
@ -65,189 +57,45 @@ bool LanguageModelKen::Load(const std::string &filePath,
|
||||
FactorType factorType,
|
||||
size_t nGramOrder)
|
||||
{
|
||||
const char *SepString = " \t\n";
|
||||
cerr << "In LanguageModelKen::Load: nGramOrder = " << nGramOrder << "\n";
|
||||
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
|
||||
m_factorType = factorType;
|
||||
m_nGramOrder = nGramOrder;
|
||||
|
||||
// get name of LM file and, if any, of the micro-macro map file
|
||||
char *filenamesOrig = strdup(filePath.c_str());
|
||||
char *filenames = filenamesOrig;
|
||||
m_filePath = strsep(&filenames, SepString);
|
||||
|
||||
// Open the input file (possibly gzipped)
|
||||
InputFileStream inp(m_filePath);
|
||||
|
||||
if (filenames) {
|
||||
// case LMfile + MAPfile: create an object of lmmacro class and load both LM file and map
|
||||
cerr << "Loading LM file + MAP\n";
|
||||
m_mapFilePath = strsep(&filenames, SepString);
|
||||
if (!FileExists(m_mapFilePath)) {
|
||||
cerr << "ERROR: Map file <" << m_mapFilePath << "> does not exist\n";
|
||||
free(filenamesOrig);
|
||||
return false;
|
||||
}
|
||||
InputFileStream inpMap(m_mapFilePath);
|
||||
m_lmtb = new lmmacro(m_filePath, inp, inpMap);
|
||||
|
||||
|
||||
} else {
|
||||
// case (standard) LMfile only: create an object of lmtable
|
||||
cerr << "Loading LM file (no MAP)\n";
|
||||
m_lmtb = (lmtable *)new lmtable;
|
||||
|
||||
// Load the (possibly binary) model
|
||||
#ifdef WIN32
|
||||
m_lmtb->load(inp); //don't use memory map
|
||||
#else
|
||||
if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0)
|
||||
m_lmtb->load(inp,m_filePath.c_str(),NULL,1);
|
||||
else
|
||||
m_lmtb->load(inp,m_filePath.c_str(),NULL,0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
m_lmtb_ng=new ngram(m_lmtb->getDict()); // ngram of words/micro tags
|
||||
m_lmtb_size=m_lmtb->maxlevel();
|
||||
|
||||
// LM can be ok, just outputs warnings
|
||||
|
||||
// Mauro: in the original, the following two instructions are wrongly switched:
|
||||
m_unknownId = m_lmtb->getDict()->oovcode(); // at the level of micro tags
|
||||
CreateFactors(factorCollection);
|
||||
|
||||
VERBOSE(1, "Ken: m_unknownId=" << m_unknownId << std::endl);
|
||||
|
||||
//install caches
|
||||
m_lmtb->init_probcache();
|
||||
m_lmtb->init_statecache();
|
||||
m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2);
|
||||
|
||||
if (m_lmtb_dub >0) m_lmtb->setlogOOVpenalty(m_lmtb_dub);
|
||||
|
||||
free(filenamesOrig);
|
||||
return true;
|
||||
}
|
||||
|
||||
void LanguageModelKen::CreateFactors(FactorCollection &factorCollection)
|
||||
{ // add factors which have srilm id
|
||||
// code copied & paste from SRI LM class. should do template function
|
||||
std::map<size_t, int> lmIdMap;
|
||||
size_t maxFactorId = 0; // to create lookup vector later on
|
||||
|
||||
dict_entry *entry;
|
||||
dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags
|
||||
while ( (entry = iter.next()) != NULL)
|
||||
{
|
||||
size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId();
|
||||
lmIdMap[factorId] = entry->code;
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
}
|
||||
|
||||
size_t factorId;
|
||||
|
||||
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
|
||||
factorId = m_sentenceStart->GetId();
|
||||
m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_);
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceStartArray[m_factorType] = m_sentenceStart;
|
||||
|
||||
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
||||
factorId = m_sentenceEnd->GetId();
|
||||
m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_);
|
||||
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
|
||||
m_sentenceEndArray[m_factorType] = m_sentenceEnd;
|
||||
|
||||
// add to lookup vector in object
|
||||
m_lmIdLookup.resize(maxFactorId+1);
|
||||
|
||||
fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);
|
||||
|
||||
map<size_t, int>::iterator iterMap;
|
||||
for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
|
||||
{
|
||||
m_lmIdLookup[iterMap->first] = iterMap->second;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
int LanguageModelKen::GetLmID( const std::string &str ) const
|
||||
{
|
||||
return m_lmtb->getDict()->encode( str.c_str() ); // at the level of micro tags
|
||||
m_ngram = new lm::ngram::Model(filePath.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
/* get score of n-gram. n-gram should not be bigger than m_nGramOrder
|
||||
* Specific implementation can return State and len data to be used in hypothesis pruning
|
||||
* \param contextFactor n-gram to be scored
|
||||
* \param finalState state used by LM. Return arg
|
||||
* \param len ???
|
||||
*/
|
||||
float LanguageModelKen::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const
|
||||
{
|
||||
unsigned int dummy;
|
||||
if (!len) { len = &dummy; }
|
||||
FactorType factorType = GetFactorType();
|
||||
|
||||
// set up context
|
||||
FactorType factorType = GetFactorType();
|
||||
size_t count = contextFactor.size();
|
||||
|
||||
m_lmtb_ng->size=0;
|
||||
if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd);
|
||||
if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart);
|
||||
|
||||
for (size_t i = 0 ; i < count ; i++)
|
||||
assert(count <= GetNGramOrder());
|
||||
if (count == 0)
|
||||
{
|
||||
//int lmId = GetLmID((*contextFactor[i])[factorType]);
|
||||
#ifdef DEBUG
|
||||
cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";
|
||||
#endif
|
||||
int lmId = GetLmID((*contextFactor[i])[factorType]->GetString());
|
||||
// cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId;
|
||||
m_lmtb_ng->pushc(lmId);
|
||||
finalState = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (finalState){
|
||||
*finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng);
|
||||
// back off stats not currently available
|
||||
*len = 0;
|
||||
|
||||
// set up context
|
||||
const vector<int> ngramId(count);
|
||||
for (size_t i = 0 ; i < count - 1 ; i++)
|
||||
{
|
||||
const Factor *factor = contextFactor[i]->GetFactor(factorType);
|
||||
const string &word = factor->GetString();
|
||||
|
||||
//ngramId[i] = StringToId(word); FOR_KEN
|
||||
}
|
||||
|
||||
float prob = m_lmtb->clprob(*m_lmtb_ng);
|
||||
|
||||
float prob;
|
||||
//prob = m_ngram.GetScore(ngramId); FOR_KEN
|
||||
|
||||
return TransformLMScore(prob);
|
||||
}
|
||||
|
||||
|
||||
bool DELETEMELMCacheCleanup(size_t sentences_done, size_t m_lmcache_cleanup_threshold)
|
||||
{
|
||||
if (sentences_done==-1) return true;
|
||||
if (m_lmcache_cleanup_threshold)
|
||||
if (sentences_done % m_lmcache_cleanup_threshold == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void LanguageModelKen::CleanUpAfterSentenceProcessing()
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
static int sentenceCount = 0;
|
||||
sentenceCount++;
|
||||
|
||||
size_t lmcache_cleanup_threshold = staticData.GetLMCacheCleanupThreshold();
|
||||
|
||||
if (DELETEMELMCacheCleanup(sentenceCount, lmcache_cleanup_threshold)){
|
||||
TRACE_ERR( "reset caches\n");
|
||||
m_lmtb->reset_caches();
|
||||
}
|
||||
}
|
||||
|
||||
void LanguageModelKen::InitializeBeforeSentenceProcessing(){
|
||||
//nothing to do
|
||||
#ifdef TRACE_CACHE
|
||||
m_lmtb->sentence_id++;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -28,43 +28,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "TypeDef.h"
|
||||
#include "Util.h"
|
||||
#include "LanguageModelSingleFactor.h"
|
||||
|
||||
class lmtable; // Ken lm table
|
||||
class lmmacro; // Ken lm for macro tags
|
||||
class ngram;
|
||||
#include "../../kenlm/lm/ngram.hh"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class Phrase;
|
||||
|
||||
/** Implementation of single factor LM using Ken's code.
|
||||
* This is the default LM for Moses and is available from the same sourceforge repository
|
||||
*/
|
||||
class LanguageModelKen : public LanguageModelSingleFactor
|
||||
{
|
||||
protected:
|
||||
std::vector<int> m_lmIdLookup;
|
||||
lmtable* m_lmtb;
|
||||
ngram* m_lmtb_ng;
|
||||
lm::ngram::Model *m_ngram;
|
||||
|
||||
int m_unknownId;
|
||||
int m_lmtb_sentenceStart; //lmtb symbols to initialize ngram with
|
||||
int m_lmtb_sentenceEnd; //lmt symbol to initialize ngram with
|
||||
int m_lmtb_size; //max ngram stored in the table
|
||||
int m_lmtb_dub; //dictionary upperboud
|
||||
|
||||
std::string m_mapFilePath;
|
||||
|
||||
// float GetValue(LmId wordId, ngram *context) const;
|
||||
|
||||
void CreateFactors(FactorCollection &factorCollection);
|
||||
int GetLmID( const std::string &str ) const;
|
||||
|
||||
int GetLmID( const Factor *factor ) const{
|
||||
size_t factorId = factor->GetId();
|
||||
return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId];
|
||||
};
|
||||
|
||||
public:
|
||||
LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub);
|
||||
~LanguageModelKen();
|
||||
@ -74,14 +50,11 @@ public:
|
||||
|
||||
virtual float GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL, unsigned int* len=0) const;
|
||||
|
||||
void CleanUpAfterSentenceProcessing();
|
||||
void InitializeBeforeSentenceProcessing();
|
||||
void CleanUpAfterSentenceProcessing() {}
|
||||
void InitializeBeforeSentenceProcessing() {}
|
||||
|
||||
void set_dictionary_upperbound(int dub){ m_lmtb_size=dub ;
|
||||
//m_lmtb->set_dictionary_upperbound(dub);
|
||||
};
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -229,10 +229,10 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
|
||||
|
||||
|
||||
isDigit = s.find_first_of("0123456789");
|
||||
if (isDigit == string::npos)
|
||||
isDigit = 0;
|
||||
else
|
||||
if (isDigit == 1)
|
||||
isDigit = 1;
|
||||
else
|
||||
isDigit = 0;
|
||||
// modify the starting bitmap
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user