diff --git a/CreateOnDisk/CreateOnDisk.xcodeproj/project.pbxproj b/CreateOnDisk/CreateOnDisk.xcodeproj/project.pbxproj index 43c9d80cc..874962405 100644 --- a/CreateOnDisk/CreateOnDisk.xcodeproj/project.pbxproj +++ b/CreateOnDisk/CreateOnDisk.xcodeproj/project.pbxproj @@ -237,6 +237,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lz", @@ -246,6 +247,7 @@ "-loolm", "-lflm", "-llattice", + "-lkenlm", ); PRODUCT_NAME = CreateOnDisk; }; @@ -261,6 +263,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lz", @@ -270,6 +273,7 @@ "-loolm", "-lflm", "-llattice", + "-lkenlm", ); PRODUCT_NAME = CreateOnDisk; }; diff --git a/misc/misc.xcodeproj/project.pbxproj b/misc/misc.xcodeproj/project.pbxproj index ff0132d18..a1f1cd5ce 100644 --- a/misc/misc.xcodeproj/project.pbxproj +++ b/misc/misc.xcodeproj/project.pbxproj @@ -17,28 +17,28 @@ isa = PBXContainerItemProxy; containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */; proxyType = 2; - remoteGlobalIDString = D2AAC046055464E500DB518D /* libmoses.a */; + remoteGlobalIDString = D2AAC046055464E500DB518D; remoteInfo = moses; }; 1EF455D71227C50C0022403A /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */; proxyType = 2; - remoteGlobalIDString = D2AAC046055464E500DB518D /* libOnDiskPt.a */; + remoteGlobalIDString = D2AAC046055464E500DB518D; remoteInfo = OnDiskPt; }; 1EF456211227C8A30022403A /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = 1EF455C81227C4D60022403A /* moses.xcodeproj */; proxyType = 1; - remoteGlobalIDString = D2AAC045055464E500DB518D /* moses */; + remoteGlobalIDString = D2AAC045055464E500DB518D; remoteInfo = moses; }; 1EF456231227C8A80022403A /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = 1EF455D31227C50C0022403A /* OnDiskPt.xcodeproj */; proxyType = 1; - remoteGlobalIDString = D2AAC045055464E500DB518D /* OnDiskPt */; + remoteGlobalIDString = D2AAC045055464E500DB518D; remoteInfo = OnDiskPt; }; /* End PBXContainerItemProxy section */ @@ -246,6 +246,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lflm", @@ -254,6 +255,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lkenlm", ); PREBINDING = NO; PRODUCT_NAME = processLexicalTable; @@ -273,6 +275,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lflm", @@ -281,6 +284,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lkenlm", ); PREBINDING = NO; PRODUCT_NAME = processLexicalTable; diff --git a/moses-chart-cmd/moses-chart-cmd.xcodeproj/project.pbxproj b/moses-chart-cmd/moses-chart-cmd.xcodeproj/project.pbxproj index 1b1aafc41..509d2a5e6 100644 --- a/moses-chart-cmd/moses-chart-cmd.xcodeproj/project.pbxproj +++ b/moses-chart-cmd/moses-chart-cmd.xcodeproj/project.pbxproj @@ -342,6 +342,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lz", @@ -351,6 +352,7 @@ "-loolm", "-lflm", "-llattice", + "-lkenlm", ); PRODUCT_NAME = "moses-chart-cmd"; }; @@ -367,6 +369,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lz", @@ -376,6 +379,7 @@ "-loolm", "-lflm", "-llattice", + "-lkenlm", ); PRODUCT_NAME = "moses-chart-cmd"; }; diff --git a/moses-cmd/moses-cmd.xcodeproj/project.pbxproj b/moses-cmd/moses-cmd.xcodeproj/project.pbxproj index 1275342a8..30fcde57e 100644 --- a/moses-cmd/moses-cmd.xcodeproj/project.pbxproj +++ b/moses-cmd/moses-cmd.xcodeproj/project.pbxproj @@ -271,6 +271,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lflm", @@ -279,6 +280,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lkenlm", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; @@ -306,6 +308,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lflm", @@ -314,6 +317,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lkenlm", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; @@ -333,6 +337,7 @@ LIBRARY_SEARCH_PATHS = ( ../irstlm/lib/i386, ../srilm/lib/macosx, + ../kenlm/lm, ); OTHER_LDFLAGS = ( "-lflm", @@ -341,6 +346,7 @@ "-ldstruct", "-lz", "-lirstlm", + "-lkenlm", ); PREBINDING = NO; PRODUCT_NAME = "moses-cmd"; diff --git a/moses/src/LanguageModelKen.cpp b/moses/src/LanguageModelKen.cpp index 50c3ccfac..3faa6d9af 100644 --- a/moses/src/LanguageModelKen.cpp +++ b/moses/src/LanguageModelKen.cpp @@ -44,20 +44,12 @@ namespace Moses LanguageModelKen::LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub) :LanguageModelSingleFactor(registerScore, scoreIndexManager) -,m_lmtb(0),m_lmtb_dub(dub) { } LanguageModelKen::~LanguageModelKen() { - -#ifndef WIN32 - TRACE_ERR( "reset mmap\n"); - m_lmtb->reset_mmap(); -#endif - - delete m_lmtb; - delete m_lmtb_ng; + delete m_ngram; } @@ -65,189 +57,45 @@ bool LanguageModelKen::Load(const std::string &filePath, FactorType factorType, size_t nGramOrder) { - const char *SepString = " \t\n"; cerr << "In LanguageModelKen::Load: nGramOrder = " << nGramOrder << "\n"; - - FactorCollection &factorCollection = FactorCollection::Instance(); - - m_factorType = factorType; - m_nGramOrder = nGramOrder; - - // get name of LM file and, if any, of the micro-macro map file - char *filenamesOrig = strdup(filePath.c_str()); - char *filenames = filenamesOrig; - m_filePath = strsep(&filenames, SepString); - - // Open the input file (possibly gzipped) - InputFileStream inp(m_filePath); - - if (filenames) { - // case LMfile + MAPfile: create an object of lmmacro class and load both LM file and map - cerr << "Loading LM file + MAP\n"; - m_mapFilePath = strsep(&filenames, SepString); - if (!FileExists(m_mapFilePath)) { - cerr << "ERROR: Map file <" << m_mapFilePath << "> does not exist\n"; - free(filenamesOrig); - return false; - } - InputFileStream inpMap(m_mapFilePath); - m_lmtb = new lmmacro(m_filePath, inp, inpMap); - - - } else { - // case (standard) LMfile only: create an object of lmtable - cerr << "Loading LM file (no MAP)\n"; - m_lmtb = (lmtable *)new lmtable; - - // Load the (possibly binary) model -#ifdef WIN32 - m_lmtb->load(inp); //don't use memory map -#else - if (m_filePath.compare(m_filePath.size()-3,3,".mm")==0) - m_lmtb->load(inp,m_filePath.c_str(),NULL,1); - else - m_lmtb->load(inp,m_filePath.c_str(),NULL,0); -#endif - - } - - m_lmtb_ng=new ngram(m_lmtb->getDict()); // ngram of words/micro tags - m_lmtb_size=m_lmtb->maxlevel(); - - // LM can be ok, just outputs warnings - - // Mauro: in the original, the following two instructions are wrongly switched: - m_unknownId = m_lmtb->getDict()->oovcode(); // at the level of micro tags - CreateFactors(factorCollection); - - VERBOSE(1, "Ken: m_unknownId=" << m_unknownId << std::endl); - - //install caches - m_lmtb->init_probcache(); - m_lmtb->init_statecache(); - m_lmtb->init_lmtcaches(m_lmtb->maxlevel()>2?m_lmtb->maxlevel()-1:2); - - if (m_lmtb_dub >0) m_lmtb->setlogOOVpenalty(m_lmtb_dub); - - free(filenamesOrig); - return true; -} - -void LanguageModelKen::CreateFactors(FactorCollection &factorCollection) -{ // add factors which have srilm id - // code copied & paste from SRI LM class. should do template function - std::map lmIdMap; - size_t maxFactorId = 0; // to create lookup vector later on - - dict_entry *entry; - dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags - while ( (entry = iter.next()) != NULL) - { - size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId(); - lmIdMap[factorId] = entry->code; - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - } - - size_t factorId; - - m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); - factorId = m_sentenceStart->GetId(); - m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_); - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - m_sentenceStartArray[m_factorType] = m_sentenceStart; - - m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); - factorId = m_sentenceEnd->GetId(); - m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_); - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - m_sentenceEndArray[m_factorType] = m_sentenceEnd; - - // add to lookup vector in object - m_lmIdLookup.resize(maxFactorId+1); - - fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId); - - map::iterator iterMap; - for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) - { - m_lmIdLookup[iterMap->first] = iterMap->second; - } - - -} - -int LanguageModelKen::GetLmID( const std::string &str ) const -{ - return m_lmtb->getDict()->encode( str.c_str() ); // at the level of micro tags + m_ngram = new lm::ngram::Model(filePath.c_str()); + return true; } + /* get score of n-gram. n-gram should not be bigger than m_nGramOrder + * Specific implementation can return State and len data to be used in hypothesis pruning + * \param contextFactor n-gram to be scored + * \param finalState state used by LM. Return arg + * \param len ??? + */ float LanguageModelKen::GetValue(const vector &contextFactor, State* finalState, unsigned int* len) const { - unsigned int dummy; - if (!len) { len = &dummy; } - FactorType factorType = GetFactorType(); - // set up context + FactorType factorType = GetFactorType(); size_t count = contextFactor.size(); - - m_lmtb_ng->size=0; - if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd); - if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart); - - for (size_t i = 0 ; i < count ; i++) + assert(count <= GetNGramOrder()); + if (count == 0) { - //int lmId = GetLmID((*contextFactor[i])[factorType]); -#ifdef DEBUG - cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n"; -#endif - int lmId = GetLmID((*contextFactor[i])[factorType]->GetString()); - // cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId; - m_lmtb_ng->pushc(lmId); + finalState = NULL; + return 0; } - - if (finalState){ - *finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng); - // back off stats not currently available - *len = 0; + + // set up context + const vector ngramId(count); + for (size_t i = 0 ; i < count - 1 ; i++) + { + const Factor *factor = contextFactor[i]->GetFactor(factorType); + const string &word = factor->GetString(); + + //ngramId[i] = StringToId(word); FOR_KEN } - float prob = m_lmtb->clprob(*m_lmtb_ng); - + float prob; + //prob = m_ngram.GetScore(ngramId); FOR_KEN return TransformLMScore(prob); } -bool DELETEMELMCacheCleanup(size_t sentences_done, size_t m_lmcache_cleanup_threshold) -{ - if (sentences_done==-1) return true; - if (m_lmcache_cleanup_threshold) - if (sentences_done % m_lmcache_cleanup_threshold == 0) - return true; - return false; -} - - -void LanguageModelKen::CleanUpAfterSentenceProcessing() -{ - const StaticData &staticData = StaticData::Instance(); - static int sentenceCount = 0; - sentenceCount++; - - size_t lmcache_cleanup_threshold = staticData.GetLMCacheCleanupThreshold(); - - if (DELETEMELMCacheCleanup(sentenceCount, lmcache_cleanup_threshold)){ - TRACE_ERR( "reset caches\n"); - m_lmtb->reset_caches(); - } -} - -void LanguageModelKen::InitializeBeforeSentenceProcessing(){ - //nothing to do -#ifdef TRACE_CACHE - m_lmtb->sentence_id++; -#endif -} - } diff --git a/moses/src/LanguageModelKen.h b/moses/src/LanguageModelKen.h index 58552a8b6..dfcf1820f 100644 --- a/moses/src/LanguageModelKen.h +++ b/moses/src/LanguageModelKen.h @@ -28,43 +28,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "TypeDef.h" #include "Util.h" #include "LanguageModelSingleFactor.h" - -class lmtable; // Ken lm table -class lmmacro; // Ken lm for macro tags -class ngram; +#include "../../kenlm/lm/ngram.hh" namespace Moses { class Phrase; /** Implementation of single factor LM using Ken's code. -* This is the default LM for Moses and is available from the same sourceforge repository */ class LanguageModelKen : public LanguageModelSingleFactor { protected: - std::vector m_lmIdLookup; - lmtable* m_lmtb; - ngram* m_lmtb_ng; + lm::ngram::Model *m_ngram; - int m_unknownId; - int m_lmtb_sentenceStart; //lmtb symbols to initialize ngram with - int m_lmtb_sentenceEnd; //lmt symbol to initialize ngram with - int m_lmtb_size; //max ngram stored in the table - int m_lmtb_dub; //dictionary upperboud - - std::string m_mapFilePath; - -// float GetValue(LmId wordId, ngram *context) const; - - void CreateFactors(FactorCollection &factorCollection); - int GetLmID( const std::string &str ) const; - - int GetLmID( const Factor *factor ) const{ - size_t factorId = factor->GetId(); - return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId]; - }; - public: LanguageModelKen(bool registerScore, ScoreIndexManager &scoreIndexManager, int dub); ~LanguageModelKen(); @@ -74,14 +50,11 @@ public: virtual float GetValue(const std::vector &contextFactor, State* finalState = NULL, unsigned int* len=0) const; - void CleanUpAfterSentenceProcessing(); - void InitializeBeforeSentenceProcessing(); + void CleanUpAfterSentenceProcessing() {} + void InitializeBeforeSentenceProcessing() {} - void set_dictionary_upperbound(int dub){ m_lmtb_size=dub ; -//m_lmtb->set_dictionary_upperbound(dub); }; }; -} #endif diff --git a/moses/src/TranslationOptionCollection.cpp b/moses/src/TranslationOptionCollection.cpp index 36b6cf37c..0d2c1192a 100644 --- a/moses/src/TranslationOptionCollection.cpp +++ b/moses/src/TranslationOptionCollection.cpp @@ -229,10 +229,10 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s isDigit = s.find_first_of("0123456789"); - if (isDigit == string::npos) - isDigit = 0; - else + if (isDigit == 1) isDigit = 1; + else + isDigit = 0; // modify the starting bitmap }