From 34a0e9b3a2a13b890631b98ce540d0fb003d6e74 Mon Sep 17 00:00:00 2001 From: bojar Date: Tue, 20 Apr 2010 15:25:52 +0000 Subject: [PATCH] support for SRILM's factored language models, implemented by Michal Richter git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3147 1f5c12ca-751b-0410-a591-d2e778427230 --- configure.in | 4 +- moses/src/LanguageModelFactory.cpp | 6 + moses/src/LanguageModelParallelBackoff.cpp | 304 +++++++++++++++++++++ moses/src/LanguageModelParallelBackoff.h | 99 +++++++ moses/src/Makefile.am | 1 + moses/src/TypeDef.h | 1 + 6 files changed, 413 insertions(+), 2 deletions(-) create mode 100644 moses/src/LanguageModelParallelBackoff.cpp create mode 100644 moses/src/LanguageModelParallelBackoff.h diff --git a/configure.in b/configure.in index b324a77d4..72714b7b0 100644 --- a/configure.in +++ b/configure.in @@ -131,12 +131,12 @@ then if test "x$with_srilm_dynamic" != 'xyes' then - LIB_SRILM="-loolm -ldstruct -lmisc" + LIB_SRILM="-loolm -ldstruct -lmisc -lflm" # ROOT/lib/i686-m64/liboolm.a # ROOT/lib/i686-m64/libdstruct.a # ROOT/lib/i686-m64/libmisc.a MY_ARCH=`${with_srilm}/sbin/machine-type` - LDFLAGS="$LDFLAGS -L${with_srilm}/lib/${MY_ARCH}" + LDFLAGS="$LDFLAGS -L${with_srilm}/lib/${MY_ARCH} -L${with_srilm}/flm/obj/${MY_ARCH}" LIBS="$LIBS $LIB_SRILM" FMTLIBS="$FMTLIBS liboolm.a libdstruct.a libmisc.a" else diff --git a/moses/src/LanguageModelFactory.cpp b/moses/src/LanguageModelFactory.cpp index 55f4bc9ef..7ddff7c5a 100644 --- a/moses/src/LanguageModelFactory.cpp +++ b/moses/src/LanguageModelFactory.cpp @@ -43,6 +43,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "LanguageModelInternal.h" #include "LanguageModelSkip.h" #include "LanguageModelJoint.h" +#include "LanguageModelParallelBackoff.h" using namespace std; @@ -109,6 +110,11 @@ namespace LanguageModelFactory , scoreIndexManager); #endif break; + case ParallelBackoff: + #ifdef LM_SRI + lm = new LanguageModelParallelBackoff(true, scoreIndexManager); + #endif + break; case Internal: #ifdef LM_INTERNAL lm = new LanguageModelInternal(true, scoreIndexManager); diff --git a/moses/src/LanguageModelParallelBackoff.cpp b/moses/src/LanguageModelParallelBackoff.cpp new file mode 100644 index 000000000..f432e85aa --- /dev/null +++ b/moses/src/LanguageModelParallelBackoff.cpp @@ -0,0 +1,304 @@ +// $Id: LanguageModelJoint.cpp 886 2006-10-17 11:07:17Z hieuhoang1972 $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "LanguageModelParallelBackoff.h" +#include "File.h" +#include "TypeDef.h" +#include "Util.h" +#include "FNgramSpecs.h" +#include "FNgramStats.h" +#include "FactoredVocab.h" +#include "FNgram.h" +#include "wmatrix.h" +#include "Vocab.h" + +using namespace std; + +namespace Moses +{ + + LanguageModelParallelBackoff::LanguageModelParallelBackoff(bool registerScore, ScoreIndexManager &scoreIndexManager) + :LanguageModelMultiFactor(registerScore, scoreIndexManager) + { + /// + } + + LanguageModelParallelBackoff::~LanguageModelParallelBackoff() + { + /// + } + + +bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector &factorTypes, float weight, size_t nGramOrder) + { + + cerr << "Loading Language Model Parallel Backoff!!!\n"; + widMatrix = new ::WidMatrix(); + m_factorTypes = FactorMask(factorTypes); + m_srilmVocab = new ::FactoredVocab(); + //assert(m_srilmVocab != 0); + + fnSpecs = 0; + File f(filePath.c_str(),"r"); + fnSpecs = new ::FNgramSpecs(f,*m_srilmVocab, 0/*debug*/); + + cerr << "Loaded fnSpecs!\n"; + + m_srilmVocab->unkIsWord() = true; + m_srilmVocab->nullIsWord() = true; + m_srilmVocab->toLower() = false; + + FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs); + + factoredStats->debugme(2); + + cerr << "Factored stats\n"; + + FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs); + assert(fngramLM != 0); + + cerr << "FNgram object created\n"; + + fngramLM->skipOOVs = false; + + if (!factoredStats->read()) { + cerr << "error reading in counts in factor file\n"; + exit(1); + } + + cerr << "Factored stats read!\n"; + + factoredStats->estimateDiscounts(); + factoredStats->computeCardinalityFunctions(); + factoredStats->sumCounts(); + + cerr << "Another three operations made!\n"; + + if (!fngramLM->read()) { + cerr << "format error in lm file\n"; + exit(1); + } + + cerr << "fngramLM reads!\n"; + + m_weight = weight; + m_filePath = filePath; + m_nGramOrder= nGramOrder; + + m_factorTypesOrdered= factorTypes; + + m_unknownId = m_srilmVocab->unkIndex(); + + cerr << "m_unknowdId = " << m_unknownId << endl; + + m_srilmModel = fngramLM; + + cerr << "Create factors...\n"; + + CreateFactors(); + + cerr << "Factors created! \n"; + FactorCollection &factorCollection = FactorCollection::Instance(); + + /*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) + { + FactorType factorType = m_factorTypesOrdered[index]; + m_sentenceStartArray[factorType] = factorCollection.AddFactor(Output, factorType, BOS_); + + + m_sentenceEndArray[factorType] = factorCollection.AddFactor(Output, factorType, EOS_); + + //factorIdStart = m_sentenceStartArray[factorType]->GetId(); + //factorIdEnd = m_sentenceEndArray[factorType]->GetId(); + + for (size_t i = 0; i < 10; i++) + { + lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_); + lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_); + } + + //(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); + //(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); + + }*/ + + + } + +VocabIndex LanguageModelParallelBackoff::GetLmID( const std::string &str ) const +{ + return m_srilmVocab->getIndex( str.c_str(), m_unknownId ); +} + +VocabIndex LanguageModelParallelBackoff::GetLmID( const Factor *factor, size_t ft ) const +{ + + size_t factorId = factor->GetId(); + if ( lmIdMap->find( factorId * 10 + ft ) != lmIdMap->end() ) + { + return lmIdMap->find( factorId * 10 + ft )->second; + } + else + { + return m_unknownId; + } + +} + +void LanguageModelParallelBackoff::CreateFactors() +{ + + // add factors which have srilm id + FactorCollection &factorCollection = FactorCollection::Instance(); + + lmIdMap = new std::map(); + + + VocabString str; + VocabIter iter(*m_srilmVocab); + + iter.init(); + + size_t pomFactorTypeNum = 0; + + + while ( (str = iter.next()) != NULL) + { + + if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') + { + continue; + } + VocabIndex lmId = GetLmID(str); + pomFactorTypeNum = str[0] - 'a'; + + size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId(); + (*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId; + } + + size_t factorIdStart; + size_t factorIdEnd; + + // sentence markers + for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) + { + FactorType factorType = m_factorTypesOrdered[index]; + m_sentenceStartArray[index] = factorCollection.AddFactor(Output, factorType, BOS_); + + + m_sentenceEndArray[index] = factorCollection.AddFactor(Output, factorType, EOS_); + + factorIdStart = m_sentenceStartArray[index]->GetId(); + factorIdEnd = m_sentenceEndArray[index]->GetId(); + + /*for (size_t i = 0; i < 10; i++) + { + lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_); + lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_); + }*/ + + (*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_); + (*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_); + + cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl; + + } + + m_wtid = GetLmID("W-"); + m_wtbid = GetLmID("W-"); + m_wteid = GetLmID("W-"); + + cerr << "W- index: " << m_wtid << endl; + cerr << "W- index: " << m_wtbid << endl; + cerr << "W- index: " << m_wteid << endl; + + +} + + float LanguageModelParallelBackoff::GetValue(const std::vector &contextFactor, State* finalState, unsigned int* len) const + { + + static WidMatrix widMatrix; + + for (int i=0;iwordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() ); + return FloorScore(TransformLMScore(p)); + + /*if (contextFactor.size() == 0) + { + return 0; + } + + for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) + { + const Word &word = *contextFactor[currPos]; + + for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) + { + FactorType factorType = m_factorTypesOrdered[index]; + const Factor *factor = word[factorType]; + + (*widMatrix)[currPos][index] = GetLmID(factor, index); + + } + + } + + float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder ); + return FloorScore(TransformLMScore(p)); */ + } + +} + diff --git a/moses/src/LanguageModelParallelBackoff.h b/moses/src/LanguageModelParallelBackoff.h new file mode 100644 index 000000000..b1c7ca997 --- /dev/null +++ b/moses/src/LanguageModelParallelBackoff.h @@ -0,0 +1,99 @@ +// $Id: LanguageModelJoint.h 1897 2008-10-08 23:51:26Z hieuhoang1972 $ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include +#include +#include +#include + +#include "LanguageModelMultiFactor.h" +#include "Word.h" +#include "Factor.h" +#include "FactorTypeSet.h" +#include "FactorCollection.h" +#include "Phrase.h" + +#include "FNgramSpecs.h" +#include "FNgramStats.h" +#include "FactoredVocab.h" +#include "FNgram.h" +#include "wmatrix.h" +#include "Vocab.h" + + + + +using namespace std; + +//class FactoredVocab; +//class FNgram; +//class WidMatrix; + + +namespace Moses +{ + + + + +/** LM of multiple factors. A simple extension of single factor LM - factors backoff together. + * Rather slow as this uses string concatenation/split +*/ +class LanguageModelParallelBackoff : public LanguageModelMultiFactor +{ +protected: + std::vector m_factorTypesOrdered; + + FactoredVocab *m_srilmVocab; + FNgram *m_srilmModel; + VocabIndex m_unknownId; + VocabIndex m_wtid; + VocabIndex m_wtbid; + VocabIndex m_wteid; + FNgramSpecs* fnSpecs; + //std::vector m_lmIdLookup; + std::map* lmIdMap; + std::fstream* debugStream; + + WidMatrix *widMatrix; + +public: + LanguageModelParallelBackoff(bool registerScore, ScoreIndexManager &scoreIndexManager); + + + ~LanguageModelParallelBackoff(); + + bool Load(const std::string &filePath, const std::vector &factorTypes, float weight, size_t nGramOrder); + +VocabIndex GetLmID( const std::string &str ) const; + +VocabIndex GetLmID( const Factor *factor, FactorType ft ) const; + +void CreateFactors(); + +virtual float GetValue(const std::vector &contextFactor, State* finalState = 0, unsigned int* len = 0) const; + + +}; + +} diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am index 06de1c8fe..85ee70f24 100644 --- a/moses/src/Makefile.am +++ b/moses/src/Makefile.am @@ -150,6 +150,7 @@ libmoses_la_SOURCES = \ LanguageModelRemote.cpp \ LanguageModelSingleFactor.cpp \ LanguageModelSkip.cpp \ + LanguageModelParallelBackoff.cpp \ TrellisPath.cpp \ TrellisPathCollection.cpp \ LexicalReordering.cpp \ diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h index cf3a1f940..f53acb75b 100644 --- a/moses/src/TypeDef.h +++ b/moses/src/TypeDef.h @@ -151,6 +151,7 @@ enum LMImplementation ,Internal = 4 ,RandLM = 5 ,Remote = 6 + ,ParallelBackoff = 7 };