diff --git a/misc/.cproject b/misc/.cproject index 653abad65..405dd3c13 100644 --- a/misc/.cproject +++ b/misc/.cproject @@ -19,35 +19,37 @@ - + - - - - - - @@ -59,15 +61,15 @@ - - @@ -76,6 +78,8 @@ + + @@ -93,35 +97,37 @@ - + - - - - - - @@ -133,22 +139,22 @@ - - - @@ -157,6 +163,8 @@ + + diff --git a/misc/.project b/misc/.project index 350898510..e67d03462 100644 --- a/misc/.project +++ b/misc/.project @@ -11,14 +11,6 @@ org.eclipse.cdt.managedbuilder.core.genmakebuilder clean,full,incremental, - - org.eclipse.cdt.make.core.cleanBuildTarget - clean - - - org.eclipse.cdt.make.core.enableCleanBuild - true - ?name? @@ -28,44 +20,52 @@ true - org.eclipse.cdt.make.core.stopOnError - true - - - org.eclipse.cdt.make.core.buildCommand - make - - - org.eclipse.cdt.make.core.contents - org.eclipse.cdt.make.core.activeConfigSettings - - - org.eclipse.cdt.make.core.buildLocation - ${workspace_loc:/misc/Debug} - - - org.eclipse.cdt.make.core.useDefaultBuildCmd - true - - - org.eclipse.cdt.make.core.enableAutoBuild - false - - - org.eclipse.cdt.make.core.enableFullBuild - true + org.eclipse.cdt.make.core.autoBuildTarget + all org.eclipse.cdt.make.core.buildArguments + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/misc/Release} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + org.eclipse.cdt.make.core.fullBuildTarget all - org.eclipse.cdt.make.core.autoBuildTarget - all + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true diff --git a/moses-cmd/.cproject b/moses-cmd/.cproject index 6da3c27a6..cc8bee3fd 100644 --- a/moses-cmd/.cproject +++ b/moses-cmd/.cproject @@ -36,6 +36,7 @@ + + + + @@ -84,33 +90,35 @@ - + - - - - @@ -123,6 +131,10 @@ + + + + diff --git a/moses/src/LanguageModelFactory.cpp b/moses/src/LanguageModelFactory.cpp index b259eef5f..4684d6d43 100644 --- a/moses/src/LanguageModelFactory.cpp +++ b/moses/src/LanguageModelFactory.cpp @@ -32,6 +32,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #ifdef LM_IRST # include "LanguageModelIRST.h" #endif +#ifdef LM_RAND +# include "LanguageModelRandLM.h" +#endif #include "LanguageModelInternal.h" #include "LanguageModelSkip.h" @@ -44,7 +47,7 @@ namespace LanguageModelFactory { LanguageModel* CreateLanguageModel(LMImplementation lmImplementation - , const std::vector &factorTypes + , const std::vector &factorTypes , size_t nGramOrder , const std::string &languageModelFile , float weight @@ -54,6 +57,13 @@ namespace LanguageModelFactory LanguageModel *lm = NULL; switch (lmImplementation) { + case RandLM: + #ifdef LM_RAND + lm = new LanguageModelRandLM(true, + scoreIndexManager); + #endif + break; + case SRI: #ifdef LM_SRI lm = new LanguageModelSRI(true, scoreIndexManager); @@ -94,7 +104,7 @@ namespace LanguageModelFactory #endif break; } - + if (lm == NULL) { UserMessage::Add("Language model type unknown. Probably not compiled into library"); @@ -109,7 +119,7 @@ namespace LanguageModelFactory delete lm; lm = NULL; } - break; + break; case MultiFactor: if (! static_cast(lm)->Load(languageModelFile, factorTypes, weight, nGramOrder)) { @@ -119,7 +129,7 @@ namespace LanguageModelFactory break; } } - + return lm; } } diff --git a/moses/src/LanguageModelRandLM.cpp b/moses/src/LanguageModelRandLM.cpp new file mode 100644 index 000000000..805878e0c --- /dev/null +++ b/moses/src/LanguageModelRandLM.cpp @@ -0,0 +1,114 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include + +#include "LanguageModelRandLM.h" +#include "FactorCollection.h" +#include "Phrase.h" +#include "InputFileStream.h" +#include "StaticData.h" + +namespace Moses +{ + +bool LanguageModelRandLM::Load(const std::string &filePath, FactorType factorType, float weight, + size_t nGramOrder) { + cerr << "Loading LanguageModelRandLM..." << endl; + FactorCollection &factorCollection = FactorCollection::Instance(); + m_filePath = filePath; + m_factorType = factorType; + m_weight = weight; + m_nGramOrder = nGramOrder; + int cache_MB = 50; // increase cache size + m_lm = randlm::RandLM::initRandLM(filePath, nGramOrder, cache_MB); + assert(m_lm != NULL); + // get special word ids + m_oov_id = m_lm->getWordID(m_lm->getOOV()); + CreateFactors(factorCollection); + return true; +} + +void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection) { // add factors which have randlm id + // code copied & paste from SRI LM class. should do template function + // first get all bf vocab in map + std::map randlm_ids_map; // map from factor id -> randlm id + size_t maxFactorId = 0; // to create lookup vector later on + for(std::map::const_iterator vIter = m_lm->vocabStart(); + vIter != m_lm->vocabEnd(); vIter++){ + // get word from randlm vocab and associate with (new) factor id + size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId(); + randlm_ids_map[factorId] = vIter->second; + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + } + // add factors for BOS and EOS and store bf word ids + size_t factorId; + m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS()); + factorId = m_sentenceStart->GetId(); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceStartArray[m_factorType] = m_sentenceStart; + + m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS()); + factorId = m_sentenceEnd->GetId(); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceEndArray[m_factorType] = m_sentenceEnd; + + // add to lookup vector in object + m_randlm_ids_vec.resize(maxFactorId+1); + // fill with OOV code + fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id); + + for (map::const_iterator iter = randlm_ids_map.begin(); + iter != randlm_ids_map.end() ; ++iter) + m_randlm_ids_vec[iter->first] = iter->second; + +} + +randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const { + return m_lm->getWordID(str); +} + +float LanguageModelRandLM::GetValue(const vector &contextFactor, + State* finalState, unsigned int* len) const { + unsigned int dummy; // is this needed ? + if (!len) { len = &dummy; } + FactorType factorType = GetFactorType(); + // set up context + randlm::WordID ngram[MAX_NGRAM_SIZE]; + int count = contextFactor.size(); + for (int i = 0 ; i < count ; i++) { + ngram[i] = GetLmID((*contextFactor[i])[factorType]); + //std::cerr << m_lm->getWord(ngram[i]) << " "; + } + int found = 0; + float logprob = FloorScore(TransformSRIScore(m_lm->getProb(&ngram[0], count, &found, finalState))); + *len = 0; // not available + //if (finalState) + // std::cerr << " = " << logprob << "(" << *finalState << ", " << *len <<")"<< std::endl; + //else + // std::cerr << " = " << logprob << std::endl; + return logprob; +} + +} + + diff --git a/moses/src/LanguageModelRandLM.h b/moses/src/LanguageModelRandLM.h new file mode 100644 index 000000000..015e1aad4 --- /dev/null +++ b/moses/src/LanguageModelRandLM.h @@ -0,0 +1,65 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include +#include +#include "Factor.h" +#include "Util.h" +#include "LanguageModelSingleFactor.h" +#include "RandLM.h" + +class randlm::RandLM; + +namespace Moses +{ +class Factor; +class Phrase; + +// RandLM wrapper (single factor LM) + +class LanguageModelRandLM : public LanguageModelSingleFactor { +public: + LanguageModelRandLM(bool registerScore, ScoreIndexManager &scoreIndexManager) + : LanguageModelSingleFactor(registerScore, scoreIndexManager), m_lm(0) {} + bool Load(const std::string &filePath, FactorType factorType, float weight, size_t nGramOrder); + virtual float GetValue(const std::vector &contextFactor, State* finalState = NULL, unsigned int* len=0) const; + ~LanguageModelRandLM() { + delete m_lm; + } + void CleanUpAfterSentenceProcessing() { + m_lm->clearCaches(); // clear caches + } + void InitializeBeforeSentenceProcessing() {} // nothing to do + protected: + std::vector m_randlm_ids_vec; + randlm::RandLM* m_lm; + randlm::WordID m_oov_id; + void CreateFactors(FactorCollection &factorCollection); + randlm::WordID GetLmID( const std::string &str ) const; + randlm::WordID GetLmID( const Factor *factor ) const{ + size_t factorId = factor->GetId(); + return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId]; + }; + +}; + +} + diff --git a/moses/src/Makefile.am b/moses/src/Makefile.am index 52c630f0e..70cf94b67 100644 --- a/moses/src/Makefile.am +++ b/moses/src/Makefile.am @@ -104,6 +104,10 @@ if IRST_LM libmoses_a_SOURCES += LanguageModelIRST.cpp endif +if RAND_LM +libmoses_a_SOURCES += LanguageModelRandLM.cpp +endif + if INTERNAL_LM libmoses_a_SOURCES += LanguageModelInternal.cpp \ NGramCollection.cpp \ diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h index da704ec58..e986301d9 100644 --- a/moses/src/TypeDef.h +++ b/moses/src/TypeDef.h @@ -34,7 +34,7 @@ namespace Moses #ifndef BOS_ #define BOS_ "" //Beginning of sentence symbol #endif -#ifndef EOS_ +#ifndef EOS_ #define EOS_ "" //End of sentence symbol #endif @@ -55,7 +55,7 @@ const float LOWEST_SCORE = -100.0f; const float DEFAULT_BEAM_WIDTH = 0.00001f; const size_t DEFAULT_VERBOSE_LEVEL = 1; -///////////////////////////////////////////////// +///////////////////////////////////////////////// // for those using autoconf/automake #if HAVE_CONFIG_H #include "config.h" @@ -72,10 +72,14 @@ const size_t DEFAULT_VERBOSE_LEVEL = 1; # define LM_IRST 1 # endif -#endif -///////////////////////////////////////////////// +# ifdef HAVE_RANDLM +# define LM_RAND 1 +# endif -// enums. +#endif +///////////////////////////////////////////////// + +// enums. // must be 0, 1, 2, ..., unless otherwise stated // can only be 2 at the moment @@ -84,7 +88,7 @@ const int NUM_LANGUAGES = 2; const size_t MAX_NUM_FACTORS = 4; enum FactorDirection -{ +{ Input, //! Source factors Output //! Target factors }; @@ -114,21 +118,23 @@ namespace DistortionOrientationType { Monotone, //distinguish only between monotone and non-monotone as possible orientations Msd //further separate non-monotone into swapped and discontinuous - }; + }; } enum LMType -{ +{ SingleFactor ,MultiFactor }; enum LMImplementation -{ - SRI = 0 - ,IRST = 1 - ,Skip = 2 +{ + SRI = 0 + ,IRST = 1 + ,Skip = 2 ,Joint = 3 ,Internal = 4 + ,RandLM = 5 + }; @@ -148,13 +154,13 @@ enum XmlInputType }; enum DictionaryFind -{ +{ Best = 0 ,All = 1 }; enum SearchAlgorithm -{ +{ Normal = 0 ,CubePruning = 1 ,CubeGrowing = 2