2010-09-22 02:43:29 +04:00
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2010-10-27 21:50:40 +04:00
# include <cstring>
2010-10-28 05:05:04 +04:00
# include <iostream>
2011-10-12 23:49:27 +04:00
# include <memory>
2015-03-28 16:09:03 +03:00
# include <cstdlib>
2013-02-05 03:10:12 +04:00
# include <boost/shared_ptr.hpp>
2015-02-02 17:25:02 +03:00
# include <boost/lexical_cast.hpp>
2013-02-05 03:10:12 +04:00
2010-10-28 05:05:04 +04:00
# include "lm/binary_format.hh"
2010-10-27 21:50:40 +04:00
# include "lm/enumerate_vocab.hh"
2011-10-13 16:33:05 +04:00
# include "lm/left.hh"
2010-11-06 03:40:16 +03:00
# include "lm/model.hh"
2014-01-13 18:37:05 +04:00
# include "util/exception.hh"
2015-02-02 17:25:02 +03:00
# include "util/tokenize_piece.hh"
2015-10-02 22:12:39 +03:00
# include "util/string_stream.hh"
2010-09-22 02:43:29 +04:00
2012-11-12 23:56:18 +04:00
# include "Ken.h"
# include "Base.h"
2013-05-24 21:02:49 +04:00
# include "moses/FF/FFState.h"
2012-11-12 23:56:18 +04:00
# include "moses/TypeDef.h"
# include "moses/Util.h"
# include "moses/FactorCollection.h"
# include "moses/Phrase.h"
# include "moses/InputFileStream.h"
# include "moses/StaticData.h"
# include "moses/ChartHypothesis.h"
2012-11-15 22:24:21 +04:00
# include "moses/Incremental.h"
2015-11-16 21:58:14 +03:00
# include "moses/Syntax/SHyperedge.h"
2014-11-04 16:13:56 +03:00
# include "moses/Syntax/SVertex.h"
2011-10-13 16:33:05 +04:00
2010-09-22 02:43:29 +04:00
using namespace std ;
2013-05-29 21:16:15 +04:00
namespace Moses
{
namespace
{
2010-11-06 03:40:16 +03:00
struct KenLMState : public FFState {
2011-02-24 16:14:42 +03:00
lm : : ngram : : State state ;
2015-10-16 15:53:33 +03:00
virtual size_t hash ( ) const {
size_t ret = hash_value ( state ) ;
return ret ;
2015-10-08 19:14:08 +03:00
}
2015-10-16 15:53:33 +03:00
virtual bool operator = = ( const FFState & o ) const {
const KenLMState & other = static_cast < const KenLMState & > ( o ) ;
bool ret = state = = other . state ;
2015-10-14 13:16:37 +03:00
return ret ;
2015-10-08 20:14:52 +03:00
}
2015-10-08 19:14:08 +03:00
2010-11-06 03:40:16 +03:00
} ;
2013-05-29 21:16:15 +04:00
class MappingBuilder : public lm : : EnumerateVocab
{
2011-10-12 23:49:27 +04:00
public :
MappingBuilder ( FactorCollection & factorCollection , std : : vector < lm : : WordIndex > & mapping )
: m_factorCollection ( factorCollection ) , m_mapping ( mapping ) { }
void Add ( lm : : WordIndex index , const StringPiece & str ) {
2011-10-13 17:32:14 +04:00
std : : size_t factorId = m_factorCollection . AddFactor ( str ) - > GetId ( ) ;
2011-10-12 23:49:27 +04:00
if ( m_mapping . size ( ) < = factorId ) {
// 0 is <unk> :-)
m_mapping . resize ( factorId + 1 ) ;
}
m_mapping [ factorId ] = index ;
}
private :
FactorCollection & m_factorCollection ;
std : : vector < lm : : WordIndex > & m_mapping ;
} ;
2013-08-27 23:55:07 +04:00
} // namespace
2013-09-27 12:35:24 +04:00
2016-02-20 03:07:48 +03:00
template < class Model > void LanguageModelKen < Model > : : LoadModel ( const std : : string & file , util : : LoadMethod load_method )
2013-02-02 00:23:36 +04:00
{
2016-03-23 18:25:57 +03:00
m_lmIdLookup . clear ( ) ;
2011-02-24 16:14:42 +03:00
lm : : ngram : : Config config ;
2015-11-10 18:07:06 +03:00
if ( this - > m_verbosity > = 1 ) {
2011-02-24 16:14:42 +03:00
config . messages = & std : : cerr ;
2015-11-25 03:00:39 +03:00
} else {
2011-02-24 16:14:42 +03:00
config . messages = NULL ;
}
2011-10-13 16:33:05 +04:00
FactorCollection & collection = FactorCollection : : Instance ( ) ;
MappingBuilder builder ( collection , m_lmIdLookup ) ;
2011-02-24 16:14:42 +03:00
config . enumerate_vocab = & builder ;
2016-02-20 03:07:48 +03:00
config . load_method = load_method ;
2011-02-24 16:14:42 +03:00
2011-10-17 13:30:30 +04:00
m_ngram . reset ( new Model ( file . c_str ( ) , config ) ) ;
2017-01-02 21:57:52 +03:00
VERBOSE ( 2 , " LanguageModelKen " < < m_description < < " reset to " < < file < < " \n " ) ;
2010-09-22 02:43:29 +04:00
}
2016-02-20 03:07:48 +03:00
template < class Model > LanguageModelKen < Model > : : LanguageModelKen ( const std : : string & line , const std : : string & file , FactorType factorType , util : : LoadMethod load_method )
2016-01-13 01:05:00 +03:00
: LanguageModel ( line )
2016-03-23 18:25:57 +03:00
, m_beginSentenceFactor ( FactorCollection : : Instance ( ) . AddFactor ( BOS_ ) )
2016-08-18 00:54:31 +03:00
, m_factorType ( factorType )
2016-01-13 01:05:00 +03:00
{
ReadParameters ( ) ;
2016-02-20 03:07:48 +03:00
LoadModel ( file , load_method ) ;
2016-01-13 01:05:00 +03:00
}
2017-01-02 21:57:52 +03:00
template < class Model > LanguageModelKen < Model > : : LanguageModelKen ( )
: LanguageModel ( " KENLM " )
, m_beginSentenceFactor ( FactorCollection : : Instance ( ) . AddFactor ( BOS_ ) )
, m_factorType ( 0 )
{
ReadParameters ( ) ;
}
2013-02-02 00:23:36 +04:00
template < class Model > LanguageModelKen < Model > : : LanguageModelKen ( const LanguageModelKen < Model > & copy_from )
2013-10-29 22:59:53 +04:00
: LanguageModel ( copy_from . GetArgLine ( ) ) ,
2013-05-29 21:16:15 +04:00
m_ngram ( copy_from . m_ngram ) ,
2013-02-02 00:23:36 +04:00
// TODO: don't copy this.
2015-03-29 14:10:51 +03:00
m_beginSentenceFactor ( copy_from . m_beginSentenceFactor ) ,
2013-05-29 21:16:15 +04:00
m_factorType ( copy_from . m_factorType ) ,
2015-03-29 14:10:51 +03:00
m_lmIdLookup ( copy_from . m_lmIdLookup )
2013-05-29 21:16:15 +04:00
{
2010-09-22 02:43:29 +04:00
}
2013-08-27 23:55:07 +04:00
template < class Model > const FFState * LanguageModelKen < Model > : : EmptyHypothesisState ( const InputType & /*input*/ ) const
{
KenLMState * ret = new KenLMState ( ) ;
ret - > state = m_ngram - > BeginSentenceState ( ) ;
return ret ;
}
2013-05-29 21:16:15 +04:00
template < class Model > void LanguageModelKen < Model > : : CalcScore ( const Phrase & phrase , float & fullScore , float & ngramScore , size_t & oovCount ) const
{
2011-10-13 16:33:05 +04:00
fullScore = 0 ;
ngramScore = 0 ;
oovCount = 0 ;
2010-10-27 21:50:40 +04:00
2011-10-13 16:33:05 +04:00
if ( ! phrase . GetSize ( ) ) return ;
2010-11-17 17:06:21 +03:00
2012-10-13 19:49:40 +04:00
lm : : ngram : : ChartState discarded_sadly ;
lm : : ngram : : RuleScore < Model > scorer ( * m_ngram , discarded_sadly ) ;
2013-05-29 21:16:15 +04:00
2011-10-13 16:33:05 +04:00
size_t position ;
if ( m_beginSentenceFactor = = phrase . GetWord ( 0 ) . GetFactor ( m_factorType ) ) {
2012-10-13 19:49:40 +04:00
scorer . BeginSentence ( ) ;
2011-10-13 16:33:05 +04:00
position = 1 ;
} else {
position = 0 ;
}
2013-05-29 21:16:15 +04:00
2011-10-13 16:33:05 +04:00
size_t ngramBoundary = m_ngram - > Order ( ) - 1 ;
2010-11-17 17:06:21 +03:00
2012-10-13 19:49:40 +04:00
size_t end_loop = std : : min ( ngramBoundary , phrase . GetSize ( ) ) ;
for ( ; position < end_loop ; + + position ) {
2011-10-13 16:33:05 +04:00
const Word & word = phrase . GetWord ( position ) ;
if ( word . IsNonTerminal ( ) ) {
2012-10-13 19:49:40 +04:00
fullScore + = scorer . Finish ( ) ;
scorer . Reset ( ) ;
2011-10-13 16:33:05 +04:00
} else {
lm : : WordIndex index = TranslateID ( word ) ;
2012-10-13 19:49:40 +04:00
scorer . Terminal ( index ) ;
2011-10-13 16:33:05 +04:00
if ( ! index ) + + oovCount ;
}
2011-02-24 16:14:42 +03:00
}
2012-10-13 19:49:40 +04:00
float before_boundary = fullScore + scorer . Finish ( ) ;
for ( ; position < phrase . GetSize ( ) ; + + position ) {
2013-05-29 21:16:15 +04:00
const Word & word = phrase . GetWord ( position ) ;
2012-10-13 19:49:40 +04:00
if ( word . IsNonTerminal ( ) ) {
fullScore + = scorer . Finish ( ) ;
scorer . Reset ( ) ;
} else {
lm : : WordIndex index = TranslateID ( word ) ;
scorer . Terminal ( index ) ;
if ( ! index ) + + oovCount ;
2013-05-29 21:16:15 +04:00
}
2012-10-13 19:49:40 +04:00
}
fullScore + = scorer . Finish ( ) ;
ngramScore = TransformLMScore ( fullScore - before_boundary ) ;
fullScore = TransformLMScore ( fullScore ) ;
2010-10-27 21:50:40 +04:00
}
2014-07-10 02:41:08 +04:00
template < class Model > FFState * LanguageModelKen < Model > : : EvaluateWhenApplied ( const Hypothesis & hypo , const FFState * ps , ScoreComponentCollection * out ) const
2013-05-29 21:16:15 +04:00
{
2011-10-12 23:49:27 +04:00
const lm : : ngram : : State & in_state = static_cast < const KenLMState & > ( * ps ) . state ;
std : : auto_ptr < KenLMState > ret ( new KenLMState ( ) ) ;
2013-05-29 21:16:15 +04:00
2011-10-12 23:49:27 +04:00
if ( ! hypo . GetCurrTargetLength ( ) ) {
ret - > state = in_state ;
return ret . release ( ) ;
}
const std : : size_t begin = hypo . GetCurrTargetWordsRange ( ) . GetStartPos ( ) ;
//[begin, end) in STL-like fashion.
const std : : size_t end = hypo . GetCurrTargetWordsRange ( ) . GetEndPos ( ) + 1 ;
const std : : size_t adjust_end = std : : min ( end , begin + m_ngram - > Order ( ) - 1 ) ;
std : : size_t position = begin ;
typename Model : : State aux_state ;
typename Model : : State * state0 = & ret - > state , * state1 = & aux_state ;
float score = m_ngram - > Score ( in_state , TranslateID ( hypo . GetWord ( position ) ) , * state0 ) ;
+ + position ;
for ( ; position < adjust_end ; + + position ) {
score + = m_ngram - > Score ( * state0 , TranslateID ( hypo . GetWord ( position ) ) , * state1 ) ;
std : : swap ( state0 , state1 ) ;
}
if ( hypo . IsSourceCompleted ( ) ) {
2013-05-29 21:16:15 +04:00
// Score end of sentence.
2011-10-12 23:49:27 +04:00
std : : vector < lm : : WordIndex > indices ( m_ngram - > Order ( ) - 1 ) ;
const lm : : WordIndex * last = LastIDs ( hypo , & indices . front ( ) ) ;
score + = m_ngram - > FullScoreForgotState ( & indices . front ( ) , last , m_ngram - > GetVocabulary ( ) . EndSentence ( ) , ret - > state ) . prob ;
} else if ( adjust_end < end ) {
2013-05-29 21:16:15 +04:00
// Get state after adding a long phrase.
2011-10-12 23:49:27 +04:00
std : : vector < lm : : WordIndex > indices ( m_ngram - > Order ( ) - 1 ) ;
const lm : : WordIndex * last = LastIDs ( hypo , & indices . front ( ) ) ;
m_ngram - > GetState ( & indices . front ( ) , last , ret - > state ) ;
} else if ( state0 ! = & ret - > state ) {
2013-05-29 21:16:15 +04:00
// Short enough phrase that we can just reuse the state.
2011-10-12 23:49:27 +04:00
ret - > state = * state0 ;
}
score = TransformLMScore ( score ) ;
2011-10-13 16:33:05 +04:00
if ( OOVFeatureEnabled ( ) ) {
2011-10-12 23:49:27 +04:00
std : : vector < float > scores ( 2 ) ;
scores [ 0 ] = score ;
scores [ 1 ] = 0.0 ;
2011-10-13 16:33:05 +04:00
out - > PlusEquals ( this , scores ) ;
2011-10-12 23:49:27 +04:00
} else {
2011-10-13 16:33:05 +04:00
out - > PlusEquals ( this , score ) ;
2011-10-12 23:49:27 +04:00
}
return ret . release ( ) ;
}
2013-05-29 21:16:15 +04:00
class LanguageModelChartStateKenLM : public FFState
{
public :
LanguageModelChartStateKenLM ( ) { }
2011-10-12 23:49:27 +04:00
2013-05-29 21:16:15 +04:00
const lm : : ngram : : ChartState & GetChartState ( ) const {
return m_state ;
}
lm : : ngram : : ChartState & GetChartState ( ) {
return m_state ;
}
2011-10-12 23:49:27 +04:00
2015-10-16 15:53:33 +03:00
size_t hash ( ) const {
size_t ret = hash_value ( m_state ) ;
return ret ;
2015-10-08 19:14:08 +03:00
}
2015-10-16 15:53:33 +03:00
virtual bool operator = = ( const FFState & o ) const {
const LanguageModelChartStateKenLM & other = static_cast < const LanguageModelChartStateKenLM & > ( o ) ;
bool ret = m_state = = other . m_state ;
return ret ;
2015-10-08 20:25:10 +03:00
}
2015-10-08 19:14:08 +03:00
2013-05-29 21:16:15 +04:00
private :
lm : : ngram : : ChartState m_state ;
2011-10-12 23:49:27 +04:00
} ;
2014-07-10 02:54:16 +04:00
template < class Model > FFState * LanguageModelKen < Model > : : EvaluateWhenApplied ( const ChartHypothesis & hypo , int featureID , ScoreComponentCollection * accumulator ) const
2013-05-29 21:16:15 +04:00
{
2011-10-12 23:49:27 +04:00
LanguageModelChartStateKenLM * newState = new LanguageModelChartStateKenLM ( ) ;
lm : : ngram : : RuleScore < Model > ruleScore ( * m_ngram , newState - > GetChartState ( ) ) ;
2012-09-27 01:49:33 +04:00
const TargetPhrase & target = hypo . GetCurrTargetPhrase ( ) ;
const AlignmentInfo : : NonTermIndexMap & nonTermIndexMap =
2013-05-29 21:16:15 +04:00
target . GetAlignNonTerm ( ) . GetNonTermIndexMap ( ) ;
2011-10-12 23:49:27 +04:00
const size_t size = hypo . GetCurrTargetPhrase ( ) . GetSize ( ) ;
size_t phrasePos = 0 ;
2013-05-29 21:16:15 +04:00
// Special cases for first word.
2011-10-12 23:49:27 +04:00
if ( size ) {
const Word & word = hypo . GetCurrTargetPhrase ( ) . GetWord ( 0 ) ;
2011-10-13 16:33:05 +04:00
if ( word . GetFactor ( m_factorType ) = = m_beginSentenceFactor ) {
2011-10-12 23:49:27 +04:00
// Begin of sentence
ruleScore . BeginSentence ( ) ;
phrasePos + + ;
} else if ( word . IsNonTerminal ( ) ) {
2013-05-29 21:16:15 +04:00
// Non-terminal is first so we can copy instead of rescoring.
2011-10-12 23:49:27 +04:00
const ChartHypothesis * prevHypo = hypo . GetPrevHypo ( nonTermIndexMap [ phrasePos ] ) ;
const lm : : ngram : : ChartState & prevState = static_cast < const LanguageModelChartStateKenLM * > ( prevHypo - > GetFFState ( featureID ) ) - > GetChartState ( ) ;
2015-01-07 17:25:43 +03:00
ruleScore . BeginNonTerminal ( prevState ) ;
2011-10-12 23:49:27 +04:00
phrasePos + + ;
}
}
for ( ; phrasePos < size ; phrasePos + + ) {
const Word & word = hypo . GetCurrTargetPhrase ( ) . GetWord ( phrasePos ) ;
if ( word . IsNonTerminal ( ) ) {
const ChartHypothesis * prevHypo = hypo . GetPrevHypo ( nonTermIndexMap [ phrasePos ] ) ;
const lm : : ngram : : ChartState & prevState = static_cast < const LanguageModelChartStateKenLM * > ( prevHypo - > GetFFState ( featureID ) ) - > GetChartState ( ) ;
2015-01-07 17:25:43 +03:00
ruleScore . NonTerminal ( prevState ) ;
2011-10-12 23:49:27 +04:00
} else {
ruleScore . Terminal ( TranslateID ( word ) ) ;
}
}
2013-04-29 16:01:26 +04:00
float score = ruleScore . Finish ( ) ;
score = TransformLMScore ( score ) ;
2015-01-07 17:25:43 +03:00
score - = hypo . GetTranslationOption ( ) . GetScores ( ) . GetScoresForProducer ( this ) [ 0 ] ;
2014-11-13 13:36:26 +03:00
if ( OOVFeatureEnabled ( ) ) {
std : : vector < float > scores ( 2 ) ;
scores [ 0 ] = score ;
scores [ 1 ] = 0.0 ;
2015-01-07 17:25:43 +03:00
accumulator - > PlusEquals ( this , scores ) ;
2015-01-14 14:07:42 +03:00
} else {
2015-01-07 17:25:43 +03:00
accumulator - > PlusEquals ( this , score ) ;
2014-11-13 13:36:26 +03:00
}
2011-10-12 23:49:27 +04:00
return newState ;
}
2014-11-04 16:13:56 +03:00
template < class Model > FFState * LanguageModelKen < Model > : : EvaluateWhenApplied ( const Syntax : : SHyperedge & hyperedge , int featureID , ScoreComponentCollection * accumulator ) const
{
LanguageModelChartStateKenLM * newState = new LanguageModelChartStateKenLM ( ) ;
lm : : ngram : : RuleScore < Model > ruleScore ( * m_ngram , newState - > GetChartState ( ) ) ;
2015-01-30 14:56:08 +03:00
const TargetPhrase & target = * hyperedge . label . translation ;
2014-11-04 16:13:56 +03:00
const AlignmentInfo : : NonTermIndexMap & nonTermIndexMap =
target . GetAlignNonTerm ( ) . GetNonTermIndexMap2 ( ) ;
const size_t size = target . GetSize ( ) ;
size_t phrasePos = 0 ;
// Special cases for first word.
if ( size ) {
const Word & word = target . GetWord ( 0 ) ;
if ( word . GetFactor ( m_factorType ) = = m_beginSentenceFactor ) {
// Begin of sentence
ruleScore . BeginSentence ( ) ;
phrasePos + + ;
} else if ( word . IsNonTerminal ( ) ) {
// Non-terminal is first so we can copy instead of rescoring.
const Syntax : : SVertex * pred = hyperedge . tail [ nonTermIndexMap [ phrasePos ] ] ;
2015-10-14 13:07:11 +03:00
const lm : : ngram : : ChartState & prevState = static_cast < const LanguageModelChartStateKenLM * > ( pred - > states [ featureID ] ) - > GetChartState ( ) ;
2015-11-16 22:23:27 +03:00
ruleScore . BeginNonTerminal ( prevState ) ;
2014-11-04 16:13:56 +03:00
phrasePos + + ;
}
}
for ( ; phrasePos < size ; phrasePos + + ) {
const Word & word = target . GetWord ( phrasePos ) ;
if ( word . IsNonTerminal ( ) ) {
const Syntax : : SVertex * pred = hyperedge . tail [ nonTermIndexMap [ phrasePos ] ] ;
2015-10-14 13:07:11 +03:00
const lm : : ngram : : ChartState & prevState = static_cast < const LanguageModelChartStateKenLM * > ( pred - > states [ featureID ] ) - > GetChartState ( ) ;
2015-11-16 22:23:27 +03:00
ruleScore . NonTerminal ( prevState ) ;
2014-11-04 16:13:56 +03:00
} else {
ruleScore . Terminal ( TranslateID ( word ) ) ;
}
}
2013-04-29 16:01:26 +04:00
float score = ruleScore . Finish ( ) ;
score = TransformLMScore ( score ) ;
2015-11-16 22:23:27 +03:00
score - = target . GetScoreBreakdown ( ) . GetScoresForProducer ( this ) [ 0 ] ;
if ( OOVFeatureEnabled ( ) ) {
std : : vector < float > scores ( 2 ) ;
scores [ 0 ] = score ;
scores [ 1 ] = 0.0 ;
accumulator - > PlusEquals ( this , scores ) ;
} else {
accumulator - > PlusEquals ( this , score ) ;
}
2011-10-12 23:49:27 +04:00
return newState ;
}
2013-09-27 12:35:24 +04:00
template < class Model > void LanguageModelKen < Model > : : IncrementalCallback ( Incremental : : Manager & manager ) const
{
2013-08-27 23:55:07 +04:00
manager . LMCallback ( * m_ngram , m_lmIdLookup ) ;
}
2013-09-27 12:35:24 +04:00
2013-10-13 09:59:05 +04:00
template < class Model > void LanguageModelKen < Model > : : ReportHistoryOrder ( std : : ostream & out , const Phrase & phrase ) const
{
2013-10-14 01:05:07 +04:00
out < < " |lm=( " ;
2013-10-13 09:59:05 +04:00
if ( ! phrase . GetSize ( ) ) return ;
typename Model : : State aux_state ;
2013-10-14 01:05:07 +04:00
typename Model : : State start_of_sentence_state = m_ngram - > BeginSentenceState ( ) ;
typename Model : : State * state0 = & start_of_sentence_state ;
2013-10-13 09:59:05 +04:00
typename Model : : State * state1 = & aux_state ;
for ( std : : size_t position = 0 ; position < phrase . GetSize ( ) ; position + + ) {
const lm : : WordIndex idx = TranslateID ( phrase . GetWord ( position ) ) ;
2013-10-14 01:05:07 +04:00
lm : : FullScoreReturn ret ( m_ngram - > FullScore ( * state0 , idx , * state1 ) ) ;
if ( position ) out < < " , " ;
out < < ( int ) ret . ngram_length < < " : " < < TransformLMScore ( ret . prob ) ;
if ( idx = = 0 ) out < < " :unk " ;
2013-10-13 09:59:05 +04:00
std : : swap ( state0 , state1 ) ;
}
out < < " )| " ;
}
2013-05-30 15:41:08 +04:00
template < class Model >
bool LanguageModelKen < Model > : : IsUseable ( const FactorMask & mask ) const
{
bool ret = mask [ m_factorType ] ;
return ret ;
}
2015-02-14 21:01:12 +03:00
/* Instantiate LanguageModelKen here. Tells the compiler to generate code
* for the instantiations ' non - inline member functions in this file .
* Otherwise , depending on the compiler , those functions may not be present
* at link time .
*/
template class LanguageModelKen < lm : : ngram : : ProbingModel > ;
template class LanguageModelKen < lm : : ngram : : RestProbingModel > ;
template class LanguageModelKen < lm : : ngram : : TrieModel > ;
template class LanguageModelKen < lm : : ngram : : ArrayTrieModel > ;
template class LanguageModelKen < lm : : ngram : : QuantTrieModel > ;
template class LanguageModelKen < lm : : ngram : : QuantArrayTrieModel > ;
2015-02-06 14:53:25 +03:00
LanguageModel * ConstructKenLM ( const std : : string & lineOrig )
2013-01-17 21:15:10 +04:00
{
2013-11-07 07:30:32 +04:00
FactorType factorType = 0 ;
2013-02-05 03:10:12 +04:00
string filePath ;
2016-02-20 03:07:48 +03:00
util : : LoadMethod load_method = util : : POPULATE_OR_READ ;
2013-02-05 03:10:12 +04:00
2015-02-06 14:53:25 +03:00
util : : TokenIter < util : : SingleCharacter , true > argument ( lineOrig , ' ' ) ;
2015-02-19 15:27:23 +03:00
+ + argument ; // KENLM
2015-02-03 01:34:57 +03:00
2015-10-03 02:33:38 +03:00
util : : StringStream line ;
2015-02-06 14:53:25 +03:00
line < < " KENLM " ;
2015-02-03 01:34:57 +03:00
for ( ; argument ; + + argument ) {
2015-02-02 17:25:02 +03:00
const char * equals = std : : find ( argument - > data ( ) , argument - > data ( ) + argument - > size ( ) , ' = ' ) ;
UTIL_THROW_IF2 ( equals = = argument - > data ( ) + argument - > size ( ) ,
" Expected = in KenLM argument " < < * argument ) ;
StringPiece name ( argument - > data ( ) , equals - argument - > data ( ) ) ;
StringPiece value ( equals + 1 , argument - > data ( ) + argument - > size ( ) - equals - 1 ) ;
if ( name = = " factor " ) {
factorType = boost : : lexical_cast < FactorType > ( value ) ;
} else if ( name = = " order " ) {
// Ignored
} else if ( name = = " path " ) {
filePath . assign ( value . data ( ) , value . size ( ) ) ;
} else if ( name = = " lazyken " ) {
2016-02-20 03:07:48 +03:00
// deprecated: use load instead.
2016-08-18 00:44:04 +03:00
if ( value = = " 0 " | | value = = " false " ) {
load_method = util : : POPULATE_OR_READ ;
} else if ( value = = " 1 " | | value = = " true " ) {
load_method = util : : LAZY ;
} else {
UTIL_THROW2 ( " Can't parse lazyken argument " < < value < < " . Also, lazyken is deprecated. Use load with one of the arguments lazy, populate_or_lazy, populate_or_read, read, or parallel_read. " ) ;
}
2016-02-20 03:07:48 +03:00
} else if ( name = = " load " ) {
if ( value = = " lazy " ) {
load_method = util : : LAZY ;
} else if ( value = = " populate_or_lazy " ) {
load_method = util : : POPULATE_OR_LAZY ;
} else if ( value = = " populate_or_read " | | value = = " populate " ) {
load_method = util : : POPULATE_OR_READ ;
} else if ( value = = " read " ) {
load_method = util : : READ ;
} else if ( value = = " parallel_read " ) {
load_method = util : : PARALLEL_READ ;
} else {
UTIL_THROW2 ( " Unknown KenLM load method " < < value ) ;
}
2015-02-02 17:25:02 +03:00
} else {
2015-02-06 18:37:47 +03:00
// pass to base class to interpret
2015-02-06 14:53:25 +03:00
line < < " " < < name < < " = " < < value ;
2013-02-05 03:10:12 +04:00
}
}
2013-01-17 21:15:10 +04:00
2016-02-20 03:07:48 +03:00
return ConstructKenLM ( line . str ( ) , filePath , factorType , load_method ) ;
2013-01-17 21:15:10 +04:00
}
2016-02-20 03:07:48 +03:00
LanguageModel * ConstructKenLM ( const std : : string & line , const std : : string & file , FactorType factorType , util : : LoadMethod load_method )
2013-05-29 21:16:15 +04:00
{
2014-01-15 19:42:02 +04:00
lm : : ngram : : ModelType model_type ;
if ( lm : : ngram : : RecognizeBinary ( file . c_str ( ) , model_type ) ) {
switch ( model_type ) {
case lm : : ngram : : PROBING :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : ProbingModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
case lm : : ngram : : REST_PROBING :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : RestProbingModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
case lm : : ngram : : TRIE :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : TrieModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
case lm : : ngram : : QUANT_TRIE :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : QuantTrieModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
case lm : : ngram : : ARRAY_TRIE :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : ArrayTrieModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
case lm : : ngram : : QUANT_ARRAY_TRIE :
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : QuantArrayTrieModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
default :
UTIL_THROW2 ( " Unrecognized kenlm model type " < < model_type ) ;
2011-02-24 16:14:42 +03:00
}
2014-01-15 19:42:02 +04:00
} else {
2016-02-20 03:07:48 +03:00
return new LanguageModelKen < lm : : ngram : : ProbingModel > ( line , file , factorType , load_method ) ;
2014-01-15 19:42:02 +04:00
}
2010-10-28 05:05:04 +04:00
}
2010-09-22 02:43:29 +04:00
}