2010-07-18 03:23:09 +04:00
// $Id$
2010-04-20 19:25:52 +04:00
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2012-11-14 18:18:53 +04:00
# include "ParallelBackoff.h"
2011-11-18 18:10:15 +04:00
# include <vector>
# include <string>
# include <sstream>
# include <fstream>
2012-11-14 18:18:53 +04:00
# include "MultiFactor.h"
# include "moses/Word.h"
# include "moses/Factor.h"
# include "moses/FactorTypeSet.h"
# include "moses/FactorCollection.h"
# include "moses/Phrase.h"
# include "moses/TypeDef.h"
# include "moses/Util.h"
2011-11-18 18:10:15 +04:00
2013-08-28 20:39:47 +04:00
// By default, SRILM defines a function called zopen.
2013-09-27 12:35:24 +04:00
//
2013-08-28 20:39:47 +04:00
// However, on Mac OS X (and possibly other BSDs),
// <stdio.h> already defines a zopen function.
//
// To resolve this conflict, SRILM checks to see if HAVE_ZOPEN is defined.
// If it is, SRILM will rename its zopen function as my_zopen.
//
2013-09-27 12:35:24 +04:00
// So, before importing any SRILM headers,
2013-08-28 20:39:47 +04:00
// it is important to define HAVE_ZOPEN if we are on an Apple OS:
//
# ifdef __APPLE__
# define HAVE_ZOPEN
# endif
2010-04-20 19:25:52 +04:00
# include "FNgramSpecs.h"
# include "FNgramStats.h"
# include "FactoredVocab.h"
# include "FNgram.h"
# include "wmatrix.h"
# include "Vocab.h"
2011-11-18 18:10:15 +04:00
# include "File.h"
2010-04-20 19:25:52 +04:00
using namespace std ;
namespace Moses
{
2011-11-18 18:10:15 +04:00
namespace
{
class LanguageModelParallelBackoff : public LanguageModelMultiFactor
{
private :
std : : vector < FactorType > m_factorTypesOrdered ;
FactoredVocab * m_srilmVocab ;
FNgram * m_srilmModel ;
VocabIndex m_unknownId ;
VocabIndex m_wtid ;
VocabIndex m_wtbid ;
VocabIndex m_wteid ;
FNgramSpecs < FNgramCount > * fnSpecs ;
//std::vector<VocabIndex> m_lmIdLookup;
std : : map < size_t , VocabIndex > * lmIdMap ;
std : : fstream * debugStream ;
WidMatrix * widMatrix ;
public :
2013-02-21 02:44:11 +04:00
LanguageModelParallelBackoff ( const std : : string & line )
2013-10-29 22:59:53 +04:00
: LanguageModelMultiFactor ( line ) {
2013-06-10 21:11:55 +04:00
}
2013-02-21 02:44:11 +04:00
2011-11-18 18:10:15 +04:00
~ LanguageModelParallelBackoff ( ) ;
bool Load ( const std : : string & filePath , const std : : vector < FactorType > & factorTypes , size_t nGramOrder ) ;
VocabIndex GetLmID ( const std : : string & str ) const ;
VocabIndex GetLmID ( const Factor * factor , FactorType ft ) const ;
void CreateFactors ( ) ;
LMResult GetValueForgotState ( const std : : vector < const Word * > & contextFactor , FFState & outState ) const ;
const FFState * GetNullContextState ( ) const ;
const FFState * GetBeginSentenceState ( ) const ;
FFState * NewState ( const FFState * from ) const ;
} ;
2011-02-24 16:14:42 +03:00
LanguageModelParallelBackoff : : ~ LanguageModelParallelBackoff ( )
{
///
}
2010-04-20 19:25:52 +04:00
2010-08-10 17:12:00 +04:00
bool LanguageModelParallelBackoff : : Load ( const std : : string & filePath , const std : : vector < FactorType > & factorTypes , size_t nGramOrder )
2011-02-24 16:14:42 +03:00
{
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Loading Language Model Parallel Backoff!!! \n " ;
widMatrix = new : : WidMatrix ( ) ;
m_factorTypes = FactorMask ( factorTypes ) ;
m_srilmVocab = new : : FactoredVocab ( ) ;
//assert(m_srilmVocab != 0);
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
fnSpecs = 0 ;
File f ( filePath . c_str ( ) , " r " ) ;
fnSpecs = new : : FNgramSpecs < FNgramCount > ( f , * m_srilmVocab , 0 /*debug*/ ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Loaded fnSpecs! \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_srilmVocab - > unkIsWord ( ) = true ;
m_srilmVocab - > nullIsWord ( ) = true ;
m_srilmVocab - > toLower ( ) = false ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
FNgramStats * factoredStats = new FNgramStats ( * m_srilmVocab , * fnSpecs ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
factoredStats - > debugme ( 2 ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Factored stats \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
FNgram * fngramLM = new FNgram ( * m_srilmVocab , * fnSpecs ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " FNgram object created \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
fngramLM - > skipOOVs = false ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
if ( ! factoredStats - > read ( ) ) {
cerr < < " error reading in counts in factor file \n " ;
exit ( 1 ) ;
}
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Factored stats read! \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
factoredStats - > estimateDiscounts ( ) ;
factoredStats - > computeCardinalityFunctions ( ) ;
factoredStats - > sumCounts ( ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Another three operations made! \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
if ( ! fngramLM - > read ( ) ) {
cerr < < " format error in lm file \n " ;
exit ( 1 ) ;
}
cerr < < " fngramLM reads! \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_filePath = filePath ;
m_nGramOrder = nGramOrder ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_factorTypesOrdered = factorTypes ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_unknownId = m_srilmVocab - > unkIndex ( ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " m_unknowdId = " < < m_unknownId < < endl ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_srilmModel = fngramLM ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Create factors... \n " ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
CreateFactors ( ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " Factors created! \n " ;
2011-02-24 19:17:38 +03:00
//FactorCollection &factorCollection = FactorCollection::Instance();
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
/*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
{
FactorType factorType = m_factorTypesOrdered [ index ] ;
m_sentenceStartArray [ factorType ] = factorCollection . AddFactor ( Output , factorType , BOS_ ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_sentenceEndArray [ factorType ] = factorCollection . AddFactor ( Output , factorType , EOS_ ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
//factorIdStart = m_sentenceStartArray[factorType]->GetId();
//factorIdEnd = m_sentenceEndArray[factorType]->GetId();
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
for ( size_t i = 0 ; i < 10 ; i + + )
{
lmIdMap [ factorIdStart * 10 + i ] = GetLmID ( BOS_ ) ;
lmIdMap [ factorIdEnd * 10 + i ] = GetLmID ( EOS_ ) ;
}
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
//(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
//(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
} */
return true ;
}
2010-04-20 19:25:52 +04:00
VocabIndex LanguageModelParallelBackoff : : GetLmID ( const std : : string & str ) const
{
2011-02-24 16:14:42 +03:00
return m_srilmVocab - > getIndex ( str . c_str ( ) , m_unknownId ) ;
2010-04-20 19:25:52 +04:00
}
VocabIndex LanguageModelParallelBackoff : : GetLmID ( const Factor * factor , size_t ft ) const
{
2011-02-24 16:14:42 +03:00
size_t factorId = factor - > GetId ( ) ;
if ( lmIdMap - > find ( factorId * 10 + ft ) ! = lmIdMap - > end ( ) ) {
return lmIdMap - > find ( factorId * 10 + ft ) - > second ;
} else {
return m_unknownId ;
}
2010-04-20 19:25:52 +04:00
}
void LanguageModelParallelBackoff : : CreateFactors ( )
2011-02-24 16:14:42 +03:00
{
// add factors which have srilm id
FactorCollection & factorCollection = FactorCollection : : Instance ( ) ;
lmIdMap = new std : : map < size_t , VocabIndex > ( ) ;
VocabString str ;
VocabIter iter ( * m_srilmVocab ) ;
iter . init ( ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
size_t pomFactorTypeNum = 0 ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
while ( ( str = iter . next ( ) ) ! = NULL ) {
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
if ( ( str [ 0 ] < ' a ' | | str [ 0 ] > ' k ' ) & & str [ 0 ] ! = ' W ' ) {
continue ;
}
VocabIndex lmId = GetLmID ( str ) ;
pomFactorTypeNum = str [ 0 ] - ' a ' ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
size_t factorId = factorCollection . AddFactor ( Output , m_factorTypesOrdered [ pomFactorTypeNum ] , & ( str [ 2 ] ) ) - > GetId ( ) ;
( * lmIdMap ) [ factorId * 10 + pomFactorTypeNum ] = lmId ;
}
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
size_t factorIdStart ;
size_t factorIdEnd ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
// sentence markers
for ( size_t index = 0 ; index < m_factorTypesOrdered . size ( ) ; + + index ) {
FactorType factorType = m_factorTypesOrdered [ index ] ;
2013-02-21 15:30:54 +04:00
m_sentenceStartWord [ index ] = factorCollection . AddFactor ( Output , factorType , BOS_ ) ;
2010-04-20 19:25:52 +04:00
2013-02-21 15:30:54 +04:00
m_sentenceEndWord [ index ] = factorCollection . AddFactor ( Output , factorType , EOS_ ) ;
2010-04-20 19:25:52 +04:00
2013-02-21 15:30:54 +04:00
factorIdStart = m_sentenceStartWord [ index ] - > GetId ( ) ;
factorIdEnd = m_sentenceEndWord [ index ] - > GetId ( ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
/*for (size_t i = 0; i < 10; i++)
{
lmIdMap [ factorIdStart * 10 + i ] = GetLmID ( BOS_ ) ;
lmIdMap [ factorIdEnd * 10 + i ] = GetLmID ( EOS_ ) ;
} */
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
( * lmIdMap ) [ factorIdStart * 10 + index ] = GetLmID ( BOS_ ) ;
( * lmIdMap ) [ factorIdEnd * 10 + index ] = GetLmID ( EOS_ ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " BOS_: " < < GetLmID ( BOS_ ) < < " , EOS_: " < < GetLmID ( EOS_ ) < < endl ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
}
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
m_wtid = GetLmID ( " W-<unk> " ) ;
m_wtbid = GetLmID ( " W-<s> " ) ;
m_wteid = GetLmID ( " W-</s> " ) ;
2010-04-20 19:25:52 +04:00
2011-02-24 16:14:42 +03:00
cerr < < " W-<unk> index: " < < m_wtid < < endl ;
cerr < < " W-<s> index: " < < m_wtbid < < endl ;
cerr < < " W-</s> index: " < < m_wteid < < endl ;
2010-04-20 19:25:52 +04:00
}
2011-03-08 02:21:09 +03:00
LMResult LanguageModelParallelBackoff : : GetValueForgotState ( const std : : vector < const Word * > & contextFactor , FFState & /*outState */ ) const
2011-02-24 16:14:42 +03:00
{
static WidMatrix widMatrix ;
for ( int i = 0 ; i < contextFactor . size ( ) ; i + + )
: : memset ( widMatrix [ i ] , 0 , ( m_factorTypesOrdered . size ( ) + 1 ) * sizeof ( VocabIndex ) ) ;
for ( size_t i = 0 ; i < contextFactor . size ( ) ; i + + ) {
const Word & word = * contextFactor [ i ] ;
for ( size_t j = 0 ; j < m_factorTypesOrdered . size ( ) ; j + + ) {
const Factor * factor = word [ m_factorTypesOrdered [ j ] ] ;
if ( factor = = NULL )
widMatrix [ i ] [ j + 1 ] = 0 ;
else
widMatrix [ i ] [ j + 1 ] = GetLmID ( factor , j ) ;
}
2013-02-21 15:30:54 +04:00
if ( widMatrix [ i ] [ 1 ] = = GetLmID ( m_sentenceStartWord [ 0 ] , 0 ) ) {
2011-02-24 16:14:42 +03:00
widMatrix [ i ] [ 0 ] = m_wtbid ;
2013-02-21 15:30:54 +04:00
} else if ( widMatrix [ i ] [ 1 ] = = GetLmID ( m_sentenceEndWord [ 0 ] , 0 ) ) {
2011-02-24 16:14:42 +03:00
widMatrix [ i ] [ 0 ] = m_wteid ;
} else {
widMatrix [ i ] [ 0 ] = m_wtid ;
}
2010-11-17 17:06:21 +03:00
}
2011-02-24 16:14:42 +03:00
2011-03-08 02:21:09 +03:00
LMResult ret ;
ret . score = m_srilmModel - > wordProb ( widMatrix , contextFactor . size ( ) - 1 , contextFactor . size ( ) ) ;
ret . score = FloorScore ( TransformLMScore ( ret . score ) ) ;
ret . unknown = ! contextFactor . empty ( ) & & ( widMatrix [ contextFactor . size ( ) - 1 ] [ 0 ] = = m_unknownId ) ;
return ret ;
2011-02-24 16:14:42 +03:00
/*if (contextFactor.size() == 0)
{
return 0 ;
2010-11-17 17:06:21 +03:00
}
2011-02-24 16:14:42 +03:00
for ( size_t currPos = 0 ; currPos < m_nGramOrder ; + + currPos )
{
const Word & word = * contextFactor [ currPos ] ;
for ( size_t index = 0 ; index < m_factorTypesOrdered . size ( ) ; + + index )
{
FactorType factorType = m_factorTypesOrdered [ index ] ;
const Factor * factor = word [ factorType ] ;
( * widMatrix ) [ currPos ] [ index ] = GetLmID ( factor , index ) ;
}
2010-10-27 21:50:40 +04:00
}
2011-02-24 16:14:42 +03:00
float p = m_srilmModel - > wordProb ( ( * widMatrix ) , m_nGramOrder - 1 , m_nGramOrder ) ;
return FloorScore ( TransformLMScore ( p ) ) ; */
}
// The old version did not initialize finalState like it should. Technically that makes the behavior undefined, so it's not clear what else to do here.
2011-02-24 19:17:38 +03:00
FFState * LanguageModelParallelBackoff : : NewState ( const FFState * /*from*/ ) const
2011-02-24 16:14:42 +03:00
{
return NULL ;
}
2011-08-24 20:44:59 +04:00
const FFState * LanguageModelParallelBackoff : : GetNullContextState ( ) const
2011-02-24 16:14:42 +03:00
{
return NULL ;
}
2011-08-24 20:44:59 +04:00
const FFState * LanguageModelParallelBackoff : : GetBeginSentenceState ( ) const
2011-02-24 16:14:42 +03:00
{
return NULL ;
}
2011-11-18 18:10:15 +04:00
}
2010-04-20 19:25:52 +04:00
}