2008-06-11 14:52:57 +04:00
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <string>
# include <cassert>
# include "PhraseDictionaryMemory.h"
# include "DecodeStepTranslation.h"
# include "DecodeStepGeneration.h"
# include "GenerationDictionary.h"
# include "DummyScoreProducers.h"
# include "StaticData.h"
# include "Util.h"
# include "FactorCollection.h"
# include "Timer.h"
# include "LanguageModelSingleFactor.h"
# include "LanguageModelMultiFactor.h"
# include "LanguageModelFactory.h"
# include "LexicalReordering.h"
2009-05-26 23:30:35 +04:00
# include "GlobalLexicalModel.h"
2008-06-11 14:52:57 +04:00
# include "SentenceStats.h"
# include "PhraseDictionaryTreeAdaptor.h"
# include "UserMessage.h"
# include "TranslationOption.h"
# include "DecodeGraph.h"
2008-06-19 03:14:09 +04:00
# include "InputFileStream.h"
2008-06-11 14:52:57 +04:00
using namespace std ;
2008-10-09 03:51:26 +04:00
namespace Moses
{
2008-06-11 14:52:57 +04:00
static size_t CalcMax ( size_t x , const vector < size_t > & y ) {
size_t max = x ;
for ( vector < size_t > : : const_iterator i = y . begin ( ) ; i ! = y . end ( ) ; + + i )
if ( * i > max ) max = * i ;
return max ;
}
static size_t CalcMax ( size_t x , const vector < size_t > & y , const vector < size_t > & z ) {
size_t max = x ;
for ( vector < size_t > : : const_iterator i = y . begin ( ) ; i ! = y . end ( ) ; + + i )
if ( * i > max ) max = * i ;
for ( vector < size_t > : : const_iterator i = z . begin ( ) ; i ! = z . end ( ) ; + + i )
if ( * i > max ) max = * i ;
return max ;
}
StaticData StaticData : : s_instance ;
StaticData : : StaticData ( )
: m_fLMsLoaded ( false )
, m_inputType ( SentenceInput )
, m_numInputScores ( 0 )
, m_distortionScoreProducer ( 0 )
, m_wpProducer ( 0 )
, m_isDetailedTranslationReportingEnabled ( false )
, m_onlyDistinctNBest ( false )
, m_computeLMBackoffStats ( false )
, m_factorDelimiter ( " | " ) // default delimiter between factors
, m_isAlwaysCreateDirectTranslationOption ( false )
2008-11-12 23:26:02 +03:00
, m_sourceStartPosMattersForRecombination ( false )
2009-02-05 20:37:09 +03:00
, m_numLinkParams ( 1 )
2008-06-11 14:52:57 +04:00
{
m_maxFactorIdx [ 0 ] = 0 ; // source side
m_maxFactorIdx [ 1 ] = 0 ; // target side
// memory pools
Phrase : : InitializeMemPool ( ) ;
}
bool StaticData : : LoadData ( Parameter * parameter )
{
ResetUserTime ( ) ;
m_parameter = parameter ;
// verbose level
m_verboseLevel = 1 ;
if ( m_parameter - > GetParam ( " verbose " ) . size ( ) = = 1 )
{
m_verboseLevel = Scan < size_t > ( m_parameter - > GetParam ( " verbose " ) [ 0 ] ) ;
}
2010-04-08 21:16:10 +04:00
// to cube or not to cube
m_searchAlgorithm = ( m_parameter - > GetParam ( " search-algorithm " ) . size ( ) > 0 ) ?
( SearchAlgorithm ) Scan < size_t > ( m_parameter - > GetParam ( " search-algorithm " ) [ 0 ] ) : Normal ;
if ( m_searchAlgorithm = = ChartDecoding )
LoadChartDecodingParameters ( ) ;
else
LoadPhraseBasedParameters ( ) ;
2008-06-11 14:52:57 +04:00
// input type has to be specified BEFORE loading the phrase tables!
if ( m_parameter - > GetParam ( " inputtype " ) . size ( ) )
m_inputType = ( InputTypeEnum ) Scan < int > ( m_parameter - > GetParam ( " inputtype " ) [ 0 ] ) ;
std : : string s_it = " text input " ;
if ( m_inputType = = 1 ) { s_it = " confusion net " ; }
if ( m_inputType = = 2 ) { s_it = " word lattice " ; }
VERBOSE ( 2 , " input type is: " < < s_it < < " \n " ) ;
if ( m_parameter - > GetParam ( " recover-input-path " ) . size ( ) ) {
m_recoverPath = Scan < bool > ( m_parameter - > GetParam ( " recover-input-path " ) [ 0 ] ) ;
if ( m_recoverPath & & m_inputType = = SentenceInput ) {
TRACE_ERR ( " --recover-input-path should only be used with confusion net or word lattice input! \n " ) ;
m_recoverPath = false ;
}
}
2008-06-19 03:14:09 +04:00
2008-06-11 14:52:57 +04:00
// factor delimiter
if ( m_parameter - > GetParam ( " factor-delimiter " ) . size ( ) > 0 ) {
m_factorDelimiter = m_parameter - > GetParam ( " factor-delimiter " ) [ 0 ] ;
}
2008-09-12 22:09:06 +04:00
//word-to-word alignment
SetBooleanParameter ( & m_UseAlignmentInfo , " use-alignment-info " , false ) ;
SetBooleanParameter ( & m_PrintAlignmentInfo , " print-alignment-info " , false ) ;
SetBooleanParameter ( & m_PrintAlignmentInfoNbest , " print-alignment-info-in-n-best " , false ) ;
2010-04-08 21:16:10 +04:00
SetBooleanParameter ( & m_outputHypoScore , " output-hypo-score " , false ) ;
2008-09-12 22:09:06 +04:00
if ( ! m_UseAlignmentInfo & & m_PrintAlignmentInfo ) {
TRACE_ERR ( " --print-alignment-info should only be used together with \" --use-alignment-info true \" . Continue forcing to false. \n " ) ;
m_PrintAlignmentInfo = false ;
}
if ( ! m_UseAlignmentInfo & & m_PrintAlignmentInfoNbest ) {
TRACE_ERR ( " --print-alignment-info-in-n-best should only be used together with \" --use-alignment-info true \" . Continue forcing to false. \n " ) ;
m_PrintAlignmentInfoNbest = false ;
}
2008-06-11 14:52:57 +04:00
// n-best
if ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) > = 2 )
{
m_nBestFilePath = m_parameter - > GetParam ( " n-best-list " ) [ 0 ] ;
m_nBestSize = Scan < size_t > ( m_parameter - > GetParam ( " n-best-list " ) [ 1 ] ) ;
m_onlyDistinctNBest = ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) > 2 & & m_parameter - > GetParam ( " n-best-list " ) [ 2 ] = = " distinct " ) ;
}
else if ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) = = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -n-best-list file size " ) ) ;
return false ;
}
else
{
m_nBestSize = 0 ;
}
if ( m_parameter - > GetParam ( " n-best-factor " ) . size ( ) > 0 )
{
m_nBestFactor = Scan < size_t > ( m_parameter - > GetParam ( " n-best-factor " ) [ 0 ] ) ;
}
else {
m_nBestFactor = 20 ;
}
// word graph
if ( m_parameter - > GetParam ( " output-word-graph " ) . size ( ) = = 2 )
m_outputWordGraph = true ;
else
m_outputWordGraph = false ;
// search graph
if ( m_parameter - > GetParam ( " output-search-graph " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph file " ) ) ;
return false ;
}
m_outputSearchGraph = true ;
}
2010-01-28 18:32:04 +03:00
// ... in extended format
else if ( m_parameter - > GetParam ( " output-search-graph-extended " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph-extended " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph-extended file " ) ) ;
return false ;
}
m_outputSearchGraph = true ;
m_outputSearchGraphExtended = true ;
}
else
2008-06-11 14:52:57 +04:00
m_outputSearchGraph = false ;
2008-09-24 20:48:23 +04:00
# ifdef HAVE_PROTOBUF
if ( m_parameter - > GetParam ( " output-search-graph-pb " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph-pb " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph-pb path " ) ) ;
return false ;
}
m_outputSearchGraphPB = true ;
}
2009-01-01 21:16:54 +03:00
else
2008-09-24 20:48:23 +04:00
m_outputSearchGraphPB = false ;
# endif
2008-06-11 14:52:57 +04:00
// include feature names in the n-best list
SetBooleanParameter ( & m_labeledNBestList , " labeled-n-best-list " , true ) ;
// include word alignment in the n-best list
SetBooleanParameter ( & m_nBestIncludesAlignment , " include-alignment-in-n-best " , false ) ;
// printing source phrase spans
SetBooleanParameter ( & m_reportSegmentation , " report-segmentation " , false ) ;
// print all factors of output translations
SetBooleanParameter ( & m_reportAllFactors , " report-all-factors " , false ) ;
2010-02-18 13:54:33 +03:00
// print all factors of output translations
SetBooleanParameter ( & m_reportAllFactorsNBest , " report-all-factors-in-n-best " , false ) ;
2008-06-11 14:52:57 +04:00
//
if ( m_inputType = = SentenceInput )
{
SetBooleanParameter ( & m_useTransOptCache , " use-persistent-cache " , true ) ;
2009-01-01 21:16:54 +03:00
m_transOptCacheMaxSize = ( m_parameter - > GetParam ( " persistent-cache-size " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " persistent-cache-size " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_CACHE_SIZE ;
2008-06-11 14:52:57 +04:00
}
else
{
m_useTransOptCache = false ;
}
//input factors
const vector < string > & inputFactorVector = m_parameter - > GetParam ( " input-factors " ) ;
for ( size_t i = 0 ; i < inputFactorVector . size ( ) ; i + + )
{
m_inputFactorOrder . push_back ( Scan < FactorType > ( inputFactorVector [ i ] ) ) ;
}
if ( m_inputFactorOrder . empty ( ) )
{
UserMessage : : Add ( string ( " no input factor specified in config file " ) ) ;
return false ;
}
//output factors
const vector < string > & outputFactorVector = m_parameter - > GetParam ( " output-factors " ) ;
for ( size_t i = 0 ; i < outputFactorVector . size ( ) ; i + + )
{
m_outputFactorOrder . push_back ( Scan < FactorType > ( outputFactorVector [ i ] ) ) ;
}
if ( m_outputFactorOrder . empty ( ) )
{ // default. output factor 0
m_outputFactorOrder . push_back ( 0 ) ;
}
//source word deletion
SetBooleanParameter ( & m_wordDeletionEnabled , " phrase-drop-allowed " , false ) ;
2010-03-07 10:57:48 +03:00
//Disable discarding
SetBooleanParameter ( & m_disableDiscarding , " disable-discarding " , false ) ;
//Print All Derivations
SetBooleanParameter ( & m_printAllDerivations , " print-all-derivations " , false ) ;
2008-06-11 14:52:57 +04:00
// additional output
SetBooleanParameter ( & m_isDetailedTranslationReportingEnabled ,
" translation-details " , false ) ;
SetBooleanParameter ( & m_computeLMBackoffStats , " lmstats " , false ) ;
if ( m_computeLMBackoffStats & &
! m_isDetailedTranslationReportingEnabled ) {
VERBOSE ( 1 , " -lmstats implies -translation-details, enabling " < < std : : endl ) ;
m_isDetailedTranslationReportingEnabled = true ;
}
// score weights
m_weightWordPenalty = Scan < float > ( m_parameter - > GetParam ( " weight-w " ) [ 0 ] ) ;
m_wpProducer = new WordPenaltyProducer ( m_scoreIndexManager ) ;
m_allWeights . push_back ( m_weightWordPenalty ) ;
2010-04-08 21:16:10 +04:00
m_weightUnknownWord = ( m_parameter - > GetParam ( " weight-u " ) . size ( ) > 0 ) ? Scan < float > ( m_parameter - > GetParam ( " weight-u " ) [ 0 ] ) : 1 ;
2008-06-11 14:52:57 +04:00
m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer ( m_scoreIndexManager ) ;
m_allWeights . push_back ( m_weightUnknownWord ) ;
2010-04-08 21:16:10 +04:00
2009-05-26 23:30:35 +04:00
// reordering constraints
2008-06-11 14:52:57 +04:00
m_maxDistortion = ( m_parameter - > GetParam ( " distortion-limit " ) . size ( ) > 0 ) ?
Scan < int > ( m_parameter - > GetParam ( " distortion-limit " ) [ 0 ] )
: - 1 ;
2008-11-24 20:30:37 +03:00
SetBooleanParameter ( & m_reorderingConstraint , " monotone-at-punctuation " , false ) ;
2008-12-13 15:08:55 +03:00
// settings for pruning
m_maxHypoStackSize = ( m_parameter - > GetParam ( " stack " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " stack " ) [ 0 ] ) : DEFAULT_MAX_HYPOSTACK_SIZE ;
2009-05-26 23:30:35 +04:00
m_minHypoStackDiversity = 0 ;
if ( m_parameter - > GetParam ( " stack-diversity " ) . size ( ) > 0 ) {
if ( m_maxDistortion > 15 ) {
UserMessage : : Add ( " stack diversity > 0 is not allowed for distortion limits larger than 15 " ) ;
return false ;
}
if ( m_inputType = = WordLatticeInput ) {
UserMessage : : Add ( " stack diversity > 0 is not allowed for lattice input " ) ;
return false ;
}
m_minHypoStackDiversity = Scan < size_t > ( m_parameter - > GetParam ( " stack-diversity " ) [ 0 ] ) ;
}
2008-06-11 14:52:57 +04:00
m_beamWidth = ( m_parameter - > GetParam ( " beam-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " beam-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_BEAM_WIDTH ) ;
2009-05-26 23:30:35 +04:00
m_earlyDiscardingThreshold = ( m_parameter - > GetParam ( " early-discarding-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " early-discarding-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_EARLY_DISCARDING_THRESHOLD ) ;
m_translationOptionThreshold = ( m_parameter - > GetParam ( " translation-option-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " translation-option-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_TRANSLATION_OPTION_THRESHOLD ) ;
2008-06-11 14:52:57 +04:00
m_maxNoTransOptPerCoverage = ( m_parameter - > GetParam ( " max-trans-opt-per-coverage " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-trans-opt-per-coverage " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_SIZE ;
m_maxNoPartTransOpt = ( m_parameter - > GetParam ( " max-partial-trans-opt " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-partial-trans-opt " ) [ 0 ] ) : DEFAULT_MAX_PART_TRANS_OPT_SIZE ;
m_maxPhraseLength = ( m_parameter - > GetParam ( " max-phrase-length " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-phrase-length " ) [ 0 ] ) : DEFAULT_MAX_PHRASE_LENGTH ;
m_cubePruningPopLimit = ( m_parameter - > GetParam ( " cube-pruning-pop-limit " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " cube-pruning-pop-limit " ) [ 0 ] ) : DEFAULT_CUBE_PRUNING_POP_LIMIT ;
m_cubePruningDiversity = ( m_parameter - > GetParam ( " cube-pruning-diversity " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " cube-pruning-diversity " ) [ 0 ] ) : DEFAULT_CUBE_PRUNING_DIVERSITY ;
2009-05-26 23:30:35 +04:00
// unknown word processing
2008-06-11 14:52:57 +04:00
SetBooleanParameter ( & m_dropUnknown , " drop-unknown " , false ) ;
// minimum Bayes risk decoding
SetBooleanParameter ( & m_mbr , " minimum-bayes-risk " , false ) ;
2010-02-03 13:23:32 +03:00
m_mbrSize = ( m_parameter - > GetParam ( " mbr-size " ) . size ( ) > 0 ) ?
2008-06-11 14:52:57 +04:00
Scan < size_t > ( m_parameter - > GetParam ( " mbr-size " ) [ 0 ] ) : 200 ;
m_mbrScale = ( m_parameter - > GetParam ( " mbr-scale " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " mbr-scale " ) [ 0 ] ) : 1.0f ;
2010-02-03 13:23:32 +03:00
//lattice mbr
SetBooleanParameter ( & m_useLatticeMBR , " lminimum-bayes-risk " , false ) ;
2010-04-08 21:16:10 +04:00
if ( m_useLatticeMBR )
m_mbr = m_useLatticeMBR ;
2010-02-09 14:37:33 +03:00
m_lmbrPruning = ( m_parameter - > GetParam ( " lmbr-pruning-factor " ) . size ( ) > 0 ) ?
2010-02-03 13:23:32 +03:00
Scan < size_t > ( m_parameter - > GetParam ( " lmbr-pruning-factor " ) [ 0 ] ) : 30 ;
m_lmbrThetas = Scan < float > ( m_parameter - > GetParam ( " lmbr-thetas " ) ) ;
2010-02-03 14:20:20 +03:00
SetBooleanParameter ( & m_useLatticeHypSetForLatticeMBR , " lattice-hypo-set " , false ) ;
2010-02-03 22:46:35 +03:00
m_lmbrPrecision = ( m_parameter - > GetParam ( " lmbr-p " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-p " ) [ 0 ] ) : 0.8f ;
m_lmbrPRatio = ( m_parameter - > GetParam ( " lmbr-r " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-r " ) [ 0 ] ) : 0.6f ;
2010-03-14 23:23:17 +03:00
m_lmbrMapWeight = ( m_parameter - > GetParam ( " lmbr-map-weight " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-map-weight " ) [ 0 ] ) : 0.0f ;
2010-04-08 21:16:10 +04:00
2010-02-03 13:23:32 +03:00
2009-05-26 23:30:35 +04:00
m_timeout_threshold = ( m_parameter - > GetParam ( " time-out " ) . size ( ) > 0 ) ?
2008-06-11 14:52:57 +04:00
Scan < size_t > ( m_parameter - > GetParam ( " time-out " ) [ 0 ] ) : - 1 ;
m_timeout = ( GetTimeoutThreshold ( ) = = - 1 ) ? false : true ;
2008-06-19 03:14:09 +04:00
// Read in constraint decoding file, if provided
2010-01-29 20:11:17 +03:00
if ( m_parameter - > GetParam ( " constraint " ) . size ( ) ) {
if ( m_parameter - > GetParam ( " search-algorithm " ) . size ( ) > 0
2010-01-29 20:11:34 +03:00
& & Scan < size_t > ( m_parameter - > GetParam ( " search-algorithm " ) [ 0 ] ) ! = 0 ) {
cerr < < " Can use -constraint only with stack-based search (-search-algorithm 0) " < < endl ;
exit ( 1 ) ;
}
2008-06-19 03:14:09 +04:00
m_constraintFileName = m_parameter - > GetParam ( " constraint " ) [ 0 ] ;
2010-01-29 20:11:34 +03:00
InputFileStream constraintFile ( m_constraintFileName ) ;
2008-06-19 03:14:09 +04:00
2010-01-29 20:11:34 +03:00
std : : string line ;
2009-01-26 19:14:38 +03:00
2010-01-29 20:11:34 +03:00
long sentenceID = - 1 ;
while ( getline ( constraintFile , line ) )
{
vector < string > vecStr = Tokenize ( line , " \t " ) ;
if ( vecStr . size ( ) = = 1 ) {
sentenceID + + ;
Phrase phrase ( Output ) ;
phrase . CreateFromString ( GetOutputFactorOrder ( ) , vecStr [ 0 ] , GetFactorDelimiter ( ) ) ;
m_constraints . insert ( make_pair ( sentenceID , phrase ) ) ;
}
else if ( vecStr . size ( ) = = 2 ) {
sentenceID = Scan < long > ( vecStr [ 0 ] ) ;
Phrase phrase ( Output ) ;
phrase . CreateFromString ( GetOutputFactorOrder ( ) , vecStr [ 1 ] , GetFactorDelimiter ( ) ) ;
m_constraints . insert ( make_pair ( sentenceID , phrase ) ) ;
}
else {
assert ( false ) ;
}
2009-01-26 19:14:38 +03:00
}
2008-06-19 03:14:09 +04:00
}
2009-01-01 21:16:54 +03:00
// use of xml in input
2008-06-11 14:52:57 +04:00
if ( m_parameter - > GetParam ( " xml-input " ) . size ( ) = = 0 ) m_xmlInputType = XmlPassThrough ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " exclusive " ) m_xmlInputType = XmlExclusive ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " inclusive " ) m_xmlInputType = XmlInclusive ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " ignore " ) m_xmlInputType = XmlIgnore ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " pass-through " ) m_xmlInputType = XmlPassThrough ;
else {
UserMessage : : Add ( " invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore " ) ;
return false ;
}
if ( ! LoadLexicalReorderingModel ( ) ) return false ;
if ( ! LoadLanguageModels ( ) ) return false ;
if ( ! LoadGenerationTables ( ) ) return false ;
if ( ! LoadPhraseTables ( ) ) return false ;
2009-05-26 23:30:35 +04:00
if ( ! LoadGlobalLexicalModel ( ) ) return false ;
2008-06-11 14:52:57 +04:00
2009-05-26 23:30:35 +04:00
m_scoreIndexManager . InitFeatureNames ( ) ;
2008-09-24 20:48:23 +04:00
if ( m_parameter - > GetParam ( " weight-file " ) . size ( ) > 0 ) {
2010-02-08 20:34:14 +03:00
UserMessage : : Add ( " ERROR: weight-file option is broken \n " ) ;
abort ( ) ;
// if (m_parameter->GetParam("weight-file").size() != 1) {
// UserMessage::Add(string("ERROR: weight-file takes a single parameter"));
// return false;
// }
// string fnam = m_parameter->GetParam("weight-file")[0];
// m_scoreIndexManager.InitWeightVectorFromFile(fnam, &m_allWeights);
2008-09-24 20:48:23 +04:00
}
2008-06-11 14:52:57 +04:00
return true ;
}
void StaticData : : SetBooleanParameter ( bool * parameter , string parameterName , bool defaultValue )
{
// default value if nothing is specified
* parameter = defaultValue ;
if ( ! m_parameter - > isParamSpecified ( parameterName ) )
{
return ;
}
// if parameter is just specified as, e.g. "-parameter" set it true
if ( m_parameter - > GetParam ( parameterName ) . size ( ) = = 0 )
{
* parameter = true ;
}
// if paramter is specified "-parameter true" or "-parameter false"
else if ( m_parameter - > GetParam ( parameterName ) . size ( ) = = 1 )
{
* parameter = Scan < bool > ( m_parameter - > GetParam ( parameterName ) [ 0 ] ) ;
}
}
StaticData : : ~ StaticData ( )
{
RemoveAllInColl ( m_phraseDictionary ) ;
RemoveAllInColl ( m_generationDictionary ) ;
2008-10-30 21:43:17 +03:00
RemoveAllInColl ( m_reorderModels ) ;
2009-05-26 23:30:35 +04:00
RemoveAllInColl ( m_globalLexicalModels ) ;
2008-06-11 14:52:57 +04:00
// delete trans opt
2009-08-07 20:47:54 +04:00
map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iterCache ;
2008-06-11 14:52:57 +04:00
for ( iterCache = m_transOptCache . begin ( ) ; iterCache ! = m_transOptCache . end ( ) ; + + iterCache )
{
2009-01-01 21:16:54 +03:00
TranslationOptionList * transOptList = iterCache - > second . first ;
2008-10-30 21:43:17 +03:00
delete transOptList ;
2008-06-11 14:52:57 +04:00
}
// small score producers
delete m_distortionScoreProducer ;
delete m_wpProducer ;
delete m_unknownWordPenaltyProducer ;
2010-04-08 21:16:10 +04:00
//delete m_parameter;
2008-06-11 14:52:57 +04:00
// memory pools
Phrase : : FinalizeMemPool ( ) ;
}
bool StaticData : : LoadLexicalReorderingModel ( )
{
2010-01-28 15:12:57 +03:00
VERBOSE ( 1 , " Loading lexical distortion models... " ) ;
const vector < string > fileStr = m_parameter - > GetParam ( " distortion-file " ) ;
const vector < string > weightsStr = m_parameter - > GetParam ( " weight-d " ) ;
std : : vector < float > weights ;
size_t w = 1 ; //cur weight
size_t f = 0 ; //cur file
//get weights values
VERBOSE ( 1 , " have " < < fileStr . size ( ) < < " models " < < std : : endl ) ;
for ( size_t j = 0 ; j < weightsStr . size ( ) ; + + j ) {
weights . push_back ( Scan < float > ( weightsStr [ j ] ) ) ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
//load all models
for ( size_t i = 0 ; i < fileStr . size ( ) ; + + i )
{
vector < string > spec = Tokenize < string > ( fileStr [ f ] , " " ) ;
+ + f ; //mark file as consumed
if ( spec . size ( ) ! = 4 ) {
UserMessage : : Add ( " Invalid Lexical Reordering Model Specification: " + fileStr [ f ] ) ;
return false ;
}
// spec[0] = factor map
// spec[1] = name
// spec[2] = num weights
// spec[3] = fileName
// decode factor map
vector < FactorType > input , output ;
vector < string > inputfactors = Tokenize ( spec [ 0 ] , " - " ) ;
if ( inputfactors . size ( ) = = 2 ) {
input = Tokenize < FactorType > ( inputfactors [ 0 ] , " , " ) ;
output = Tokenize < FactorType > ( inputfactors [ 1 ] , " , " ) ;
} else if ( inputfactors . size ( ) = = 1 ) {
//if there is only one side assume it is on e side... why?
output = Tokenize < FactorType > ( inputfactors [ 0 ] , " , " ) ;
} else {
//format error
return false ;
}
string modelType = spec [ 1 ] ;
// decode num weights and fetch weights from array
std : : vector < float > mweights ;
size_t numWeights = atoi ( spec [ 2 ] . c_str ( ) ) ;
for ( size_t k = 0 ; k < numWeights ; + + k , + + w )
{
if ( w > = weights . size ( ) ) {
UserMessage : : Add ( " Lexicalized distortion model: Not enough weights, add to [weight-d] " ) ;
return false ;
} else
mweights . push_back ( weights [ w ] ) ;
}
string filePath = spec [ 3 ] ;
m_reorderModels . push_back ( new LexicalReordering ( input , output , modelType , filePath , mweights ) ) ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
return true ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
2009-05-26 23:30:35 +04:00
bool StaticData : : LoadGlobalLexicalModel ( )
{
const vector < float > & weight = Scan < float > ( m_parameter - > GetParam ( " weight-lex " ) ) ;
const vector < string > & file = m_parameter - > GetParam ( " global-lexical-file " ) ;
if ( weight . size ( ) ! = file . size ( ) )
{
std : : cerr < < " number of weights and models for the global lexical model does not match ( "
< < weight . size ( ) < < " != " < < file . size ( ) < < " ) " < < std : : endl ;
return false ;
}
for ( size_t i = 0 ; i < weight . size ( ) ; i + + )
{
vector < string > spec = Tokenize < string > ( file [ i ] , " " ) ;
if ( spec . size ( ) ! = 2 )
{
std : : cerr < < " wrong global lexical model specification: " < < file [ i ] < < endl ;
return false ;
}
vector < string > factors = Tokenize ( spec [ 0 ] , " - " ) ;
if ( factors . size ( ) ! = 2 )
{
std : : cerr < < " wrong factor definition for global lexical model: " < < spec [ 0 ] < < endl ;
return false ;
}
vector < FactorType > inputFactors = Tokenize < FactorType > ( factors [ 0 ] , " , " ) ;
vector < FactorType > outputFactors = Tokenize < FactorType > ( factors [ 1 ] , " , " ) ;
m_globalLexicalModels . push_back ( new GlobalLexicalModel ( spec [ 1 ] , weight [ i ] , inputFactors , outputFactors ) ) ;
}
return true ;
}
2008-06-11 14:52:57 +04:00
bool StaticData : : LoadLanguageModels ( )
{
if ( m_parameter - > GetParam ( " lmodel-file " ) . size ( ) > 0 )
{
// weights
vector < float > weightAll = Scan < float > ( m_parameter - > GetParam ( " weight-l " ) ) ;
for ( size_t i = 0 ; i < weightAll . size ( ) ; i + + )
{
m_allWeights . push_back ( weightAll [ i ] ) ;
}
// dictionary upper-bounds fo all IRST LMs
vector < int > LMdub = Scan < int > ( m_parameter - > GetParam ( " lmodel-dub " ) ) ;
if ( m_parameter - > GetParam ( " lmodel-dub " ) . size ( ) = = 0 ) {
for ( size_t i = 0 ; i < m_parameter - > GetParam ( " lmodel-file " ) . size ( ) ; i + + )
LMdub . push_back ( 0 ) ;
}
// initialize n-gram order for each factor. populated only by factored lm
const vector < string > & lmVector = m_parameter - > GetParam ( " lmodel-file " ) ;
for ( size_t i = 0 ; i < lmVector . size ( ) ; i + + )
{
vector < string > token = Tokenize ( lmVector [ i ] ) ;
if ( token . size ( ) ! = 4 & & token . size ( ) ! = 5 )
{
UserMessage : : Add ( " Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]' " ) ;
return false ;
}
// type = implementation, SRI, IRST etc
LMImplementation lmImplementation = static_cast < LMImplementation > ( Scan < int > ( token [ 0 ] ) ) ;
// factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
vector < FactorType > factorTypes = Tokenize < FactorType > ( token [ 1 ] , " , " ) ;
// nGramOrder = 2 = bigram, 3 = trigram, etc
size_t nGramOrder = Scan < int > ( token [ 2 ] ) ;
string & languageModelFile = token [ 3 ] ;
if ( token . size ( ) = = 5 ) {
if ( lmImplementation = = IRST )
languageModelFile + = " " + token [ 4 ] ;
else {
UserMessage : : Add ( " Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]' " ) ;
return false ;
}
}
IFVERBOSE ( 1 )
PrintUserTime ( string ( " Start loading LanguageModel " ) + languageModelFile ) ;
LanguageModel * lm = LanguageModelFactory : : CreateLanguageModel (
lmImplementation
, factorTypes
, nGramOrder
, languageModelFile
, weightAll [ i ]
, m_scoreIndexManager
, LMdub [ i ] ) ;
if ( lm = = NULL )
{
UserMessage : : Add ( " no LM created. We probably don't have it compiled " ) ;
return false ;
}
2010-04-08 21:16:10 +04:00
m_languageModel . Add ( lm ) ;
2008-06-11 14:52:57 +04:00
}
}
// flag indicating that language models were loaded,
// since phrase table loading requires their presence
m_fLMsLoaded = true ;
IFVERBOSE ( 1 )
PrintUserTime ( " Finished loading LanguageModels " ) ;
return true ;
}
bool StaticData : : LoadGenerationTables ( )
{
if ( m_parameter - > GetParam ( " generation-file " ) . size ( ) > 0 )
{
const vector < string > & generationVector = m_parameter - > GetParam ( " generation-file " ) ;
const vector < float > & weight = Scan < float > ( m_parameter - > GetParam ( " weight-generation " ) ) ;
IFVERBOSE ( 1 )
{
TRACE_ERR ( " weight-generation: " ) ;
for ( size_t i = 0 ; i < weight . size ( ) ; i + + )
{
TRACE_ERR ( weight [ i ] < < " \t " ) ;
}
TRACE_ERR ( endl ) ;
}
size_t currWeightNum = 0 ;
for ( size_t currDict = 0 ; currDict < generationVector . size ( ) ; currDict + + )
{
vector < string > token = Tokenize ( generationVector [ currDict ] ) ;
vector < FactorType > input = Tokenize < FactorType > ( token [ 0 ] , " , " )
, output = Tokenize < FactorType > ( token [ 1 ] , " , " ) ;
m_maxFactorIdx [ 1 ] = CalcMax ( m_maxFactorIdx [ 1 ] , input , output ) ;
string filePath ;
size_t numFeatures ;
numFeatures = Scan < size_t > ( token [ 2 ] ) ;
filePath = token [ 3 ] ;
if ( ! FileExists ( filePath ) & & FileExists ( filePath + " .gz " ) ) {
filePath + = " .gz " ;
}
VERBOSE ( 1 , filePath < < endl ) ;
m_generationDictionary . push_back ( new GenerationDictionary ( numFeatures , m_scoreIndexManager ) ) ;
assert ( m_generationDictionary . back ( ) & & " could not create GenerationDictionary " ) ;
if ( ! m_generationDictionary . back ( ) - > Load ( input
, output
, filePath
, Output ) )
{
delete m_generationDictionary . back ( ) ;
return false ;
}
for ( size_t i = 0 ; i < numFeatures ; i + + ) {
assert ( currWeightNum < weight . size ( ) ) ;
m_allWeights . push_back ( weight [ currWeightNum + + ] ) ;
}
}
if ( currWeightNum ! = weight . size ( ) ) {
TRACE_ERR ( " [WARNING] config file has " < < weight . size ( ) < < " generation weights listed, but the configuration for generation files indicates there should be " < < currWeightNum < < " ! \n " ) ;
}
}
return true ;
}
bool StaticData : : LoadPhraseTables ( )
{
VERBOSE ( 2 , " About to LoadPhraseTables " < < endl ) ;
// language models must be loaded prior to loading phrase tables
assert ( m_fLMsLoaded ) ;
// load phrase translation tables
if ( m_parameter - > GetParam ( " ttable-file " ) . size ( ) > 0 )
{
// weights
vector < float > weightAll = Scan < float > ( m_parameter - > GetParam ( " weight-t " ) ) ;
const vector < string > & translationVector = m_parameter - > GetParam ( " ttable-file " ) ;
vector < size_t > maxTargetPhrase = Scan < size_t > ( m_parameter - > GetParam ( " ttable-limit " ) ) ;
2008-12-13 15:08:55 +03:00
2008-06-11 14:52:57 +04:00
size_t index = 0 ;
size_t weightAllOffset = 0 ;
for ( size_t currDict = 0 ; currDict < translationVector . size ( ) ; currDict + + )
{
vector < string > token = Tokenize ( translationVector [ currDict ] ) ;
2010-04-09 00:50:22 +04:00
assert ( token . size ( ) > = 5 ) ;
2008-06-11 14:52:57 +04:00
//characteristics of the phrase table
2010-04-08 21:16:10 +04:00
PhraseTableImplementation implementation = ( PhraseTableImplementation ) Scan < int > ( token [ 0 ] ) ;
vector < FactorType > input = Tokenize < FactorType > ( token [ 1 ] , " , " )
, output = Tokenize < FactorType > ( token [ 2 ] , " , " ) ;
2008-06-11 14:52:57 +04:00
m_maxFactorIdx [ 0 ] = CalcMax ( m_maxFactorIdx [ 0 ] , input ) ;
m_maxFactorIdx [ 1 ] = CalcMax ( m_maxFactorIdx [ 1 ] , output ) ;
m_maxNumFactors = std : : max ( m_maxFactorIdx [ 0 ] , m_maxFactorIdx [ 1 ] ) + 1 ;
2010-04-08 21:16:10 +04:00
size_t numScoreComponent = Scan < size_t > ( token [ 3 ] ) ;
string filePath = token [ 4 ] ;
2008-06-11 14:52:57 +04:00
assert ( weightAll . size ( ) > = weightAllOffset + numScoreComponent ) ;
// weights for this phrase dictionary
// first InputScores (if any), then translation scores
vector < float > weight ;
2010-04-08 21:16:10 +04:00
if ( currDict = = 0 & & ( m_inputType = = ConfusionNetworkInput | | m_inputType = = WordLatticeInput ) )
2008-06-11 14:52:57 +04:00
{ // TODO. find what the assumptions made by confusion network about phrase table output which makes
// it only work with binrary file. This is a hack
2009-02-05 20:37:09 +03:00
2008-06-11 14:52:57 +04:00
m_numInputScores = m_parameter - > GetParam ( " weight-i " ) . size ( ) ;
for ( unsigned k = 0 ; k < m_numInputScores ; + + k )
2009-02-05 20:37:09 +03:00
weight . push_back ( Scan < float > ( m_parameter - > GetParam ( " weight-i " ) [ k ] ) ) ;
if ( m_parameter - > GetParam ( " link-param-count " ) . size ( ) )
m_numLinkParams = Scan < size_t > ( m_parameter - > GetParam ( " link-param-count " ) [ 0 ] ) ;
//print some info about this interaction:
if ( m_numLinkParams = = m_numInputScores ) {
VERBOSE ( 1 , " specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count. \n " ) ;
} else if ( ( m_numLinkParams + 1 ) = = m_numInputScores ) {
VERBOSE ( 1 , " WARN: " < < m_numInputScores < < " insertion weights found and only " < < m_numLinkParams < < " link parameters specified, applying non-epsilon 'real' word link count for last feature weight. \n " ) ;
} else {
stringstream strme ;
strme < < " You specified " < < m_numInputScores
< < " input weights (weight-i), but you specified " < < m_numLinkParams < < " link parameters (link-param-count)! " ;
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
2008-06-11 14:52:57 +04:00
}
2009-02-05 20:37:09 +03:00
if ( ! m_inputType ) {
2008-06-11 14:52:57 +04:00
m_numInputScores = 0 ;
}
2009-02-05 20:37:09 +03:00
//this number changes depending on what phrase table we're talking about: only 0 has the weights on it
size_t tableInputScores = ( currDict = = 0 ? m_numInputScores : 0 ) ;
2008-06-11 14:52:57 +04:00
for ( size_t currScore = 0 ; currScore < numScoreComponent ; currScore + + )
weight . push_back ( weightAll [ weightAllOffset + currScore ] ) ;
2009-02-05 20:37:09 +03:00
if ( weight . size ( ) - tableInputScores ! = numScoreComponent )
2008-06-11 14:52:57 +04:00
{
stringstream strme ;
strme < < " Your phrase table has " < < numScoreComponent
2009-02-05 20:37:09 +03:00
< < " scores, but you specified " < < ( weight . size ( ) - tableInputScores ) < < " weights! " ;
2008-06-11 14:52:57 +04:00
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
weightAllOffset + = numScoreComponent ;
2009-02-05 20:37:09 +03:00
numScoreComponent + = tableInputScores ;
2010-04-08 21:57:38 +04:00
string targetPath , alignmentsFile ;
if ( implementation = = SuffixArray )
{
targetPath = token [ 5 ] ;
alignmentsFile = token [ 6 ] ;
}
2009-02-05 20:37:09 +03:00
2008-06-11 14:52:57 +04:00
assert ( numScoreComponent = = weight . size ( ) ) ;
std : : copy ( weight . begin ( ) , weight . end ( ) , std : : back_inserter ( m_allWeights ) ) ;
IFVERBOSE ( 1 )
PrintUserTime ( string ( " Start loading PhraseTable " ) + filePath ) ;
2010-04-09 00:50:22 +04:00
VERBOSE ( 1 , " filePath: " < < filePath < < endl ) ;
2010-04-08 21:16:10 +04:00
2009-08-07 20:47:54 +04:00
PhraseDictionaryFeature * pdf = new PhraseDictionaryFeature (
2010-04-08 21:16:10 +04:00
implementation
, numScoreComponent
, ( currDict = = 0 ? m_numInputScores : 0 )
2009-08-07 20:47:54 +04:00
, input
, output
, filePath
, weight
2010-04-08 21:57:38 +04:00
, maxTargetPhrase [ index ]
, targetPath , alignmentsFile ) ;
2010-04-08 21:16:10 +04:00
2009-08-07 20:47:54 +04:00
m_phraseDictionary . push_back ( pdf ) ;
2010-04-08 21:16:10 +04:00
2008-06-11 14:52:57 +04:00
index + + ;
}
}
IFVERBOSE ( 1 )
PrintUserTime ( " Finished loading phrase tables " ) ;
return true ;
}
2010-04-08 21:16:10 +04:00
void StaticData : : LoadNonTerminals ( )
{
string defaultNonTerminals ;
if ( m_parameter - > GetParam ( " non-terminals " ) . size ( ) = = 0 )
{
defaultNonTerminals = " X " ;
}
else
{
vector < std : : string > tokens = Tokenize ( m_parameter - > GetParam ( " non-terminals " ) [ 0 ] ) ;
defaultNonTerminals = tokens [ 0 ] ;
}
FactorCollection & factorCollection = FactorCollection : : Instance ( ) ;
m_inputDefaultNonTerminal . SetIsNonTerminal ( true ) ;
const Factor * sourceFactor = factorCollection . AddFactor ( Input , 0 , defaultNonTerminals ) ;
m_inputDefaultNonTerminal . SetFactor ( 0 , sourceFactor ) ;
m_outputDefaultNonTerminal . SetIsNonTerminal ( true ) ;
const Factor * targetFactor = factorCollection . AddFactor ( Output , 0 , defaultNonTerminals ) ;
m_outputDefaultNonTerminal . SetFactor ( 0 , targetFactor ) ;
// for unknwon words
if ( m_parameter - > GetParam ( " unknown-lhs " ) . size ( ) = = 0 )
{
UnknownLHSEntry entry ( defaultNonTerminals , 0.0f ) ;
m_unknownLHS . push_back ( entry ) ;
}
else
{
const string & filePath = m_parameter - > GetParam ( " unknown-lhs " ) [ 0 ] ;
InputFileStream inStream ( filePath ) ;
string line ;
while ( getline ( inStream , line ) )
{
vector < string > tokens = Tokenize ( line ) ;
assert ( tokens . size ( ) = = 2 ) ;
UnknownLHSEntry entry ( tokens [ 0 ] , Scan < float > ( tokens [ 1 ] ) ) ;
m_unknownLHS . push_back ( entry ) ;
}
}
}
void StaticData : : LoadChartDecodingParameters ( )
{
LoadNonTerminals ( ) ;
// source label overlap
if ( m_parameter - > GetParam ( " source-label-overlap " ) . size ( ) > 0 )
{
m_sourceLabelOverlap = ( SourceLabelOverlap ) Scan < int > ( m_parameter - > GetParam ( " source-label-overlap " ) [ 0 ] ) ;
}
else
{
m_sourceLabelOverlap = SourceLabelOverlapAdd ;
}
m_ruleLimit = ( m_parameter - > GetParam ( " rule-limit " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " rule-limit " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_SIZE ;
}
void StaticData : : LoadPhraseBasedParameters ( )
{
const vector < string > distortionWeights = m_parameter - > GetParam ( " weight-d " ) ;
m_weightDistortion = Scan < float > ( distortionWeights [ 0 ] ) ;
m_distortionScoreProducer = new DistortionScoreProducer ( m_scoreIndexManager ) ;
m_allWeights . push_back ( m_weightDistortion ) ;
}
2009-08-07 20:47:54 +04:00
vector < DecodeGraph * > StaticData : : GetDecodeStepVL ( const InputType & source ) const
2008-06-11 14:52:57 +04:00
{
2010-04-08 21:16:10 +04:00
vector < DecodeGraph * > decodeGraphs ;
2008-06-11 14:52:57 +04:00
// mapping
const vector < string > & mappingVector = m_parameter - > GetParam ( " mapping " ) ;
2010-04-08 21:16:10 +04:00
const vector < size_t > & maxChartSpans = Scan < size_t > ( m_parameter - > GetParam ( " max-chart-span " ) ) ;
2008-06-11 14:52:57 +04:00
DecodeStep * prev = 0 ;
2010-04-08 21:16:10 +04:00
size_t prevDecodeGraphInd = 0 ;
2008-06-11 14:52:57 +04:00
for ( size_t i = 0 ; i < mappingVector . size ( ) ; i + + )
{
vector < string > token = Tokenize ( mappingVector [ i ] ) ;
2010-04-08 21:16:10 +04:00
size_t decodeGraphInd ;
2008-06-11 14:52:57 +04:00
DecodeType decodeType ;
size_t index ;
if ( token . size ( ) = = 2 )
{
2010-04-08 21:16:10 +04:00
decodeGraphInd = 0 ;
2008-06-11 14:52:57 +04:00
decodeType = token [ 0 ] = = " T " ? Translate : Generate ;
index = Scan < size_t > ( token [ 1 ] ) ;
}
else if ( token . size ( ) = = 3 )
2010-04-08 21:16:10 +04:00
{ // For specifying multiple translation model
decodeGraphInd = Scan < size_t > ( token [ 0 ] ) ;
2008-06-11 14:52:57 +04:00
//the vectorList index can only increment by one
2010-04-08 21:16:10 +04:00
assert ( decodeGraphInd = = prevDecodeGraphInd | | decodeGraphInd = = prevDecodeGraphInd + 1 ) ;
if ( decodeGraphInd > prevDecodeGraphInd )
2008-06-11 14:52:57 +04:00
{
prev = NULL ;
}
decodeType = token [ 1 ] = = " T " ? Translate : Generate ;
index = Scan < size_t > ( token [ 2 ] ) ;
}
else
{
UserMessage : : Add ( " Malformed mapping! " ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
DecodeStep * decodeStep = NULL ;
2008-06-11 14:52:57 +04:00
switch ( decodeType ) {
case Translate :
if ( index > = m_phraseDictionary . size ( ) )
{
stringstream strme ;
strme < < " No phrase dictionary with index "
< < index < < " available! " ;
UserMessage : : Add ( strme . str ( ) ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
2009-08-07 20:47:54 +04:00
decodeStep = new DecodeStepTranslation ( m_phraseDictionary [ index ] - > GetDictionary ( source ) , prev ) ;
2008-06-11 14:52:57 +04:00
break ;
case Generate :
if ( index > = m_generationDictionary . size ( ) )
{
stringstream strme ;
strme < < " No generation dictionary with index "
< < index < < " available! " ;
UserMessage : : Add ( strme . str ( ) ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
decodeStep = new DecodeStepGeneration ( m_generationDictionary [ index ] , prev ) ;
break ;
case InsertNullFertilityWord :
assert ( ! " Please implement NullFertilityInsertion. " ) ;
break ;
}
2010-04-08 21:16:10 +04:00
2008-06-11 14:52:57 +04:00
assert ( decodeStep ) ;
2010-04-08 21:16:10 +04:00
if ( decodeGraphs . size ( ) < decodeGraphInd + 1 )
2008-06-11 14:52:57 +04:00
{
2010-04-08 21:16:10 +04:00
DecodeGraph * decodeGraph ;
if ( m_searchAlgorithm = = ChartDecoding )
{
size_t maxChartSpan = ( decodeGraphInd < maxChartSpans . size ( ) ) ? maxChartSpans [ decodeGraphInd ] : DEFAULT_MAX_CHART_SPAN ;
decodeGraph = new DecodeGraph ( decodeGraphs . size ( ) , maxChartSpan ) ;
}
else
{
decodeGraph = new DecodeGraph ( decodeGraphs . size ( ) ) ;
}
decodeGraphs . push_back ( decodeGraph ) ; // TODO max chart span
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
decodeGraphs [ decodeGraphInd ] - > Add ( decodeStep ) ;
2008-06-11 14:52:57 +04:00
prev = decodeStep ;
2010-04-08 21:16:10 +04:00
prevDecodeGraphInd = decodeGraphInd ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
return decodeGraphs ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
# include "PhraseDictionary.h"
2008-06-11 14:52:57 +04:00
void StaticData : : CleanUpAfterSentenceProcessing ( ) const
{
2010-04-08 21:16:10 +04:00
for ( size_t i = 0 ; i < m_phraseDictionary . size ( ) ; + + i )
{
PhraseDictionaryFeature & phraseDictionaryFeature = * m_phraseDictionary [ i ] ;
PhraseDictionary & phraseDictionary = * phraseDictionaryFeature . GetDictionary ( ) ;
phraseDictionary . CleanUp ( ) ;
}
2008-06-11 14:52:57 +04:00
for ( size_t i = 0 ; i < m_generationDictionary . size ( ) ; + + i )
m_generationDictionary [ i ] - > CleanUp ( ) ;
//something LMs could do after each sentence
LMList : : const_iterator iterLM ;
for ( iterLM = m_languageModel . begin ( ) ; iterLM ! = m_languageModel . end ( ) ; + + iterLM )
{
LanguageModel & languageModel = * * iterLM ;
languageModel . CleanUpAfterSentenceProcessing ( ) ;
}
}
/** initialize the translation and language models for this sentence
( includes loading of translation table entries on demand , if
binary format is used ) */
void StaticData : : InitializeBeforeSentenceProcessing ( InputType const & in ) const
{
2009-05-26 23:30:35 +04:00
for ( size_t i = 0 ; i < m_reorderModels . size ( ) ; + + i ) {
m_reorderModels [ i ] - > InitializeForInput ( in ) ;
}
for ( size_t i = 0 ; i < m_globalLexicalModels . size ( ) ; + + i ) {
m_globalLexicalModels [ i ] - > InitializeForInput ( ( Sentence const & ) in ) ;
}
//something LMs could do before translating a sentence
LMList : : const_iterator iterLM ;
2008-06-11 14:52:57 +04:00
for ( iterLM = m_languageModel . begin ( ) ; iterLM ! = m_languageModel . end ( ) ; + + iterLM )
{
LanguageModel & languageModel = * * iterLM ;
2009-05-26 23:30:35 +04:00
languageModel . InitializeBeforeSentenceProcessing ( ) ;
2008-06-11 14:52:57 +04:00
}
}
void StaticData : : SetWeightsForScoreProducer ( const ScoreProducer * sp , const std : : vector < float > & weights )
{
const size_t id = sp - > GetScoreBookkeepingID ( ) ;
const size_t begin = m_scoreIndexManager . GetBeginIndex ( id ) ;
const size_t end = m_scoreIndexManager . GetEndIndex ( id ) ;
assert ( end - begin = = weights . size ( ) ) ;
if ( m_allWeights . size ( ) < end )
m_allWeights . resize ( end ) ;
std : : vector < float > : : const_iterator weightIter = weights . begin ( ) ;
for ( size_t i = begin ; i < end ; i + + )
m_allWeights [ i ] = * weightIter + + ;
}
2008-10-14 23:25:18 +04:00
const TranslationOptionList * StaticData : : FindTransOptListInCache ( const DecodeGraph & decodeGraph , const Phrase & sourcePhrase ) const
2008-06-11 14:52:57 +04:00
{
2009-08-07 20:47:54 +04:00
std : : pair < size_t , Phrase > key ( decodeGraph . GetPosition ( ) , sourcePhrase ) ;
# ifdef WITH_THREADS
boost : : mutex : : scoped_lock lock ( m_transOptCacheMutex ) ;
# endif
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iter
2008-10-14 23:25:18 +04:00
= m_transOptCache . find ( key ) ;
2008-06-11 14:52:57 +04:00
if ( iter = = m_transOptCache . end ( ) )
return NULL ;
2009-01-01 21:16:54 +03:00
iter - > second . second = clock ( ) ; // update last used time
return iter - > second . first ;
}
void StaticData : : ReduceTransOptCache ( ) const
{
if ( m_transOptCache . size ( ) < = m_transOptCacheMaxSize ) return ; // not full
clock_t t = clock ( ) ;
// find cutoff for last used time
priority_queue < clock_t > lastUsedTimes ;
2009-08-07 20:47:54 +04:00
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iter ;
2009-01-01 21:16:54 +03:00
iter = m_transOptCache . begin ( ) ;
while ( iter ! = m_transOptCache . end ( ) )
{
lastUsedTimes . push ( iter - > second . second ) ;
iter + + ;
}
for ( size_t i = 0 ; i < lastUsedTimes . size ( ) - m_transOptCacheMaxSize / 2 ; i + + )
lastUsedTimes . pop ( ) ;
clock_t cutoffLastUsedTime = lastUsedTimes . top ( ) ;
2008-06-11 14:52:57 +04:00
2009-01-01 21:16:54 +03:00
// remove all old entries
iter = m_transOptCache . begin ( ) ;
while ( iter ! = m_transOptCache . end ( ) )
{
if ( iter - > second . second < cutoffLastUsedTime )
{
2009-08-07 20:47:54 +04:00
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iterRemove = iter + + ;
2009-01-01 21:16:54 +03:00
delete iterRemove - > second . first ;
m_transOptCache . erase ( iterRemove ) ;
}
else iter + + ;
}
VERBOSE ( 2 , " Reduced persistent translation option cache in " < < ( ( clock ( ) - t ) / ( float ) CLOCKS_PER_SEC ) < < " seconds. " < < std : : endl ) ;
2008-06-11 14:52:57 +04:00
}
2008-10-14 23:25:18 +04:00
void StaticData : : AddTransOptListToCache ( const DecodeGraph & decodeGraph , const Phrase & sourcePhrase , const TranslationOptionList & transOptList ) const
{
2009-08-07 20:47:54 +04:00
std : : pair < size_t , Phrase > key ( decodeGraph . GetPosition ( ) , sourcePhrase ) ;
2009-01-01 21:16:54 +03:00
TranslationOptionList * storedTransOptList = new TranslationOptionList ( transOptList ) ;
2009-08-07 20:47:54 +04:00
# ifdef WITH_THREADS
boost : : mutex : : scoped_lock lock ( m_transOptCacheMutex ) ;
# endif
2009-01-01 21:16:54 +03:00
m_transOptCache [ key ] = make_pair ( storedTransOptList , clock ( ) ) ;
ReduceTransOptCache ( ) ;
2008-10-14 23:25:18 +04:00
}
2008-10-09 03:51:26 +04:00
}