2008-06-11 14:52:57 +04:00
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase - based language decoder
Copyright ( C ) 2006 University of Edinburgh
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
# include <string>
# include <cassert>
# include "PhraseDictionaryMemory.h"
# include "DecodeStepTranslation.h"
# include "DecodeStepGeneration.h"
# include "GenerationDictionary.h"
# include "DummyScoreProducers.h"
# include "StaticData.h"
# include "Util.h"
# include "FactorCollection.h"
# include "Timer.h"
# include "LanguageModelFactory.h"
# include "LexicalReordering.h"
2009-05-26 23:30:35 +04:00
# include "GlobalLexicalModel.h"
2008-06-11 14:52:57 +04:00
# include "SentenceStats.h"
2011-05-11 02:02:25 +04:00
# include "PhraseBoundaryFeature.h"
2010-04-26 18:56:06 +04:00
# include "PhraseDictionary.h"
2011-03-22 17:33:16 +03:00
# include "PhrasePairFeature.h"
2011-08-06 18:10:43 +04:00
# include "PhraseLengthFeature.h"
2008-06-11 14:52:57 +04:00
# include "UserMessage.h"
# include "TranslationOption.h"
2010-09-17 17:36:03 +04:00
# include "TargetBigramFeature.h"
2008-06-11 14:52:57 +04:00
# include "DecodeGraph.h"
2008-06-19 03:14:09 +04:00
# include "InputFileStream.h"
2010-09-16 19:45:56 +04:00
# include "BleuScoreFeature.h"
2010-09-17 18:25:08 +04:00
# include "ScoreComponentCollection.h"
2008-06-11 14:52:57 +04:00
using namespace std ;
2008-10-09 03:51:26 +04:00
namespace Moses
{
2008-06-11 14:52:57 +04:00
static size_t CalcMax ( size_t x , const vector < size_t > & y ) {
size_t max = x ;
for ( vector < size_t > : : const_iterator i = y . begin ( ) ; i ! = y . end ( ) ; + + i )
if ( * i > max ) max = * i ;
return max ;
}
static size_t CalcMax ( size_t x , const vector < size_t > & y , const vector < size_t > & z ) {
size_t max = x ;
for ( vector < size_t > : : const_iterator i = y . begin ( ) ; i ! = y . end ( ) ; + + i )
if ( * i > max ) max = * i ;
for ( vector < size_t > : : const_iterator i = z . begin ( ) ; i ! = z . end ( ) ; + + i )
if ( * i > max ) max = * i ;
return max ;
}
StaticData StaticData : : s_instance ;
StaticData : : StaticData ( )
2011-03-23 23:42:34 +03:00
: m_targetBigramFeature ( NULL )
2011-05-11 02:02:25 +04:00
, m_phraseBoundaryFeature ( NULL )
2011-03-23 23:42:34 +03:00
, m_phrasePairFeature ( NULL )
2011-08-06 18:10:43 +04:00
, m_phraseLengthFeature ( NULL )
2011-03-23 23:42:34 +03:00
, m_numLinkParams ( 1 )
2010-08-10 17:12:00 +04:00
, m_fLMsLoaded ( false )
, m_sourceStartPosMattersForRecombination ( false )
2008-06-11 14:52:57 +04:00
, m_inputType ( SentenceInput )
, m_numInputScores ( 0 )
2010-09-14 20:55:33 +04:00
, m_bleuScoreFeature ( NULL )
2010-05-08 19:51:59 +04:00
, m_detailedTranslationReportingFilePath ( )
2008-06-11 14:52:57 +04:00
, m_onlyDistinctNBest ( false )
, m_factorDelimiter ( " | " ) // default delimiter between factors
, m_isAlwaysCreateDirectTranslationOption ( false )
2010-09-14 20:55:33 +04:00
2008-06-11 14:52:57 +04:00
{
m_maxFactorIdx [ 0 ] = 0 ; // source side
m_maxFactorIdx [ 1 ] = 0 ; // target side
// memory pools
Phrase : : InitializeMemPool ( ) ;
}
2010-05-06 15:26:52 +04:00
2008-06-11 14:52:57 +04:00
bool StaticData : : LoadData ( Parameter * parameter )
{
ResetUserTime ( ) ;
m_parameter = parameter ;
// verbose level
m_verboseLevel = 1 ;
if ( m_parameter - > GetParam ( " verbose " ) . size ( ) = = 1 )
{
m_verboseLevel = Scan < size_t > ( m_parameter - > GetParam ( " verbose " ) [ 0 ] ) ;
}
2010-04-08 21:16:10 +04:00
// to cube or not to cube
m_searchAlgorithm = ( m_parameter - > GetParam ( " search-algorithm " ) . size ( ) > 0 ) ?
( SearchAlgorithm ) Scan < size_t > ( m_parameter - > GetParam ( " search-algorithm " ) [ 0 ] ) : Normal ;
if ( m_searchAlgorithm = = ChartDecoding )
LoadChartDecodingParameters ( ) ;
else
LoadPhraseBasedParameters ( ) ;
2008-06-11 14:52:57 +04:00
// input type has to be specified BEFORE loading the phrase tables!
if ( m_parameter - > GetParam ( " inputtype " ) . size ( ) )
m_inputType = ( InputTypeEnum ) Scan < int > ( m_parameter - > GetParam ( " inputtype " ) [ 0 ] ) ;
std : : string s_it = " text input " ;
if ( m_inputType = = 1 ) { s_it = " confusion net " ; }
if ( m_inputType = = 2 ) { s_it = " word lattice " ; }
VERBOSE ( 2 , " input type is: " < < s_it < < " \n " ) ;
if ( m_parameter - > GetParam ( " recover-input-path " ) . size ( ) ) {
m_recoverPath = Scan < bool > ( m_parameter - > GetParam ( " recover-input-path " ) [ 0 ] ) ;
if ( m_recoverPath & & m_inputType = = SentenceInput ) {
TRACE_ERR ( " --recover-input-path should only be used with confusion net or word lattice input! \n " ) ;
m_recoverPath = false ;
}
}
2010-08-10 17:12:00 +04:00
2008-06-11 14:52:57 +04:00
// factor delimiter
if ( m_parameter - > GetParam ( " factor-delimiter " ) . size ( ) > 0 ) {
m_factorDelimiter = m_parameter - > GetParam ( " factor-delimiter " ) [ 0 ] ;
}
2010-05-04 01:39:23 +04:00
SetBooleanParameter ( & m_continuePartialTranslation , " continue-partial-translation " , false ) ;
2008-09-12 22:09:06 +04:00
//word-to-word alignment
SetBooleanParameter ( & m_UseAlignmentInfo , " use-alignment-info " , false ) ;
SetBooleanParameter ( & m_PrintAlignmentInfo , " print-alignment-info " , false ) ;
SetBooleanParameter ( & m_PrintAlignmentInfoNbest , " print-alignment-info-in-n-best " , false ) ;
2010-04-08 21:16:10 +04:00
SetBooleanParameter ( & m_outputHypoScore , " output-hypo-score " , false ) ;
2008-09-12 22:09:06 +04:00
if ( ! m_UseAlignmentInfo & & m_PrintAlignmentInfo ) {
TRACE_ERR ( " --print-alignment-info should only be used together with \" --use-alignment-info true \" . Continue forcing to false. \n " ) ;
m_PrintAlignmentInfo = false ;
}
if ( ! m_UseAlignmentInfo & & m_PrintAlignmentInfoNbest ) {
TRACE_ERR ( " --print-alignment-info-in-n-best should only be used together with \" --use-alignment-info true \" . Continue forcing to false. \n " ) ;
m_PrintAlignmentInfoNbest = false ;
}
2008-06-11 14:52:57 +04:00
// n-best
if ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) > = 2 )
{
m_nBestFilePath = m_parameter - > GetParam ( " n-best-list " ) [ 0 ] ;
m_nBestSize = Scan < size_t > ( m_parameter - > GetParam ( " n-best-list " ) [ 1 ] ) ;
m_onlyDistinctNBest = ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) > 2 & & m_parameter - > GetParam ( " n-best-list " ) [ 2 ] = = " distinct " ) ;
}
else if ( m_parameter - > GetParam ( " n-best-list " ) . size ( ) = = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -n-best-list file size " ) ) ;
return false ;
}
else
{
m_nBestSize = 0 ;
}
if ( m_parameter - > GetParam ( " n-best-factor " ) . size ( ) > 0 )
{
m_nBestFactor = Scan < size_t > ( m_parameter - > GetParam ( " n-best-factor " ) [ 0 ] ) ;
}
else {
m_nBestFactor = 20 ;
}
// word graph
if ( m_parameter - > GetParam ( " output-word-graph " ) . size ( ) = = 2 )
m_outputWordGraph = true ;
else
m_outputWordGraph = false ;
// search graph
if ( m_parameter - > GetParam ( " output-search-graph " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph file " ) ) ;
return false ;
}
m_outputSearchGraph = true ;
}
2010-01-28 18:32:04 +03:00
// ... in extended format
else if ( m_parameter - > GetParam ( " output-search-graph-extended " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph-extended " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph-extended file " ) ) ;
return false ;
}
m_outputSearchGraph = true ;
m_outputSearchGraphExtended = true ;
}
else
2008-06-11 14:52:57 +04:00
m_outputSearchGraph = false ;
2008-09-24 20:48:23 +04:00
# ifdef HAVE_PROTOBUF
if ( m_parameter - > GetParam ( " output-search-graph-pb " ) . size ( ) > 0 )
{
if ( m_parameter - > GetParam ( " output-search-graph-pb " ) . size ( ) ! = 1 ) {
UserMessage : : Add ( string ( " ERROR: wrong format for switch -output-search-graph-pb path " ) ) ;
return false ;
}
m_outputSearchGraphPB = true ;
}
2009-01-01 21:16:54 +03:00
else
2008-09-24 20:48:23 +04:00
m_outputSearchGraphPB = false ;
# endif
2008-06-11 14:52:57 +04:00
// include feature names in the n-best list
SetBooleanParameter ( & m_labeledNBestList , " labeled-n-best-list " , true ) ;
// include word alignment in the n-best list
SetBooleanParameter ( & m_nBestIncludesAlignment , " include-alignment-in-n-best " , false ) ;
// printing source phrase spans
SetBooleanParameter ( & m_reportSegmentation , " report-segmentation " , false ) ;
// print all factors of output translations
SetBooleanParameter ( & m_reportAllFactors , " report-all-factors " , false ) ;
2010-02-18 13:54:33 +03:00
// print all factors of output translations
SetBooleanParameter ( & m_reportAllFactorsNBest , " report-all-factors-in-n-best " , false ) ;
2011-08-07 04:58:56 +04:00
// caching of translation options
2008-06-11 14:52:57 +04:00
if ( m_inputType = = SentenceInput )
{
SetBooleanParameter ( & m_useTransOptCache , " use-persistent-cache " , true ) ;
2009-01-01 21:16:54 +03:00
m_transOptCacheMaxSize = ( m_parameter - > GetParam ( " persistent-cache-size " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " persistent-cache-size " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_CACHE_SIZE ;
2008-06-11 14:52:57 +04:00
}
else
{
m_useTransOptCache = false ;
}
2010-09-14 16:49:07 +04:00
SetBooleanParameter ( & m_enableOnlineCommand , " enable-online-command " , false ) ;
if ( m_enableOnlineCommand = = true ) {
VERBOSE ( 1 , " Online commands are enabled. \n " ) ;
VERBOSE ( 1 , " Cache for translation options is disabled. \n " ) ;
m_useTransOptCache = false ;
}
2008-06-11 14:52:57 +04:00
//input factors
const vector < string > & inputFactorVector = m_parameter - > GetParam ( " input-factors " ) ;
for ( size_t i = 0 ; i < inputFactorVector . size ( ) ; i + + )
{
m_inputFactorOrder . push_back ( Scan < FactorType > ( inputFactorVector [ i ] ) ) ;
}
if ( m_inputFactorOrder . empty ( ) )
{
UserMessage : : Add ( string ( " no input factor specified in config file " ) ) ;
return false ;
}
//output factors
const vector < string > & outputFactorVector = m_parameter - > GetParam ( " output-factors " ) ;
for ( size_t i = 0 ; i < outputFactorVector . size ( ) ; i + + )
{
m_outputFactorOrder . push_back ( Scan < FactorType > ( outputFactorVector [ i ] ) ) ;
}
if ( m_outputFactorOrder . empty ( ) )
{ // default. output factor 0
m_outputFactorOrder . push_back ( 0 ) ;
}
//source word deletion
SetBooleanParameter ( & m_wordDeletionEnabled , " phrase-drop-allowed " , false ) ;
2010-03-07 10:57:48 +03:00
//Disable discarding
SetBooleanParameter ( & m_disableDiscarding , " disable-discarding " , false ) ;
//Print All Derivations
SetBooleanParameter ( & m_printAllDerivations , " print-all-derivations " , false ) ;
2008-06-11 14:52:57 +04:00
// additional output
2010-05-08 19:51:59 +04:00
if ( m_parameter - > isParamSpecified ( " translation-details " ) )
{
const vector < string > & args = m_parameter - > GetParam ( " translation-details " ) ;
if ( args . size ( ) = = 1 )
{
m_detailedTranslationReportingFilePath = args [ 0 ] ;
}
else
{
UserMessage : : Add ( string ( " the translation-details option requires exactly one filename argument " ) ) ;
return false ;
}
}
2008-06-11 14:52:57 +04:00
2010-08-10 17:12:00 +04:00
// word penalties
for ( size_t i = 0 ; i < m_parameter - > GetParam ( " weight-w " ) . size ( ) ; + + i ) {
float weightWordPenalty = Scan < float > ( m_parameter - > GetParam ( " weight-w " ) [ i ] ) ;
2010-10-07 02:06:49 +04:00
m_wordPenaltyProducers . push_back ( new WordPenaltyProducer ( ) ) ;
SetWeight ( m_wordPenaltyProducers . back ( ) , weightWordPenalty ) ;
2010-08-10 17:12:00 +04:00
}
float weightUnknownWord = ( m_parameter - > GetParam ( " weight-u " ) . size ( ) > 0 ) ? Scan < float > ( m_parameter - > GetParam ( " weight-u " ) [ 0 ] ) : 1 ;
2010-10-07 02:06:49 +04:00
m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer ( ) ;
SetWeight ( m_unknownWordPenaltyProducer , weightUnknownWord ) ;
2010-04-08 21:16:10 +04:00
2009-05-26 23:30:35 +04:00
// reordering constraints
2008-06-11 14:52:57 +04:00
m_maxDistortion = ( m_parameter - > GetParam ( " distortion-limit " ) . size ( ) > 0 ) ?
Scan < int > ( m_parameter - > GetParam ( " distortion-limit " ) [ 0 ] )
: - 1 ;
2008-11-24 20:30:37 +03:00
SetBooleanParameter ( & m_reorderingConstraint , " monotone-at-punctuation " , false ) ;
2008-12-13 15:08:55 +03:00
// settings for pruning
m_maxHypoStackSize = ( m_parameter - > GetParam ( " stack " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " stack " ) [ 0 ] ) : DEFAULT_MAX_HYPOSTACK_SIZE ;
2009-05-26 23:30:35 +04:00
m_minHypoStackDiversity = 0 ;
if ( m_parameter - > GetParam ( " stack-diversity " ) . size ( ) > 0 ) {
if ( m_maxDistortion > 15 ) {
UserMessage : : Add ( " stack diversity > 0 is not allowed for distortion limits larger than 15 " ) ;
return false ;
}
if ( m_inputType = = WordLatticeInput ) {
UserMessage : : Add ( " stack diversity > 0 is not allowed for lattice input " ) ;
return false ;
}
m_minHypoStackDiversity = Scan < size_t > ( m_parameter - > GetParam ( " stack-diversity " ) [ 0 ] ) ;
}
2008-06-11 14:52:57 +04:00
m_beamWidth = ( m_parameter - > GetParam ( " beam-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " beam-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_BEAM_WIDTH ) ;
2009-05-26 23:30:35 +04:00
m_earlyDiscardingThreshold = ( m_parameter - > GetParam ( " early-discarding-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " early-discarding-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_EARLY_DISCARDING_THRESHOLD ) ;
m_translationOptionThreshold = ( m_parameter - > GetParam ( " translation-option-threshold " ) . size ( ) > 0 ) ?
TransformScore ( Scan < float > ( m_parameter - > GetParam ( " translation-option-threshold " ) [ 0 ] ) )
: TransformScore ( DEFAULT_TRANSLATION_OPTION_THRESHOLD ) ;
2008-06-11 14:52:57 +04:00
m_maxNoTransOptPerCoverage = ( m_parameter - > GetParam ( " max-trans-opt-per-coverage " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-trans-opt-per-coverage " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_SIZE ;
m_maxNoPartTransOpt = ( m_parameter - > GetParam ( " max-partial-trans-opt " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-partial-trans-opt " ) [ 0 ] ) : DEFAULT_MAX_PART_TRANS_OPT_SIZE ;
m_maxPhraseLength = ( m_parameter - > GetParam ( " max-phrase-length " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " max-phrase-length " ) [ 0 ] ) : DEFAULT_MAX_PHRASE_LENGTH ;
m_cubePruningPopLimit = ( m_parameter - > GetParam ( " cube-pruning-pop-limit " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " cube-pruning-pop-limit " ) [ 0 ] ) : DEFAULT_CUBE_PRUNING_POP_LIMIT ;
m_cubePruningDiversity = ( m_parameter - > GetParam ( " cube-pruning-diversity " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " cube-pruning-diversity " ) [ 0 ] ) : DEFAULT_CUBE_PRUNING_DIVERSITY ;
2009-05-26 23:30:35 +04:00
// unknown word processing
2008-06-11 14:52:57 +04:00
SetBooleanParameter ( & m_dropUnknown , " drop-unknown " , false ) ;
// minimum Bayes risk decoding
SetBooleanParameter ( & m_mbr , " minimum-bayes-risk " , false ) ;
2010-02-03 13:23:32 +03:00
m_mbrSize = ( m_parameter - > GetParam ( " mbr-size " ) . size ( ) > 0 ) ?
2008-06-11 14:52:57 +04:00
Scan < size_t > ( m_parameter - > GetParam ( " mbr-size " ) [ 0 ] ) : 200 ;
m_mbrScale = ( m_parameter - > GetParam ( " mbr-scale " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " mbr-scale " ) [ 0 ] ) : 1.0f ;
2010-02-03 13:23:32 +03:00
//lattice mbr
SetBooleanParameter ( & m_useLatticeMBR , " lminimum-bayes-risk " , false ) ;
2010-04-12 13:51:29 +04:00
if ( m_useLatticeMBR & & m_mbr ) {
cerr < < " Errror: Cannot use both n-best mbr and lattice mbr together " < < endl ;
exit ( 1 ) ;
}
if ( m_useLatticeMBR ) m_mbr = true ;
2010-02-09 14:37:33 +03:00
m_lmbrPruning = ( m_parameter - > GetParam ( " lmbr-pruning-factor " ) . size ( ) > 0 ) ?
2010-02-03 13:23:32 +03:00
Scan < size_t > ( m_parameter - > GetParam ( " lmbr-pruning-factor " ) [ 0 ] ) : 30 ;
m_lmbrThetas = Scan < float > ( m_parameter - > GetParam ( " lmbr-thetas " ) ) ;
2010-02-03 14:20:20 +03:00
SetBooleanParameter ( & m_useLatticeHypSetForLatticeMBR , " lattice-hypo-set " , false ) ;
2010-02-03 22:46:35 +03:00
m_lmbrPrecision = ( m_parameter - > GetParam ( " lmbr-p " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-p " ) [ 0 ] ) : 0.8f ;
m_lmbrPRatio = ( m_parameter - > GetParam ( " lmbr-r " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-r " ) [ 0 ] ) : 0.6f ;
2010-03-14 23:23:17 +03:00
m_lmbrMapWeight = ( m_parameter - > GetParam ( " lmbr-map-weight " ) . size ( ) > 0 ) ?
Scan < float > ( m_parameter - > GetParam ( " lmbr-map-weight " ) [ 0 ] ) : 0.0f ;
2010-04-08 21:16:10 +04:00
2010-04-12 13:51:29 +04:00
//consensus decoding
SetBooleanParameter ( & m_useConsensusDecoding , " consensus-decoding " , false ) ;
if ( m_useConsensusDecoding & & m_mbr ) {
cerr < < " Error: Cannot use consensus decoding together with mbr " < < endl ;
exit ( 1 ) ;
}
if ( m_useConsensusDecoding ) m_mbr = true ;
2010-02-03 13:23:32 +03:00
2009-05-26 23:30:35 +04:00
m_timeout_threshold = ( m_parameter - > GetParam ( " time-out " ) . size ( ) > 0 ) ?
2008-06-11 14:52:57 +04:00
Scan < size_t > ( m_parameter - > GetParam ( " time-out " ) [ 0 ] ) : - 1 ;
2010-08-10 17:12:00 +04:00
m_timeout = ( GetTimeoutThreshold ( ) = = ( size_t ) - 1 ) ? false : true ;
2008-06-11 14:52:57 +04:00
2010-04-23 19:01:06 +04:00
m_lmcache_cleanup_threshold = ( m_parameter - > GetParam ( " clean-lm-cache " ) . size ( ) > 0 ) ?
Scan < size_t > ( m_parameter - > GetParam ( " clean-lm-cache " ) [ 0 ] ) : 1 ;
2008-06-19 03:14:09 +04:00
// Read in constraint decoding file, if provided
2010-01-29 20:11:17 +03:00
if ( m_parameter - > GetParam ( " constraint " ) . size ( ) ) {
if ( m_parameter - > GetParam ( " search-algorithm " ) . size ( ) > 0
2010-01-29 20:11:34 +03:00
& & Scan < size_t > ( m_parameter - > GetParam ( " search-algorithm " ) [ 0 ] ) ! = 0 ) {
cerr < < " Can use -constraint only with stack-based search (-search-algorithm 0) " < < endl ;
exit ( 1 ) ;
}
2008-06-19 03:14:09 +04:00
m_constraintFileName = m_parameter - > GetParam ( " constraint " ) [ 0 ] ;
2010-01-29 20:11:34 +03:00
InputFileStream constraintFile ( m_constraintFileName ) ;
2008-06-19 03:14:09 +04:00
2010-01-29 20:11:34 +03:00
std : : string line ;
2009-01-26 19:14:38 +03:00
2010-01-29 20:11:34 +03:00
long sentenceID = - 1 ;
while ( getline ( constraintFile , line ) )
{
vector < string > vecStr = Tokenize ( line , " \t " ) ;
if ( vecStr . size ( ) = = 1 ) {
sentenceID + + ;
Phrase phrase ( Output ) ;
phrase . CreateFromString ( GetOutputFactorOrder ( ) , vecStr [ 0 ] , GetFactorDelimiter ( ) ) ;
m_constraints . insert ( make_pair ( sentenceID , phrase ) ) ;
}
else if ( vecStr . size ( ) = = 2 ) {
sentenceID = Scan < long > ( vecStr [ 0 ] ) ;
Phrase phrase ( Output ) ;
phrase . CreateFromString ( GetOutputFactorOrder ( ) , vecStr [ 1 ] , GetFactorDelimiter ( ) ) ;
m_constraints . insert ( make_pair ( sentenceID , phrase ) ) ;
}
else {
assert ( false ) ;
}
2009-01-26 19:14:38 +03:00
}
2008-06-19 03:14:09 +04:00
}
2009-01-01 21:16:54 +03:00
// use of xml in input
2008-06-11 14:52:57 +04:00
if ( m_parameter - > GetParam ( " xml-input " ) . size ( ) = = 0 ) m_xmlInputType = XmlPassThrough ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " exclusive " ) m_xmlInputType = XmlExclusive ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " inclusive " ) m_xmlInputType = XmlInclusive ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " ignore " ) m_xmlInputType = XmlIgnore ;
else if ( m_parameter - > GetParam ( " xml-input " ) [ 0 ] = = " pass-through " ) m_xmlInputType = XmlPassThrough ;
else {
UserMessage : : Add ( " invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore " ) ;
return false ;
}
if ( ! LoadLexicalReorderingModel ( ) ) return false ;
if ( ! LoadLanguageModels ( ) ) return false ;
if ( ! LoadGenerationTables ( ) ) return false ;
if ( ! LoadPhraseTables ( ) ) return false ;
2009-05-26 23:30:35 +04:00
if ( ! LoadGlobalLexicalModel ( ) ) return false ;
2010-09-14 20:25:33 +04:00
if ( ! LoadDecodeGraphs ( ) ) return false ;
if ( ! LoadReferences ( ) ) return false ;
2010-10-15 19:19:17 +04:00
if ( ! LoadDiscrimLMFeature ( ) ) return false ;
2011-03-22 17:33:16 +03:00
if ( ! LoadPhrasePairFeature ( ) ) return false ;
2011-05-11 02:02:25 +04:00
if ( ! LoadPhraseBoundaryFeature ( ) ) return false ;
2011-08-06 18:10:43 +04:00
if ( ! LoadPhraseLengthFeature ( ) ) return false ;
2010-08-10 17:12:00 +04:00
2011-08-07 04:58:56 +04:00
// report individual sparse features in n-best list
if ( m_parameter - > GetParam ( " report-sparse-features " ) . size ( ) > 0 ) {
for ( size_t i = 0 ; i < m_parameter - > GetParam ( " report-sparse-features " ) . size ( ) ; i + + ) {
const std : : string & name = m_parameter - > GetParam ( " report-sparse-features " ) [ i ] ;
2011-08-07 05:57:41 +04:00
if ( m_targetBigramFeature & & name . compare ( m_targetBigramFeature - > GetScoreProducerWeightShortName ( ) ) = = 0 )
m_targetBigramFeature - > SetSparseFeatureReporting ( ) ;
if ( m_phrasePairFeature & & name . compare ( m_phrasePairFeature - > GetScoreProducerWeightShortName ( ) ) = = 0 )
m_phrasePairFeature - > SetSparseFeatureReporting ( ) ;
if ( m_phraseBoundaryFeature & & name . compare ( m_phraseBoundaryFeature - > GetScoreProducerWeightShortName ( ) ) = = 0 )
m_phraseBoundaryFeature - > SetSparseFeatureReporting ( ) ;
2011-08-07 04:58:56 +04:00
if ( m_phraseLengthFeature & & name . compare ( m_phraseLengthFeature - > GetScoreProducerWeightShortName ( ) ) = = 0 )
m_phraseLengthFeature - > SetSparseFeatureReporting ( ) ;
}
}
2010-08-10 17:12:00 +04:00
//configure the translation systems with these tables
vector < string > tsConfig = m_parameter - > GetParam ( " translation-systems " ) ;
if ( ! tsConfig . size ( ) ) {
//use all models in default system.
2010-09-07 14:54:04 +04:00
tsConfig . push_back ( TranslationSystem : : DEFAULT + " R * D * L * G * " ) ;
2010-08-10 17:12:00 +04:00
}
if ( m_wordPenaltyProducers . size ( ) ! = tsConfig . size ( ) ) {
UserMessage : : Add ( string ( " Mismatch between number of word penalties and number of translation systems " ) ) ;
return false ;
}
if ( m_searchAlgorithm = = ChartDecoding ) {
//insert some null distortion score producers
m_distortionScoreProducers . assign ( tsConfig . size ( ) , NULL ) ;
} else {
if ( m_distortionScoreProducers . size ( ) ! = tsConfig . size ( ) ) {
UserMessage : : Add ( string ( " Mismatch between number of distortion scores and number of translation systems " ) ) ;
return false ;
}
}
for ( size_t i = 0 ; i < tsConfig . size ( ) ; + + i ) {
vector < string > config = Tokenize ( tsConfig [ i ] ) ;
if ( config . size ( ) % 2 ! = 1 ) {
UserMessage : : Add ( string ( " Incorrect number of fields in Translation System config. Should be an odd number " ) ) ;
}
m_translationSystems . insert ( pair < string , TranslationSystem > ( config [ 0 ] ,
TranslationSystem ( config [ 0 ] , m_wordPenaltyProducers [ i ] , m_unknownWordPenaltyProducer , m_distortionScoreProducers [ i ] ) ) ) ;
for ( size_t j = 1 ; j < config . size ( ) ; j + = 2 ) {
const string & id = config [ j ] ;
const string & tables = config [ j + 1 ] ;
set < size_t > tableIds ;
if ( tables ! = " * " ) {
//selected tables
vector < string > tableIdStrings = Tokenize ( tables , " , " ) ;
vector < size_t > tableIdList ;
Scan < size_t > ( tableIdList , tableIdStrings ) ;
copy ( tableIdList . begin ( ) , tableIdList . end ( ) , inserter ( tableIds , tableIds . end ( ) ) ) ;
}
if ( id = = " D " ) {
for ( size_t k = 0 ; k < m_decodeGraphs . size ( ) ; + + k ) {
if ( ! tableIds . size ( ) | | tableIds . find ( k ) ! = tableIds . end ( ) ) {
VERBOSE ( 2 , " Adding decoder graph " < < k < < " to translation system " < < config [ 0 ] < < endl ) ;
m_translationSystems . find ( config [ 0 ] ) - > second . AddDecodeGraph ( m_decodeGraphs [ k ] ) ;
}
}
} else if ( id = = " R " ) {
for ( size_t k = 0 ; k < m_reorderModels . size ( ) ; + + k ) {
if ( ! tableIds . size ( ) | | tableIds . find ( k ) ! = tableIds . end ( ) ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddReorderModel ( m_reorderModels [ k ] ) ;
VERBOSE ( 2 , " Adding reorder table " < < k < < " to translation system " < < config [ 0 ] < < endl ) ;
}
}
} else if ( id = = " G " ) {
for ( size_t k = 0 ; k < m_globalLexicalModels . size ( ) ; + + k ) {
if ( ! tableIds . size ( ) | | tableIds . find ( k ) ! = tableIds . end ( ) ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddGlobalLexicalModel ( m_globalLexicalModels [ k ] ) ;
VERBOSE ( 2 , " Adding global lexical model " < < k < < " to translation system " < < config [ 0 ] < < endl ) ;
}
}
} else if ( id = = " L " ) {
size_t lmid = 0 ;
for ( LMList : : const_iterator k = m_languageModel . begin ( ) ; k ! = m_languageModel . end ( ) ; + + k , + + lmid ) {
if ( ! tableIds . size ( ) | | tableIds . find ( lmid ) ! = tableIds . end ( ) ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddLanguageModel ( * k ) ;
VERBOSE ( 2 , " Adding language model " < < lmid < < " to translation system " < < config [ 0 ] < < endl ) ;
}
}
} else {
UserMessage : : Add ( string ( " Incorrect translation system identifier: " ) + id ) ;
return false ;
}
}
//Instigate dictionary loading
m_translationSystems . find ( config [ 0 ] ) - > second . ConfigDictionaries ( ) ;
//Add any other features here.
2010-09-14 20:25:33 +04:00
if ( m_bleuScoreFeature ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddFeatureFunction ( m_bleuScoreFeature ) ;
}
2010-10-15 19:19:17 +04:00
if ( m_targetBigramFeature ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddFeatureFunction ( m_targetBigramFeature ) ;
}
2011-03-22 17:33:16 +03:00
if ( m_phrasePairFeature ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddFeatureFunction ( m_phrasePairFeature ) ;
}
2011-05-11 02:02:25 +04:00
if ( m_phraseBoundaryFeature ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddFeatureFunction ( m_phraseBoundaryFeature ) ;
}
2011-08-06 18:10:43 +04:00
if ( m_phraseLengthFeature ) {
m_translationSystems . find ( config [ 0 ] ) - > second . AddFeatureFunction ( m_phraseLengthFeature ) ;
}
2010-08-10 17:12:00 +04:00
}
2011-08-07 04:58:56 +04:00
2010-10-15 19:19:17 +04:00
//Load extra feature weights
//NB: These are common to all translation systems (at the moment!)
vector < string > extraWeightConfig = m_parameter - > GetParam ( " weight-file " ) ;
if ( extraWeightConfig . size ( ) )
{
if ( extraWeightConfig . size ( ) > 1 )
{
UserMessage : : Add ( " Only one argument should be supplied for weight-file " ) ;
return false ;
}
ScoreComponentCollection extraWeights ;
if ( ! extraWeights . Load ( extraWeightConfig [ 0 ] ) )
{
UserMessage : : Add ( " Unable to load weights from " + extraWeightConfig [ 0 ] ) ;
return false ;
}
m_allWeights . PlusEquals ( extraWeights ) ;
}
2008-06-11 14:52:57 +04:00
return true ;
}
void StaticData : : SetBooleanParameter ( bool * parameter , string parameterName , bool defaultValue )
{
// default value if nothing is specified
* parameter = defaultValue ;
if ( ! m_parameter - > isParamSpecified ( parameterName ) )
{
return ;
}
// if parameter is just specified as, e.g. "-parameter" set it true
if ( m_parameter - > GetParam ( parameterName ) . size ( ) = = 0 )
{
* parameter = true ;
}
// if paramter is specified "-parameter true" or "-parameter false"
else if ( m_parameter - > GetParam ( parameterName ) . size ( ) = = 1 )
{
* parameter = Scan < bool > ( m_parameter - > GetParam ( parameterName ) [ 0 ] ) ;
}
}
2010-10-07 02:06:49 +04:00
void StaticData : : SetWeight ( const ScoreProducer * sp , float weight )
{
m_allWeights . Assign ( sp , weight ) ;
}
void StaticData : : SetWeights ( const ScoreProducer * sp , const std : : vector < float > & weights )
{
m_allWeights . Assign ( sp , weights ) ;
}
2008-06-11 14:52:57 +04:00
StaticData : : ~ StaticData ( )
{
RemoveAllInColl ( m_phraseDictionary ) ;
RemoveAllInColl ( m_generationDictionary ) ;
2008-10-30 21:43:17 +03:00
RemoveAllInColl ( m_reorderModels ) ;
2009-05-26 23:30:35 +04:00
RemoveAllInColl ( m_globalLexicalModels ) ;
2010-10-15 19:19:17 +04:00
RemoveAllInColl ( m_decodeGraphs ) ;
RemoveAllInColl ( m_wordPenaltyProducers ) ;
RemoveAllInColl ( m_distortionScoreProducers ) ;
m_languageModel . CleanUp ( ) ;
2008-06-11 14:52:57 +04:00
// delete trans opt
2009-08-07 20:47:54 +04:00
map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iterCache ;
2008-06-11 14:52:57 +04:00
for ( iterCache = m_transOptCache . begin ( ) ; iterCache ! = m_transOptCache . end ( ) ; + + iterCache )
{
2009-01-01 21:16:54 +03:00
TranslationOptionList * transOptList = iterCache - > second . first ;
2008-10-30 21:43:17 +03:00
delete transOptList ;
2008-06-11 14:52:57 +04:00
}
// small score producers
delete m_unknownWordPenaltyProducer ;
2010-09-17 17:36:03 +04:00
delete m_targetBigramFeature ;
2011-03-22 17:33:16 +03:00
delete m_phrasePairFeature ;
2011-05-11 02:02:25 +04:00
delete m_phraseBoundaryFeature ;
2011-08-06 18:10:43 +04:00
delete m_phraseLengthFeature ;
2008-06-11 14:52:57 +04:00
2010-05-06 23:46:23 +04:00
//delete m_parameter;
2010-04-08 21:16:10 +04:00
2008-06-11 14:52:57 +04:00
// memory pools
Phrase : : FinalizeMemPool ( ) ;
}
bool StaticData : : LoadLexicalReorderingModel ( )
{
2010-01-28 15:12:57 +03:00
VERBOSE ( 1 , " Loading lexical distortion models... " ) ;
const vector < string > fileStr = m_parameter - > GetParam ( " distortion-file " ) ;
2010-08-10 17:12:00 +04:00
bool hasWeightlr = ( m_parameter - > GetParam ( " weight-lr " ) . size ( ) ! = 0 ) ;
vector < string > weightsStr ;
if ( hasWeightlr ) {
weightsStr = m_parameter - > GetParam ( " weight-lr " ) ;
} else {
weightsStr = m_parameter - > GetParam ( " weight-d " ) ;
}
2010-01-28 15:12:57 +03:00
std : : vector < float > weights ;
size_t w = 1 ; //cur weight
2010-08-10 17:12:00 +04:00
if ( hasWeightlr ) {
w = 0 ; // if reading from weight-lr, don't have to count first as distortion penalty
}
2010-01-28 15:12:57 +03:00
size_t f = 0 ; //cur file
//get weights values
VERBOSE ( 1 , " have " < < fileStr . size ( ) < < " models " < < std : : endl ) ;
for ( size_t j = 0 ; j < weightsStr . size ( ) ; + + j ) {
weights . push_back ( Scan < float > ( weightsStr [ j ] ) ) ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
//load all models
for ( size_t i = 0 ; i < fileStr . size ( ) ; + + i )
{
vector < string > spec = Tokenize < string > ( fileStr [ f ] , " " ) ;
+ + f ; //mark file as consumed
if ( spec . size ( ) ! = 4 ) {
UserMessage : : Add ( " Invalid Lexical Reordering Model Specification: " + fileStr [ f ] ) ;
return false ;
}
// spec[0] = factor map
// spec[1] = name
// spec[2] = num weights
// spec[3] = fileName
// decode factor map
vector < FactorType > input , output ;
vector < string > inputfactors = Tokenize ( spec [ 0 ] , " - " ) ;
if ( inputfactors . size ( ) = = 2 ) {
input = Tokenize < FactorType > ( inputfactors [ 0 ] , " , " ) ;
output = Tokenize < FactorType > ( inputfactors [ 1 ] , " , " ) ;
} else if ( inputfactors . size ( ) = = 1 ) {
//if there is only one side assume it is on e side... why?
output = Tokenize < FactorType > ( inputfactors [ 0 ] , " , " ) ;
} else {
//format error
return false ;
}
string modelType = spec [ 1 ] ;
// decode num weights and fetch weights from array
std : : vector < float > mweights ;
size_t numWeights = atoi ( spec [ 2 ] . c_str ( ) ) ;
for ( size_t k = 0 ; k < numWeights ; + + k , + + w )
{
if ( w > = weights . size ( ) ) {
UserMessage : : Add ( " Lexicalized distortion model: Not enough weights, add to [weight-d] " ) ;
return false ;
} else
mweights . push_back ( weights [ w ] ) ;
}
string filePath = spec [ 3 ] ;
m_reorderModels . push_back ( new LexicalReordering ( input , output , modelType , filePath , mweights ) ) ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
return true ;
2008-06-11 14:52:57 +04:00
}
2010-01-28 15:12:57 +03:00
2009-05-26 23:30:35 +04:00
bool StaticData : : LoadGlobalLexicalModel ( )
{
const vector < float > & weight = Scan < float > ( m_parameter - > GetParam ( " weight-lex " ) ) ;
const vector < string > & file = m_parameter - > GetParam ( " global-lexical-file " ) ;
if ( weight . size ( ) ! = file . size ( ) )
{
std : : cerr < < " number of weights and models for the global lexical model does not match ( "
< < weight . size ( ) < < " != " < < file . size ( ) < < " ) " < < std : : endl ;
return false ;
}
for ( size_t i = 0 ; i < weight . size ( ) ; i + + )
{
vector < string > spec = Tokenize < string > ( file [ i ] , " " ) ;
if ( spec . size ( ) ! = 2 )
{
std : : cerr < < " wrong global lexical model specification: " < < file [ i ] < < endl ;
return false ;
}
vector < string > factors = Tokenize ( spec [ 0 ] , " - " ) ;
if ( factors . size ( ) ! = 2 )
{
std : : cerr < < " wrong factor definition for global lexical model: " < < spec [ 0 ] < < endl ;
return false ;
}
vector < FactorType > inputFactors = Tokenize < FactorType > ( factors [ 0 ] , " , " ) ;
vector < FactorType > outputFactors = Tokenize < FactorType > ( factors [ 1 ] , " , " ) ;
2010-10-07 02:06:49 +04:00
m_globalLexicalModels . push_back ( new GlobalLexicalModel ( spec [ 1 ] , inputFactors , outputFactors ) ) ;
SetWeight ( m_globalLexicalModels . back ( ) , weight [ i ] ) ;
2009-05-26 23:30:35 +04:00
}
return true ;
}
2008-06-11 14:52:57 +04:00
bool StaticData : : LoadLanguageModels ( )
{
if ( m_parameter - > GetParam ( " lmodel-file " ) . size ( ) > 0 )
{
// weights
vector < float > weightAll = Scan < float > ( m_parameter - > GetParam ( " weight-l " ) ) ;
// dictionary upper-bounds fo all IRST LMs
vector < int > LMdub = Scan < int > ( m_parameter - > GetParam ( " lmodel-dub " ) ) ;
if ( m_parameter - > GetParam ( " lmodel-dub " ) . size ( ) = = 0 ) {
for ( size_t i = 0 ; i < m_parameter - > GetParam ( " lmodel-file " ) . size ( ) ; i + + )
LMdub . push_back ( 0 ) ;
}
// initialize n-gram order for each factor. populated only by factored lm
const vector < string > & lmVector = m_parameter - > GetParam ( " lmodel-file " ) ;
2010-08-10 17:12:00 +04:00
//prevent language models from being loaded twice
map < string , LanguageModel * > languageModelsLoaded ;
2008-06-11 14:52:57 +04:00
for ( size_t i = 0 ; i < lmVector . size ( ) ; i + + )
{
2010-08-10 17:12:00 +04:00
LanguageModel * lm = NULL ;
if ( languageModelsLoaded . find ( lmVector [ i ] ) ! = languageModelsLoaded . end ( ) ) {
2011-01-05 16:49:44 +03:00
lm = new LanguageModel (
( languageModelsLoaded [ lmVector [ i ] ] ) ) ;
2010-08-10 17:12:00 +04:00
} else {
vector < string > token = Tokenize ( lmVector [ i ] ) ;
if ( token . size ( ) ! = 4 & & token . size ( ) ! = 5 )
{
UserMessage : : Add ( " Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]' " ) ;
return false ;
}
// type = implementation, SRI, IRST etc
LMImplementation lmImplementation = static_cast < LMImplementation > ( Scan < int > ( token [ 0 ] ) ) ;
// factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
vector < FactorType > factorTypes = Tokenize < FactorType > ( token [ 1 ] , " , " ) ;
// nGramOrder = 2 = bigram, 3 = trigram, etc
size_t nGramOrder = Scan < int > ( token [ 2 ] ) ;
string & languageModelFile = token [ 3 ] ;
if ( token . size ( ) = = 5 ) {
if ( lmImplementation = = IRST )
languageModelFile + = " " + token [ 4 ] ;
else {
UserMessage : : Add ( " Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]' " ) ;
return false ;
}
}
IFVERBOSE ( 1 )
PrintUserTime ( string ( " Start loading LanguageModel " ) + languageModelFile ) ;
lm = LanguageModelFactory : : CreateLanguageModel (
lmImplementation
, factorTypes
, nGramOrder
, languageModelFile
, LMdub [ i ] ) ;
if ( lm = = NULL )
{
UserMessage : : Add ( " no LM created. We probably don't have it compiled " ) ;
return false ;
}
languageModelsLoaded [ lmVector [ i ] ] = lm ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
m_languageModel . Add ( lm ) ;
2010-10-07 02:06:49 +04:00
SetWeight ( lm , weightAll [ i ] ) ;
2008-06-11 14:52:57 +04:00
}
}
// flag indicating that language models were loaded,
// since phrase table loading requires their presence
m_fLMsLoaded = true ;
IFVERBOSE ( 1 )
PrintUserTime ( " Finished loading LanguageModels " ) ;
return true ;
}
bool StaticData : : LoadGenerationTables ( )
{
if ( m_parameter - > GetParam ( " generation-file " ) . size ( ) > 0 )
{
const vector < string > & generationVector = m_parameter - > GetParam ( " generation-file " ) ;
const vector < float > & weight = Scan < float > ( m_parameter - > GetParam ( " weight-generation " ) ) ;
IFVERBOSE ( 1 )
{
TRACE_ERR ( " weight-generation: " ) ;
for ( size_t i = 0 ; i < weight . size ( ) ; i + + )
{
TRACE_ERR ( weight [ i ] < < " \t " ) ;
}
TRACE_ERR ( endl ) ;
}
size_t currWeightNum = 0 ;
for ( size_t currDict = 0 ; currDict < generationVector . size ( ) ; currDict + + )
{
vector < string > token = Tokenize ( generationVector [ currDict ] ) ;
vector < FactorType > input = Tokenize < FactorType > ( token [ 0 ] , " , " )
, output = Tokenize < FactorType > ( token [ 1 ] , " , " ) ;
m_maxFactorIdx [ 1 ] = CalcMax ( m_maxFactorIdx [ 1 ] , input , output ) ;
string filePath ;
size_t numFeatures ;
numFeatures = Scan < size_t > ( token [ 2 ] ) ;
filePath = token [ 3 ] ;
if ( ! FileExists ( filePath ) & & FileExists ( filePath + " .gz " ) ) {
filePath + = " .gz " ;
}
VERBOSE ( 1 , filePath < < endl ) ;
2010-10-07 02:06:49 +04:00
m_generationDictionary . push_back ( new GenerationDictionary ( numFeatures , input , output ) ) ;
2008-06-11 14:52:57 +04:00
assert ( m_generationDictionary . back ( ) & & " could not create GenerationDictionary " ) ;
2010-08-10 17:12:00 +04:00
if ( ! m_generationDictionary . back ( ) - > Load ( filePath , Output ) )
2008-06-11 14:52:57 +04:00
{
delete m_generationDictionary . back ( ) ;
return false ;
}
2010-10-07 02:06:49 +04:00
vector < float > gdWeights ;
2008-06-11 14:52:57 +04:00
for ( size_t i = 0 ; i < numFeatures ; i + + ) {
assert ( currWeightNum < weight . size ( ) ) ;
2010-10-07 02:06:49 +04:00
gdWeights . push_back ( weight [ currWeightNum + + ] ) ;
2008-06-11 14:52:57 +04:00
}
2010-10-07 02:06:49 +04:00
SetWeights ( m_generationDictionary . back ( ) , gdWeights ) ;
2008-06-11 14:52:57 +04:00
}
if ( currWeightNum ! = weight . size ( ) ) {
TRACE_ERR ( " [WARNING] config file has " < < weight . size ( ) < < " generation weights listed, but the configuration for generation files indicates there should be " < < currWeightNum < < " ! \n " ) ;
}
}
return true ;
}
2010-08-10 17:12:00 +04:00
/* Doesn't load phrase tables any more. Just creates the features. */
2008-06-11 14:52:57 +04:00
bool StaticData : : LoadPhraseTables ( )
{
2010-08-10 17:12:00 +04:00
VERBOSE ( 2 , " Creating phrase table features " < < endl ) ;
2008-06-11 14:52:57 +04:00
// language models must be loaded prior to loading phrase tables
assert ( m_fLMsLoaded ) ;
// load phrase translation tables
if ( m_parameter - > GetParam ( " ttable-file " ) . size ( ) > 0 )
{
// weights
vector < float > weightAll = Scan < float > ( m_parameter - > GetParam ( " weight-t " ) ) ;
const vector < string > & translationVector = m_parameter - > GetParam ( " ttable-file " ) ;
vector < size_t > maxTargetPhrase = Scan < size_t > ( m_parameter - > GetParam ( " ttable-limit " ) ) ;
2008-12-13 15:08:55 +03:00
2010-04-22 12:42:32 +04:00
if ( maxTargetPhrase . size ( ) = = 1 & & translationVector . size ( ) > 1 ) {
VERBOSE ( 1 , " Using uniform ttable-limit of " < < maxTargetPhrase [ 0 ] < < " for all translation tables. " < < endl ) ;
for ( size_t i = 1 ; i < translationVector . size ( ) ; i + + )
maxTargetPhrase . push_back ( maxTargetPhrase [ 0 ] ) ;
} else if ( maxTargetPhrase . size ( ) ! = 1 & & maxTargetPhrase . size ( ) < translationVector . size ( ) ) {
stringstream strme ;
strme < < " You specified " < < translationVector . size ( ) < < " translation tables, but only " < < maxTargetPhrase . size ( ) < < " ttable-limits. " ;
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
2008-06-11 14:52:57 +04:00
size_t index = 0 ;
size_t weightAllOffset = 0 ;
2010-05-13 18:12:05 +04:00
bool oldFileFormat = false ;
2008-06-11 14:52:57 +04:00
for ( size_t currDict = 0 ; currDict < translationVector . size ( ) ; currDict + + )
{
vector < string > token = Tokenize ( translationVector [ currDict ] ) ;
2010-05-13 18:12:05 +04:00
if ( currDict = = 0 & & token . size ( ) = = 4 )
{
VERBOSE ( 1 , " Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)! " < < endl ) ;
oldFileFormat = true ;
}
2010-08-10 17:12:00 +04:00
if ( ( ! oldFileFormat & & token . size ( ) < 5 ) | | ( oldFileFormat & & token . size ( ) ! = 4 ) )
2010-05-13 18:12:05 +04:00
{
UserMessage : : Add ( " invalid phrase table specification " ) ;
return false ;
}
PhraseTableImplementation implementation = ( PhraseTableImplementation ) Scan < int > ( token [ 0 ] ) ;
if ( oldFileFormat )
{
token . push_back ( token [ 3 ] ) ;
token [ 3 ] = token [ 2 ] ;
token [ 2 ] = token [ 1 ] ;
token [ 1 ] = token [ 0 ] ;
token [ 0 ] = " 1 " ;
implementation = Binary ;
} else
implementation = ( PhraseTableImplementation ) Scan < int > ( token [ 0 ] ) ;
2010-04-09 00:50:22 +04:00
assert ( token . size ( ) > = 5 ) ;
2008-06-11 14:52:57 +04:00
//characteristics of the phrase table
2010-04-08 21:16:10 +04:00
vector < FactorType > input = Tokenize < FactorType > ( token [ 1 ] , " , " )
, output = Tokenize < FactorType > ( token [ 2 ] , " , " ) ;
2008-06-11 14:52:57 +04:00
m_maxFactorIdx [ 0 ] = CalcMax ( m_maxFactorIdx [ 0 ] , input ) ;
m_maxFactorIdx [ 1 ] = CalcMax ( m_maxFactorIdx [ 1 ] , output ) ;
m_maxNumFactors = std : : max ( m_maxFactorIdx [ 0 ] , m_maxFactorIdx [ 1 ] ) + 1 ;
2010-04-08 21:16:10 +04:00
size_t numScoreComponent = Scan < size_t > ( token [ 3 ] ) ;
string filePath = token [ 4 ] ;
2008-06-11 14:52:57 +04:00
assert ( weightAll . size ( ) > = weightAllOffset + numScoreComponent ) ;
// weights for this phrase dictionary
// first InputScores (if any), then translation scores
vector < float > weight ;
2010-04-08 21:16:10 +04:00
if ( currDict = = 0 & & ( m_inputType = = ConfusionNetworkInput | | m_inputType = = WordLatticeInput ) )
2008-06-11 14:52:57 +04:00
{ // TODO. find what the assumptions made by confusion network about phrase table output which makes
// it only work with binrary file. This is a hack
2009-02-05 20:37:09 +03:00
2008-06-11 14:52:57 +04:00
m_numInputScores = m_parameter - > GetParam ( " weight-i " ) . size ( ) ;
for ( unsigned k = 0 ; k < m_numInputScores ; + + k )
2009-02-05 20:37:09 +03:00
weight . push_back ( Scan < float > ( m_parameter - > GetParam ( " weight-i " ) [ k ] ) ) ;
if ( m_parameter - > GetParam ( " link-param-count " ) . size ( ) )
m_numLinkParams = Scan < size_t > ( m_parameter - > GetParam ( " link-param-count " ) [ 0 ] ) ;
//print some info about this interaction:
if ( m_numLinkParams = = m_numInputScores ) {
VERBOSE ( 1 , " specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count. \n " ) ;
} else if ( ( m_numLinkParams + 1 ) = = m_numInputScores ) {
VERBOSE ( 1 , " WARN: " < < m_numInputScores < < " insertion weights found and only " < < m_numLinkParams < < " link parameters specified, applying non-epsilon 'real' word link count for last feature weight. \n " ) ;
} else {
stringstream strme ;
strme < < " You specified " < < m_numInputScores
< < " input weights (weight-i), but you specified " < < m_numLinkParams < < " link parameters (link-param-count)! " ;
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
2008-06-11 14:52:57 +04:00
}
2009-02-05 20:37:09 +03:00
if ( ! m_inputType ) {
2008-06-11 14:52:57 +04:00
m_numInputScores = 0 ;
}
2009-02-05 20:37:09 +03:00
//this number changes depending on what phrase table we're talking about: only 0 has the weights on it
size_t tableInputScores = ( currDict = = 0 ? m_numInputScores : 0 ) ;
2008-06-11 14:52:57 +04:00
for ( size_t currScore = 0 ; currScore < numScoreComponent ; currScore + + )
weight . push_back ( weightAll [ weightAllOffset + currScore ] ) ;
2009-02-05 20:37:09 +03:00
if ( weight . size ( ) - tableInputScores ! = numScoreComponent )
2008-06-11 14:52:57 +04:00
{
stringstream strme ;
strme < < " Your phrase table has " < < numScoreComponent
2009-02-05 20:37:09 +03:00
< < " scores, but you specified " < < ( weight . size ( ) - tableInputScores ) < < " weights! " ;
2008-06-11 14:52:57 +04:00
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
weightAllOffset + = numScoreComponent ;
2009-02-05 20:37:09 +03:00
numScoreComponent + = tableInputScores ;
2010-04-08 21:57:38 +04:00
string targetPath , alignmentsFile ;
if ( implementation = = SuffixArray )
{
targetPath = token [ 5 ] ;
alignmentsFile = token [ 6 ] ;
}
2009-02-05 20:37:09 +03:00
2008-06-11 14:52:57 +04:00
assert ( numScoreComponent = = weight . size ( ) ) ;
2010-08-10 17:12:00 +04:00
//This is needed for regression testing, but the phrase table
//might not really be loading here
2008-06-11 14:52:57 +04:00
IFVERBOSE ( 1 )
PrintUserTime ( string ( " Start loading PhraseTable " ) + filePath ) ;
2010-04-09 00:50:22 +04:00
VERBOSE ( 1 , " filePath: " < < filePath < < endl ) ;
2010-10-07 02:06:49 +04:00
PhraseDictionaryFeature * pdf = new PhraseDictionaryFeature (
implementation
, numScoreComponent
, ( currDict = = 0 ? m_numInputScores : 0 )
, input
, output
, filePath
, weight
, maxTargetPhrase [ index ]
, targetPath , alignmentsFile ) ;
2010-04-08 21:16:10 +04:00
2010-10-07 02:06:49 +04:00
m_phraseDictionary . push_back ( pdf ) ;
SetWeights ( m_phraseDictionary . back ( ) , weight ) ;
2010-04-08 21:16:10 +04:00
2008-06-11 14:52:57 +04:00
index + + ;
}
}
IFVERBOSE ( 1 )
PrintUserTime ( " Finished loading phrase tables " ) ;
return true ;
}
2010-04-08 21:16:10 +04:00
void StaticData : : LoadNonTerminals ( )
{
string defaultNonTerminals ;
if ( m_parameter - > GetParam ( " non-terminals " ) . size ( ) = = 0 )
{
defaultNonTerminals = " X " ;
}
else
{
vector < std : : string > tokens = Tokenize ( m_parameter - > GetParam ( " non-terminals " ) [ 0 ] ) ;
defaultNonTerminals = tokens [ 0 ] ;
}
FactorCollection & factorCollection = FactorCollection : : Instance ( ) ;
m_inputDefaultNonTerminal . SetIsNonTerminal ( true ) ;
const Factor * sourceFactor = factorCollection . AddFactor ( Input , 0 , defaultNonTerminals ) ;
m_inputDefaultNonTerminal . SetFactor ( 0 , sourceFactor ) ;
m_outputDefaultNonTerminal . SetIsNonTerminal ( true ) ;
const Factor * targetFactor = factorCollection . AddFactor ( Output , 0 , defaultNonTerminals ) ;
m_outputDefaultNonTerminal . SetFactor ( 0 , targetFactor ) ;
// for unknwon words
if ( m_parameter - > GetParam ( " unknown-lhs " ) . size ( ) = = 0 )
{
UnknownLHSEntry entry ( defaultNonTerminals , 0.0f ) ;
m_unknownLHS . push_back ( entry ) ;
}
else
{
const string & filePath = m_parameter - > GetParam ( " unknown-lhs " ) [ 0 ] ;
InputFileStream inStream ( filePath ) ;
string line ;
while ( getline ( inStream , line ) )
{
vector < string > tokens = Tokenize ( line ) ;
assert ( tokens . size ( ) = = 2 ) ;
UnknownLHSEntry entry ( tokens [ 0 ] , Scan < float > ( tokens [ 1 ] ) ) ;
m_unknownLHS . push_back ( entry ) ;
}
}
}
void StaticData : : LoadChartDecodingParameters ( )
{
LoadNonTerminals ( ) ;
// source label overlap
if ( m_parameter - > GetParam ( " source-label-overlap " ) . size ( ) > 0 )
{
m_sourceLabelOverlap = ( SourceLabelOverlap ) Scan < int > ( m_parameter - > GetParam ( " source-label-overlap " ) [ 0 ] ) ;
}
else
{
m_sourceLabelOverlap = SourceLabelOverlapAdd ;
}
m_ruleLimit = ( m_parameter - > GetParam ( " rule-limit " ) . size ( ) > 0 )
? Scan < size_t > ( m_parameter - > GetParam ( " rule-limit " ) [ 0 ] ) : DEFAULT_MAX_TRANS_OPT_SIZE ;
}
void StaticData : : LoadPhraseBasedParameters ( )
{
const vector < string > distortionWeights = m_parameter - > GetParam ( " weight-d " ) ;
2010-08-10 17:12:00 +04:00
size_t distortionWeightCount = distortionWeights . size ( ) ;
//if there's a lex-reordering model, and no separate weight set, then
//take just one of these weights for linear distortion
if ( ! m_parameter - > GetParam ( " weight-lr " ) . size ( ) & & m_parameter - > GetParam ( " distortion-file " ) . size ( ) ) {
distortionWeightCount = 1 ;
}
for ( size_t i = 0 ; i < distortionWeightCount ; + + i ) {
float weightDistortion = Scan < float > ( distortionWeights [ i ] ) ;
2010-10-07 02:06:49 +04:00
m_distortionScoreProducers . push_back ( new DistortionScoreProducer ( ) ) ;
SetWeight ( m_distortionScoreProducers . back ( ) , weightDistortion ) ;
2010-08-10 17:12:00 +04:00
}
2010-04-08 21:16:10 +04:00
}
2010-08-10 17:12:00 +04:00
bool StaticData : : LoadDecodeGraphs ( ) {
2008-06-11 14:52:57 +04:00
const vector < string > & mappingVector = m_parameter - > GetParam ( " mapping " ) ;
2010-04-08 21:16:10 +04:00
const vector < size_t > & maxChartSpans = Scan < size_t > ( m_parameter - > GetParam ( " max-chart-span " ) ) ;
2008-06-11 14:52:57 +04:00
DecodeStep * prev = 0 ;
2010-04-08 21:16:10 +04:00
size_t prevDecodeGraphInd = 0 ;
2008-06-11 14:52:57 +04:00
for ( size_t i = 0 ; i < mappingVector . size ( ) ; i + + )
{
vector < string > token = Tokenize ( mappingVector [ i ] ) ;
2010-04-08 21:16:10 +04:00
size_t decodeGraphInd ;
2008-06-11 14:52:57 +04:00
DecodeType decodeType ;
size_t index ;
if ( token . size ( ) = = 2 )
{
2010-04-08 21:16:10 +04:00
decodeGraphInd = 0 ;
2008-06-11 14:52:57 +04:00
decodeType = token [ 0 ] = = " T " ? Translate : Generate ;
index = Scan < size_t > ( token [ 1 ] ) ;
}
else if ( token . size ( ) = = 3 )
2010-04-08 21:16:10 +04:00
{ // For specifying multiple translation model
decodeGraphInd = Scan < size_t > ( token [ 0 ] ) ;
2008-06-11 14:52:57 +04:00
//the vectorList index can only increment by one
2010-04-08 21:16:10 +04:00
assert ( decodeGraphInd = = prevDecodeGraphInd | | decodeGraphInd = = prevDecodeGraphInd + 1 ) ;
if ( decodeGraphInd > prevDecodeGraphInd )
2008-06-11 14:52:57 +04:00
{
prev = NULL ;
}
decodeType = token [ 1 ] = = " T " ? Translate : Generate ;
index = Scan < size_t > ( token [ 2 ] ) ;
}
else
{
UserMessage : : Add ( " Malformed mapping! " ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
DecodeStep * decodeStep = NULL ;
2008-06-11 14:52:57 +04:00
switch ( decodeType ) {
case Translate :
if ( index > = m_phraseDictionary . size ( ) )
{
stringstream strme ;
strme < < " No phrase dictionary with index "
< < index < < " available! " ;
UserMessage : : Add ( strme . str ( ) ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
2010-08-10 17:12:00 +04:00
decodeStep = new DecodeStepTranslation ( m_phraseDictionary [ index ] , prev ) ;
2008-06-11 14:52:57 +04:00
break ;
case Generate :
if ( index > = m_generationDictionary . size ( ) )
{
stringstream strme ;
strme < < " No generation dictionary with index "
< < index < < " available! " ;
UserMessage : : Add ( strme . str ( ) ) ;
2009-08-07 20:47:54 +04:00
assert ( false ) ;
2008-06-11 14:52:57 +04:00
}
decodeStep = new DecodeStepGeneration ( m_generationDictionary [ index ] , prev ) ;
break ;
case InsertNullFertilityWord :
assert ( ! " Please implement NullFertilityInsertion. " ) ;
break ;
}
2010-04-08 21:16:10 +04:00
2008-06-11 14:52:57 +04:00
assert ( decodeStep ) ;
2010-08-10 17:12:00 +04:00
if ( m_decodeGraphs . size ( ) < decodeGraphInd + 1 )
2008-06-11 14:52:57 +04:00
{
2010-04-08 21:16:10 +04:00
DecodeGraph * decodeGraph ;
if ( m_searchAlgorithm = = ChartDecoding )
{
size_t maxChartSpan = ( decodeGraphInd < maxChartSpans . size ( ) ) ? maxChartSpans [ decodeGraphInd ] : DEFAULT_MAX_CHART_SPAN ;
2010-08-10 17:12:00 +04:00
decodeGraph = new DecodeGraph ( m_decodeGraphs . size ( ) , maxChartSpan ) ;
2010-04-08 21:16:10 +04:00
}
else
{
2010-08-10 17:12:00 +04:00
decodeGraph = new DecodeGraph ( m_decodeGraphs . size ( ) ) ;
2010-04-08 21:16:10 +04:00
}
2010-08-10 17:12:00 +04:00
m_decodeGraphs . push_back ( decodeGraph ) ; // TODO max chart span
2008-06-11 14:52:57 +04:00
}
2010-04-08 21:16:10 +04:00
2010-08-10 17:12:00 +04:00
m_decodeGraphs [ decodeGraphInd ] - > Add ( decodeStep ) ;
2008-06-11 14:52:57 +04:00
prev = decodeStep ;
2010-04-08 21:16:10 +04:00
prevDecodeGraphInd = decodeGraphInd ;
2008-06-11 14:52:57 +04:00
}
2010-08-10 17:12:00 +04:00
return true ;
2008-06-11 14:52:57 +04:00
}
2010-09-14 20:25:33 +04:00
bool StaticData : : LoadReferences ( ) {
vector < string > bleuWeightStr = m_parameter - > GetParam ( " weight-b " ) ;
vector < string > referenceFiles = m_parameter - > GetParam ( " references " ) ;
if ( ( ! referenceFiles . size ( ) & & bleuWeightStr . size ( ) ) | | ( referenceFiles . size ( ) & & ! bleuWeightStr . size ( ) ) ) {
UserMessage : : Add ( " You cannot use the bleu feature without references, and vice-versa " ) ;
return false ;
}
2010-09-14 20:55:33 +04:00
if ( ! referenceFiles . size ( ) ) {
return true ;
}
2010-09-14 20:25:33 +04:00
if ( bleuWeightStr . size ( ) > 1 ) {
UserMessage : : Add ( " Can only specify one weight for the bleu feature " ) ;
return false ;
}
float bleuWeight = Scan < float > ( bleuWeightStr [ 0 ] ) ;
m_bleuScoreFeature = new BleuScoreFeature ( ) ;
2010-10-07 02:06:49 +04:00
SetWeight ( m_bleuScoreFeature , bleuWeight ) ;
2010-09-14 20:25:33 +04:00
vector < vector < string > > references ( referenceFiles . size ( ) ) ;
for ( size_t i = 0 ; i < referenceFiles . size ( ) ; + + i ) {
ifstream in ( referenceFiles [ i ] . c_str ( ) ) ;
if ( ! in ) {
stringstream strme ;
strme < < " Unable to load references from " < < referenceFiles [ i ] ;
UserMessage : : Add ( strme . str ( ) ) ;
return false ;
}
string line ;
while ( getline ( in , line ) ) {
references . back ( ) . push_back ( line ) ;
}
if ( i > 0 ) {
if ( references [ i ] . size ( ) ! = references [ i - 1 ] . size ( ) ) {
UserMessage : : Add ( " Reference files are of different lengths " ) ;
return false ;
}
}
in . close ( ) ;
}
2010-09-16 16:49:57 +04:00
//Set the references in the bleu feature
m_bleuScoreFeature - > LoadReferences ( references ) ;
m_bleuScoreFeature - > SetCurrentReference ( 0 ) ; //TODO: Temporary, for testing
2010-09-14 20:25:33 +04:00
return true ;
}
2010-10-15 19:19:17 +04:00
bool StaticData : : LoadDiscrimLMFeature ( )
2010-09-17 17:36:03 +04:00
{
2011-08-06 18:10:43 +04:00
// only load if specified
2010-10-15 19:19:17 +04:00
const vector < string > & wordFile = m_parameter - > GetParam ( " discrim-lmodel-file " ) ;
2010-09-17 18:43:38 +04:00
if ( wordFile . empty ( ) )
2010-10-15 19:19:17 +04:00
{
2010-09-17 18:43:38 +04:00
return true ;
2010-10-15 19:19:17 +04:00
}
2010-09-17 17:36:03 +04:00
if ( wordFile . size ( ) ! = 1 ) {
2010-10-21 17:23:53 +04:00
UserMessage : : Add ( " Can only have one discrim-lmodel-file " ) ;
2010-09-17 17:36:03 +04:00
return false ;
}
2011-08-06 18:10:43 +04:00
2010-10-15 19:19:17 +04:00
vector < string > tokens = Tokenize ( wordFile [ 0 ] ) ;
2011-03-01 13:18:50 +03:00
if ( tokens . size ( ) ! = 2 & & tokens . size ( ) ! = 3 ) {
2011-08-06 18:10:43 +04:00
UserMessage : : Add ( " Format of discriminative language model parameter is <order> [factor] <filename> " ) ;
2010-10-15 19:19:17 +04:00
return false ;
}
2011-08-06 18:10:43 +04:00
2010-10-15 19:19:17 +04:00
size_t order = Scan < size_t > ( tokens [ 0 ] ) ;
if ( order ! = 2 ) {
UserMessage : : Add ( " Only bigrams are supported by the discriminative LM " ) ;
return false ;
}
2011-03-01 13:18:50 +03:00
FactorType factorId = 0 ;
string filename = tokens [ 1 ] ;
if ( tokens . size ( ) = = 3 ) {
factorId = Scan < size_t > ( tokens [ 1 ] ) ;
filename = tokens [ 2 ] ;
}
2010-10-15 19:19:17 +04:00
2011-03-01 13:18:50 +03:00
m_targetBigramFeature = new TargetBigramFeature ( factorId ) ;
cerr < < " loading from " < < filename < < endl ;
if ( ! m_targetBigramFeature - > Load ( filename ) ) {
UserMessage : : Add ( " Unable to load word list from file " + filename ) ;
2010-09-17 17:36:03 +04:00
return false ;
}
return true ;
}
2008-06-11 14:52:57 +04:00
2011-05-11 02:02:25 +04:00
bool StaticData : : LoadPhraseBoundaryFeature ( )
{
const vector < string > & phraseBoundarySourceFactors =
m_parameter - > GetParam ( " phrase-boundary-source-feature " ) ;
const vector < string > & phraseBoundaryTargetFactors =
m_parameter - > GetParam ( " phrase-boundary-target-feature " ) ;
if ( phraseBoundarySourceFactors . size ( ) = = 0 & & phraseBoundaryTargetFactors . size ( ) = = 0 ) {
return true ;
}
if ( phraseBoundarySourceFactors . size ( ) > 1 ) {
UserMessage : : Add ( " Need to specify comma separated list of source factors for phrase boundary " ) ;
return false ;
}
if ( phraseBoundaryTargetFactors . size ( ) > 1 ) {
UserMessage : : Add ( " Need to specify comma separated list of target factors for phrase boundary " ) ;
return false ;
}
FactorList sourceFactors ;
FactorList targetFactors ;
if ( phraseBoundarySourceFactors . size ( ) ) {
sourceFactors = Tokenize < FactorType > ( phraseBoundarySourceFactors [ 0 ] , " , " ) ;
}
if ( phraseBoundaryTargetFactors . size ( ) ) {
targetFactors = Tokenize < FactorType > ( phraseBoundaryTargetFactors [ 0 ] , " , " ) ;
}
//cerr << "source "; for (size_t i = 0; i < sourceFactors.size(); ++i) cerr << sourceFactors[i] << " "; cerr << endl;
//cerr << "target "; for (size_t i = 0; i < targetFactors.size(); ++i) cerr << targetFactors[i] << " "; cerr << endl;
m_phraseBoundaryFeature = new PhraseBoundaryFeature ( sourceFactors , targetFactors ) ;
return true ;
}
2011-03-22 17:33:16 +03:00
bool StaticData : : LoadPhrasePairFeature ( )
{
const vector < string > & phrasePairFactors =
m_parameter - > GetParam ( " phrase-pair-feature " ) ;
if ( phrasePairFactors . size ( ) = = 0 ) return true ;
2011-03-23 23:42:34 +03:00
if ( phrasePairFactors . size ( ) ! = 1 ) {
UserMessage : : Add ( " Need to specify source and target factors for phrase pair feature " ) ;
return false ;
}
vector < string > tokens = Tokenize ( phrasePairFactors [ 0 ] ) ;
if ( tokens . size ( ) ! = 2 ) {
2011-03-22 17:33:16 +03:00
UserMessage : : Add ( " Need to specify source and target factors for phrase pair feature " ) ;
return false ;
}
2011-03-23 23:42:34 +03:00
size_t sourceFactorId = Scan < FactorType > ( tokens [ 0 ] ) ;
size_t targetFactorId = Scan < FactorType > ( tokens [ 1 ] ) ;
2011-03-22 17:33:16 +03:00
m_phrasePairFeature = new PhrasePairFeature ( sourceFactorId , targetFactorId ) ;
return true ;
}
2011-08-06 18:10:43 +04:00
bool StaticData : : LoadPhraseLengthFeature ( )
{
if ( m_parameter - > isParamSpecified ( " phrase-length-feature " ) ) {
m_phraseLengthFeature = new PhraseLengthFeature ( ) ;
}
return true ;
}
2008-10-14 23:25:18 +04:00
const TranslationOptionList * StaticData : : FindTransOptListInCache ( const DecodeGraph & decodeGraph , const Phrase & sourcePhrase ) const
2008-06-11 14:52:57 +04:00
{
2009-08-07 20:47:54 +04:00
std : : pair < size_t , Phrase > key ( decodeGraph . GetPosition ( ) , sourcePhrase ) ;
# ifdef WITH_THREADS
boost : : mutex : : scoped_lock lock ( m_transOptCacheMutex ) ;
# endif
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iter
2008-10-14 23:25:18 +04:00
= m_transOptCache . find ( key ) ;
2008-06-11 14:52:57 +04:00
if ( iter = = m_transOptCache . end ( ) )
return NULL ;
2009-01-01 21:16:54 +03:00
iter - > second . second = clock ( ) ; // update last used time
return iter - > second . first ;
}
void StaticData : : ReduceTransOptCache ( ) const
{
if ( m_transOptCache . size ( ) < = m_transOptCacheMaxSize ) return ; // not full
clock_t t = clock ( ) ;
// find cutoff for last used time
priority_queue < clock_t > lastUsedTimes ;
2009-08-07 20:47:54 +04:00
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iter ;
2009-01-01 21:16:54 +03:00
iter = m_transOptCache . begin ( ) ;
while ( iter ! = m_transOptCache . end ( ) )
{
lastUsedTimes . push ( iter - > second . second ) ;
iter + + ;
}
for ( size_t i = 0 ; i < lastUsedTimes . size ( ) - m_transOptCacheMaxSize / 2 ; i + + )
lastUsedTimes . pop ( ) ;
clock_t cutoffLastUsedTime = lastUsedTimes . top ( ) ;
2008-06-11 14:52:57 +04:00
2009-01-01 21:16:54 +03:00
// remove all old entries
iter = m_transOptCache . begin ( ) ;
while ( iter ! = m_transOptCache . end ( ) )
{
if ( iter - > second . second < cutoffLastUsedTime )
{
2009-08-07 20:47:54 +04:00
std : : map < std : : pair < size_t , Phrase > , std : : pair < TranslationOptionList * , clock_t > > : : iterator iterRemove = iter + + ;
2009-01-01 21:16:54 +03:00
delete iterRemove - > second . first ;
m_transOptCache . erase ( iterRemove ) ;
}
else iter + + ;
}
VERBOSE ( 2 , " Reduced persistent translation option cache in " < < ( ( clock ( ) - t ) / ( float ) CLOCKS_PER_SEC ) < < " seconds. " < < std : : endl ) ;
2008-06-11 14:52:57 +04:00
}
2008-10-14 23:25:18 +04:00
void StaticData : : AddTransOptListToCache ( const DecodeGraph & decodeGraph , const Phrase & sourcePhrase , const TranslationOptionList & transOptList ) const
{
2011-04-13 14:38:27 +04:00
if ( m_transOptCacheMaxSize = = 0 ) return ;
2009-08-07 20:47:54 +04:00
std : : pair < size_t , Phrase > key ( decodeGraph . GetPosition ( ) , sourcePhrase ) ;
2009-01-01 21:16:54 +03:00
TranslationOptionList * storedTransOptList = new TranslationOptionList ( transOptList ) ;
2009-08-07 20:47:54 +04:00
# ifdef WITH_THREADS
boost : : mutex : : scoped_lock lock ( m_transOptCacheMutex ) ;
# endif
2009-01-01 21:16:54 +03:00
m_transOptCache [ key ] = make_pair ( storedTransOptList , clock ( ) ) ;
ReduceTransOptCache ( ) ;
2008-10-14 23:25:18 +04:00
}
2010-09-14 13:42:37 +04:00
void StaticData : : ReLoadParameter ( )
{
m_verboseLevel = 1 ;
if ( m_parameter - > GetParam ( " verbose " ) . size ( ) = = 1 )
{
m_verboseLevel = Scan < size_t > ( m_parameter - > GetParam ( " verbose " ) [ 0 ] ) ;
}
// check whether "weight-u" is already set
if ( m_parameter - > isParamShortNameSpecified ( " u " ) )
{
if ( m_parameter - > GetParamShortName ( " u " ) . size ( ) < 1 ) {
PARAM_VEC w ( 1 , " 1.0 " ) ;
m_parameter - > OverwriteParamShortName ( " u " , w ) ;
}
}
//loop over all ScoreProducer to update weights
2010-09-14 14:13:06 +04:00
const TranslationSystem & transSystem = GetTranslationSystem ( TranslationSystem : : DEFAULT ) ;
2010-09-14 13:42:37 +04:00
std : : vector < const ScoreProducer * > : : const_iterator iterSP ;
2010-09-14 14:13:06 +04:00
for ( iterSP = transSystem . GetFeatureFunctions ( ) . begin ( ) ; iterSP ! = transSystem . GetFeatureFunctions ( ) . end ( ) ; + + iterSP )
2010-09-14 13:42:37 +04:00
{
2010-09-14 20:25:33 +04:00
std : : string paramShortName = ( * iterSP ) - > GetScoreProducerWeightShortName ( ) ;
vector < float > Weights = Scan < float > ( m_parameter - > GetParamShortName ( paramShortName ) ) ;
2010-09-14 13:42:37 +04:00
if ( paramShortName = = " d " ) { //basic distortion model takes the first weight
2010-09-14 20:25:33 +04:00
if ( ( * iterSP ) - > GetScoreProducerDescription ( ) = = " Distortion " ) {
Weights . resize ( 1 ) ; //take only the first element
2010-09-14 13:42:37 +04:00
} else { //lexicalized reordering model takes the other
2010-09-14 20:25:33 +04:00
Weights . erase ( Weights . begin ( ) ) ; //remove the first element
2010-09-14 13:42:37 +04:00
}
// std::cerr << "this is the Distortion Score Producer -> " << (*iterSP)->GetScoreProducerDescription() << std::cerr;
// std::cerr << "this is the Distortion Score Producer; it has " << (*iterSP)->GetNumScoreComponents() << " weights"<< std::cerr;
2010-09-14 19:05:56 +04:00
// std::cerr << Weights << std::endl;
2010-09-14 13:42:37 +04:00
}
2010-09-17 16:37:30 +04:00
else if ( paramShortName = = " tm " )
{
continue ;
}
2010-10-07 02:06:49 +04:00
SetWeights ( * iterSP , Weights ) ;
2010-09-17 16:37:30 +04:00
}
2010-09-14 13:42:37 +04:00
// std::cerr << "There are " << m_phraseDictionary.size() << " m_phraseDictionaryfeatures" << std::endl;
const vector < float > WeightsTM = Scan < float > ( m_parameter - > GetParamShortName ( " tm " ) ) ;
// std::cerr << "WeightsTM: " << WeightsTM << std::endl;
const vector < float > WeightsLM = Scan < float > ( m_parameter - > GetParamShortName ( " lm " ) ) ;
// std::cerr << "WeightsLM: " << WeightsLM << std::endl;
size_t index_WeightTM = 0 ;
2010-09-14 14:13:06 +04:00
for ( size_t i = 0 ; i < transSystem . GetPhraseDictionaries ( ) . size ( ) ; + + i )
2010-09-14 13:42:37 +04:00
{
PhraseDictionaryFeature & phraseDictionaryFeature = * m_phraseDictionary [ i ] ;
// std::cerr << "phraseDictionaryFeature.GetNumScoreComponents():" << phraseDictionaryFeature.GetNumScoreComponents() << std::endl;
// std::cerr << "phraseDictionaryFeature.GetNumInputScores():" << phraseDictionaryFeature.GetNumInputScores() << std::endl;
vector < float > tmp_weights ;
for ( size_t j = 0 ; j < phraseDictionaryFeature . GetNumScoreComponents ( ) ; + + j )
tmp_weights . push_back ( WeightsTM [ index_WeightTM + + ] ) ;
// std::cerr << tmp_weights << std::endl;
2010-10-07 02:06:49 +04:00
SetWeights ( & phraseDictionaryFeature , tmp_weights ) ;
2010-09-14 13:42:37 +04:00
}
}
2010-09-17 18:25:08 +04:00
2010-11-24 20:06:54 +03:00
void StaticData : : ReLoadBleuScoreFeatureParameter ( )
{
//loop over all ScoreProducer to update weights
const TranslationSystem & transSystem = GetTranslationSystem ( TranslationSystem : : DEFAULT ) ;
std : : vector < const ScoreProducer * > : : const_iterator iterSP ;
for ( iterSP = transSystem . GetFeatureFunctions ( ) . begin ( ) ; iterSP ! = transSystem . GetFeatureFunctions ( ) . end ( ) ; + + iterSP )
{
std : : string paramShortName = ( * iterSP ) - > GetScoreProducerWeightShortName ( ) ;
vector < float > Weights = Scan < float > ( m_parameter - > GetParamShortName ( paramShortName ) ) ;
if ( paramShortName = = " bl " ) {
SetWeights ( * iterSP , Weights ) ;
}
}
}
2010-09-17 18:25:08 +04:00
// ScoreComponentCollection StaticData::GetAllWeightsScoreComponentCollection() const {}
// in ScoreComponentCollection.h
2010-09-14 13:42:37 +04:00
2008-10-09 03:51:26 +04:00
}