2010-09-15 18:36:07 +04:00
/***********************************************************************
2011-03-23 15:13:38 +03:00
Moses - factored phrase - based language decoder
Copyright ( C ) 2010 University of Edinburgh
2010-09-15 18:36:07 +04:00
2011-03-23 15:13:38 +03:00
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2.1 of the License , or ( at your option ) any later version .
2010-09-15 18:36:07 +04:00
2011-03-23 15:13:38 +03:00
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
2010-09-15 18:36:07 +04:00
2011-03-23 15:13:38 +03:00
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 USA
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2010-09-15 18:36:07 +04:00
2010-09-28 19:13:50 +04:00
# include <algorithm>
2010-09-15 19:38:46 +04:00
# include <cstdlib>
# include <ctime>
2010-09-15 18:36:07 +04:00
# include <string>
# include <vector>
2011-05-31 19:39:48 +04:00
# include <map>
2010-09-15 18:36:07 +04:00
# include <boost/program_options.hpp>
2011-02-24 13:54:16 +03:00
# include <boost/algorithm/string.hpp>
2010-09-28 19:13:50 +04:00
# ifdef MPI_ENABLE
# include <boost/mpi.hpp>
namespace mpi = boost : : mpi ;
# endif
2010-09-15 18:36:07 +04:00
2011-06-29 20:18:55 +04:00
# include "Main.h"
2010-09-15 19:38:46 +04:00
# include "Optimiser.h"
2011-04-22 23:17:33 +04:00
# include "Hildreth.h"
2012-06-01 04:49:42 +04:00
# include "HypothesisQueue.h"
2012-11-14 18:18:53 +04:00
# include "moses/StaticData.h"
# include "moses/ScoreComponentCollection.h"
# include "moses/ThreadPool.h"
# include "mert/BleuScorer.h"
2013-05-24 21:02:49 +04:00
# include "moses/FeatureVector.h"
# include "moses/FF/WordTranslationFeature.h"
# include "moses/FF/PhrasePairFeature.h"
2013-05-25 02:57:06 +04:00
# include "moses/FF/WordPenaltyProducer.h"
2013-10-03 14:05:53 +04:00
# include "moses/LM/Base.h"
2015-04-24 16:00:07 +03:00
# include "util/random.hh"
2010-09-15 18:36:07 +04:00
using namespace Mira ;
using namespace std ;
using namespace Moses ;
namespace po = boost : : program_options ;
2013-05-29 21:16:15 +04:00
int main ( int argc , char * * argv )
{
2015-04-24 16:00:07 +03:00
util : : rand_init ( ) ;
2012-05-10 21:54:24 +04:00
size_t rank = 0 ;
size_t size = 1 ;
2010-09-28 19:13:50 +04:00
# ifdef MPI_ENABLE
2012-05-10 21:54:24 +04:00
mpi : : environment env ( argc , argv ) ;
mpi : : communicator world ;
rank = world . rank ( ) ;
size = world . size ( ) ;
2010-09-28 19:13:50 +04:00
# endif
2012-10-01 23:36:52 +04:00
2012-05-10 21:54:24 +04:00
bool help ;
int verbosity ;
string mosesConfigFile ;
string inputFile ;
vector < string > referenceFiles ;
vector < string > mosesConfigFilesFolds , inputFilesFolds , referenceFilesFolds ;
2012-06-01 04:49:42 +04:00
// string coreWeightFile, startWeightFile;
2012-05-10 21:54:24 +04:00
size_t epochs ;
string learner ;
bool shuffle ;
size_t mixingFrequency ;
size_t weightDumpFrequency ;
string weightDumpStem ;
2013-08-12 16:03:26 +04:00
bool scale_margin ;
bool scale_update ;
2012-05-10 21:54:24 +04:00
size_t n ;
size_t batchSize ;
bool distinctNbest ;
bool accumulateWeights ;
float historySmoothing ;
bool scaleByInputLength , scaleByAvgInputLength ;
bool scaleByInverseLength , scaleByAvgInverseLength ;
float scaleByX ;
float slack ;
bool averageWeights ;
bool weightConvergence ;
float learning_rate ;
float mira_learning_rate ;
float perceptron_learning_rate ;
string decoder_settings ;
float min_weight_change ;
bool normaliseWeights , normaliseMargin ;
bool print_feature_values ;
bool historyBleu ;
bool sentenceBleu ;
bool perceptron_update ;
2012-10-01 23:36:52 +04:00
bool hope_fear ;
bool model_hope_fear ;
2013-08-12 16:03:26 +04:00
size_t hope_n , fear_n ;
2012-05-10 21:54:24 +04:00
size_t bleu_smoothing_scheme ;
float min_oracle_bleu ;
float minBleuRatio , maxBleuRatio ;
bool boost ;
bool decode_hope , decode_fear , decode_model ;
string decode_filename ;
bool batchEqualsShard ;
bool sparseAverage , dumpMixedWeights , sparseNoAverage ;
int featureCutoff ;
bool pruneZeroWeights ;
bool printFeatureCounts , printNbestWithFeatures ;
bool avgRefLength ;
2012-10-01 23:36:52 +04:00
bool print_weights , print_core_weights , debug_model , scale_lm , scale_wp ;
2012-05-10 21:54:24 +04:00
float scale_lm_factor , scale_wp_factor ;
2012-10-01 23:36:52 +04:00
bool kbest ;
2012-05-10 21:54:24 +04:00
string moses_src ;
float sigmoidParam ;
float bleuWeight , bleuWeight_hope , bleuWeight_fear ;
2013-08-12 16:03:26 +04:00
bool bleu_weight_lm ;
2012-05-10 21:54:24 +04:00
float bleu_weight_lm_factor ;
2012-07-26 20:32:50 +04:00
bool l1_regularize , l2_regularize , l1_reg_sparse , l2_reg_sparse ;
2012-05-10 21:54:24 +04:00
float l1_lambda , l2_lambda ;
2013-08-12 16:03:26 +04:00
bool most_violated , most_violated_reg , all_violated , max_bleu_diff ;
2012-06-01 04:49:42 +04:00
bool feature_confidence , signed_counts ;
float decay_core , decay_sparse , core_r0 , sparse_r0 ;
2012-10-01 23:36:52 +04:00
float bleu_weight_fear_factor ;
2012-06-01 04:49:42 +04:00
bool hildreth ;
float add2lm ;
2013-08-12 16:03:26 +04:00
// compute real sentence Bleu scores on complete translations, disable Bleu feature
2012-05-30 04:04:36 +04:00
bool realBleu , disableBleuFeature ;
2012-06-01 23:20:30 +04:00
bool rescaleSlack ;
2012-06-01 04:49:42 +04:00
bool makePairs ;
2012-07-26 20:32:50 +04:00
bool debug ;
bool reg_on_every_mix ;
2012-10-01 23:36:52 +04:00
size_t continue_epoch ;
bool modelPlusBleu , simpleHistoryBleu ;
2012-05-10 21:54:24 +04:00
po : : options_description desc ( " Allowed options " ) ;
desc . add_options ( )
2013-05-29 21:16:15 +04:00
( " continue-epoch " , po : : value < size_t > ( & continue_epoch ) - > default_value ( 0 ) , " Continue an interrupted experiment from this epoch on " )
( " freq-reg " , po : : value < bool > ( & reg_on_every_mix ) - > default_value ( false ) , " Regularize after every weight mixing " )
( " l1sparse " , po : : value < bool > ( & l1_reg_sparse ) - > default_value ( true ) , " L1-regularization for sparse weights only " )
( " l2sparse " , po : : value < bool > ( & l2_reg_sparse ) - > default_value ( true ) , " L2-regularization for sparse weights only " )
( " mv-reg " , po : : value < bool > ( & most_violated_reg ) - > default_value ( false ) , " Regularize most violated constraint " )
2013-08-12 16:03:26 +04:00
( " most-violated " , po : : value < bool > ( & most_violated ) - > default_value ( false ) , " Add most violated constraint " )
( " all-violated " , po : : value < bool > ( & all_violated ) - > default_value ( false ) , " Add all violated constraints " )
( " feature-confidence " , po : : value < bool > ( & feature_confidence ) - > default_value ( false ) , " Confidence-weighted learning " )
( " signed-counts " , po : : value < bool > ( & signed_counts ) - > default_value ( false ) , " Use signed feature counts for CWL " )
2013-05-29 21:16:15 +04:00
( " dbg " , po : : value < bool > ( & debug ) - > default_value ( true ) , " More debug output " )
( " make-pairs " , po : : value < bool > ( & makePairs ) - > default_value ( true ) , " Make pairs of hypotheses for 1slack " )
( " debug " , po : : value < bool > ( & debug ) - > default_value ( true ) , " More debug output " )
( " rescale-slack " , po : : value < bool > ( & rescaleSlack ) - > default_value ( false ) , " Rescale slack in 1-slack formulation " )
( " add2lm " , po : : value < float > ( & add2lm ) - > default_value ( 0.0 ) , " Add the specified amount to all LM weights " )
( " hildreth " , po : : value < bool > ( & hildreth ) - > default_value ( false ) , " Prefer Hildreth over analytical update " )
( " model-plus-bleu " , po : : value < bool > ( & modelPlusBleu ) - > default_value ( false ) , " Use the sum of model score and +/- bleu to select hope and fear translations " )
( " simple-history-bleu " , po : : value < bool > ( & simpleHistoryBleu ) - > default_value ( false ) , " Simple history Bleu " )
( " bleu-weight " , po : : value < float > ( & bleuWeight ) - > default_value ( 1.0 ) , " Bleu weight used in decoder objective " )
( " bw-hope " , po : : value < float > ( & bleuWeight_hope ) - > default_value ( - 1.0 ) , " Bleu weight used in decoder objective for hope " )
( " bw-fear " , po : : value < float > ( & bleuWeight_fear ) - > default_value ( - 1.0 ) , " Bleu weight used in decoder objective for fear " )
( " core-r0 " , po : : value < float > ( & core_r0 ) - > default_value ( 1.0 ) , " Start learning rate for core features " )
( " sparse-r0 " , po : : value < float > ( & sparse_r0 ) - > default_value ( 1.0 ) , " Start learning rate for sparse features " )
2013-08-12 16:03:26 +04:00
( " decay-core " , po : : value < float > ( & decay_core ) - > default_value ( 0.01 ) , " Decay for core feature learning rate " )
( " decay-sparse " , po : : value < float > ( & decay_sparse ) - > default_value ( 0.01 ) , " Decay for sparse feature learning rate " )
2013-08-14 15:37:22 +04:00
2013-08-12 16:03:26 +04:00
( " tie-bw-to-lm " , po : : value < bool > ( & bleu_weight_lm ) - > default_value ( true ) , " Make bleu weight depend on lm weight " )
2013-05-29 21:16:15 +04:00
( " bw-lm-factor " , po : : value < float > ( & bleu_weight_lm_factor ) - > default_value ( 2.0 ) , " Make bleu weight depend on lm weight by this factor " )
( " bw-factor-fear " , po : : value < float > ( & bleu_weight_fear_factor ) - > default_value ( 1.0 ) , " Multiply fear weight by this factor " )
( " accumulate-weights " , po : : value < bool > ( & accumulateWeights ) - > default_value ( false ) , " Accumulate and average weights over all epochs " )
( " average-weights " , po : : value < bool > ( & averageWeights ) - > default_value ( false ) , " Set decoder weights to average weights after each update " )
( " avg-ref-length " , po : : value < bool > ( & avgRefLength ) - > default_value ( false ) , " Use average reference length instead of shortest for BLEU score feature " )
( " batch-equals-shard " , po : : value < bool > ( & batchEqualsShard ) - > default_value ( false ) , " Batch size is equal to shard size (purely batch) " )
( " batch-size,b " , po : : value < size_t > ( & batchSize ) - > default_value ( 1 ) , " Size of batch that is send to optimiser for weight adjustments " )
( " bleu-smoothing-scheme " , po : : value < size_t > ( & bleu_smoothing_scheme ) - > default_value ( 1 ) , " Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1) " )
( " boost " , po : : value < bool > ( & boost ) - > default_value ( false ) , " Apply boosting factor to updates on misranked candidates " )
( " config,f " , po : : value < string > ( & mosesConfigFile ) , " Moses ini-file " )
( " configs-folds " , po : : value < vector < string > > ( & mosesConfigFilesFolds ) , " Moses ini-files, one for each fold " )
( " debug-model " , po : : value < bool > ( & debug_model ) - > default_value ( false ) , " Get best model translation for debugging purposes " )
( " decode-hope " , po : : value < bool > ( & decode_hope ) - > default_value ( false ) , " Decode dev input set according to hope objective " )
( " decode-fear " , po : : value < bool > ( & decode_fear ) - > default_value ( false ) , " Decode dev input set according to fear objective " )
( " decode-model " , po : : value < bool > ( & decode_model ) - > default_value ( false ) , " Decode dev input set according to normal objective " )
( " decode-filename " , po : : value < string > ( & decode_filename ) , " Filename for Bleu objective translations " )
( " decoder-settings " , po : : value < string > ( & decoder_settings ) - > default_value ( " " ) , " Decoder settings for tuning runs " )
( " distinct-nbest " , po : : value < bool > ( & distinctNbest ) - > default_value ( true ) , " Use n-best list with distinct translations in inference step " )
( " dump-mixed-weights " , po : : value < bool > ( & dumpMixedWeights ) - > default_value ( false ) , " Dump mixed weights instead of averaged weights " )
( " epochs,e " , po : : value < size_t > ( & epochs ) - > default_value ( 10 ) , " Number of epochs " )
( " feature-cutoff " , po : : value < int > ( & featureCutoff ) - > default_value ( - 1 ) , " Feature cutoff as additional regularization for sparse features " )
2013-08-12 16:03:26 +04:00
( " fear-n " , po : : value < size_t > ( & fear_n ) - > default_value ( 1 ) , " Number of fear translations used " )
2013-05-29 21:16:15 +04:00
( " help " , po : : value ( & help ) - > zero_tokens ( ) - > default_value ( false ) , " Print this help message and exit " )
( " history-bleu " , po : : value < bool > ( & historyBleu ) - > default_value ( false ) , " Use 1best translations to update the history " )
( " history-smoothing " , po : : value < float > ( & historySmoothing ) - > default_value ( 0.9 ) , " Adjust the factor for history smoothing " )
( " hope-fear " , po : : value < bool > ( & hope_fear ) - > default_value ( true ) , " Use only hope and fear translations for optimisation (not model) " )
2013-08-12 16:03:26 +04:00
( " hope-n " , po : : value < size_t > ( & hope_n ) - > default_value ( 2 ) , " Number of hope translations used " )
2013-05-29 21:16:15 +04:00
( " input-file,i " , po : : value < string > ( & inputFile ) , " Input file containing tokenised source " )
( " input-files-folds " , po : : value < vector < string > > ( & inputFilesFolds ) , " Input files containing tokenised source, one for each fold " )
( " learner,l " , po : : value < string > ( & learner ) - > default_value ( " mira " ) , " Learning algorithm " )
( " l1-lambda " , po : : value < float > ( & l1_lambda ) - > default_value ( 0.0001 ) , " Lambda for l1-regularization (w_i +/- lambda) " )
( " l2-lambda " , po : : value < float > ( & l2_lambda ) - > default_value ( 0.01 ) , " Lambda for l2-regularization (w_i * (1 - lambda)) " )
( " l1-reg " , po : : value < bool > ( & l1_regularize ) - > default_value ( false ) , " L1-regularization " )
( " l2-reg " , po : : value < bool > ( & l2_regularize ) - > default_value ( false ) , " L2-regularization " )
( " min-bleu-ratio " , po : : value < float > ( & minBleuRatio ) - > default_value ( - 1 ) , " Set a minimum BLEU ratio between hope and fear " )
( " max-bleu-ratio " , po : : value < float > ( & maxBleuRatio ) - > default_value ( - 1 ) , " Set a maximum BLEU ratio between hope and fear " )
( " max-bleu-diff " , po : : value < bool > ( & max_bleu_diff ) - > default_value ( true ) , " Select hope/fear with maximum Bleu difference " )
( " min-oracle-bleu " , po : : value < float > ( & min_oracle_bleu ) - > default_value ( 0 ) , " Set a minimum oracle BLEU score " )
( " min-weight-change " , po : : value < float > ( & min_weight_change ) - > default_value ( 0.0001 ) , " Set minimum weight change for stopping criterion " )
( " mira-learning-rate " , po : : value < float > ( & mira_learning_rate ) - > default_value ( 1 ) , " Learning rate for MIRA (fixed or flexible) " )
2013-08-12 16:03:26 +04:00
( " mixing-frequency " , po : : value < size_t > ( & mixingFrequency ) - > default_value ( 10 ) , " How often per epoch to mix weights, when using mpi " )
2013-05-29 21:16:15 +04:00
( " model-hope-fear " , po : : value < bool > ( & model_hope_fear ) - > default_value ( false ) , " Use model, hope and fear translations for optimisation " )
( " moses-src " , po : : value < string > ( & moses_src ) - > default_value ( " " ) , " Moses source directory " )
2013-08-12 16:03:26 +04:00
( " nbest,n " , po : : value < size_t > ( & n ) - > default_value ( 30 ) , " Number of translations in n-best list " )
2013-05-29 21:16:15 +04:00
( " normalise-weights " , po : : value < bool > ( & normaliseWeights ) - > default_value ( false ) , " Whether to normalise the updated weights before passing them to the decoder " )
( " normalise-margin " , po : : value < bool > ( & normaliseMargin ) - > default_value ( false ) , " Normalise the margin: squash between 0 and 1 " )
( " perceptron-learning-rate " , po : : value < float > ( & perceptron_learning_rate ) - > default_value ( 0.01 ) , " Perceptron learning rate " )
( " print-feature-values " , po : : value < bool > ( & print_feature_values ) - > default_value ( false ) , " Print out feature values " )
( " print-feature-counts " , po : : value < bool > ( & printFeatureCounts ) - > default_value ( false ) , " Print out feature values, print feature list with hope counts after 1st epoch " )
( " print-nbest-with-features " , po : : value < bool > ( & printNbestWithFeatures ) - > default_value ( false ) , " Print out feature values, print feature list with hope counts after 1st epoch " )
( " print-weights " , po : : value < bool > ( & print_weights ) - > default_value ( false ) , " Print out current weights " )
( " print-core-weights " , po : : value < bool > ( & print_core_weights ) - > default_value ( true ) , " Print out current core weights " )
( " prune-zero-weights " , po : : value < bool > ( & pruneZeroWeights ) - > default_value ( false ) , " Prune zero-valued sparse feature weights " )
( " reference-files,r " , po : : value < vector < string > > ( & referenceFiles ) , " Reference translation files for training " )
( " reference-files-folds " , po : : value < vector < string > > ( & referenceFilesFolds ) , " Reference translation files for training, one for each fold " )
2013-08-12 16:03:26 +04:00
( " kbest " , po : : value < bool > ( & kbest ) - > default_value ( true ) , " Select hope/fear pairs from a list of nbest translations " )
2013-05-29 21:16:15 +04:00
( " scale-by-inverse-length " , po : : value < bool > ( & scaleByInverseLength ) - > default_value ( false ) , " Scale BLEU by (history of) inverse input length " )
2013-08-12 16:03:26 +04:00
( " scale-by-input-length " , po : : value < bool > ( & scaleByInputLength ) - > default_value ( true ) , " Scale BLEU by (history of) input length " )
2013-05-29 21:16:15 +04:00
( " scale-by-avg-input-length " , po : : value < bool > ( & scaleByAvgInputLength ) - > default_value ( false ) , " Scale BLEU by average input length " )
( " scale-by-avg-inverse-length " , po : : value < bool > ( & scaleByAvgInverseLength ) - > default_value ( false ) , " Scale BLEU by average inverse input length " )
2013-08-12 16:03:26 +04:00
( " scale-by-x " , po : : value < float > ( & scaleByX ) - > default_value ( 0.1 ) , " Scale the BLEU score by value x " )
( " scale-lm " , po : : value < bool > ( & scale_lm ) - > default_value ( true ) , " Scale the language model feature " )
( " scale-factor-lm " , po : : value < float > ( & scale_lm_factor ) - > default_value ( 0.5 ) , " Scale the language model feature by this factor " )
2013-05-29 21:16:15 +04:00
( " scale-wp " , po : : value < bool > ( & scale_wp ) - > default_value ( false ) , " Scale the word penalty feature " )
( " scale-factor-wp " , po : : value < float > ( & scale_wp_factor ) - > default_value ( 2 ) , " Scale the word penalty feature by this factor " )
( " scale-margin " , po : : value < bool > ( & scale_margin ) - > default_value ( 0 ) , " Scale the margin by the Bleu score of the oracle translation " )
( " sentence-level-bleu " , po : : value < bool > ( & sentenceBleu ) - > default_value ( true ) , " Use a sentences level Bleu scoring function " )
( " shuffle " , po : : value < bool > ( & shuffle ) - > default_value ( false ) , " Shuffle input sentences before processing " )
( " sigmoid-param " , po : : value < float > ( & sigmoidParam ) - > default_value ( 1 ) , " y=sigmoidParam is the axis that this sigmoid approaches " )
2013-08-12 16:03:26 +04:00
( " slack " , po : : value < float > ( & slack ) - > default_value ( 0.05 ) , " Use slack in optimiser " )
2013-05-29 21:16:15 +04:00
( " sparse-average " , po : : value < bool > ( & sparseAverage ) - > default_value ( false ) , " Average weights by the number of processes " )
( " sparse-no-average " , po : : value < bool > ( & sparseNoAverage ) - > default_value ( false ) , " Don't average sparse weights, just sum " )
( " stop-weights " , po : : value < bool > ( & weightConvergence ) - > default_value ( true ) , " Stop when weights converge " )
( " verbosity,v " , po : : value < int > ( & verbosity ) - > default_value ( 0 ) , " Verbosity level " )
2013-08-12 16:03:26 +04:00
( " weight-dump-frequency " , po : : value < size_t > ( & weightDumpFrequency ) - > default_value ( 2 ) , " How often per epoch to dump weights (mpi) " )
2013-05-29 21:16:15 +04:00
( " weight-dump-stem " , po : : value < string > ( & weightDumpStem ) - > default_value ( " weights " ) , " Stem of filename to use for dumping weights " ) ;
2012-10-01 23:36:52 +04:00
2012-05-10 21:54:24 +04:00
po : : options_description cmdline_options ;
cmdline_options . add ( desc ) ;
po : : variables_map vm ;
po : : store ( po : : command_line_parser ( argc , argv ) . options ( cmdline_options ) . run ( ) , vm ) ;
po : : notify ( vm ) ;
2012-10-01 23:36:52 +04:00
2012-05-10 21:54:24 +04:00
if ( help ) {
std : : cout < < " Usage: " + string ( argv [ 0 ] )
2013-05-29 21:16:15 +04:00
+ " -f mosesini-file -i input-file -r reference-file(s) [options] " < < std : : endl ;
2012-05-10 21:54:24 +04:00
std : : cout < < desc < < std : : endl ;
return 0 ;
2011-09-26 21:31:59 +04:00
}
2012-06-01 04:49:42 +04:00
2012-05-10 21:54:24 +04:00
const StaticData & staticData = StaticData : : Instance ( ) ;
2012-10-01 23:36:52 +04:00
bool trainWithMultipleFolds = false ;
2012-04-01 22:59:00 +04:00
if ( mosesConfigFilesFolds . size ( ) > 0 | | inputFilesFolds . size ( ) > 0 | | referenceFilesFolds . size ( ) > 0 ) {
2012-10-01 23:36:52 +04:00
if ( rank = = 0 )
cerr < < " Training with " < < mosesConfigFilesFolds . size ( ) < < " folds " < < endl ;
trainWithMultipleFolds = true ;
2012-04-01 22:59:00 +04:00
}
2012-02-20 21:27:35 +04:00
2012-03-13 01:31:05 +04:00
if ( dumpMixedWeights & & ( mixingFrequency ! = weightDumpFrequency ) ) {
2012-10-01 23:36:52 +04:00
cerr < < " Set mixing frequency = weight dump frequency for dumping mixed weights! " < < endl ;
exit ( 1 ) ;
2012-03-13 01:31:05 +04:00
}
2012-03-19 20:10:32 +04:00
if ( ( sparseAverage | | sparseNoAverage ) & & averageWeights ) {
2012-10-01 23:36:52 +04:00
cerr < < " Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented) " < < endl ;
exit ( 1 ) ;
2012-03-13 20:34:48 +04:00
}
2012-03-19 20:10:32 +04:00
if ( trainWithMultipleFolds ) {
2012-10-01 23:36:52 +04:00
if ( ! mosesConfigFilesFolds . size ( ) ) {
cerr < < " Error: No moses ini files specified for training with folds " < < endl ;
exit ( 1 ) ;
}
2012-03-19 20:10:32 +04:00
2012-10-01 23:36:52 +04:00
if ( ! inputFilesFolds . size ( ) ) {
cerr < < " Error: No input files specified for training with folds " < < endl ;
exit ( 1 ) ;
}
if ( ! referenceFilesFolds . size ( ) ) {
cerr < < " Error: No reference files specified for training with folds " < < endl ;
exit ( 1 ) ;
}
2013-05-29 21:16:15 +04:00
} else {
2012-10-01 23:36:52 +04:00
if ( mosesConfigFile . empty ( ) ) {
cerr < < " Error: No moses ini file specified " < < endl ;
return 1 ;
}
2012-03-19 20:10:32 +04:00
2012-10-01 23:36:52 +04:00
if ( inputFile . empty ( ) ) {
cerr < < " Error: No input file specified " < < endl ;
return 1 ;
}
if ( ! referenceFiles . size ( ) ) {
cerr < < " Error: No reference files specified " < < endl ;
return 1 ;
}
2012-03-19 20:10:32 +04:00
}
2011-03-23 15:13:38 +03:00
2012-10-01 23:36:52 +04:00
// load input and references
vector < string > inputSentences ;
size_t inputSize = trainWithMultipleFolds ? inputFilesFolds . size ( ) : 0 ;
size_t refSize = trainWithMultipleFolds ? referenceFilesFolds . size ( ) : referenceFiles . size ( ) ;
vector < vector < string > > inputSentencesFolds ( inputSize ) ;
vector < vector < string > > referenceSentences ( refSize ) ;
2011-03-23 15:13:38 +03:00
2012-10-01 23:36:52 +04:00
// number of cores for each fold
size_t coresPerFold = 0 , myFold = 0 ;
if ( trainWithMultipleFolds ) {
if ( mosesConfigFilesFolds . size ( ) > size ) {
cerr < < " Number of cores has to be a multiple of the number of folds " < < endl ;
exit ( 1 ) ;
}
coresPerFold = size / mosesConfigFilesFolds . size ( ) ;
if ( size % coresPerFold > 0 ) {
cerr < < " Number of cores has to be a multiple of the number of folds " < < endl ;
exit ( 1 ) ;
}
2011-10-24 13:43:53 +04:00
2012-10-01 23:36:52 +04:00
if ( rank = = 0 )
cerr < < " Number of cores per fold: " < < coresPerFold < < endl ;
myFold = rank / coresPerFold ;
cerr < < " Rank " < < rank < < " , my fold: " < < myFold < < endl ;
}
// NOTE: we do not actually need the references here, because we are reading them in from StaticData
if ( trainWithMultipleFolds ) {
if ( ! loadSentences ( inputFilesFolds [ myFold ] , inputSentencesFolds [ myFold ] ) ) {
cerr < < " Error: Failed to load input sentences from " < < inputFilesFolds [ myFold ] < < endl ;
exit ( 1 ) ;
}
VERBOSE ( 1 , " Rank " < < rank < < " reading inputs from " < < inputFilesFolds [ myFold ] < < endl ) ;
if ( ! loadSentences ( referenceFilesFolds [ myFold ] , referenceSentences [ myFold ] ) ) {
cerr < < " Error: Failed to load reference sentences from " < < referenceFilesFolds [ myFold ] < < endl ;
exit ( 1 ) ;
}
if ( referenceSentences [ myFold ] . size ( ) ! = inputSentencesFolds [ myFold ] . size ( ) ) {
cerr < < " Error: Input file length ( " < < inputSentencesFolds [ myFold ] . size ( ) < < " ) != ( "
2013-05-29 21:16:15 +04:00
< < referenceSentences [ myFold ] . size ( ) < < " ) reference file length (rank " < < rank < < " ) " < < endl ;
2012-10-01 23:36:52 +04:00
exit ( 1 ) ;
}
VERBOSE ( 1 , " Rank " < < rank < < " reading references from " < < referenceFilesFolds [ myFold ] < < endl ) ;
2013-05-29 21:16:15 +04:00
} else {
2012-10-01 23:36:52 +04:00
if ( ! loadSentences ( inputFile , inputSentences ) ) {
cerr < < " Error: Failed to load input sentences from " < < inputFile < < endl ;
return 1 ;
}
for ( size_t i = 0 ; i < referenceFiles . size ( ) ; + + i ) {
if ( ! loadSentences ( referenceFiles [ i ] , referenceSentences [ i ] ) ) {
2013-05-29 21:16:15 +04:00
cerr < < " Error: Failed to load reference sentences from "
< < referenceFiles [ i ] < < endl ;
return 1 ;
2012-10-01 23:36:52 +04:00
}
if ( referenceSentences [ i ] . size ( ) ! = inputSentences . size ( ) ) {
2013-05-29 21:16:15 +04:00
cerr < < " Error: Input file length ( " < < inputSentences . size ( ) < < " ) != ( "
< < referenceSentences [ i ] . size ( ) < < " ) length of reference file " < < i
< < endl ;
return 1 ;
2012-10-01 23:36:52 +04:00
}
}
}
if ( scaleByAvgInputLength | | scaleByInverseLength | | scaleByAvgInverseLength )
scaleByInputLength = false ;
if ( historyBleu | | simpleHistoryBleu ) {
sentenceBleu = false ;
cerr < < " Using history Bleu. " < < endl ;
}
if ( kbest ) {
realBleu = true ;
disableBleuFeature = true ;
cerr < < " Use kbest lists and real Bleu scores, disable Bleu feature.. " < < endl ;
}
// initialise Moses
2012-10-03 21:53:55 +04:00
// add references to initialize Bleu feature
2012-10-01 23:36:52 +04:00
boost : : trim ( decoder_settings ) ;
2013-08-13 19:39:37 +04:00
decoder_settings + = " -mira -n-best-list - " + boost : : lexical_cast < string > ( n ) + " distinct " ;
vector < string > decoder_params ;
boost : : split ( decoder_params , decoder_settings , boost : : is_any_of ( " \t " ) ) ;
// bleu feature
decoder_params . push_back ( " -feature-add " ) ;
decoder_settings = " BleuScoreFeature tuneable=false references= " ;
2012-10-01 23:36:52 +04:00
if ( trainWithMultipleFolds ) {
decoder_settings + = referenceFilesFolds [ myFold ] ;
2013-05-29 21:16:15 +04:00
} else {
2013-08-13 19:39:37 +04:00
decoder_settings + = referenceFiles [ 0 ] ;
for ( size_t i = 1 ; i < referenceFiles . size ( ) ; + + i ) {
decoder_settings + = " , " ;
2012-10-01 23:36:52 +04:00
decoder_settings + = referenceFiles [ i ] ;
}
}
2013-08-13 19:39:37 +04:00
decoder_params . push_back ( decoder_settings ) ;
2012-10-01 23:36:52 +04:00
string configFile = trainWithMultipleFolds ? mosesConfigFilesFolds [ myFold ] : mosesConfigFile ;
VERBOSE ( 1 , " Rank " < < rank < < " reading config file from " < < configFile < < endl ) ;
MosesDecoder * decoder = new MosesDecoder ( configFile , verbosity , decoder_params . size ( ) , decoder_params ) ;
decoder - > setBleuParameters ( disableBleuFeature , sentenceBleu , scaleByInputLength , scaleByAvgInputLength ,
2013-05-29 21:16:15 +04:00
scaleByInverseLength , scaleByAvgInverseLength ,
scaleByX , historySmoothing , bleu_smoothing_scheme , simpleHistoryBleu ) ;
2013-06-18 18:21:47 +04:00
bool chartDecoding = staticData . IsChart ( ) ;
2012-03-12 23:59:10 +04:00
2012-10-01 23:36:52 +04:00
// Optionally shuffle the sentences
vector < size_t > order ;
if ( trainWithMultipleFolds ) {
for ( size_t i = 0 ; i < inputSentencesFolds [ myFold ] . size ( ) ; + + i ) {
order . push_back ( i ) ;
}
2013-05-29 21:16:15 +04:00
} else {
2012-10-01 23:36:52 +04:00
if ( rank = = 0 ) {
for ( size_t i = 0 ; i < inputSentences . size ( ) ; + + i ) {
2013-05-29 21:16:15 +04:00
order . push_back ( i ) ;
2012-10-01 23:36:52 +04:00
}
}
}
// initialise optimizer
Optimiser * optimiser = NULL ;
if ( learner = = " mira " ) {
if ( rank = = 0 ) {
cerr < < " Optimising using Mira " < < endl ;
cerr < < " slack: " < < slack < < " , learning rate: " < < mira_learning_rate < < endl ;
if ( normaliseMargin )
2013-05-29 21:16:15 +04:00
cerr < < " sigmoid parameter: " < < sigmoidParam < < endl ;
2012-10-01 23:36:52 +04:00
}
2013-08-14 15:37:22 +04:00
optimiser = new MiraOptimiser ( slack , scale_margin , scale_update , boost , normaliseMargin , sigmoidParam ) ;
learning_rate = mira_learning_rate ;
2012-10-01 23:36:52 +04:00
perceptron_update = false ;
} else if ( learner = = " perceptron " ) {
if ( rank = = 0 ) {
cerr < < " Optimising using Perceptron " < < endl ;
}
optimiser = new Perceptron ( ) ;
learning_rate = perceptron_learning_rate ;
perceptron_update = true ;
model_hope_fear = false ; // mira only
hope_fear = false ; // mira only
n = 1 ;
hope_n = 1 ;
fear_n = 1 ;
} else {
cerr < < " Error: Unknown optimiser: " < < learner < < endl ;
return 1 ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// resolve parameter dependencies
if ( batchSize > 1 & & perceptron_update ) {
batchSize = 1 ;
cerr < < " Info: Setting batch size to 1 for perceptron update " < < endl ;
}
2013-05-29 21:16:15 +04:00
2013-08-12 16:03:26 +04:00
if ( hope_n = = 0 )
2012-10-01 23:36:52 +04:00
hope_n = n ;
2013-08-12 16:03:26 +04:00
if ( fear_n = = 0 )
2012-10-01 23:36:52 +04:00
fear_n = n ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( model_hope_fear | | kbest )
hope_fear = false ; // is true by default
if ( learner = = " mira " & & ! ( hope_fear | | model_hope_fear | | kbest ) ) {
cerr < < " Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update. " < < endl ;
return 1 ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
# ifdef MPI_ENABLE
if ( ! trainWithMultipleFolds )
mpi : : broadcast ( world , order , 0 ) ;
# endif
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// Create shards according to the number of processes used
vector < size_t > shard ;
if ( trainWithMultipleFolds ) {
size_t shardSize = order . size ( ) / coresPerFold ;
size_t shardStart = ( size_t ) ( shardSize * ( rank % coresPerFold ) ) ;
size_t shardEnd = shardStart + shardSize ;
if ( rank % coresPerFold = = coresPerFold - 1 ) { // last rank of each fold
shardEnd = order . size ( ) ;
shardSize = shardEnd - shardStart ;
}
VERBOSE ( 1 , " Rank: " < < rank < < " , shard size: " < < shardSize < < endl ) ;
VERBOSE ( 1 , " Rank: " < < rank < < " , shard start: " < < shardStart < < " shard end: " < < shardEnd < < endl ) ;
shard . resize ( shardSize ) ;
copy ( order . begin ( ) + shardStart , order . begin ( ) + shardEnd , shard . begin ( ) ) ;
batchSize = 1 ;
2013-05-29 21:16:15 +04:00
} else {
2012-10-01 23:36:52 +04:00
size_t shardSize = order . size ( ) / size ;
size_t shardStart = ( size_t ) ( shardSize * rank ) ;
size_t shardEnd = ( size_t ) ( shardSize * ( rank + 1 ) ) ;
if ( rank = = size - 1 ) {
shardEnd = order . size ( ) ;
shardSize = shardEnd - shardStart ;
}
VERBOSE ( 1 , " Rank: " < < rank < < " Shard size: " < < shardSize < < endl ) ;
VERBOSE ( 1 , " Rank: " < < rank < < " Shard start: " < < shardStart < < " Shard end: " < < shardEnd < < endl ) ;
shard . resize ( shardSize ) ;
copy ( order . begin ( ) + shardStart , order . begin ( ) + shardEnd , shard . begin ( ) ) ;
if ( batchEqualsShard )
batchSize = shardSize ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// get reference to feature functions
2014-05-31 17:25:43 +04:00
// const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
2012-10-01 23:36:52 +04:00
ScoreComponentCollection initialWeights = decoder - > getWeights ( ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( add2lm ! = 0 ) {
2013-05-29 21:16:15 +04:00
const std : : vector < const StatefulFeatureFunction * > & statefulFFs = StatefulFeatureFunction : : GetStatefulFeatureFunctions ( ) ;
for ( size_t i = 0 ; i < statefulFFs . size ( ) ; + + i ) {
const StatefulFeatureFunction * ff = statefulFFs [ i ] ;
const LanguageModel * lm = dynamic_cast < const LanguageModel * > ( ff ) ;
if ( lm ) {
float lmWeight = initialWeights . GetScoreForProducer ( lm ) + add2lm ;
initialWeights . Assign ( lm , lmWeight ) ;
cerr < < " Rank " < < rank < < " , add " < < add2lm < < " to lm weight. " < < endl ;
}
}
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( normaliseWeights ) {
initialWeights . L1Normalise ( ) ;
cerr < < " Rank " < < rank < < " , normalised initial weights: " < < initialWeights < < endl ;
}
decoder - > setWeights ( initialWeights ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// set bleu weight to twice the size of the language model weight(s)
if ( bleu_weight_lm ) {
float lmSum = 0 ;
2013-05-29 21:16:15 +04:00
const std : : vector < const StatefulFeatureFunction * > & statefulFFs = StatefulFeatureFunction : : GetStatefulFeatureFunctions ( ) ;
for ( size_t i = 0 ; i < statefulFFs . size ( ) ; + + i ) {
const StatefulFeatureFunction * ff = statefulFFs [ i ] ;
const LanguageModel * lm = dynamic_cast < const LanguageModel * > ( ff ) ;
2013-05-27 19:50:21 +04:00
2013-05-29 21:16:15 +04:00
if ( lm ) {
2013-05-27 19:50:21 +04:00
lmSum + = abs ( initialWeights . GetScoreForProducer ( lm ) ) ;
2013-05-29 21:16:15 +04:00
}
}
2013-05-27 19:50:21 +04:00
2012-10-01 23:36:52 +04:00
bleuWeight = lmSum * bleu_weight_lm_factor ;
2013-08-12 16:03:26 +04:00
if ( ! kbest ) cerr < < " Set bleu weight to lm weight * " < < bleu_weight_lm_factor < < endl ;
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2013-08-12 16:03:26 +04:00
// bleu weights can be set separately for hope and fear; otherwise they are both set to 'lm weight * bleu_weight_lm_factor'
2012-10-01 23:36:52 +04:00
if ( bleuWeight_hope = = - 1 ) {
bleuWeight_hope = bleuWeight ;
}
if ( bleuWeight_fear = = - 1 ) {
bleuWeight_fear = bleuWeight ;
}
bleuWeight_fear * = bleu_weight_fear_factor ;
2013-08-12 16:03:26 +04:00
if ( ! kbest ) {
cerr < < " Bleu weight: " < < bleuWeight < < endl ;
cerr < < " Bleu weight fear: " < < bleuWeight_fear < < endl ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( decode_hope | | decode_fear | | decode_model ) {
size_t decode = 1 ;
if ( decode_fear ) decode = 2 ;
if ( decode_model ) decode = 3 ;
decodeHopeOrFear ( rank , size , decode , decode_filename , inputSentences , decoder , n , bleuWeight ) ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
//Main loop:
ScoreComponentCollection cumulativeWeights ; // collect weights per epoch to produce an average
ScoreComponentCollection cumulativeWeightsBinary ;
size_t numberOfUpdates = 0 ;
size_t numberOfUpdatesThisEpoch = 0 ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
time_t now ;
time ( & now ) ;
cerr < < " Rank " < < rank < < " , " < < ctime ( & now ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
float avgInputLength = 0 ;
float sumOfInputs = 0 ;
size_t numberOfInputs = 0 ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
ScoreComponentCollection mixedWeights ;
ScoreComponentCollection mixedWeightsPrevious ;
ScoreComponentCollection mixedWeightsBeforePrevious ;
ScoreComponentCollection mixedAverageWeights ;
ScoreComponentCollection mixedAverageWeightsPrevious ;
ScoreComponentCollection mixedAverageWeightsBeforePrevious ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
bool stop = false ;
// int sumStillViolatedConstraints;
float epsilon = 0.0001 ;
// Variables for feature confidence
ScoreComponentCollection confidenceCounts , mixedConfidenceCounts , featureLearningRates ;
featureLearningRates . UpdateLearningRates ( decay_core , decay_sparse , confidenceCounts , core_r0 , sparse_r0 ) ; //initialise core learning rates
cerr < < " Initial learning rates, core: " < < core_r0 < < " , sparse: " < < sparse_r0 < < endl ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
for ( size_t epoch = continue_epoch ; epoch < epochs & & ! stop ; + + epoch ) {
if ( shuffle ) {
if ( trainWithMultipleFolds | | rank = = 0 ) {
2013-05-29 21:16:15 +04:00
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , shuffling input sentences.. " < < endl ;
RandomIndex rindex ;
random_shuffle ( order . begin ( ) , order . end ( ) , rindex ) ;
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
# ifdef MPI_ENABLE
if ( ! trainWithMultipleFolds )
2013-05-29 21:16:15 +04:00
mpi : : broadcast ( world , order , 0 ) ;
2012-10-01 23:36:52 +04:00
# endif
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// redo shards
if ( trainWithMultipleFolds ) {
2013-05-29 21:16:15 +04:00
size_t shardSize = order . size ( ) / coresPerFold ;
size_t shardStart = ( size_t ) ( shardSize * ( rank % coresPerFold ) ) ;
size_t shardEnd = shardStart + shardSize ;
if ( rank % coresPerFold = = coresPerFold - 1 ) { // last rank of each fold
shardEnd = order . size ( ) ;
shardSize = shardEnd - shardStart ;
}
VERBOSE ( 1 , " Rank: " < < rank < < " , shard size: " < < shardSize < < endl ) ;
VERBOSE ( 1 , " Rank: " < < rank < < " , shard start: " < < shardStart < < " shard end: " < < shardEnd < < endl ) ;
shard . resize ( shardSize ) ;
copy ( order . begin ( ) + shardStart , order . begin ( ) + shardEnd , shard . begin ( ) ) ;
batchSize = 1 ;
} else {
size_t shardSize = order . size ( ) / size ;
size_t shardStart = ( size_t ) ( shardSize * rank ) ;
size_t shardEnd = ( size_t ) ( shardSize * ( rank + 1 ) ) ;
if ( rank = = size - 1 ) {
shardEnd = order . size ( ) ;
shardSize = shardEnd - shardStart ;
}
VERBOSE ( 1 , " Shard size: " < < shardSize < < endl ) ;
VERBOSE ( 1 , " Rank: " < < rank < < " Shard start: " < < shardStart < < " Shard end: " < < shardEnd < < endl ) ;
shard . resize ( shardSize ) ;
copy ( order . begin ( ) + shardStart , order . begin ( ) + shardEnd , shard . begin ( ) ) ;
if ( batchEqualsShard )
batchSize = shardSize ;
2012-10-01 23:36:52 +04:00
}
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// sum of violated constraints in an epoch
// sumStillViolatedConstraints = 0;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
numberOfUpdatesThisEpoch = 0 ;
// Sum up weights over one epoch, final average uses weights from last epoch
if ( ! accumulateWeights ) {
cumulativeWeights . ZeroAll ( ) ;
cumulativeWeightsBinary . ZeroAll ( ) ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// number of weight dumps this epoch
2014-07-16 11:17:24 +04:00
size_t weightMixingThisEpoch = 0 ;
2012-10-01 23:36:52 +04:00
size_t weightEpochDump = 0 ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
size_t shardPosition = 0 ;
vector < size_t > : : const_iterator sid = shard . begin ( ) ;
while ( sid ! = shard . end ( ) ) {
// feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
vector < vector < ScoreComponentCollection > > featureValues ;
vector < vector < float > > bleuScores ;
vector < vector < float > > modelScores ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// variables for hope-fear/perceptron setting
vector < vector < ScoreComponentCollection > > featureValuesHope ;
vector < vector < ScoreComponentCollection > > featureValuesFear ;
vector < vector < float > > bleuScoresHope ;
vector < vector < float > > bleuScoresFear ;
vector < vector < float > > modelScoresHope ;
vector < vector < float > > modelScoresFear ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// get moses weights
ScoreComponentCollection mosesWeights = decoder - > getWeights ( ) ;
VERBOSE ( 1 , " \n Rank " < < rank < < " , epoch " < < epoch < < " , weights: " < < mosesWeights < < endl ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( historyBleu | | simpleHistoryBleu ) {
2013-05-29 21:16:15 +04:00
decoder - > printBleuFeatureHistory ( cerr ) ;
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// BATCHING: produce nbest lists for all input sentences in batch
vector < float > oracleBleuScores ;
vector < float > oracleModelScores ;
vector < vector < const Word * > > oneBests ;
vector < ScoreComponentCollection > oracleFeatureValues ;
vector < size_t > inputLengths ;
vector < size_t > ref_ids ;
size_t actualBatchSize = 0 ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
size_t examples_in_batch = 0 ;
bool skip_example = false ;
for ( size_t batchPosition = 0 ; batchPosition < batchSize & & sid
2013-05-29 21:16:15 +04:00
! = shard . end ( ) ; + + batchPosition ) {
string input ;
if ( trainWithMultipleFolds )
input = inputSentencesFolds [ myFold ] [ * sid ] ;
else
input = inputSentences [ * sid ] ;
Moses : : Sentence * sentence = new Sentence ( ) ;
stringstream in ( input + " \n " ) ;
const vector < FactorType > inputFactorOrder = staticData . GetInputFactorOrder ( ) ;
sentence - > Read ( in , inputFactorOrder ) ;
cerr < < " \n Rank " < < rank < < " , epoch " < < epoch < < " , input sentence " < < * sid < < " : \" " ;
sentence - > Print ( cerr ) ;
cerr < < " \" " < < " (batch pos " < < batchPosition < < " ) " < < endl ;
size_t current_input_length = ( * sentence ) . GetSize ( ) ;
if ( epoch = = 0 & & ( scaleByAvgInputLength | | scaleByAvgInverseLength ) ) {
sumOfInputs + = current_input_length ;
+ + numberOfInputs ;
avgInputLength = sumOfInputs / numberOfInputs ;
decoder - > setAvgInputLength ( avgInputLength ) ;
cerr < < " Rank " < < rank < < " , epoch 0, average input length: " < < avgInputLength < < endl ;
}
vector < ScoreComponentCollection > newFeatureValues ;
vector < float > newScores ;
if ( model_hope_fear ) {
featureValues . push_back ( newFeatureValues ) ;
bleuScores . push_back ( newScores ) ;
modelScores . push_back ( newScores ) ;
}
if ( hope_fear | | perceptron_update ) {
featureValuesHope . push_back ( newFeatureValues ) ;
featureValuesFear . push_back ( newFeatureValues ) ;
bleuScoresHope . push_back ( newScores ) ;
bleuScoresFear . push_back ( newScores ) ;
modelScoresHope . push_back ( newScores ) ;
modelScoresFear . push_back ( newScores ) ;
if ( historyBleu | | simpleHistoryBleu | | debug_model ) {
featureValues . push_back ( newFeatureValues ) ;
bleuScores . push_back ( newScores ) ;
modelScores . push_back ( newScores ) ;
}
}
if ( kbest ) {
// for decoding
featureValues . push_back ( newFeatureValues ) ;
bleuScores . push_back ( newScores ) ;
modelScores . push_back ( newScores ) ;
// for storing selected examples
featureValuesHope . push_back ( newFeatureValues ) ;
featureValuesFear . push_back ( newFeatureValues ) ;
bleuScoresHope . push_back ( newScores ) ;
bleuScoresFear . push_back ( newScores ) ;
modelScoresHope . push_back ( newScores ) ;
modelScoresFear . push_back ( newScores ) ;
}
size_t ref_length ;
float avg_ref_length ;
if ( print_weights )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , current weights: " < < mosesWeights < < endl ;
if ( print_core_weights ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , current weights: " ;
mosesWeights . PrintCoreFeatures ( ) ;
cerr < < endl ;
}
// check LM weight
const std : : vector < const StatefulFeatureFunction * > & statefulFFs = StatefulFeatureFunction : : GetStatefulFeatureFunctions ( ) ;
for ( size_t i = 0 ; i < statefulFFs . size ( ) ; + + i ) {
const StatefulFeatureFunction * ff = statefulFFs [ i ] ;
const LanguageModel * lm = dynamic_cast < const LanguageModel * > ( ff ) ;
if ( lm ) {
float lmWeight = mosesWeights . GetScoreForProducer ( lm ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , lm weight: " < < lmWeight < < endl ;
if ( lmWeight < = 0 ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , ERROR: language model weight should never be <= 0. " < < endl ;
mosesWeights . Assign ( lm , 0.1 ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , assign lm weights of 0.1 " < < endl ;
}
}
}
// select inference scheme
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , real Bleu? " < < realBleu < < endl ;
if ( hope_fear | | perceptron_update ) {
// HOPE
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < hope_n < <
" best hope translations " < < endl ;
vector < vector < const Word * > > outputHope = decoder - > getNBest ( input , * sid , hope_n , 1.0 , bleuWeight_hope ,
featureValuesHope [ batchPosition ] , bleuScoresHope [ batchPosition ] , modelScoresHope [ batchPosition ] ,
1 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
vector < const Word * > oracle = outputHope [ 0 ] ;
decoder - > cleanup ( chartDecoding ) ;
ref_length = decoder - > getClosestReferenceLength ( * sid , oracle . size ( ) ) ;
avg_ref_length = ref_length ;
float hope_length_ratio = ( float ) oracle . size ( ) / ref_length ;
cerr < < endl ;
// count sparse features occurring in hope translation
featureValuesHope [ batchPosition ] [ 0 ] . IncrementSparseHopeFeatures ( ) ;
vector < const Word * > bestModel ;
if ( debug_model | | historyBleu | | simpleHistoryBleu ) {
// MODEL (for updating the history only, using dummy vectors)
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , 1best wrt model score (debug or history) " < < endl ;
vector < vector < const Word * > > outputModel = decoder - > getNBest ( input , * sid , n , 0.0 , bleuWeight ,
featureValues [ batchPosition ] , bleuScores [ batchPosition ] , modelScores [ batchPosition ] ,
1 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
bestModel = outputModel [ 0 ] ;
decoder - > cleanup ( chartDecoding ) ;
cerr < < endl ;
ref_length = decoder - > getClosestReferenceLength ( * sid , bestModel . size ( ) ) ;
}
// FEAR
2013-08-12 16:03:26 +04:00
//float fear_length_ratio = 0;
2013-05-29 21:16:15 +04:00
float bleuRatioHopeFear = 0 ;
2013-08-12 16:03:26 +04:00
//int fearSize = 0;
2013-05-29 21:16:15 +04:00
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < fear_n < < " best fear translations " < < endl ;
vector < vector < const Word * > > outputFear = decoder - > getNBest ( input , * sid , fear_n , - 1.0 , bleuWeight_fear ,
featureValuesFear [ batchPosition ] , bleuScoresFear [ batchPosition ] , modelScoresFear [ batchPosition ] ,
1 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
vector < const Word * > fear = outputFear [ 0 ] ;
decoder - > cleanup ( chartDecoding ) ;
ref_length = decoder - > getClosestReferenceLength ( * sid , fear . size ( ) ) ;
avg_ref_length + = ref_length ;
avg_ref_length / = 2 ;
2013-08-12 16:03:26 +04:00
//fear_length_ratio = (float)fear.size()/ref_length;
//fearSize = (int)fear.size();
2013-05-29 21:16:15 +04:00
cerr < < endl ;
for ( size_t i = 0 ; i < fear . size ( ) ; + + i )
delete fear [ i ] ;
// count sparse features occurring in fear translation
featureValuesFear [ batchPosition ] [ 0 ] . IncrementSparseFearFeatures ( ) ;
// Bleu-related example selection
bool skip = false ;
bleuRatioHopeFear = bleuScoresHope [ batchPosition ] [ 0 ] / bleuScoresFear [ batchPosition ] [ 0 ] ;
if ( minBleuRatio ! = - 1 & & bleuRatioHopeFear < minBleuRatio )
skip = true ;
if ( maxBleuRatio ! = - 1 & & bleuRatioHopeFear > maxBleuRatio )
skip = true ;
// sanity check
if ( historyBleu | | simpleHistoryBleu ) {
if ( bleuScores [ batchPosition ] [ 0 ] > bleuScoresHope [ batchPosition ] [ 0 ] & &
modelScores [ batchPosition ] [ 0 ] > modelScoresHope [ batchPosition ] [ 0 ] ) {
if ( abs ( bleuScores [ batchPosition ] [ 0 ] - bleuScoresHope [ batchPosition ] [ 0 ] ) > epsilon & &
abs ( modelScores [ batchPosition ] [ 0 ] - modelScoresHope [ batchPosition ] [ 0 ] ) > epsilon ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , ERROR: MODEL translation better than HOPE translation. " < < endl ;
skip = true ;
}
}
if ( bleuScoresFear [ batchPosition ] [ 0 ] > bleuScores [ batchPosition ] [ 0 ] & &
modelScoresFear [ batchPosition ] [ 0 ] > modelScores [ batchPosition ] [ 0 ] ) {
if ( abs ( bleuScoresFear [ batchPosition ] [ 0 ] - bleuScores [ batchPosition ] [ 0 ] ) > epsilon & &
abs ( modelScoresFear [ batchPosition ] [ 0 ] - modelScores [ batchPosition ] [ 0 ] ) > epsilon ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , ERROR: FEAR translation better than MODEL translation. " < < endl ;
skip = true ;
}
}
}
if ( bleuScoresFear [ batchPosition ] [ 0 ] > bleuScoresHope [ batchPosition ] [ 0 ] ) {
if ( abs ( bleuScoresFear [ batchPosition ] [ 0 ] - bleuScoresHope [ batchPosition ] [ 0 ] ) > epsilon ) {
// check if it's an error or a warning
skip = true ;
if ( modelScoresFear [ batchPosition ] [ 0 ] > modelScoresHope [ batchPosition ] [ 0 ] & & abs ( modelScoresFear [ batchPosition ] [ 0 ] - modelScoresHope [ batchPosition ] [ 0 ] ) > epsilon ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , ERROR: FEAR translation better than HOPE translation. (abs-diff: " < < abs ( bleuScoresFear [ batchPosition ] [ 0 ] - bleuScoresHope [ batchPosition ] [ 0 ] ) < < " ) " < < endl ;
} else {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " < < abs ( bleuScoresFear [ batchPosition ] [ 0 ] - bleuScoresHope [ batchPosition ] [ 0 ] ) < < " ) " < < endl ;
}
}
}
if ( skip ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , skip example ( " < < hope_length_ratio < < " , " < < bleuRatioHopeFear < < " ).. " < < endl ;
featureValuesHope [ batchPosition ] . clear ( ) ;
featureValuesFear [ batchPosition ] . clear ( ) ;
bleuScoresHope [ batchPosition ] . clear ( ) ;
bleuScoresFear [ batchPosition ] . clear ( ) ;
if ( historyBleu | | simpleHistoryBleu | | debug_model ) {
featureValues [ batchPosition ] . clear ( ) ;
bleuScores [ batchPosition ] . clear ( ) ;
}
} else {
examples_in_batch + + ;
// needed for history
if ( historyBleu | | simpleHistoryBleu ) {
inputLengths . push_back ( current_input_length ) ;
ref_ids . push_back ( * sid ) ;
oneBests . push_back ( bestModel ) ;
}
}
}
if ( model_hope_fear ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < n < < " best hope translations " < < endl ;
size_t oraclePos = featureValues [ batchPosition ] . size ( ) ;
decoder - > getNBest ( input , * sid , n , 1.0 , bleuWeight_hope ,
featureValues [ batchPosition ] , bleuScores [ batchPosition ] , modelScores [ batchPosition ] ,
0 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
//vector<const Word*> oracle = outputHope[0];
// needed for history
inputLengths . push_back ( current_input_length ) ;
ref_ids . push_back ( * sid ) ;
decoder - > cleanup ( chartDecoding ) ;
//ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
//float hope_length_ratio = (float)oracle.size()/ref_length;
cerr < < endl ;
oracleFeatureValues . push_back ( featureValues [ batchPosition ] [ oraclePos ] ) ;
oracleBleuScores . push_back ( bleuScores [ batchPosition ] [ oraclePos ] ) ;
oracleModelScores . push_back ( modelScores [ batchPosition ] [ oraclePos ] ) ;
// MODEL
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < n < < " best wrt model score " < < endl ;
if ( historyBleu | | simpleHistoryBleu ) {
vector < vector < const Word * > > outputModel = decoder - > getNBest ( input , * sid , n , 0.0 ,
bleuWeight , featureValues [ batchPosition ] , bleuScores [ batchPosition ] ,
modelScores [ batchPosition ] , 1 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
vector < const Word * > bestModel = outputModel [ 0 ] ;
oneBests . push_back ( bestModel ) ;
inputLengths . push_back ( current_input_length ) ;
ref_ids . push_back ( * sid ) ;
} else {
decoder - > getNBest ( input , * sid , n , 0.0 , bleuWeight ,
featureValues [ batchPosition ] , bleuScores [ batchPosition ] , modelScores [ batchPosition ] ,
0 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
}
decoder - > cleanup ( chartDecoding ) ;
//ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
//float model_length_ratio = (float)bestModel.size()/ref_length;
cerr < < endl ;
// FEAR
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < n < < " best fear translations " < < endl ;
decoder - > getNBest ( input , * sid , n , - 1.0 , bleuWeight_fear ,
featureValues [ batchPosition ] , bleuScores [ batchPosition ] , modelScores [ batchPosition ] ,
0 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
decoder - > cleanup ( chartDecoding ) ;
//ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
//float fear_length_ratio = (float)fear.size()/ref_length;
examples_in_batch + + ;
}
if ( kbest ) {
// MODEL
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , " < < n < < " best wrt model score " < < endl ;
if ( historyBleu | | simpleHistoryBleu ) {
vector < vector < const Word * > > outputModel = decoder - > getNBest ( input , * sid , n , 0.0 ,
bleuWeight , featureValues [ batchPosition ] , bleuScores [ batchPosition ] ,
modelScores [ batchPosition ] , 1 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
vector < const Word * > bestModel = outputModel [ 0 ] ;
oneBests . push_back ( bestModel ) ;
inputLengths . push_back ( current_input_length ) ;
ref_ids . push_back ( * sid ) ;
} else {
decoder - > getNBest ( input , * sid , n , 0.0 , bleuWeight ,
featureValues [ batchPosition ] , bleuScores [ batchPosition ] ,
modelScores [ batchPosition ] , 0 , realBleu , distinctNbest , avgRefLength , rank , epoch , " " ) ;
}
decoder - > cleanup ( chartDecoding ) ;
//ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
//float model_length_ratio = (float)bestModel.size()/ref_length;
cerr < < endl ;
examples_in_batch + + ;
HypothesisQueue queueHope ( hope_n ) ;
HypothesisQueue queueFear ( fear_n ) ;
cerr < < endl ;
2013-08-12 16:03:26 +04:00
if ( most_violated | | all_violated ) {
2013-05-29 21:16:15 +04:00
float bleuHope = - 1000 ;
float bleuFear = 1000 ;
2013-08-12 16:03:26 +04:00
int indexHope = - 1 ;
int indexFear = - 1 ;
2013-05-29 21:16:15 +04:00
vector < float > bleuHopeList ;
vector < float > bleuFearList ;
vector < float > indexHopeList ;
vector < float > indexFearList ;
if ( most_violated )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , pick pair with most violated constraint " < < endl ;
else if ( all_violated )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , pick all pairs with violated constraints " ;
else
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , pick all pairs with hope " ;
// find best hope, then find fear that violates our constraint most
for ( size_t i = 0 ; i < bleuScores [ batchPosition ] . size ( ) ; + + i ) {
if ( abs ( bleuScores [ batchPosition ] [ i ] - bleuHope ) < epsilon ) { // equal bleu scores
if ( modelScores [ batchPosition ] [ i ] > modelScores [ batchPosition ] [ indexHope ] ) {
if ( abs ( modelScores [ batchPosition ] [ i ] - modelScores [ batchPosition ] [ indexHope ] ) > epsilon ) {
// better model score
bleuHope = bleuScores [ batchPosition ] [ i ] ;
indexHope = i ;
}
}
} else if ( bleuScores [ batchPosition ] [ i ] > bleuHope ) { // better than current best
bleuHope = bleuScores [ batchPosition ] [ i ] ;
indexHope = i ;
}
}
float currentViolation = 0 ;
for ( size_t i = 0 ; i < bleuScores [ batchPosition ] . size ( ) ; + + i ) {
float bleuDiff = bleuHope - bleuScores [ batchPosition ] [ i ] ;
float modelDiff = modelScores [ batchPosition ] [ indexHope ] - modelScores [ batchPosition ] [ i ] ;
2013-08-12 16:03:26 +04:00
if ( ( bleuDiff > epsilon ) & & ( modelDiff < bleuDiff ) ) {
2013-08-14 15:37:22 +04:00
float diff = bleuDiff - modelDiff ;
if ( diff > epsilon ) {
if ( all_violated ) {
cerr < < " .. adding pair " ;
bleuHopeList . push_back ( bleuHope ) ;
bleuFearList . push_back ( bleuScores [ batchPosition ] [ i ] ) ;
indexHopeList . push_back ( indexHope ) ;
indexFearList . push_back ( i ) ;
} else if ( most_violated & & diff > currentViolation ) {
currentViolation = diff ;
bleuFear = bleuScores [ batchPosition ] [ i ] ;
indexFear = i ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , current violation: " < < currentViolation < < " ( " < < modelDiff < < " >= " < < bleuDiff < < " ) " < < endl ;
2013-05-29 21:16:15 +04:00
}
}
}
}
if ( most_violated ) {
if ( currentViolation > 0 ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , adding pair with violation " < < currentViolation < < endl ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , hope: " < < bleuHope < < " ( " < < indexHope < < " ), fear: " < < bleuFear < < " ( " < < indexFear < < " ) " < < endl ;
bleuScoresHope [ batchPosition ] . push_back ( bleuHope ) ;
bleuScoresFear [ batchPosition ] . push_back ( bleuFear ) ;
featureValuesHope [ batchPosition ] . push_back ( featureValues [ batchPosition ] [ indexHope ] ) ;
featureValuesFear [ batchPosition ] . push_back ( featureValues [ batchPosition ] [ indexFear ] ) ;
float modelScoreHope = modelScores [ batchPosition ] [ indexHope ] ;
float modelScoreFear = modelScores [ batchPosition ] [ indexFear ] ;
if ( most_violated_reg ) {
// reduce model score difference by factor ~0.5
float reg = currentViolation / 4 ;
modelScoreHope + = abs ( reg ) ;
modelScoreFear - = abs ( reg ) ;
float newViolation = ( bleuHope - bleuFear ) - ( modelScoreHope - modelScoreFear ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , regularized violation: " < < newViolation < < endl ;
}
modelScoresHope [ batchPosition ] . push_back ( modelScoreHope ) ;
modelScoresFear [ batchPosition ] . push_back ( modelScoreFear ) ;
featureValues [ batchPosition ] [ indexHope ] . IncrementSparseHopeFeatures ( ) ;
featureValues [ batchPosition ] [ indexFear ] . IncrementSparseFearFeatures ( ) ;
} else {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , no violated constraint found. " < < endl ;
skip_example = 1 ;
}
} else cerr < < endl ;
}
if ( max_bleu_diff ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , pick pair with max Bleu diff from list: " < < bleuScores [ batchPosition ] . size ( ) < < endl ;
for ( size_t i = 0 ; i < bleuScores [ batchPosition ] . size ( ) ; + + i ) {
float hopeScore = bleuScores [ batchPosition ] [ i ] ;
if ( modelPlusBleu ) hopeScore + = modelScores [ batchPosition ] [ i ] ;
BleuIndexPair hope ( hopeScore , i ) ;
queueHope . Push ( hope ) ;
float fearScore = - 1 * ( bleuScores [ batchPosition ] [ i ] ) ;
if ( modelPlusBleu ) fearScore + = modelScores [ batchPosition ] [ i ] ;
BleuIndexPair fear ( fearScore , i ) ;
queueFear . Push ( fear ) ;
}
skip_example = 0 ;
}
cerr < < endl ;
vector < BleuIndexPair > hopeList , fearList ;
for ( size_t i = 0 ; i < hope_n & & ! queueHope . Empty ( ) ; + + i ) hopeList . push_back ( queueHope . Pop ( ) ) ;
for ( size_t i = 0 ; i < fear_n & & ! queueFear . Empty ( ) ; + + i ) fearList . push_back ( queueFear . Pop ( ) ) ;
for ( size_t i = 0 ; i < hopeList . size ( ) ; + + i ) {
//float bleuHope = hopeList[i].first;
size_t indexHope = hopeList [ i ] . second ;
float bleuHope = bleuScores [ batchPosition ] [ indexHope ] ;
for ( size_t j = 0 ; j < fearList . size ( ) ; + + j ) {
//float bleuFear = -1*(fearList[j].first);
size_t indexFear = fearList [ j ] . second ;
float bleuFear = bleuScores [ batchPosition ] [ indexFear ] ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , hope: " < < bleuHope < < " ( " < < indexHope < < " ), fear: " < < bleuFear < < " ( " < < indexFear < < " ) " < < endl ;
bleuScoresHope [ batchPosition ] . push_back ( bleuHope ) ;
bleuScoresFear [ batchPosition ] . push_back ( bleuFear ) ;
featureValuesHope [ batchPosition ] . push_back ( featureValues [ batchPosition ] [ indexHope ] ) ;
featureValuesFear [ batchPosition ] . push_back ( featureValues [ batchPosition ] [ indexFear ] ) ;
float modelScoreHope = modelScores [ batchPosition ] [ indexHope ] ;
float modelScoreFear = modelScores [ batchPosition ] [ indexFear ] ;
modelScoresHope [ batchPosition ] . push_back ( modelScoreHope ) ;
modelScoresFear [ batchPosition ] . push_back ( modelScoreFear ) ;
featureValues [ batchPosition ] [ indexHope ] . IncrementSparseHopeFeatures ( ) ;
featureValues [ batchPosition ] [ indexFear ] . IncrementSparseFearFeatures ( ) ;
}
}
if ( ! makePairs )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " summing up hope and fear vectors, no pairs " < < endl ;
}
// next input sentence
+ + sid ;
+ + actualBatchSize ;
+ + shardPosition ;
2012-10-01 23:36:52 +04:00
} // end of batch loop
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( examples_in_batch = = 0 | | ( kbest & & skip_example ) ) {
2013-05-29 21:16:15 +04:00
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , batch is empty. " < < endl ;
} else {
vector < vector < float > > losses ( actualBatchSize ) ;
if ( model_hope_fear ) {
// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
for ( size_t batchPosition = 0 ; batchPosition < actualBatchSize ; + + batchPosition ) {
for ( size_t j = 0 ; j < bleuScores [ batchPosition ] . size ( ) ; + + j ) {
losses [ batchPosition ] . push_back ( oracleBleuScores [ batchPosition ] - bleuScores [ batchPosition ] [ j ] ) ;
}
}
}
// set weight for bleu feature to 0 before optimizing
vector < FeatureFunction * > : : const_iterator iter ;
const vector < FeatureFunction * > & featureFunctions2 = FeatureFunction : : GetFeatureFunctions ( ) ;
for ( iter = featureFunctions2 . begin ( ) ; iter ! = featureFunctions2 . end ( ) ; + + iter ) {
if ( ( * iter ) - > GetScoreProducerDescription ( ) = = " BleuScoreFeature " ) {
mosesWeights . Assign ( * iter , 0 ) ;
break ;
}
}
// scale LM feature (to avoid rapid changes)
if ( scale_lm ) {
cerr < < " scale lm " < < endl ;
const std : : vector < const StatefulFeatureFunction * > & statefulFFs = StatefulFeatureFunction : : GetStatefulFeatureFunctions ( ) ;
for ( size_t i = 0 ; i < statefulFFs . size ( ) ; + + i ) {
const StatefulFeatureFunction * ff = statefulFFs [ i ] ;
const LanguageModel * lm = dynamic_cast < const LanguageModel * > ( ff ) ;
if ( lm ) {
// scale down score
if ( model_hope_fear ) {
scaleFeatureScore ( lm , scale_lm_factor , featureValues , rank , epoch ) ;
} else {
scaleFeatureScore ( lm , scale_lm_factor , featureValuesHope , rank , epoch ) ;
scaleFeatureScore ( lm , scale_lm_factor , featureValuesFear , rank , epoch ) ;
}
}
}
}
// scale WP
if ( scale_wp ) {
// scale up weight
2013-12-06 19:57:49 +04:00
WordPenaltyProducer & wp = WordPenaltyProducer : : InstanceNonConst ( ) ;
2013-05-29 21:16:15 +04:00
// scale down score
if ( model_hope_fear ) {
2013-12-06 19:57:49 +04:00
scaleFeatureScore ( & wp , scale_wp_factor , featureValues , rank , epoch ) ;
2013-05-29 21:16:15 +04:00
} else {
2013-12-06 19:57:49 +04:00
scaleFeatureScore ( & wp , scale_wp_factor , featureValuesHope , rank , epoch ) ;
scaleFeatureScore ( & wp , scale_wp_factor , featureValuesFear , rank , epoch ) ;
2013-05-29 21:16:15 +04:00
}
}
// print out the feature values
if ( print_feature_values ) {
cerr < < " \n Rank " < < rank < < " , epoch " < < epoch < < " , feature values: " < < endl ;
if ( model_hope_fear ) printFeatureValues ( featureValues ) ;
else {
cerr < < " hope: " < < endl ;
printFeatureValues ( featureValuesHope ) ;
cerr < < " fear: " < < endl ;
printFeatureValues ( featureValuesFear ) ;
}
}
// apply learning rates to feature vectors before optimization
if ( feature_confidence ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , apply feature learning rates with decays " < < decay_core < < " / " < < decay_sparse < < " : " < < featureLearningRates < < endl ;
if ( model_hope_fear ) {
applyPerFeatureLearningRates ( featureValues , featureLearningRates , sparse_r0 ) ;
} else {
applyPerFeatureLearningRates ( featureValuesHope , featureLearningRates , sparse_r0 ) ;
applyPerFeatureLearningRates ( featureValuesFear , featureLearningRates , sparse_r0 ) ;
}
} else {
// apply fixed learning rates
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , apply fixed learning rates, core: " < < core_r0 < < " , sparse: " < < sparse_r0 < < endl ;
if ( core_r0 ! = 1.0 | | sparse_r0 ! = 1.0 ) {
if ( model_hope_fear ) {
applyLearningRates ( featureValues , core_r0 , sparse_r0 ) ;
} else {
applyLearningRates ( featureValuesHope , core_r0 , sparse_r0 ) ;
applyLearningRates ( featureValuesFear , core_r0 , sparse_r0 ) ;
}
}
}
// Run optimiser on batch:
VERBOSE ( 1 , " \n Rank " < < rank < < " , epoch " < < epoch < < " , run optimiser: " < < endl ) ;
size_t update_status = 1 ;
ScoreComponentCollection weightUpdate ;
if ( perceptron_update ) {
vector < vector < float > > dummy1 ;
update_status = optimiser - > updateWeightsHopeFear ( weightUpdate , featureValuesHope ,
featureValuesFear , dummy1 , dummy1 , dummy1 , dummy1 , learning_rate , rank , epoch ) ;
} else if ( hope_fear ) {
if ( bleuScoresHope [ 0 ] [ 0 ] > = min_oracle_bleu ) {
if ( hope_n = = 1 & & fear_n = = 1 & & batchSize = = 1 & & ! hildreth ) {
update_status = ( ( MiraOptimiser * ) optimiser ) - > updateWeightsAnalytically ( weightUpdate ,
featureValuesHope [ 0 ] [ 0 ] , featureValuesFear [ 0 ] [ 0 ] , bleuScoresHope [ 0 ] [ 0 ] ,
bleuScoresFear [ 0 ] [ 0 ] , modelScoresHope [ 0 ] [ 0 ] , modelScoresFear [ 0 ] [ 0 ] , learning_rate , rank , epoch ) ;
} else
update_status = optimiser - > updateWeightsHopeFear ( weightUpdate , featureValuesHope ,
featureValuesFear , bleuScoresHope , bleuScoresFear , modelScoresHope ,
modelScoresFear , learning_rate , rank , epoch ) ;
} else
update_status = 1 ;
} else if ( kbest ) {
2013-10-05 17:34:05 +04:00
if ( batchSize = = 1 & & featureValuesHope [ 0 ] . size ( ) = = 1 & & ! hildreth ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , model score hope: " < < modelScoresHope [ 0 ] [ 0 ] < < endl ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , model score fear: " < < modelScoresFear [ 0 ] [ 0 ] < < endl ;
update_status = ( ( MiraOptimiser * ) optimiser ) - > updateWeightsAnalytically (
2014-01-15 19:49:57 +04:00
weightUpdate , featureValuesHope [ 0 ] [ 0 ] , featureValuesFear [ 0 ] [ 0 ] ,
bleuScoresHope [ 0 ] [ 0 ] , bleuScoresFear [ 0 ] [ 0 ] , modelScoresHope [ 0 ] [ 0 ] ,
modelScoresFear [ 0 ] [ 0 ] , learning_rate , rank , epoch ) ;
2013-10-05 17:34:05 +04:00
} else {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , model score hope: " < < modelScoresHope [ 0 ] [ 0 ] < < endl ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , model score fear: " < < modelScoresFear [ 0 ] [ 0 ] < < endl ;
update_status = optimiser - > updateWeightsHopeFear ( weightUpdate , featureValuesHope ,
2014-01-15 19:49:57 +04:00
featureValuesFear , bleuScoresHope , bleuScoresFear , modelScoresHope ,
modelScoresFear , learning_rate , rank , epoch ) ;
2013-05-29 21:16:15 +04:00
}
} else {
// model_hope_fear
update_status = ( ( MiraOptimiser * ) optimiser ) - > updateWeights ( weightUpdate ,
featureValues , losses , bleuScores , modelScores , oracleFeatureValues ,
oracleBleuScores , oracleModelScores , learning_rate , rank , epoch ) ;
}
// sumStillViolatedConstraints += update_status;
if ( update_status = = 0 ) { // if weights were updated
// apply weight update
if ( debug )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , update: " < < weightUpdate < < endl ;
if ( feature_confidence ) {
// update confidence counts based on weight update
confidenceCounts . UpdateConfidenceCounts ( weightUpdate , signed_counts ) ;
// update feature learning rates
featureLearningRates . UpdateLearningRates ( decay_core , decay_sparse , confidenceCounts , core_r0 , sparse_r0 ) ;
}
// apply weight update to Moses weights
mosesWeights . PlusEquals ( weightUpdate ) ;
if ( normaliseWeights )
mosesWeights . L1Normalise ( ) ;
cumulativeWeights . PlusEquals ( mosesWeights ) ;
if ( sparseAverage ) {
ScoreComponentCollection binary ;
binary . SetToBinaryOf ( mosesWeights ) ;
cumulativeWeightsBinary . PlusEquals ( binary ) ;
}
+ + numberOfUpdates ;
+ + numberOfUpdatesThisEpoch ;
if ( averageWeights ) {
ScoreComponentCollection averageWeights ( cumulativeWeights ) ;
if ( accumulateWeights ) {
averageWeights . DivideEquals ( numberOfUpdates ) ;
} else {
averageWeights . DivideEquals ( numberOfUpdatesThisEpoch ) ;
}
mosesWeights = averageWeights ;
}
// set new Moses weights
decoder - > setWeights ( mosesWeights ) ;
//cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
}
// update history (for approximate document Bleu)
if ( historyBleu | | simpleHistoryBleu ) {
for ( size_t i = 0 ; i < oneBests . size ( ) ; + + i )
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , update history with 1best length: " < < oneBests [ i ] . size ( ) < < " " ;
decoder - > updateHistory ( oneBests , inputLengths , ref_ids , rank , epoch ) ;
deleteTranslations ( oneBests ) ;
}
2012-10-01 23:36:52 +04:00
} // END TRANSLATE AND UPDATE BATCH
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// size of all shards except for the last one
size_t generalShardSize ;
if ( trainWithMultipleFolds )
2013-05-29 21:16:15 +04:00
generalShardSize = order . size ( ) / coresPerFold ;
2012-10-01 23:36:52 +04:00
else
2013-05-29 21:16:15 +04:00
generalShardSize = order . size ( ) / size ;
2012-10-01 23:36:52 +04:00
size_t mixing_base = mixingFrequency = = 0 ? 0 : generalShardSize / mixingFrequency ;
size_t dumping_base = weightDumpFrequency = = 0 ? 0 : generalShardSize / weightDumpFrequency ;
bool mix = evaluateModulo ( shardPosition , mixing_base , actualBatchSize ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// mix weights?
if ( mix ) {
2011-03-07 17:12:36 +03:00
# ifdef MPI_ENABLE
2013-05-29 21:16:15 +04:00
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , mixing weights.. " < < endl ;
// collect all weights in mixedWeights and divide by number of processes
mpi : : reduce ( world , mosesWeights , mixedWeights , SCCPlus ( ) , 0 ) ;
// mix confidence counts
//mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
ScoreComponentCollection totalBinary ;
if ( sparseAverage ) {
ScoreComponentCollection binary ;
binary . SetToBinaryOf ( mosesWeights ) ;
mpi : : reduce ( world , binary , totalBinary , SCCPlus ( ) , 0 ) ;
}
if ( rank = = 0 ) {
// divide by number of processes
if ( sparseNoAverage )
mixedWeights . CoreDivideEquals ( size ) ; // average only core weights
else if ( sparseAverage )
mixedWeights . DivideEquals ( totalBinary ) ;
else
mixedWeights . DivideEquals ( size ) ;
// divide confidence counts
//mixedConfidenceCounts.DivideEquals(size);
// normalise weights after averaging
if ( normaliseWeights ) {
mixedWeights . L1Normalise ( ) ;
}
+ + weightMixingThisEpoch ;
if ( pruneZeroWeights ) {
size_t pruned = mixedWeights . PruneZeroWeightFeatures ( ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < pruned < < " zero-weighted features pruned from mixedWeights. " < < endl ;
pruned = cumulativeWeights . PruneZeroWeightFeatures ( ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < pruned < < " zero-weighted features pruned from cumulativeWeights. " < < endl ;
}
if ( featureCutoff ! = - 1 & & weightMixingThisEpoch = = mixingFrequency ) {
size_t pruned = mixedWeights . PruneSparseFeatures ( featureCutoff ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < pruned < < " features pruned from mixedWeights. " < < endl ;
pruned = cumulativeWeights . PruneSparseFeatures ( featureCutoff ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < pruned < < " features pruned from cumulativeWeights. " < < endl ;
}
if ( weightMixingThisEpoch = = mixingFrequency | | reg_on_every_mix ) {
if ( l1_regularize ) {
size_t pruned ;
if ( l1_reg_sparse )
pruned = mixedWeights . SparseL1Regularize ( l1_lambda ) ;
else
pruned = mixedWeights . L1Regularize ( l1_lambda ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < " l1-reg. on mixedWeights with lambda= " < < l1_lambda < < " , pruned: " < < pruned < < endl ;
}
if ( l2_regularize ) {
if ( l2_reg_sparse )
mixedWeights . SparseL2Regularize ( l2_lambda ) ;
else
mixedWeights . L2Regularize ( l2_lambda ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < " l2-reg. on mixedWeights with lambda= " < < l2_lambda < < endl ;
}
}
}
// broadcast average weights from process 0
mpi : : broadcast ( world , mixedWeights , 0 ) ;
decoder - > setWeights ( mixedWeights ) ;
mosesWeights = mixedWeights ;
// broadcast summed confidence counts
//mpi::broadcast(world, mixedConfidenceCounts, 0);
//confidenceCounts = mixedConfidenceCounts;
2010-11-24 20:06:54 +03:00
# endif
# ifndef MPI_ENABLE
2013-05-29 21:16:15 +04:00
//cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
mixedWeights = mosesWeights ;
2010-12-06 18:28:51 +03:00
# endif
2012-10-01 23:36:52 +04:00
} // end mixing
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// Dump weights?
if ( trainWithMultipleFolds | | weightEpochDump = = weightDumpFrequency ) {
2013-05-29 21:16:15 +04:00
// dump mixed weights at end of every epoch to enable continuing a crashed experiment
// (for jackknife every time the weights are mixed)
ostringstream filename ;
if ( epoch < 10 )
filename < < weightDumpStem < < " _mixed_0 " < < epoch ;
else
filename < < weightDumpStem < < " _mixed_ " < < epoch ;
if ( weightDumpFrequency > 1 )
filename < < " _ " < < weightEpochDump ;
mixedWeights . Save ( filename . str ( ) ) ;
cerr < < " Dumping mixed weights during epoch " < < epoch < < " to " < < filename . str ( ) < < endl < < endl ;
2012-10-01 23:36:52 +04:00
}
if ( dumpMixedWeights ) {
2013-05-29 21:16:15 +04:00
if ( mix & & rank = = 0 & & ! weightDumpStem . empty ( ) ) {
// dump mixed weights instead of average weights
ostringstream filename ;
if ( epoch < 10 )
filename < < weightDumpStem < < " _0 " < < epoch ;
else
filename < < weightDumpStem < < " _ " < < epoch ;
if ( weightDumpFrequency > 1 )
filename < < " _ " < < weightEpochDump ;
cerr < < " Dumping mixed weights during epoch " < < epoch < < " to " < < filename . str ( ) < < endl < < endl ;
mixedWeights . Save ( filename . str ( ) ) ;
+ + weightEpochDump ;
}
} else {
if ( evaluateModulo ( shardPosition , dumping_base , actualBatchSize ) ) {
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , dump weights.. (pos: " < < shardPosition < < " , base: " < < dumping_base < < " ) " < < endl ;
ScoreComponentCollection tmpAverageWeights ( cumulativeWeights ) ;
bool proceed = false ;
if ( accumulateWeights ) {
if ( numberOfUpdates > 0 ) {
tmpAverageWeights . DivideEquals ( numberOfUpdates ) ;
proceed = true ;
}
} else {
if ( numberOfUpdatesThisEpoch > 0 ) {
if ( sparseNoAverage ) // average only core weights
tmpAverageWeights . CoreDivideEquals ( numberOfUpdatesThisEpoch ) ;
else if ( sparseAverage )
tmpAverageWeights . DivideEquals ( cumulativeWeightsBinary ) ;
else
tmpAverageWeights . DivideEquals ( numberOfUpdatesThisEpoch ) ;
proceed = true ;
}
}
if ( proceed ) {
2011-03-08 19:58:02 +03:00
# ifdef MPI_ENABLE
2013-05-29 21:16:15 +04:00
// average across processes
mpi : : reduce ( world , tmpAverageWeights , mixedAverageWeights , SCCPlus ( ) , 0 ) ;
ScoreComponentCollection totalBinary ;
if ( sparseAverage ) {
ScoreComponentCollection binary ;
binary . SetToBinaryOf ( mosesWeights ) ;
mpi : : reduce ( world , binary , totalBinary , SCCPlus ( ) , 0 ) ;
}
2011-03-08 19:58:02 +03:00
# endif
# ifndef MPI_ENABLE
2013-05-29 21:16:15 +04:00
mixedAverageWeights = tmpAverageWeights ;
//FIXME: What do to for non-mpi version
ScoreComponentCollection totalBinary ;
2011-03-08 19:58:02 +03:00
# endif
2013-05-29 21:16:15 +04:00
if ( rank = = 0 & & ! weightDumpStem . empty ( ) ) {
// divide by number of processes
if ( sparseNoAverage )
mixedAverageWeights . CoreDivideEquals ( size ) ; // average only core weights
else if ( sparseAverage )
mixedAverageWeights . DivideEquals ( totalBinary ) ;
else
mixedAverageWeights . DivideEquals ( size ) ;
// normalise weights after averaging
if ( normaliseWeights ) {
mixedAverageWeights . L1Normalise ( ) ;
}
// dump final average weights
ostringstream filename ;
if ( epoch < 10 ) {
filename < < weightDumpStem < < " _0 " < < epoch ;
} else {
filename < < weightDumpStem < < " _ " < < epoch ;
}
if ( weightDumpFrequency > 1 ) {
filename < < " _ " < < weightEpochDump ;
}
/*if (accumulateWeights) {
cerr < < " \n Mixed average weights (cumulative) during epoch " < < epoch < < " : " < < mixedAverageWeights < < endl ;
} else {
cerr < < " \n Mixed average weights during epoch " < < epoch < < " : " < < mixedAverageWeights < < endl ;
} */
cerr < < " Dumping mixed average weights during epoch " < < epoch < < " to " < < filename . str ( ) < < endl < < endl ;
mixedAverageWeights . Save ( filename . str ( ) ) ;
+ + weightEpochDump ;
if ( weightEpochDump = = weightDumpFrequency ) {
if ( l1_regularize ) {
size_t pruned = mixedAverageWeights . SparseL1Regularize ( l1_lambda ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < " l1-reg. on mixedAverageWeights with lambda= " < < l1_lambda < < " , pruned: " < < pruned < < endl ;
}
if ( l2_regularize ) {
mixedAverageWeights . SparseL2Regularize ( l2_lambda ) ;
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , "
< < " l2-reg. on mixedAverageWeights with lambda= " < < l2_lambda < < endl ;
}
if ( l1_regularize | | l2_regularize ) {
filename < < " _reg " ;
cerr < < " Dumping regularized mixed average weights during epoch " < < epoch < < " to " < < filename . str ( ) < < endl < < endl ;
mixedAverageWeights . Save ( filename . str ( ) ) ;
}
}
if ( weightEpochDump = = weightDumpFrequency & & printFeatureCounts ) {
// print out all features with counts
stringstream s1 , s2 ;
s1 < < " sparse_feature_hope_counts " < < " _ " < < epoch ;
s2 < < " sparse_feature_fear_counts " < < " _ " < < epoch ;
ofstream sparseFeatureCountsHope ( s1 . str ( ) . c_str ( ) ) ;
ofstream sparseFeatureCountsFear ( s2 . str ( ) . c_str ( ) ) ;
mixedAverageWeights . PrintSparseHopeFeatureCounts ( sparseFeatureCountsHope ) ;
mixedAverageWeights . PrintSparseFearFeatureCounts ( sparseFeatureCountsFear ) ;
sparseFeatureCountsHope . close ( ) ;
sparseFeatureCountsFear . close ( ) ;
}
}
}
} // end dumping
2012-10-01 23:36:52 +04:00
} // end if dump
} // end of shard loop, end of this epoch
cerr < < " Rank " < < rank < < " , epoch " < < epoch < < " , end of epoch.. " < < endl ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( historyBleu | | simpleHistoryBleu ) {
cerr < < " Bleu feature history after epoch " < < epoch < < endl ;
decoder - > printBleuFeatureHistory ( cerr ) ;
}
// cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// Check whether there were any weight updates during this epoch
size_t sumUpdates ;
size_t * sendbuf_uint , * recvbuf_uint ;
sendbuf_uint = ( size_t * ) malloc ( sizeof ( size_t ) ) ;
recvbuf_uint = ( size_t * ) malloc ( sizeof ( size_t ) ) ;
2011-04-09 01:04:08 +04:00
# ifdef MPI_ENABLE
2012-10-01 23:36:52 +04:00
sendbuf_uint [ 0 ] = numberOfUpdatesThisEpoch ;
recvbuf_uint [ 0 ] = 0 ;
MPI_Reduce ( sendbuf_uint , recvbuf_uint , 1 , MPI_UNSIGNED , MPI_SUM , 0 , world ) ;
sumUpdates = recvbuf_uint [ 0 ] ;
2011-04-09 01:04:08 +04:00
# endif
# ifndef MPI_ENABLE
2012-10-01 23:36:52 +04:00
sumUpdates = numberOfUpdatesThisEpoch ;
2011-04-09 01:04:08 +04:00
# endif
2012-10-01 23:36:52 +04:00
if ( rank = = 0 & & sumUpdates = = 0 ) {
cerr < < " \n No weight updates during this epoch.. stopping. " < < endl ;
stop = true ;
2011-04-10 23:05:36 +04:00
# ifdef MPI_ENABLE
2012-10-01 23:36:52 +04:00
mpi : : broadcast ( world , stop , 0 ) ;
2011-04-10 23:05:36 +04:00
# endif
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( ! stop ) {
// Test if weights have converged
if ( weightConvergence ) {
2013-05-29 21:16:15 +04:00
bool reached = true ;
if ( rank = = 0 & & ( epoch > = 2 ) ) {
ScoreComponentCollection firstDiff , secondDiff ;
if ( dumpMixedWeights ) {
firstDiff = mixedWeights ;
firstDiff . MinusEquals ( mixedWeightsPrevious ) ;
secondDiff = mixedWeights ;
secondDiff . MinusEquals ( mixedWeightsBeforePrevious ) ;
} else {
firstDiff = mixedAverageWeights ;
firstDiff . MinusEquals ( mixedAverageWeightsPrevious ) ;
secondDiff = mixedAverageWeights ;
secondDiff . MinusEquals ( mixedAverageWeightsBeforePrevious ) ;
}
VERBOSE ( 1 , " Average weight changes since previous epoch: " < < firstDiff < < " (max: " < < firstDiff . GetLInfNorm ( ) < < " ) " < < endl ) ;
VERBOSE ( 1 , " Average weight changes since before previous epoch: " < < secondDiff < < " (max: " < < secondDiff . GetLInfNorm ( ) < < " ) " < < endl < < endl ) ;
// check whether stopping criterion has been reached
// (both difference vectors must have all weight changes smaller than min_weight_change)
if ( firstDiff . GetLInfNorm ( ) > = min_weight_change )
reached = false ;
if ( secondDiff . GetLInfNorm ( ) > = min_weight_change )
reached = false ;
if ( reached ) {
// stop MIRA
stop = true ;
cerr < < " \n Weights have converged after epoch " < < epoch < < " .. stopping MIRA. " < < endl ;
ScoreComponentCollection dummy ;
ostringstream endfilename ;
endfilename < < " stopping " ;
dummy . Save ( endfilename . str ( ) ) ;
}
}
mixedWeightsBeforePrevious = mixedWeightsPrevious ;
mixedWeightsPrevious = mixedWeights ;
mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious ;
mixedAverageWeightsPrevious = mixedAverageWeights ;
2011-03-04 15:24:10 +03:00
# ifdef MPI_ENABLE
2013-05-29 21:16:15 +04:00
mpi : : broadcast ( world , stop , 0 ) ;
2011-03-04 15:24:10 +03:00
# endif
2012-10-01 23:36:52 +04:00
} //end if (weightConvergence)
}
} // end of epoch loop
2013-05-29 21:16:15 +04:00
2010-12-01 21:09:49 +03:00
# ifdef MPI_ENABLE
2012-10-01 23:36:52 +04:00
MPI_Finalize ( ) ;
2010-12-06 18:28:51 +03:00
# endif
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
time ( & now ) ;
cerr < < " Rank " < < rank < < " , " < < ctime ( & now ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( rank = = 0 ) {
ScoreComponentCollection dummy ;
ostringstream endfilename ;
endfilename < < " finished " ;
dummy . Save ( endfilename . str ( ) ) ;
}
2010-12-06 18:28:51 +03:00
2012-10-01 23:36:52 +04:00
delete decoder ;
exit ( 0 ) ;
2010-09-15 18:36:07 +04:00
}
2010-09-16 20:23:52 +04:00
2013-05-29 21:16:15 +04:00
bool loadSentences ( const string & filename , vector < string > & sentences )
{
2012-10-01 23:36:52 +04:00
ifstream in ( filename . c_str ( ) ) ;
if ( ! in )
return false ;
string line ;
while ( getline ( in , line ) )
sentences . push_back ( line ) ;
return true ;
2011-06-29 20:18:55 +04:00
}
2013-05-29 21:16:15 +04:00
bool evaluateModulo ( size_t shard_position , size_t mix_or_dump_base , size_t actual_batch_size )
{
2012-10-01 23:36:52 +04:00
if ( mix_or_dump_base = = 0 ) return 0 ;
if ( actual_batch_size > 1 ) {
bool mix_or_dump = false ;
size_t numberSubtracts = actual_batch_size ;
do {
if ( shard_position % mix_or_dump_base = = 0 ) {
2013-05-29 21:16:15 +04:00
mix_or_dump = true ;
break ;
2012-10-01 23:36:52 +04:00
}
- - shard_position ;
- - numberSubtracts ;
} while ( numberSubtracts > 0 ) ;
return mix_or_dump ;
2013-05-29 21:16:15 +04:00
} else {
2012-10-01 23:36:52 +04:00
return ( ( shard_position % mix_or_dump_base ) = = 0 ) ;
}
2011-06-29 20:18:55 +04:00
}
2013-05-29 21:16:15 +04:00
void printFeatureValues ( vector < vector < ScoreComponentCollection > > & featureValues )
{
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < featureValues . size ( ) ; + + i ) {
for ( size_t j = 0 ; j < featureValues [ i ] . size ( ) ; + + j ) {
cerr < < featureValues [ i ] [ j ] < < endl ;
}
}
cerr < < endl ;
2012-01-12 20:26:16 +04:00
}
2013-05-29 21:16:15 +04:00
void deleteTranslations ( vector < vector < const Word * > > & translations )
{
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < translations . size ( ) ; + + i ) {
for ( size_t j = 0 ; j < translations [ i ] . size ( ) ; + + j ) {
delete translations [ i ] [ j ] ;
}
}
2011-06-29 20:18:55 +04:00
}
2012-03-01 16:22:12 +04:00
2013-05-29 21:16:15 +04:00
void decodeHopeOrFear ( size_t rank , size_t size , size_t decode , string filename , vector < string > & inputSentences , MosesDecoder * decoder , size_t n , float bleuWeight )
{
2012-10-01 23:36:52 +04:00
if ( decode = = 1 )
cerr < < " Rank " < < rank < < " , decoding dev input set according to hope objective.. " < < endl ;
else if ( decode = = 2 )
cerr < < " Rank " < < rank < < " , decoding dev input set according to fear objective.. " < < endl ;
else
cerr < < " Rank " < < rank < < " , decoding dev input set according to normal objective.. " < < endl ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// Create shards according to the number of processes used
vector < size_t > order ;
for ( size_t i = 0 ; i < inputSentences . size ( ) ; + + i )
order . push_back ( i ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
vector < size_t > shard ;
float shardSize = ( float ) ( order . size ( ) ) / size ;
size_t shardStart = ( size_t ) ( shardSize * rank ) ;
size_t shardEnd = ( size_t ) ( shardSize * ( rank + 1 ) ) ;
if ( rank = = size - 1 ) {
shardEnd = inputSentences . size ( ) ;
shardSize = shardEnd - shardStart ;
}
VERBOSE ( 1 , " Rank " < < rank < < " , shard start: " < < shardStart < < " Shard end: " < < shardEnd < < endl ) ;
VERBOSE ( 1 , " Rank " < < rank < < " , shard size: " < < shardSize < < endl ) ;
shard . resize ( shardSize ) ;
copy ( order . begin ( ) + shardStart , order . begin ( ) + shardEnd , shard . begin ( ) ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
// open files for writing
stringstream fname ;
fname < < filename < < " .rank " < < rank ;
filename = fname . str ( ) ;
ostringstream filename_nbest ;
filename_nbest < < filename < < " . " < < n < < " best " ;
ofstream out ( filename . c_str ( ) ) ;
ofstream nbest_out ( ( filename_nbest . str ( ) ) . c_str ( ) ) ;
if ( ! out ) {
ostringstream msg ;
msg < < " Unable to open " < < fname . str ( ) ;
throw runtime_error ( msg . str ( ) ) ;
}
if ( ! nbest_out ) {
ostringstream msg ;
msg < < " Unable to open " < < filename_nbest ;
throw runtime_error ( msg . str ( ) ) ;
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < shard . size ( ) ; + + i ) {
size_t sid = shard [ i ] ;
string & input = inputSentences [ sid ] ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
vector < vector < ScoreComponentCollection > > dummyFeatureValues ;
vector < vector < float > > dummyBleuScores ;
vector < vector < float > > dummyModelScores ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
vector < ScoreComponentCollection > newFeatureValues ;
vector < float > newScores ;
dummyFeatureValues . push_back ( newFeatureValues ) ;
dummyBleuScores . push_back ( newScores ) ;
dummyModelScores . push_back ( newScores ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
float factor = 0.0 ;
if ( decode = = 1 ) factor = 1.0 ;
if ( decode = = 2 ) factor = - 1.0 ;
cerr < < " Rank " < < rank < < " , translating sentence " < < sid < < endl ;
bool realBleu = false ;
vector < vector < const Word * > > nbestOutput = decoder - > getNBest ( input , sid , n , factor , bleuWeight , dummyFeatureValues [ 0 ] ,
2013-05-29 21:16:15 +04:00
dummyBleuScores [ 0 ] , dummyModelScores [ 0 ] , n , realBleu , true , false , rank , 0 , " " ) ;
2012-10-01 23:36:52 +04:00
cerr < < endl ;
2013-06-18 18:21:47 +04:00
decoder - > cleanup ( StaticData : : Instance ( ) . IsChart ( ) ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < nbestOutput . size ( ) ; + + i ) {
vector < const Word * > output = nbestOutput [ i ] ;
stringstream translation ;
for ( size_t k = 0 ; k < output . size ( ) ; + + k ) {
2013-05-29 21:16:15 +04:00
Word * w = const_cast < Word * > ( output [ k ] ) ;
translation < < w - > GetString ( 0 ) ;
translation < < " " ;
2012-10-01 23:36:52 +04:00
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
if ( i = = 0 )
2013-05-29 21:16:15 +04:00
out < < translation . str ( ) < < endl ;
2012-10-01 23:36:52 +04:00
nbest_out < < sid < < " ||| " < < translation . str ( ) < < " ||| " < < dummyFeatureValues [ 0 ] [ i ] < <
2013-05-29 21:16:15 +04:00
" ||| " < < dummyModelScores [ 0 ] [ i ] < < " ||| sBleu= " < < dummyBleuScores [ 0 ] [ i ] < < endl ;
2012-10-01 23:36:52 +04:00
}
}
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
out . close ( ) ;
nbest_out . close ( ) ;
cerr < < " Closing files " < < filename < < " and " < < filename_nbest . str ( ) < < endl ;
2013-05-29 21:16:15 +04:00
2012-03-07 13:47:04 +04:00
# ifdef MPI_ENABLE
2012-10-01 23:36:52 +04:00
MPI_Finalize ( ) ;
2012-03-07 13:47:04 +04:00
# endif
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
time_t now ;
time ( & now ) ;
cerr < < " Rank " < < rank < < " , " < < ctime ( & now ) ;
2013-05-29 21:16:15 +04:00
2012-10-01 23:36:52 +04:00
delete decoder ;
exit ( 0 ) ;
2012-03-01 16:22:12 +04:00
}
2012-04-29 08:37:48 +04:00
2013-05-29 21:16:15 +04:00
void applyLearningRates ( vector < vector < ScoreComponentCollection > > & featureValues , float core_r0 , float sparse_r0 )
{
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < featureValues . size ( ) ; + + i ) // each item in batch
for ( size_t j = 0 ; j < featureValues [ i ] . size ( ) ; + + j ) // each item in nbest
2012-05-18 21:58:42 +04:00
featureValues [ i ] [ j ] . MultiplyEquals ( core_r0 , sparse_r0 ) ;
}
2013-05-29 21:16:15 +04:00
void applyPerFeatureLearningRates ( vector < vector < ScoreComponentCollection > > & featureValues , ScoreComponentCollection featureLearningRates , float sparse_r0 )
{
2012-10-01 23:36:52 +04:00
for ( size_t i = 0 ; i < featureValues . size ( ) ; + + i ) // each item in batch
for ( size_t j = 0 ; j < featureValues [ i ] . size ( ) ; + + j ) // each item in nbest
2012-05-18 21:58:42 +04:00
featureValues [ i ] [ j ] . MultiplyEqualsBackoff ( featureLearningRates , sparse_r0 ) ;
}
2013-05-29 21:16:15 +04:00
void scaleFeatureScore ( const FeatureFunction * sp , float scaling_factor , vector < vector < ScoreComponentCollection > > & featureValues , size_t rank , size_t epoch )
{
2012-12-04 22:39:06 +04:00
string name = sp - > GetScoreProducerDescription ( ) ;
2012-04-29 08:37:48 +04:00
// scale down score
float featureScore ;
for ( size_t i = 0 ; i < featureValues . size ( ) ; + + i ) { // each item in batch
for ( size_t j = 0 ; j < featureValues [ i ] . size ( ) ; + + j ) { // each item in nbest
featureScore = featureValues [ i ] [ j ] . GetScoreForProducer ( sp ) ;
2012-06-01 04:49:42 +04:00
featureValues [ i ] [ j ] . Assign ( sp , featureScore * scaling_factor ) ;
2012-05-02 20:57:24 +04:00
//cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
2012-04-29 08:37:48 +04:00
}
}
}
2012-05-03 01:54:23 +04:00
2013-05-29 21:16:15 +04:00
void scaleFeatureScores ( const FeatureFunction * sp , float scaling_factor , vector < vector < ScoreComponentCollection > > & featureValues , size_t rank , size_t epoch )
{
2012-12-04 22:39:06 +04:00
string name = sp - > GetScoreProducerDescription ( ) ;
2012-05-03 01:54:23 +04:00
2012-10-01 23:36:52 +04:00
// scale down score
for ( size_t i = 0 ; i < featureValues . size ( ) ; + + i ) { // each item in batch
for ( size_t j = 0 ; j < featureValues [ i ] . size ( ) ; + + j ) { // each item in nbest
2012-05-03 01:54:23 +04:00
vector < float > featureScores = featureValues [ i ] [ j ] . GetScoresForProducer ( sp ) ;
for ( size_t k = 0 ; k < featureScores . size ( ) ; + + k )
2013-05-29 21:16:15 +04:00
featureScores [ k ] * = scaling_factor ;
2012-05-03 01:54:23 +04:00
featureValues [ i ] [ j ] . Assign ( sp , featureScores ) ;
2012-10-01 23:36:52 +04:00
//cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
2012-05-03 01:54:23 +04:00
}
}
}