2012-12-14 17:29:26 +04:00
#!/usr/bin/perl -w
2010-11-10 14:25:40 +03:00
# $Id$
2008-06-10 13:07:20 +04:00
# Usage:
# mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
# For other options see below or run 'mert-moses.pl --help'
# Notes:
# <foreign> and <english> should be raw text files, one sentence per line
# <english> can be a prefix, in which case the files are <english>0, <english>1, etc. are used
2011-08-17 13:15:19 +04:00
# Excerpts from revision history
2008-06-10 13:07:20 +04:00
2011-09-07 12:08:35 +04:00
# Sept 2011 multi-threaded mert (Barry Haddow)
2011-09-07 20:37:33 +04:00
# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
2011-08-17 13:15:19 +04:00
# Jul 2011 simplifications (Ondrej Bojar)
2012-05-01 22:46:36 +04:00
# -- rely on moses' -show-weights instead of parsing moses.ini
2011-08-17 13:15:19 +04:00
# ... so moses is also run once *before* mert starts, checking
# the model to some extent
# -- got rid of the 'triples' mess;
# use --range to supply bounds for random starting values:
# --range tm:-3..3 --range lm:-3..3
2012-05-01 22:46:36 +04:00
# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU
2009-08-05 20:39:06 +04:00
# and case-sensistive/insensitive evaluation (Nicola Bertoldi)
2008-06-10 13:07:20 +04:00
# 5 Jun 2008 Forked previous version to support new mert implementation.
# 13 Feb 2007 Better handling of default values for lambda, now works with multiple
# models and lexicalized reordering
# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1]
# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi)
# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table)
# useful if binary phrase tables are used (Nicola Bertoldi)
# 28 Aug 2006 Use either closest or average or shortest (default) reference
# length as effective reference length
# Use either normalization or not (default) of texts (Nicola Bertoldi)
# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings
# adding default paths
# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again)
# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar)
# 27 Jul 2006 adding the safesystem() function to handle with process failure
2012-05-01 22:46:36 +04:00
# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi)
# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi)
2008-06-10 13:07:20 +04:00
# 18 Jul 2006 adapted for Moses and cleaned up (PK)
# 21 Jan 2005 unified various versions, thorough cleanup (DWC)
# now indexing accumulated n-best list solely by feature vectors
# 14 Dec 2004 reimplemented find_threshold_points in C (NMD)
# 25 Oct 2004 Use either average or shortest (default) reference
# length as effective reference length (DWC)
# 13 Oct 2004 Use alternative decoders (DWC)
# Original version by Philipp Koehn
2011-12-08 21:10:40 +04:00
use strict ;
2012-06-26 19:54:16 +04:00
use FindBin qw( $RealBin ) ;
2008-06-10 13:07:20 +04:00
use File::Basename ;
2011-08-17 13:15:19 +04:00
use File::Path ;
2011-12-08 21:10:40 +04:00
use File::Spec ;
use Cwd ;
2012-06-26 19:54:16 +04:00
my $ SCRIPTS_ROOTDIR = $ RealBin ;
2008-06-10 13:07:20 +04:00
$ SCRIPTS_ROOTDIR =~ s/\/training$// ;
$ SCRIPTS_ROOTDIR = $ ENV { "SCRIPTS_ROOTDIR" } if defined ( $ ENV { "SCRIPTS_ROOTDIR" } ) ;
my $ minimum_required_change_in_weights = 0.00001 ;
# stop if no lambda changes more than this
my $ verbose = 0 ;
my $ usage = 0 ; # request for --help
2011-12-08 21:10:40 +04:00
# We assume that if you don't specify working directory,
# we set the default is set to `pwd`/mert-work
my $ ___WORKING_DIR = File::Spec - > catfile ( Cwd:: getcwd ( ) , "mert-work" ) ;
2008-06-10 13:07:20 +04:00
my $ ___DEV_F = undef ; # required, input text to decode
my $ ___DEV_E = undef ; # required, basename of files with references
my $ ___DECODER = undef ; # required, pathname to the decoder executable
my $ ___CONFIG = undef ; # required, pathname to startup ini file
my $ ___N_BEST_LIST_SIZE = 100 ;
2011-10-05 00:45:47 +04:00
my $ ___LATTICE_SAMPLES = 0 ;
2010-09-04 04:16:26 +04:00
my $ queue_flags = "-hard" ; # extra parameters for parallelizer
2011-08-17 13:15:19 +04:00
# the -l ws0ssmt was relevant only to JHU 2006 workshop
2011-03-09 13:43:34 +03:00
my $ ___JOBS = undef ; # if parallel, number of jobs to use (undef or 0 -> serial)
2008-06-10 13:07:20 +04:00
my $ ___DECODER_FLAGS = "" ; # additional parametrs to pass to the decoder
my $ continue = 0 ; # should we try to continue from the last saved step?
my $ skip_decoder = 0 ; # and should we skip the first decoder run (assuming we got interrupted during mert)
my $ ___FILTER_PHRASE_TABLE = 1 ; # filter phrase table
2009-02-25 22:31:17 +03:00
my $ ___PREDICTABLE_SEEDS = 0 ;
2011-07-23 04:24:45 +04:00
my $ ___START_WITH_HISTORIC_BESTS = 0 ; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009]
my $ ___RANDOM_DIRECTIONS = 0 ; # search in random directions only
my $ ___NUM_RANDOM_DIRECTIONS = 0 ; # number of random directions, also works with default optimizer [Cer&al.,2008]
2011-08-17 13:15:19 +04:00
my $ ___RANDOM_RESTARTS = 20 ;
2012-06-12 17:16:11 +04:00
my $ ___RETURN_BEST_DEV = 0 ; # return the best weights according to dev, not the last
2012-05-03 03:29:39 +04:00
# Flags related to PRO (Hopkins & May, 2011)
2012-05-03 03:33:28 +04:00
my $ ___PAIRWISE_RANKED_OPTIMIZER = 0 ; # flag to enable PRO.
2012-05-03 03:29:39 +04:00
my $ ___PRO_STARTING_POINT = 0 ; # get a starting point from pairwise ranked optimizer
2011-09-16 15:55:49 +04:00
my $ ___HISTORIC_INTERPOLATION = 0 ; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3]
2012-05-03 03:29:39 +04:00
# MegaM's options for PRO optimization.
# TODO: Should we also add these values to options of this script?
my $ megam_default_options = "-fvals -maxi 30 -nobias binary" ;
2012-05-29 21:38:57 +04:00
# Flags related to Batch MIRA (Cherry & Foster, 2012)
my $ ___BATCH_MIRA = 0 ; # flg to enable batch MIRA
2014-05-23 00:20:14 +04:00
# Hypergraph mira
my $ ___HG_MIRA = 0 ;
2013-02-22 01:40:01 +04:00
# Train phrase model mixture weights with PRO (Haddow, NAACL 2012)
my $ __PROMIX_TRAINING = undef ; # Location of main script (contrib/promix/main.py)
# The phrase tables. These should be gzip text format.
my @ __PROMIX_TABLES ;
2012-11-14 17:28:40 +04:00
# used to filter output
my $ __REMOVE_SEGMENTATION = "$SCRIPTS_ROOTDIR/ems/support/remove-segmentation-markup.perl" ;
2012-07-31 20:17:10 +04:00
2011-09-07 12:08:35 +04:00
my $ __THREADS = 0 ;
2008-06-10 13:07:20 +04:00
# Parameter for effective reference length when computing BLEU score
# Default is to use shortest reference
2009-08-05 20:39:06 +04:00
# Use "--shortest" to use shortest reference length
2008-06-10 13:07:20 +04:00
# Use "--average" to use average reference length
# Use "--closest" to use closest reference length
2009-08-05 20:39:06 +04:00
# Only one between --shortest, --average and --closest can be set
# If more than one choice the defualt (--shortest) is used
my $ ___SHORTEST = 0 ;
2008-06-10 13:07:20 +04:00
my $ ___AVERAGE = 0 ;
my $ ___CLOSEST = 0 ;
2009-08-05 20:39:06 +04:00
# Use "--nocase" to compute case-insensitive scores
2009-10-01 11:44:30 +04:00
my $ ___NOCASE = 0 ;
2009-08-05 20:39:06 +04:00
# Use "--nonorm" to non normalize translation before computing scores
2008-06-10 13:07:20 +04:00
my $ ___NONORM = 0 ;
2014-08-08 00:02:51 +04:00
# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
my $ ___INPUTTYPE ;
2008-06-10 13:07:20 +04:00
my $ mertdir = undef ; # path to new mert directory
2011-09-15 21:45:35 +04:00
my $ mertargs = undef ; # args to pass through to mert & extractor
my $ mertmertargs = undef ; # args to pass through to mert only
2012-02-29 18:53:44 +04:00
my $ extractorargs = undef ; # args to pass through to extractor only
2013-02-18 15:11:20 +04:00
my $ proargs = undef ; # args to pass through to pro only
2012-06-02 14:55:56 +04:00
# Args to pass through to batch mira only. This flags is useful to
# change MIRA's hyperparameters such as regularization parameter C,
# BLEU decay factor, and the number of iterations of MIRA.
my $ batch_mira_args = undef ;
2008-06-10 13:07:20 +04:00
my $ filtercmd = undef ; # path to filter-model-given-input.pl
2010-09-09 15:40:40 +04:00
my $ filterfile = undef ;
2008-06-10 13:07:20 +04:00
my $ qsubwrapper = undef ;
my $ moses_parallel_cmd = undef ;
2010-09-09 15:40:40 +04:00
my $ old_sge = 0 ; # assume sge<6.0
2011-08-17 13:15:19 +04:00
my $ ___CONFIG_ORIG = undef ; # pathname to startup ini file before filtering
2012-05-01 22:46:36 +04:00
my $ ___ACTIVATE_FEATURES = undef ; # comma-separated (or blank-separated) list of features to work on
2008-06-10 13:07:20 +04:00
# if undef work on all features
# (others are fixed to the starting values)
2011-08-17 13:15:19 +04:00
my $ ___RANGES = undef ;
2012-09-24 14:11:40 +04:00
my $ ___USE_CONFIG_WEIGHTS_FIRST = 0 ; # use weights in configuration file for first iteration
2008-12-30 20:33:16 +03:00
my $ prev_aggregate_nbl_size = - 1 ; # number of previous step to consider when loading data (default =-1)
# -1 means all previous, i.e. from iteration 1
# 0 means no previous data, i.e. from actual iteration
# 1 means 1 previous data , i.e. from the actual iteration and from the previous one
2011-12-08 21:39:02 +04:00
# and so on
2010-09-01 23:57:55 +04:00
my $ maximum_iterations = 25 ;
2010-09-01 21:02:12 +04:00
2014-08-05 22:20:00 +04:00
# Simulated post-editing
my $ ___MOSES_SIM_PE = "$SCRIPTS_ROOTDIR/generic/moses_sim_pe.py" ;
my $ ___DEV_SYMAL = undef ;
my $ dev_symal_abs = undef ;
my $ working_dir_abs = undef ;
2008-06-10 13:07:20 +04:00
use Getopt::Long ;
GetOptions (
"working-dir=s" = > \ $ ___WORKING_DIR ,
"input=s" = > \ $ ___DEV_F ,
"inputtype=i" = > \ $ ___INPUTTYPE ,
"refs=s" = > \ $ ___DEV_E ,
"decoder=s" = > \ $ ___DECODER ,
"config=s" = > \ $ ___CONFIG ,
"nbest=i" = > \ $ ___N_BEST_LIST_SIZE ,
2011-10-05 00:45:47 +04:00
"lattice-samples=i" = > \ $ ___LATTICE_SAMPLES ,
2008-06-10 13:07:20 +04:00
"queue-flags=s" = > \ $ queue_flags ,
"jobs=i" = > \ $ ___JOBS ,
"decoder-flags=s" = > \ $ ___DECODER_FLAGS ,
"continue" = > \ $ continue ,
"skip-decoder" = > \ $ skip_decoder ,
2009-08-05 20:39:06 +04:00
"shortest" = > \ $ ___SHORTEST ,
2008-06-10 13:07:20 +04:00
"average" = > \ $ ___AVERAGE ,
"closest" = > \ $ ___CLOSEST ,
2009-08-05 20:39:06 +04:00
"nocase" = > \ $ ___NOCASE ,
2008-06-10 13:07:20 +04:00
"nonorm" = > \ $ ___NONORM ,
"help" = > \ $ usage ,
"verbose" = > \ $ verbose ,
"mertdir=s" = > \ $ mertdir ,
2008-06-24 23:27:18 +04:00
"mertargs=s" = > \ $ mertargs ,
2012-02-29 18:53:44 +04:00
"extractorargs=s" = > \ $ extractorargs ,
2013-02-18 15:11:20 +04:00
"proargs=s" = > \ $ proargs ,
2011-09-15 21:45:35 +04:00
"mertmertargs=s" = > \ $ mertmertargs ,
2008-06-10 13:07:20 +04:00
"rootdir=s" = > \ $ SCRIPTS_ROOTDIR ,
"filtercmd=s" = > \ $ filtercmd , # allow to override the default location
2010-09-09 15:40:40 +04:00
"filterfile=s" = > \ $ filterfile , # input to filtering script (useful for lattices/confnets)
2008-06-10 13:07:20 +04:00
"qsubwrapper=s" = > \ $ qsubwrapper , # allow to override the default location
"mosesparallelcmd=s" = > \ $ moses_parallel_cmd , # allow to override the default location
2010-09-09 15:40:40 +04:00
"old-sge" = > \ $ old_sge , #passed to moses-parallel
2011-08-17 13:15:19 +04:00
"filter-phrase-table!" = > \ $ ___FILTER_PHRASE_TABLE , # (dis)allow of phrase tables
"predictable-seeds" = > \ $ ___PREDICTABLE_SEEDS , # make random restarts deterministic
2011-07-23 04:24:45 +04:00
"historic-bests" = > \ $ ___START_WITH_HISTORIC_BESTS , # use best settings from all previous iterations as starting points
"random-directions" = > \ $ ___RANDOM_DIRECTIONS , # search only in random directions
"number-of-random-directions=i" = > \ $ ___NUM_RANDOM_DIRECTIONS , # number of random directions
2011-08-17 13:15:19 +04:00
"random-restarts=i" = > \ $ ___RANDOM_RESTARTS , # number of random restarts
2012-06-12 17:16:11 +04:00
"return-best-dev" = > \ $ ___RETURN_BEST_DEV , # return the best weights according to dev, not the last
2008-06-10 13:07:20 +04:00
"activate-features=s" = > \ $ ___ACTIVATE_FEATURES , #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
2011-08-17 13:15:19 +04:00
"range=s@" = > \ $ ___RANGES ,
2012-09-24 14:11:40 +04:00
"use-config-weights-for-first-run" = > \ $ ___USE_CONFIG_WEIGHTS_FIRST , # use the weights in the configuration file when running the decoder for the first time
2008-12-30 20:33:16 +03:00
"prev-aggregate-nbestlist=i" = > \ $ prev_aggregate_nbl_size , #number of previous step to consider when loading data (default =-1, i.e. all previous)
2010-09-01 21:02:12 +04:00
"maximum-iterations=i" = > \ $ maximum_iterations ,
2011-09-07 12:08:35 +04:00
"pairwise-ranked" = > \ $ ___PAIRWISE_RANKED_OPTIMIZER ,
2011-09-16 15:55:49 +04:00
"pro-starting-point" = > \ $ ___PRO_STARTING_POINT ,
"historic-interpolation=f" = > \ $ ___HISTORIC_INTERPOLATION ,
2012-05-29 21:38:57 +04:00
"batch-mira" = > \ $ ___BATCH_MIRA ,
2014-05-23 00:20:14 +04:00
"hg-mira" = > \ $ ___HG_MIRA ,
2012-06-02 14:55:56 +04:00
"batch-mira-args=s" = > \ $ batch_mira_args ,
2013-02-22 01:40:01 +04:00
"promix-training=s" = > \ $ __PROMIX_TRAINING ,
"promix-table=s" = > \ @ __PROMIX_TABLES ,
2014-08-05 22:20:00 +04:00
"threads=i" = > \ $ __THREADS ,
"spe-symal=s" = > \ $ ___DEV_SYMAL
2008-06-10 13:07:20 +04:00
) or exit ( 1 ) ;
# the 4 required parameters can be supplied on the command line directly
# or using the --options
if ( scalar @ ARGV == 4 ) {
# required parameters: input_file references_basename decoder_executable
$ ___DEV_F = shift ;
$ ___DEV_E = shift ;
$ ___DECODER = shift ;
$ ___CONFIG = shift ;
}
2009-10-01 11:44:30 +04:00
if ( $ usage || ! defined $ ___DEV_F || ! defined $ ___DEV_E || ! defined $ ___DECODER || ! defined $ ___CONFIG ) {
2011-08-17 13:15:19 +04:00
print STDERR " usage: $ 0 input - text references decoder - executable decoder . ini
2008-06-10 13:07:20 +04:00
Options:
- - working - dir = mert - dir ... where all the files are created
2010-11-16 03:26:50 +03:00
- - nbest = 100 ... how big nbestlist to generate
2011-10-05 00:45:47 +04:00
- - lattice - samples ... how many lattice samples ( Chatterjee & Cancedda , emnlp 2010 )
2010-11-16 03:26:50 +03:00
- - jobs = N ... set this to anything to run moses in parallel
- - mosesparallelcmd = STR ... use a different script instead of moses - parallel
- - queue - flags = STRING ... anything you with to pass to qsub , eg .
'-l ws06osssmt=true' . The default is: '-hard'
2012-05-01 22:46:36 +04:00
To reset the parameters , please use
2010-11-16 03:26:50 +03:00
- - queue - flags = ' '
( i . e . a space between the quotes ) .
2008-06-10 13:07:20 +04:00
- - decoder - flags = STRING ... extra parameters for the decoder
2010-11-16 03:26:50 +03:00
- - continue ... continue from the last successful iteration
- - skip - decoder ... skip the decoder run for the first time ,
assuming that we got interrupted during
optimization
- - shortest - - average - - closest
... Use shortest /average/c losest reference length
as effective reference length ( mutually exclusive )
- - nocase ... Do not preserve case information ; i . e .
case - insensitive evaluation ( default is false ) .
- - nonorm ... Do not use text normalization ( flag is not active ,
i . e . text is NOT normalized )
- - filtercmd = STRING ... path to filter - model - given - input . pl
- - filterfile = STRING ... path to alternative to input - text for filtering
model . useful for lattice decoding
- - rootdir = STRING ... where do helpers reside ( if not given explicitly )
- - mertdir = STRING ... path to new mert implementation
2012-02-29 18:53:44 +04:00
- - mertargs = STRING ... extra args for both extractor and mert
- - extractorargs = STRING ... extra args for extractor only
2012-05-01 22:46:36 +04:00
- - mertmertargs = STRING ... extra args for mert only
2010-11-16 03:26:50 +03:00
- - scorenbestcmd = STRING ... path to score - nbest . py
- - old - sge ... passed to parallelizers , assume Grid Engine < 6.0
- - inputtype = [ 0 | 1 | 2 ] ... Handle different input types: ( 0 for text ,
1 for confusion network , 2 for lattices ,
default is 0 )
2008-06-10 13:07:20 +04:00
- - no - filter - phrase - table ... disallow filtering of phrase tables
( useful if binary phrase tables are available )
2011-08-17 13:15:19 +04:00
- - random - restarts = INT ... number of random restarts ( default: 20 )
2010-11-16 03:26:50 +03:00
- - predictable - seeds ... provide predictable seeds to mert so that random
restarts are the same on every run
2011-08-17 13:15:19 +04:00
- - range = tm:0 .. 1 , - 1 .. 1 ... specify min and max value for some features
- - range can be repeated as needed .
The order of the various - - range specifications
is important only within a feature name .
E . g . :
- - range = tm:0 .. 1 , - 1 .. 1 - - range = tm:0 .. 2
is identical to:
- - range = tm:0 .. 1 , - 1 .. 1 , 0 .. 2
but not to:
2012-05-01 22:46:36 +04:00
- - range = tm:0 .. 2 - - range = tm:0 .. 1 , - 1 .. 1
2010-11-10 14:25:40 +03:00
- - activate - features = STRING ... comma - separated list of features to optimize ,
others are fixed to the starting values
default: optimize all features
example: tm_0 , tm_4 , d_0
2010-11-16 03:26:50 +03:00
- - prev - aggregate - nbestlist = INT ... number of previous step to consider when
2010-12-15 17:49:34 +03:00
loading data ( default = $ prev_aggregate_nbl_size )
2010-11-16 03:26:50 +03:00
- 1 means all previous , i . e . from iteration 1
0 means no previous data , i . e . only the
current iteration
N means this and N previous iterations
2008-12-30 20:33:16 +03:00
2010-11-16 03:26:50 +03:00
- - maximum - iterations = ITERS ... Maximum number of iterations . Default: $ maximum_iterations
2012-06-12 17:16:11 +04:00
- - return - best - dev ... Return the weights according to dev bleu , instead of returning
the last iteration
2011-07-23 04:24:45 +04:00
- - random - directions ... search only in random directions
- - number - of - random - directions = int ... number of random directions
( also works with regular optimizer , default: 0 )
2011-09-07 12:08:35 +04:00
- - pairwise - ranked ... Use PRO for optimisation ( Hopkins and May , emnlp 2011 )
2011-09-16 15:55:49 +04:00
- - pro - starting - point ... Use PRO to get a starting point for MERT
2012-06-02 14:55:56 +04:00
- - batch - mira ... Use Batch MIRA for optimisation ( Cherry and Foster , NAACL 2012 )
2014-05-23 00:20:14 +04:00
- - hg - mira ... Use hypergraph MIRA , ie batch mira with hypergraphs instead of kbests .
- - batch - mira - args = STRING ... args to pass through to batch / hg MIRA . This flag is useful to
2012-06-02 14:55:56 +04:00
change MIRA ' s hyperparameters such as regularization parameter C ,
BLEU decay factor , and the number of iterations of MIRA .
2013-02-25 13:36:58 +04:00
- - promix - training = STRING ... PRO - based mixture model training ( Haddow , NAACL 2013 )
- - promix - tables = STRING ... Phrase tables for PRO - based mixture model training .
2011-09-07 12:08:35 +04:00
- - threads = NUMBER ... Use multi - threaded mert ( must be compiled in ) .
2011-09-16 15:55:49 +04:00
- - historic - interpolation ... Interpolate optimized weights with prior iterations ' weight
( parameter sets factor [ 0 ; 1 ] given to current weights )
2014-08-05 22:20:00 +04:00
- - spe - symal = SYMAL ... Use simulated post - editing when decoding .
( SYMAL aligns input to refs )
2008-06-10 13:07:20 +04:00
" ;
exit 1 ;
}
# Check validity of input parameters and set defaults if needed
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n" ;
# path of script for filtering phrase tables and running the decoder
2012-05-03 03:09:20 +04:00
$ filtercmd = File::Spec - > catfile ( $ SCRIPTS_ROOTDIR , "training" , "filter-model-given-input.pl" ) if ! defined $ filtercmd ;
2008-06-10 13:07:20 +04:00
2014-02-08 21:54:48 +04:00
# WHY ... ! ___FILTER_PHRASE_TABLE ??? This doesn't make sense! [UG]
# if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
if ( ! - x $ filtercmd && $ ___FILTER_PHRASE_TABLE ) {
2012-05-03 02:22:08 +04:00
warn "Filtering command not found: $filtercmd." ;
warn "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table" ;
2011-08-17 13:15:19 +04:00
exit 1 ;
}
2012-05-03 03:09:20 +04:00
$ qsubwrapper = File::Spec - > catfile ( $ SCRIPTS_ROOTDIR , "generic" , "qsub-wrapper.pl" ) if ! defined $ qsubwrapper ;
2008-06-10 13:07:20 +04:00
2012-05-03 03:09:20 +04:00
$ moses_parallel_cmd = File::Spec - > catfile ( $ SCRIPTS_ROOTDIR , "generic" , "moses-parallel.pl" )
2008-06-10 13:07:20 +04:00
if ! defined $ moses_parallel_cmd ;
2010-10-26 19:21:38 +04:00
if ( ! defined $ mertdir ) {
2012-06-28 16:56:06 +04:00
$ mertdir = File::Spec - > catfile ( File::Basename:: dirname ( $ SCRIPTS_ROOTDIR ) , "bin" ) ;
2012-05-03 03:09:20 +04:00
die "mertdir does not exist: $mertdir" if ! - x $ mertdir ;
2010-10-26 19:21:38 +04:00
print STDERR "Assuming --mertdir=$mertdir\n" ;
}
2008-06-10 13:07:20 +04:00
2012-05-03 03:09:20 +04:00
my $ mert_extract_cmd = File::Spec - > catfile ( $ mertdir , "extractor" ) ;
my $ mert_mert_cmd = File::Spec - > catfile ( $ mertdir , "mert" ) ;
my $ mert_pro_cmd = File::Spec - > catfile ( $ mertdir , "pro" ) ;
2012-05-29 21:38:57 +04:00
my $ mert_mira_cmd = File::Spec - > catfile ( $ mertdir , "kbmira" ) ;
2012-06-12 17:16:11 +04:00
my $ mert_eval_cmd = File::Spec - > catfile ( $ mertdir , "evaluator" ) ;
2008-06-10 13:07:20 +04:00
die "Not executable: $mert_extract_cmd" if ! - x $ mert_extract_cmd ;
2012-05-01 23:03:44 +04:00
die "Not executable: $mert_mert_cmd" if ! - x $ mert_mert_cmd ;
die "Not executable: $mert_pro_cmd" if ! - x $ mert_pro_cmd ;
2012-05-29 21:38:57 +04:00
die "Not executable: $mert_mira_cmd" if ! - x $ mert_mira_cmd ;
2012-06-12 17:16:11 +04:00
die "Not executable: $mert_eval_cmd" if ! - x $ mert_eval_cmd ;
2008-06-10 13:07:20 +04:00
2012-05-03 03:09:20 +04:00
my $ pro_optimizer = File::Spec - > catfile ( $ mertdir , "megam_i686.opt" ) ; # or set to your installation
2008-06-10 13:07:20 +04:00
2011-09-16 15:55:49 +04:00
if ( ( $ ___PAIRWISE_RANKED_OPTIMIZER || $ ___PRO_STARTING_POINT ) && ! - x $ pro_optimizer ) {
2012-05-03 23:04:54 +04:00
print "Could not find $pro_optimizer, installing it in $mertdir\n" ;
2013-04-04 00:59:03 +04:00
my $ megam_url = "http://hal3.name/megam" ;
2012-05-03 23:04:54 +04:00
if ( & is_mac_osx ( ) ) {
die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details." ;
}
2013-01-19 21:30:20 +04:00
`cd $mertdir; wget $megam_url/megam_i686.opt.gz;` ;
2011-08-03 21:00:17 +04:00
`gunzip $pro_optimizer.gz` ;
`chmod +x $pro_optimizer` ;
2012-05-03 23:04:54 +04:00
die ( "ERROR: Installation of megam_i686.opt failed! Install by hand from $megam_url" ) unless - x $ pro_optimizer ;
2011-08-03 21:00:17 +04:00
}
2013-02-22 01:40:01 +04:00
if ( $ __PROMIX_TRAINING ) {
die "Not executable $__PROMIX_TRAINING" unless - x $ __PROMIX_TRAINING ;
2013-03-15 20:13:33 +04:00
die "For promix training, specify the tables using --promix-table arguments" unless @ __PROMIX_TABLES ;
2013-02-22 01:40:01 +04:00
die "For mixture model, need at least 2 tables" unless scalar ( @ __PROMIX_TABLES ) > 1 ;
2012-10-31 18:45:15 +04:00
2013-02-22 01:40:01 +04:00
for my $ TABLE ( @ __PROMIX_TABLES ) {
die "Phrase table $TABLE not found" unless - r $ TABLE ;
2012-10-31 18:45:15 +04:00
}
2013-03-15 20:13:33 +04:00
die "To use promix training, need to specify a filter and binarisation command" unless $ filtercmd =~ /Binarizer/ ;
2012-10-30 14:31:28 +04:00
}
2014-09-17 17:14:11 +04:00
if ( ! defined $ mertargs ) {
if ( defined $ batch_mira_args ) {
$ mertargs = $ batch_mira_args ;
}
else {
$ mertargs = "" ;
}
}
2009-08-05 20:39:06 +04:00
my $ scconfig = undef ;
2014-09-30 18:43:38 +04:00
if ( $ mertargs =~ /\-\-scconfig(?:\s+|=)(.+?)(\s|$)/ ) {
2012-05-04 10:40:50 +04:00
$ scconfig = $ 1 ;
2009-08-05 20:39:06 +04:00
$ scconfig =~ s/\,/ /g ;
2014-09-30 18:43:38 +04:00
$ mertargs =~ s/\-\-scconfig(?:\s+|=)(.+?)(\s|$)// ;
2009-08-05 20:39:06 +04:00
}
2014-09-24 17:42:59 +04:00
my $ sctype = "--sctype BLEU" ;
2014-09-30 18:43:38 +04:00
if ( $ mertargs =~ /(\-\-sctype(?:\s+|=).+?)(\s|$)/ ) {
2014-09-18 18:18:14 +04:00
$ sctype = $ 1 ;
2014-09-30 18:43:38 +04:00
$ mertargs =~ s/(\-\-sctype(?:\s+|=)+.+?)(\s|$)// ;
2014-09-18 18:18:14 +04:00
}
2009-08-05 20:39:06 +04:00
# handling reference lengh strategy
2012-05-03 01:59:34 +04:00
$ scconfig . = & setup_reference_length_type ( ) ;
2009-08-05 20:39:06 +04:00
# handling case-insensitive flag
2012-05-03 01:59:34 +04:00
$ scconfig . = & setup_case_config ( ) ;
2009-08-05 20:39:06 +04:00
$ scconfig =~ s/^\s+// ;
$ scconfig =~ s/\s+$// ;
$ scconfig =~ s/\s+/,/g ;
$ scconfig = "--scconfig $scconfig" if ( $ scconfig ) ;
2014-09-18 18:18:14 +04:00
my $ mert_extract_args = "$sctype $scconfig" ;
2012-05-01 22:40:44 +04:00
$ extractorargs = "" unless $ extractorargs ;
2012-05-04 10:40:50 +04:00
$ mert_extract_args . = " $extractorargs" ;
2009-08-05 20:39:06 +04:00
2011-09-15 21:45:35 +04:00
$ mertmertargs = "" if ! defined $ mertmertargs ;
2013-02-18 15:11:20 +04:00
$ proargs = "" unless $ proargs ;
2012-05-04 10:40:50 +04:00
my $ mert_mert_args = "$mertargs $mertmertargs" ;
2008-12-30 20:33:16 +03:00
$ mert_mert_args =~ s/\-+(binary|b)\b// ;
2012-05-04 10:40:50 +04:00
$ mert_mert_args . = " $scconfig" ;
if ( $ ___ACTIVATE_FEATURES ) {
$ mert_mert_args . = " -o \"$___ACTIVATE_FEATURES\"" ;
}
2008-06-24 23:27:18 +04:00
2012-05-04 10:40:50 +04:00
my ( $ just_cmd_filtercmd , $ x ) = split ( / / , $ filtercmd ) ;
2014-02-08 21:54:48 +04:00
die "Not executable: $just_cmd_filtercmd" if $ ___FILTER_PHRASE_TABLE && ! - x $ just_cmd_filtercmd ;
2008-06-10 13:07:20 +04:00
die "Not executable: $moses_parallel_cmd" if defined $ ___JOBS && ! - x $ moses_parallel_cmd ;
2012-05-01 23:03:44 +04:00
die "Not executable: $qsubwrapper" if defined $ ___JOBS && ! - x $ qsubwrapper ;
die "Not executable: $___DECODER" if ! - x $ ___DECODER ;
2008-06-10 13:07:20 +04:00
my $ input_abs = ensure_full_path ( $ ___DEV_F ) ;
2012-05-04 10:40:50 +04:00
die "File not found: $___DEV_F (interpreted as $input_abs)." if ! - e $ input_abs ;
2008-06-10 13:07:20 +04:00
$ ___DEV_F = $ input_abs ;
2010-09-09 15:40:40 +04:00
# Option to pass to qsubwrapper and moses-parallel
my $ pass_old_sge = $ old_sge ? "-old-sge" : "" ;
2008-06-10 13:07:20 +04:00
my $ decoder_abs = ensure_full_path ( $ ___DECODER ) ;
2011-08-17 13:15:19 +04:00
die "File not executable: $___DECODER (interpreted as $decoder_abs)."
2008-06-10 13:07:20 +04:00
if ! - x $ decoder_abs ;
$ ___DECODER = $ decoder_abs ;
my $ ref_abs = ensure_full_path ( $ ___DEV_E ) ;
# check if English dev set (reference translations) exist and store a list of all references
my @ references ;
if ( - e $ ref_abs ) {
push @ references , $ ref_abs ;
2012-05-01 23:03:44 +04:00
} else {
2008-06-10 13:07:20 +04:00
# if multiple file, get a full list of the files
2012-05-04 10:40:50 +04:00
my $ part = 0 ;
if ( ! - e $ ref_abs . "0" && - e $ ref_abs . ".ref0" ) {
$ ref_abs . = ".ref" ;
}
while ( - e $ ref_abs . $ part ) {
push @ references , $ ref_abs . $ part ;
$ part + + ;
}
die ( "Reference translations not found: $___DEV_E (interpreted as $ref_abs)" ) unless $ part ;
2008-06-10 13:07:20 +04:00
}
my $ config_abs = ensure_full_path ( $ ___CONFIG ) ;
2012-05-01 23:03:44 +04:00
die "File not found: $___CONFIG (interpreted as $config_abs)." if ! - e $ config_abs ;
2008-06-10 13:07:20 +04:00
$ ___CONFIG = $ config_abs ;
# moses should use our config
if ( $ ___DECODER_FLAGS =~ /(^|\s)-(config|f) /
2012-05-01 23:03:44 +04:00
|| $ ___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) /
|| $ ___DECODER_FLAGS =~ /(^|\s)-(distortion-file) /
|| $ ___DECODER_FLAGS =~ /(^|\s)-(generation-file) /
|| $ ___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) /
|| $ ___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) /
) {
2008-06-10 13:07:20 +04:00
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files." ;
}
2014-08-05 22:20:00 +04:00
# Paths needed for simulated post-editing
2014-08-13 23:58:51 +04:00
$ working_dir_abs = ensure_full_path ( $ ___WORKING_DIR ) ;
if ( defined $ ___DEV_SYMAL ) {
2014-08-05 22:20:00 +04:00
$ dev_symal_abs = ensure_full_path ( $ ___DEV_SYMAL ) ;
}
2008-06-10 13:07:20 +04:00
# as weights are normalized in the next steps (by cmert)
# normalize initial LAMBDAs, too
my $ need_to_normalize = 1 ;
#store current directory and create the working directory (if needed)
2012-05-01 23:21:35 +04:00
my $ cwd = Cwd:: getcwd ( ) ;
2008-06-10 13:07:20 +04:00
2011-08-17 13:15:19 +04:00
mkpath ( $ ___WORKING_DIR ) ;
2008-06-10 13:07:20 +04:00
# open local scope
2012-05-01 23:03:44 +04:00
{
2008-06-10 13:07:20 +04:00
#chdir to the working directory
chdir ( $ ___WORKING_DIR ) or die "Can't chdir to $___WORKING_DIR" ;
# fixed file names
2010-12-15 17:49:34 +03:00
my $ mert_outfile = "mert.out" ;
2008-06-10 13:07:20 +04:00
my $ mert_logfile = "mert.log" ;
my $ weights_in_file = "init.opt" ;
my $ weights_out_file = "weights.txt" ;
2012-05-02 00:25:00 +04:00
my $ finished_step_file = "finished_step.txt" ;
2008-06-10 13:07:20 +04:00
# set start run
my $ start_run = 1 ;
2009-10-07 20:35:57 +04:00
my $ bestpoint = undef ;
my $ devbleu = undef ;
2011-09-07 20:37:33 +04:00
my $ sparse_weights_file = undef ;
2009-10-07 20:35:57 +04:00
my $ prev_feature_file = undef ;
my $ prev_score_file = undef ;
2011-07-23 04:24:45 +04:00
my $ prev_init_file = undef ;
2012-07-31 20:17:10 +04:00
my @ allnbests ;
2008-06-10 13:07:20 +04:00
2013-03-15 20:13:33 +04:00
# If we're doing promix training, need to make sure the appropriate
2012-10-31 18:45:15 +04:00
# tables are in place
2013-02-22 01:40:01 +04:00
my @ _PROMIX_TABLES_BIN ;
if ( $ __PROMIX_TRAINING ) {
2013-03-15 20:13:33 +04:00
print STDERR "Training mixture model using promix\n" ;
2013-02-22 01:40:01 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ __PROMIX_TABLES ) ; + + $ i ) {
2012-10-31 18:45:15 +04:00
# Create filtered, binarised tables
my $ filtered_config = "moses_$i.ini" ;
2013-02-22 01:40:01 +04:00
substitute_ttable ( $ ___CONFIG , $ filtered_config , $ __PROMIX_TABLES [ $ i ] ) ;
2013-03-15 20:13:33 +04:00
#TODO: Remove reordering table from config, as we don't need to filter
# and binarise it.
2012-10-31 18:45:15 +04:00
my $ filtered_path = "filtered_$i" ;
my $ ___FILTER_F = $ ___DEV_F ;
$ ___FILTER_F = $ filterfile if ( defined $ filterfile ) ;
my $ cmd = "$filtercmd ./$filtered_path $filtered_config $___FILTER_F" ;
& submit_or_exec ( $ cmd , "filterphrases_$i.out" , "filterphrases_$i.err" ) ;
2013-02-22 01:40:01 +04:00
push ( @ _PROMIX_TABLES_BIN , "$filtered_path/phrase-table.0-0.1.1" ) ;
2012-10-31 18:45:15 +04:00
}
2013-02-22 01:40:01 +04:00
}
2011-08-17 13:15:19 +04:00
if ( $ ___FILTER_PHRASE_TABLE ) {
my $ outdir = "filtered" ;
if ( - e "$outdir/moses.ini" ) {
print STDERR "Assuming the tables are already filtered, reusing $outdir/moses.ini\n" ;
2012-05-01 23:03:44 +04:00
} else {
2011-08-17 13:15:19 +04:00
# filter the phrase tables with respect to input, use --decoder-flags
print STDERR "filtering the phrase tables... " . `date` ;
my $ ___FILTER_F = $ ___DEV_F ;
$ ___FILTER_F = $ filterfile if ( defined $ filterfile ) ;
my $ cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F" ;
2012-05-04 10:40:50 +04:00
& submit_or_exec ( $ cmd , "filterphrases.out" , "filterphrases.err" ) ;
2011-08-17 13:15:19 +04:00
}
# make a backup copy of startup ini filepath
$ ___CONFIG_ORIG = $ ___CONFIG ;
# the decoder should now use the filtered model
$ ___CONFIG = "$outdir/moses.ini" ;
2012-05-01 23:03:44 +04:00
} else {
2011-08-17 13:15:19 +04:00
# do not filter phrase tables (useful if binary phrase tables are available)
# use the original configuration file
$ ___CONFIG_ORIG = $ ___CONFIG ;
}
# we run moses to check validity of moses.ini and to obtain all the feature
# names
my $ featlist = get_featlist_from_moses ( $ ___CONFIG ) ;
$ featlist = insert_ranges_to_featlist ( $ featlist , $ ___RANGES ) ;
# Mark which features are disabled:
if ( defined $ ___ACTIVATE_FEATURES ) {
my % enabled = map { ( $ _ , 1 ) } split /[, ]+/ , $ ___ACTIVATE_FEATURES ;
my % cnt ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
2011-08-17 13:15:19 +04:00
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
$ cnt { $ name } = 0 if ! defined $ cnt { $ name } ;
2012-05-04 10:40:50 +04:00
$ featlist - > { "enabled" } - > [ $ i ] = $ enabled { $ name . "_" . $ cnt { $ name } } ;
2011-08-17 13:15:19 +04:00
$ cnt { $ name } + + ;
}
} else {
# all enabled
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
2011-08-17 13:15:19 +04:00
$ featlist - > { "enabled" } - > [ $ i ] = 1 ;
}
}
print STDERR "MERT starting values and ranges for random generation:\n" ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
2011-08-17 13:15:19 +04:00
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
my $ val = $ featlist - > { "values" } - > [ $ i ] ;
my $ min = $ featlist - > { "mins" } - > [ $ i ] ;
my $ max = $ featlist - > { "maxs" } - > [ $ i ] ;
my $ enabled = $ featlist - > { "enabled" } - > [ $ i ] ;
printf STDERR " %5s = %7.3f" , $ name , $ val ;
if ( $ enabled ) {
printf STDERR " (%5.2f .. %5.2f)\n" , $ min , $ max ;
} else {
print STDERR " --- inactive, not optimized ---\n" ;
}
}
2008-06-10 13:07:20 +04:00
if ( $ continue ) {
2009-10-07 20:35:57 +04:00
# getting the last finished step
2008-06-10 13:07:20 +04:00
print STDERR "Trying to continue an interrupted optimization.\n" ;
2012-05-02 00:25:00 +04:00
open my $ fh , '<' , $ finished_step_file or die "$finished_step_file: $!" ;
2012-05-02 00:07:14 +04:00
my $ step = <$fh> ;
2008-06-10 13:07:20 +04:00
chomp $ step ;
2012-05-02 00:07:14 +04:00
close $ fh ;
2008-06-10 13:07:20 +04:00
2009-10-07 20:35:57 +04:00
print STDERR "Last finished step is $step\n" ;
2008-06-10 13:07:20 +04:00
2009-10-07 20:35:57 +04:00
# getting the first needed step
my $ firststep ;
2012-05-04 10:40:50 +04:00
if ( $ prev_aggregate_nbl_size == - 1 ) {
$ firststep = 1 ;
2012-05-01 23:03:44 +04:00
} else {
2012-05-04 10:40:50 +04:00
$ firststep = $ step - $ prev_aggregate_nbl_size + 1 ;
$ firststep = ( $ firststep > 0 ) ? $ firststep : 1 ;
2009-10-07 20:35:57 +04:00
}
2008-06-10 13:07:20 +04:00
2012-05-01 23:03:44 +04:00
#checking if all needed data are available
2012-05-04 10:40:50 +04:00
if ( $ firststep <= $ step ) {
2009-10-07 20:35:57 +04:00
print STDERR "First previous needed data index is $firststep\n" ;
print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n" ;
2012-05-01 22:46:36 +04:00
2012-05-04 10:40:50 +04:00
for ( my $ prevstep = $ firststep ; $ prevstep <= $ step ; $ prevstep + + ) {
2012-05-01 23:03:44 +04:00
print STDERR "Checking whether data of step $prevstep are available\n" ;
2012-05-04 10:40:50 +04:00
if ( ! - e "run$prevstep.features.dat" ) {
2012-05-01 23:03:44 +04:00
die "Can't start from step $step, because run$prevstep.features.dat was not found!" ;
} else {
2012-05-04 10:40:50 +04:00
if ( defined $ prev_feature_file ) {
$ prev_feature_file = "${prev_feature_file},run$prevstep.features.dat" ;
} else {
$ prev_feature_file = "run$prevstep.features.dat" ;
}
2009-10-07 20:35:57 +04:00
}
2012-05-04 10:40:50 +04:00
if ( ! - e "run$prevstep.scores.dat" ) {
2012-05-01 23:03:44 +04:00
die "Can't start from step $step, because run$prevstep.scores.dat was not found!" ;
} else {
2012-05-04 10:40:50 +04:00
if ( defined $ prev_score_file ) {
$ prev_score_file = "${prev_score_file},run$prevstep.scores.dat" ;
} else {
$ prev_score_file = "run$prevstep.scores.dat" ;
}
2009-10-07 20:35:57 +04:00
}
2012-05-04 10:40:50 +04:00
if ( ! - e "run$prevstep.${weights_in_file}" ) {
2012-05-01 23:03:44 +04:00
die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!" ;
} else {
2012-05-04 10:40:50 +04:00
if ( defined $ prev_init_file ) {
2011-07-23 04:24:45 +04:00
$ prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}" ;
2012-05-04 10:40:50 +04:00
} else {
2011-07-23 04:24:45 +04:00
$ prev_init_file = "run$prevstep.${weights_in_file}" ;
}
}
2009-10-07 20:35:57 +04:00
}
2012-05-04 10:40:50 +04:00
if ( ! - e "run$step.weights.txt" ) {
2010-01-08 18:56:45 +03:00
die "Can't start from step $step, because run$step.weights.txt was not found!" ;
2009-10-07 20:35:57 +04:00
}
2012-05-04 10:40:50 +04:00
if ( ! - e "run$step.$mert_logfile" ) {
2009-10-07 20:35:57 +04:00
die "Can't start from step $step, because run$step.$mert_logfile was not found!" ;
}
2012-05-04 10:40:50 +04:00
if ( ! - e "run$step.best$___N_BEST_LIST_SIZE.out.gz" ) {
2010-01-08 18:56:45 +03:00
die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!" ;
2009-10-07 20:35:57 +04:00
}
print STDERR "All needed data are available\n" ;
print STDERR "Loading information from last step ($step)\n" ;
2012-05-04 10:40:50 +04:00
2011-09-07 20:37:33 +04:00
my % dummy ; # sparse features
2012-05-04 10:40:50 +04:00
( $ bestpoint , $ devbleu ) = & get_weights_from_mert ( "run$step.$mert_outfile" , "run$step.$mert_logfile" , scalar @ { $ featlist - > { "names" } } , \ % dummy ) ;
2009-10-07 20:35:57 +04:00
die "Failed to parse mert.log, missed Best point there."
if ! defined $ bestpoint || ! defined $ devbleu ;
print "($step) BEST at $step $bestpoint => $devbleu at " . `date` ;
my @ newweights = split /\s+/ , $ bestpoint ;
2012-05-01 22:46:36 +04:00
2011-08-17 13:15:19 +04:00
# Sanity check: order of lambdas must match
2014-05-23 00:20:14 +04:00
if ( ! $ ___HG_MIRA ) {
sanity_check_order_of_lambdas ( $ featlist ,
"gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |" ) ;
} else {
print STDERR "WARN: No sanity check of order of features in hypergraph mira\n" ;
}
2011-09-07 20:37:33 +04:00
2009-10-07 20:35:57 +04:00
# update my cache of lambda values
2011-08-17 13:15:19 +04:00
$ featlist - > { "values" } = \ @ newweights ;
2012-05-04 10:40:50 +04:00
} else {
2011-08-17 13:15:19 +04:00
print STDERR "No previous data are needed\n" ;
2009-10-07 20:35:57 +04:00
}
2012-05-04 10:40:50 +04:00
$ start_run = $ step + 1 ;
2008-06-10 13:07:20 +04:00
}
2011-08-17 13:15:19 +04:00
###### MERT MAIN LOOP
2008-06-10 13:07:20 +04:00
2012-05-01 23:03:44 +04:00
my $ run = $ start_run - 1 ;
2008-06-10 13:07:20 +04:00
2012-05-04 10:40:50 +04:00
my $ oldallsorted = undef ;
my $ allsorted = undef ;
my $ nbest_file = undef ;
my $ lsamp_file = undef ; # Lattice samples
2014-05-23 00:20:14 +04:00
my $ hypergraph_dir = undef ;
2012-05-04 10:40:50 +04:00
my $ orig_nbest_file = undef ; # replaced if lattice sampling
2012-11-20 19:13:31 +04:00
# For mixture modelling
2013-02-22 01:40:01 +04:00
my @ promix_weights ;
2012-11-20 19:13:31 +04:00
my $ num_mixed_phrase_features ;
2013-03-15 20:13:33 +04:00
my $ interpolated_config ;
my $ uninterpolated_config ; # backup of config without interpolated ttable
2008-06-10 13:07:20 +04:00
2012-05-01 23:03:44 +04:00
while ( 1 ) {
2008-06-10 13:07:20 +04:00
$ run + + ;
2010-09-01 21:02:12 +04:00
if ( $ maximum_iterations && $ run > $ maximum_iterations ) {
2012-05-04 10:40:50 +04:00
print "Maximum number of iterations exceeded - stopping\n" ;
last ;
2010-09-01 21:02:12 +04:00
}
2012-10-31 18:45:15 +04:00
print "run $run start at " . `date` ;
2013-02-22 01:40:01 +04:00
if ( $ __PROMIX_TRAINING ) {
2013-03-15 20:13:33 +04:00
# Need to create an ini file for the interpolated phrase table
2013-02-22 01:40:01 +04:00
if ( ! @ promix_weights ) {
2013-03-15 20:13:33 +04:00
# Create initial weights, distributing evenly between tables
2012-11-20 19:13:31 +04:00
# total number of weights is 1 less than number of phrase features, multiplied
# by the number of tables
$ num_mixed_phrase_features = ( grep { $ _ eq 'tm' } @ { $ featlist - > { "names" } } ) - 1 ;
2013-02-22 01:40:01 +04:00
@ promix_weights = ( 1.0 / scalar ( @ __PROMIX_TABLES ) ) x
( $ num_mixed_phrase_features * scalar ( @ __PROMIX_TABLES ) ) ;
2013-03-15 20:13:33 +04:00
}
# backup orig config, so we always add the table into it
$ uninterpolated_config = $ ___CONFIG unless $ uninterpolated_config ;
2012-11-20 19:13:31 +04:00
2012-10-31 18:45:15 +04:00
# Interpolation
2013-04-30 17:04:39 +04:00
my $ interpolated_phrase_table = "interpolate" ;
for my $ itable ( @ _PROMIX_TABLES_BIN ) {
$ interpolated_phrase_table . = " 1:$itable" ;
2012-11-20 19:13:31 +04:00
}
2013-04-30 17:04:39 +04:00
2012-10-31 18:45:15 +04:00
# Create an ini file for the interpolated phrase table
2013-03-15 20:13:33 +04:00
$ interpolated_config = "moses.interpolated.ini" ;
2013-05-02 18:11:17 +04:00
substitute_ttable ( $ uninterpolated_config , $ interpolated_config , $ interpolated_phrase_table , "99" ) ;
2013-04-30 17:04:39 +04:00
# Append the multimodel weights
open ( ITABLE , ">>$interpolated_config" ) || die "Failed to append weights to $interpolated_config" ;
print ITABLE "\n" ;
print ITABLE "[weight-t-multimodel]\n" ;
#for my $feature (0..($num_mixed_phrase_features-1)) {
# for my $table (0..(scalar(@__PROMIX_TABLES)-1)) {
# print ITABLE $promix_weights[$table * $num_mixed_phrase_features + $feature];
# print ITABLE "\n";
# }
#}
for my $ iweight ( @ promix_weights ) {
print ITABLE $ iweight . "\n" ;
}
close ITABLE ;
2012-10-31 18:45:15 +04:00
2013-03-15 20:13:33 +04:00
# the decoder should now use the interpolated model
$ ___CONFIG = "$interpolated_config" ;
2012-10-31 18:45:15 +04:00
}
2008-06-10 13:07:20 +04:00
# run beamdecoder with option to output nbestlists
# the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists
# In case something dies later, we might wish to have a copy
2012-05-04 10:40:50 +04:00
create_config ( $ ___CONFIG , "./run$run.moses.ini" , $ featlist , $ run , ( defined $ devbleu ? $ devbleu : "--not-estimated--" ) , $ sparse_weights_file ) ;
2008-06-10 13:07:20 +04:00
2012-06-13 07:02:04 +04:00
# Save dense weights to simplify best dev recovery
{
my $ densefile = "run$run.dense" ;
my @ vals = @ { $ featlist - > { "values" } } ;
my @ names = @ { $ featlist - > { "names" } } ;
open my $ denseout , '>' , $ densefile or die "Can't write $densefile (WD now $___WORKING_DIR)" ;
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
2013-05-17 19:13:24 +04:00
print $ denseout "$names[$i]= $vals[$i]\n" ;
2012-06-13 07:02:04 +04:00
}
close $ denseout ;
}
2011-09-07 20:37:33 +04:00
# skip running the decoder if the user wanted
2012-05-04 10:40:50 +04:00
if ( ! $ skip_decoder ) {
print "($run) run decoder to produce n-best lists\n" ;
2014-05-23 00:20:14 +04:00
( $ nbest_file , $ lsamp_file , $ hypergraph_dir ) = run_decoder ( $ featlist , $ run , $ need_to_normalize ) ;
2012-05-04 10:40:50 +04:00
$ need_to_normalize = 0 ;
if ( $ ___LATTICE_SAMPLES ) {
my $ combined_file = "$nbest_file.comb" ;
safesystem ( "sort -k1,1n $nbest_file $lsamp_file > $combined_file" ) or
2011-10-05 00:45:47 +04:00
die ( "failed to merge nbest and lattice samples" ) ;
2012-05-04 10:40:50 +04:00
safesystem ( "gzip -f $nbest_file; gzip -f $lsamp_file" ) or
2011-10-05 00:45:47 +04:00
die "Failed to gzip nbests and lattice samples" ;
2012-05-04 10:40:50 +04:00
$ orig_nbest_file = "$nbest_file.gz" ;
$ orig_nbest_file = "$nbest_file.gz" ;
$ lsamp_file = "$lsamp_file.gz" ;
$ lsamp_file = "$lsamp_file.gz" ;
$ nbest_file = "$combined_file" ;
}
2014-05-23 00:20:14 +04:00
safesystem ( "gzip -f $nbest_file" ) or die "Failed to gzip run*out" unless $ ___HG_MIRA ;
2012-05-04 10:40:50 +04:00
$ nbest_file = $ nbest_file . ".gz" ;
2012-05-01 23:03:44 +04:00
} else {
2012-05-04 10:40:50 +04:00
$ nbest_file = "run$run.best$___N_BEST_LIST_SIZE.out.gz" ;
print "skipped decoder run $run\n" ;
$ skip_decoder = 0 ;
$ need_to_normalize = 0 ;
2008-06-10 13:07:20 +04:00
}
2014-05-23 00:20:14 +04:00
# extract score statistics and features from the nbest lists
print STDERR "Scoring the nbestlist.\n" ;
2008-12-30 20:33:16 +03:00
2014-05-23 00:20:14 +04:00
my $ base_feature_file = "features.dat" ;
my $ base_score_file = "scores.dat" ;
my $ feature_file = "run$run.${base_feature_file}" ;
my $ score_file = "run$run.${base_score_file}" ;
2011-10-10 08:28:55 +04:00
2014-05-23 00:20:14 +04:00
my $ cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r " . join ( "," , @ references ) . " -n $nbest_file" ;
if ( ! $ ___HG_MIRA ) {
$ cmd . = " -d" if $ __PROMIX_TRAINING ; # Allow duplicates
# remove segmentation
$ cmd . = " -l $__REMOVE_SEGMENTATION" if $ __PROMIX_TRAINING ;
$ cmd = & create_extractor_script ( $ cmd , $ ___WORKING_DIR ) ;
& submit_or_exec ( $ cmd , "extract.out" , "extract.err" ) ;
}
2008-06-10 13:07:20 +04:00
2011-08-17 13:15:19 +04:00
# Create the initial weights file for mert: init.opt
2012-05-04 10:40:50 +04:00
my @ MIN = @ { $ featlist - > { "mins" } } ;
my @ MAX = @ { $ featlist - > { "maxs" } } ;
2011-08-17 13:15:19 +04:00
my @ CURR = @ { $ featlist - > { "values" } } ;
my @ NAME = @ { $ featlist - > { "names" } } ;
2012-05-01 22:46:36 +04:00
2012-05-02 00:07:14 +04:00
open my $ out , '>' , $ weights_in_file or die "Can't write $weights_in_file (WD now $___WORKING_DIR)" ;
2012-05-04 10:40:50 +04:00
print $ out join ( " " , @ CURR ) . "\n" ;
print $ out join ( " " , @ MIN ) . "\n" ; # this is where we could pass MINS
print $ out join ( " " , @ MAX ) . "\n" ; # this is where we could pass MAXS
2012-05-02 00:07:14 +04:00
close $ out ;
2011-08-17 13:15:19 +04:00
# print join(" ", @NAME)."\n";
2012-05-01 22:46:36 +04:00
2008-06-10 13:07:20 +04:00
# make a backup copy labelled with this run number
safesystem ( "\\cp -f $weights_in_file run$run.$weights_in_file" ) or die ;
my $ DIM = scalar ( @ CURR ) ; # number of lambdas
# run mert
2011-09-16 15:55:49 +04:00
$ cmd = "$mert_mert_cmd -d $DIM $mert_mert_args" ;
2012-05-01 22:46:36 +04:00
2011-09-16 15:55:49 +04:00
my $ mert_settings = " -n $___RANDOM_RESTARTS" ;
2011-11-16 21:36:04 +04:00
my $ seed_settings = "" ;
2009-02-25 22:31:17 +03:00
if ( $ ___PREDICTABLE_SEEDS ) {
2012-05-04 10:40:50 +04:00
my $ seed = $ run * 1000 ;
$ seed_settings . = " -r $seed" ;
2009-02-25 22:31:17 +03:00
}
2011-11-16 21:36:04 +04:00
$ mert_settings . = $ seed_settings ;
2011-07-23 04:24:45 +04:00
if ( $ ___RANDOM_DIRECTIONS ) {
if ( $ ___NUM_RANDOM_DIRECTIONS == 0 ) {
2011-09-16 15:55:49 +04:00
$ mert_settings . = " -m 50" ;
2011-07-23 04:24:45 +04:00
}
2011-09-16 15:55:49 +04:00
$ mert_settings . = " -t random-direction" ;
2011-07-23 04:24:45 +04:00
}
if ( $ ___NUM_RANDOM_DIRECTIONS ) {
2011-09-16 15:55:49 +04:00
$ mert_settings . = " -m $___NUM_RANDOM_DIRECTIONS" ;
}
if ( $ __THREADS ) {
$ mert_settings . = " --threads $__THREADS" ;
2011-07-23 04:24:45 +04:00
}
2008-12-30 20:33:16 +03:00
2011-11-16 21:36:04 +04:00
my $ ffiles = "" ;
my $ scfiles = "" ;
2012-05-01 23:03:44 +04:00
2008-12-30 20:33:16 +03:00
if ( defined $ prev_feature_file ) {
2011-11-16 21:36:04 +04:00
$ ffiles = "$prev_feature_file,$feature_file" ;
2012-05-01 23:03:44 +04:00
} else {
2011-11-16 21:36:04 +04:00
$ ffiles = "$feature_file" ;
2008-12-30 20:33:16 +03:00
}
2012-05-01 23:03:44 +04:00
2008-12-30 20:33:16 +03:00
if ( defined $ prev_score_file ) {
2011-11-16 21:36:04 +04:00
$ scfiles = "$prev_score_file,$score_file" ;
2012-05-01 23:03:44 +04:00
} else {
2011-11-16 21:36:04 +04:00
$ scfiles = "$score_file" ;
2008-12-30 20:33:16 +03:00
}
2011-11-16 21:36:04 +04:00
2012-05-29 21:38:57 +04:00
my $ mira_settings = "" ;
2014-05-23 00:20:14 +04:00
if ( ( $ ___BATCH_MIRA || $ ___HG_MIRA ) && $ batch_mira_args ) {
2012-06-02 14:55:56 +04:00
$ mira_settings . = "$batch_mira_args " ;
}
2014-07-21 14:43:37 +04:00
#$mira_settings .= " --dense-init run$run.$weights_in_file";
$ mira_settings . = " --dense-init run$run.dense" ;
2012-05-29 21:38:57 +04:00
if ( - e "run$run.sparse-weights" ) {
$ mira_settings . = " --sparse-init run$run.sparse-weights" ;
}
2011-11-16 21:36:04 +04:00
my $ file_settings = " --ffile $ffiles --scfile $scfiles" ;
2012-05-04 10:40:50 +04:00
my $ pro_file_settings = "--ffile " . join ( " --ffile " , split ( /,/ , $ ffiles ) ) .
" --scfile " . join ( " --scfile " , split ( /,/ , $ scfiles ) ) ;
2012-05-01 22:46:36 +04:00
2012-07-31 20:17:10 +04:00
push @ allnbests , $ nbest_file ;
2013-02-22 01:40:01 +04:00
my $ promix_file_settings =
2012-07-31 20:17:10 +04:00
"--scfile " . join ( " --scfile " , split ( /,/ , $ scfiles ) ) .
" --nbest " . join ( " --nbest " , @ allnbests ) ;
2011-07-23 04:24:45 +04:00
if ( $ ___START_WITH_HISTORIC_BESTS && defined $ prev_init_file ) {
2011-09-16 15:55:49 +04:00
$ file_settings . = " --ifile $prev_init_file,run$run.$weights_in_file" ;
2012-05-04 10:40:50 +04:00
} else {
2011-09-16 15:55:49 +04:00
$ file_settings . = " --ifile run$run.$weights_in_file" ;
2011-07-23 04:24:45 +04:00
}
2009-10-01 20:07:13 +04:00
2011-09-16 15:55:49 +04:00
$ cmd . = $ file_settings ;
2011-09-07 12:08:35 +04:00
2012-01-13 20:52:15 +04:00
my % sparse_weights ; # sparse features
2012-05-03 03:29:39 +04:00
my $ pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data" ;
2012-05-04 10:40:50 +04:00
if ( $ ___PAIRWISE_RANKED_OPTIMIZER ) { # pro optimization
2013-02-18 15:11:20 +04:00
$ cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd" ;
2012-05-04 10:40:50 +04:00
& submit_or_exec ( $ cmd , $ mert_outfile , $ mert_logfile ) ;
} elsif ( $ ___PRO_STARTING_POINT ) { # First, run pro, then mert
2011-09-16 15:55:49 +04:00
# run pro...
2013-02-18 15:11:20 +04:00
my $ pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd" ;
2012-05-04 10:40:50 +04:00
& submit_or_exec ( $ pro_cmd , "run$run.pro.out" , "run$run.pro.err" ) ;
2011-09-16 15:55:49 +04:00
# ... get results ...
2013-02-22 01:40:01 +04:00
( $ bestpoint , $ devbleu ) = & get_weights_from_mert ( "run$run.pro.out" , "run$run.pro.err" , scalar @ { $ featlist - > { "names" } } , \ % sparse_weights , \ @ promix_weights ) ;
2012-01-13 20:52:15 +04:00
# Get the pro outputs ready for mert. Add the weight ranges,
# and a weight and range for the single sparse feature
$ cmd =~ s/--ifile (\S+)/--ifile run$run.init.pro/ ;
open ( MERT_START , $ 1 ) ;
2011-09-16 15:55:49 +04:00
open ( PRO_START , ">run$run.init.pro" ) ;
2012-01-13 20:52:15 +04:00
print PRO_START $ bestpoint . " 1\n" ;
my $ mert_line = <MERT_START> ;
$ mert_line = <MERT_START> ;
chomp $ mert_line ;
print PRO_START $ mert_line . " 0\n" ;
$ mert_line = <MERT_START> ;
chomp $ mert_line ;
print PRO_START $ mert_line . " 1\n" ;
2011-09-16 15:55:49 +04:00
close ( PRO_START ) ;
2012-01-13 20:52:15 +04:00
# Write the sparse weights to file so mert can use them
open ( SPARSE_WEIGHTS , ">run$run.merge-weights" ) ;
foreach my $ fname ( keys % sparse_weights ) {
print SPARSE_WEIGHTS "$fname $sparse_weights{$fname}\n" ;
}
close ( SPARSE_WEIGHTS ) ;
$ cmd = $ cmd . " --sparse-weights run$run.merge-weights" ;
2011-09-16 15:55:49 +04:00
# ... and run mert
$ cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/ ;
2012-05-04 10:40:50 +04:00
& submit_or_exec ( $ cmd . $ mert_settings , $ mert_outfile , $ mert_logfile ) ;
2012-05-29 21:38:57 +04:00
} elsif ( $ ___BATCH_MIRA ) { # batch MIRA optimization
safesystem ( "echo 'not used' > $weights_out_file" ) or die ;
$ cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile" ;
& submit_or_exec ( $ cmd , "run$run.mira.out" , $ mert_logfile ) ;
2014-05-23 00:20:14 +04:00
} elsif ( $ ___HG_MIRA ) {
safesystem ( "echo 'not used' > $weights_out_file" ) or die ;
$ mira_settings . = " --type hypergraph " ;
$ mira_settings . = join ( " " , map { "--reference $_" } @ references ) ;
$ mira_settings . = " --hgdir $hypergraph_dir " ;
2014-05-27 11:55:05 +04:00
#$mira_settings .= "--verbose ";
2014-05-23 00:20:14 +04:00
$ cmd = "$mert_mira_cmd $mira_settings $seed_settings -o $mert_outfile" ;
& submit_or_exec ( $ cmd , "run$run.mira.out" , $ mert_logfile ) ;
2013-02-22 01:40:01 +04:00
} elsif ( $ __PROMIX_TRAINING ) {
# PRO trained mixture model
2012-10-31 18:45:15 +04:00
safesystem ( "echo 'not used' > $weights_out_file" ) or die ;
2013-02-22 01:40:01 +04:00
$ cmd = "$__PROMIX_TRAINING $promix_file_settings" ;
2012-10-31 18:45:15 +04:00
$ cmd . = " -t mix " ;
2013-02-22 01:40:01 +04:00
$ cmd . = join ( " " , map { "-p $_" } @ _PROMIX_TABLES_BIN ) ;
2012-10-31 18:45:15 +04:00
$ cmd . = " -i $___DEV_F" ;
2013-02-25 13:36:58 +04:00
print "Starting promix optimisation at " . `date` ;
2012-10-31 18:45:15 +04:00
& submit_or_exec ( $ cmd , "$mert_outfile" , $ mert_logfile ) ;
2013-02-25 13:36:58 +04:00
print "Finished promix optimisation at " . `date` ;
2012-05-04 10:40:50 +04:00
} else { # just mert
& submit_or_exec ( $ cmd . $ mert_settings , $ mert_outfile , $ mert_logfile ) ;
2012-10-31 18:45:15 +04:00
}
2011-09-16 15:55:49 +04:00
2008-06-10 13:07:20 +04:00
die "Optimization failed, file $weights_out_file does not exist or is empty"
if ! - s $ weights_out_file ;
2011-09-07 20:37:33 +04:00
# backup copies
2014-05-23 00:20:14 +04:00
if ( ! $ ___HG_MIRA ) {
safesystem ( "\\cp -f extract.err run$run.extract.err" ) or die ;
safesystem ( "\\cp -f extract.out run$run.extract.out" ) or die ;
}
2012-05-04 10:40:50 +04:00
safesystem ( "\\cp -f $mert_outfile run$run.$mert_outfile" ) or die ;
safesystem ( "\\cp -f $mert_logfile run$run.$mert_logfile" ) or die ;
safesystem ( "touch $mert_logfile run$run.$mert_logfile" ) or die ;
safesystem ( "\\cp -f $weights_out_file run$run.$weights_out_file" ) or die ; # this one is needed for restarts, too
2013-03-15 20:13:33 +04:00
if ( $ __PROMIX_TRAINING ) {
safesystem ( "\\cp -f $interpolated_config run$run.$interpolated_config" ) or die ;
}
2008-06-10 13:07:20 +04:00
print "run $run end at " . `date` ;
2013-02-22 01:40:01 +04:00
( $ bestpoint , $ devbleu ) = & get_weights_from_mert ( "run$run.$mert_outfile" , "run$run.$mert_logfile" , scalar @ { $ featlist - > { "names" } } , \ % sparse_weights , \ @ promix_weights ) ;
2012-01-13 20:52:15 +04:00
my $ merge_weight = 0 ;
2014-05-26 14:03:28 +04:00
if ( $ __PROMIX_TRAINING ) {
print "New mixture weights: " . join ( " " , @ promix_weights ) . "\n" ;
}
2011-09-07 20:37:33 +04:00
2008-06-10 13:07:20 +04:00
die "Failed to parse mert.log, missed Best point there."
if ! defined $ bestpoint || ! defined $ devbleu ;
2011-09-16 15:55:49 +04:00
2008-06-10 13:07:20 +04:00
print "($run) BEST at $run: $bestpoint => $devbleu at " . `date` ;
2011-08-17 13:15:19 +04:00
# update my cache of lambda values
2008-06-10 13:07:20 +04:00
my @ newweights = split /\s+/ , $ bestpoint ;
2012-01-13 20:52:15 +04:00
if ( $ ___PRO_STARTING_POINT ) {
$ merge_weight = pop @ newweights ;
}
2011-09-16 15:55:49 +04:00
# interpolate with prior's interation weight, if historic-interpolation is specified
if ( $ ___HISTORIC_INTERPOLATION > 0 && $ run > 3 ) {
my % historic_sparse_weights ;
if ( - e "run$run.sparse-weights" ) {
2012-05-02 00:07:14 +04:00
open my $ sparse_fh , '<' , "run$run.sparse-weights" or die "run$run.sparse-weights: $!" ;
while ( <$sparse_fh> ) {
2011-09-16 15:55:49 +04:00
chop ;
2012-05-02 00:07:14 +04:00
my ( $ feature , $ weight ) = split ;
2011-09-16 15:55:49 +04:00
$ historic_sparse_weights { $ feature } = $ weight ;
}
2012-05-02 00:07:14 +04:00
close $ sparse_fh ;
2011-09-16 15:55:49 +04:00
}
2012-05-04 10:40:50 +04:00
my $ prev = $ run - 1 ;
2011-09-16 15:55:49 +04:00
my @ historic_weights = split /\s+/ , `cat run$prev.$weights_out_file` ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ newweights ) ; $ i + + ) {
$ newweights [ $ i ] = $ ___HISTORIC_INTERPOLATION * $ newweights [ $ i ] + ( 1 - $ ___HISTORIC_INTERPOLATION ) * $ historic_weights [ $ i ] ;
2011-09-16 15:55:49 +04:00
}
2012-05-04 10:40:50 +04:00
print "interpolate with " . join ( "," , @ historic_weights ) . " to " . join ( "," , @ newweights ) ;
2011-09-16 15:55:49 +04:00
foreach ( keys % sparse_weights ) {
$ sparse_weights { $ _ } *= $ ___HISTORIC_INTERPOLATION ;
#print STDERR "sparse_weights{$_} *= $___HISTORIC_INTERPOLATION -> $sparse_weights{$_}\n";
}
foreach ( keys % historic_sparse_weights ) {
2012-05-04 10:40:50 +04:00
$ sparse_weights { $ _ } += ( 1 - $ ___HISTORIC_INTERPOLATION ) * $ historic_sparse_weights { $ _ } ;
2011-09-16 15:55:49 +04:00
#print STDERR "sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_} -> $sparse_weights{$_}\n";
}
}
2012-05-02 00:07:14 +04:00
if ( $ ___HISTORIC_INTERPOLATION > 0 ) {
open my $ weights_fh , '>' , "run$run.$weights_out_file" or die "run$run.$weights_out_file: $!" ;
print $ weights_fh join ( " " , @ newweights ) ;
close $ weights_fh ;
2011-09-16 15:55:49 +04:00
}
2011-08-17 13:15:19 +04:00
$ featlist - > { "values" } = \ @ newweights ;
2008-06-10 13:07:20 +04:00
2011-09-07 20:37:33 +04:00
if ( scalar keys % sparse_weights ) {
2012-05-04 10:40:50 +04:00
$ sparse_weights_file = "run" . ( $ run + 1 ) . ".sparse-weights" ;
2012-05-02 00:07:14 +04:00
open my $ sparse_fh , '>' , $ sparse_weights_file or die "$sparse_weights_file: $!" ;
2011-09-07 20:37:33 +04:00
foreach my $ feature ( keys % sparse_weights ) {
2012-01-13 20:52:15 +04:00
my $ sparse_weight = $ sparse_weights { $ feature } ;
if ( $ ___PRO_STARTING_POINT ) {
$ sparse_weight *= $ merge_weight ;
}
2012-05-25 00:11:35 +04:00
print $ sparse_fh "$feature $sparse_weight\n" ;
2011-09-07 20:37:33 +04:00
}
2012-05-02 00:07:14 +04:00
close $ sparse_fh ;
2011-09-07 20:37:33 +04:00
}
2008-06-10 13:07:20 +04:00
## additional stopping criterion: weights have not changed
my $ shouldstop = 1 ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < @ CURR ; $ i + + ) {
2008-06-10 13:07:20 +04:00
die "Lost weight! mert reported fewer weights (@newweights) than we gave it (@CURR)"
if ! defined $ newweights [ $ i ] ;
if ( abs ( $ CURR [ $ i ] - $ newweights [ $ i ] ) >= $ minimum_required_change_in_weights ) {
$ shouldstop = 0 ;
last ;
}
}
2012-05-02 00:25:00 +04:00
& save_finished_step ( $ finished_step_file , $ run ) ;
2008-06-10 13:07:20 +04:00
if ( $ shouldstop ) {
print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n" ;
last ;
}
2008-12-30 20:33:16 +03:00
my $ firstrun ;
2012-05-04 10:40:50 +04:00
if ( $ prev_aggregate_nbl_size == - 1 ) {
$ firstrun = 1 ;
} else {
$ firstrun = $ run - $ prev_aggregate_nbl_size + 1 ;
$ firstrun = ( $ firstrun > 0 ) ? $ firstrun : 1 ;
2008-12-30 20:33:16 +03:00
}
2012-05-04 10:40:50 +04:00
2008-12-30 20:33:16 +03:00
print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n" ;
$ prev_feature_file = undef ;
2012-05-04 10:40:50 +04:00
$ prev_score_file = undef ;
$ prev_init_file = undef ;
for ( my $ i = $ firstrun ; $ i <= $ run ; $ i + + ) {
if ( defined $ prev_feature_file ) {
2008-12-30 20:33:16 +03:00
$ prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}" ;
2012-05-04 10:40:50 +04:00
} else {
2008-12-30 20:33:16 +03:00
$ prev_feature_file = "run${i}.${base_feature_file}" ;
}
2012-05-04 10:40:50 +04:00
if ( defined $ prev_score_file ) {
2008-12-30 20:33:16 +03:00
$ prev_score_file = "${prev_score_file},run${i}.${base_score_file}" ;
2012-05-04 10:40:50 +04:00
} else {
2008-12-30 20:33:16 +03:00
$ prev_score_file = "run${i}.${base_score_file}" ;
}
2012-05-04 10:40:50 +04:00
if ( defined $ prev_init_file ) {
2011-07-23 04:24:45 +04:00
$ prev_init_file = "${prev_init_file},run${i}.${weights_in_file}" ;
2012-05-04 10:40:50 +04:00
} else {
2011-07-23 04:24:45 +04:00
$ prev_init_file = "run${i}.${weights_in_file}" ;
}
2008-12-30 20:33:16 +03:00
}
2009-10-07 20:35:57 +04:00
print "loading data from $prev_feature_file\n" if defined ( $ prev_feature_file ) ;
2012-05-04 10:40:50 +04:00
print "loading data from $prev_score_file\n" if defined ( $ prev_score_file ) ;
print "loading data from $prev_init_file\n" if defined ( $ prev_init_file ) ;
2008-06-10 13:07:20 +04:00
}
2012-05-02 00:07:14 +04:00
if ( defined $ allsorted ) {
safesystem ( "\\rm -f $allsorted" ) or die ;
}
2008-06-10 13:07:20 +04:00
safesystem ( "\\cp -f $weights_in_file run$run.$weights_in_file" ) or die ;
safesystem ( "\\cp -f $mert_logfile run$run.$mert_logfile" ) or die ;
2012-06-12 17:16:11 +04:00
if ( $ ___RETURN_BEST_DEV ) {
my $ bestit = 1 ;
my $ bestbleu = 0 ;
my $ evalout = "eval.out" ;
for ( my $ i = 1 ; $ i < $ run ; $ i + + ) {
2014-09-17 20:40:11 +04:00
my $ cmd = "$mert_eval_cmd --reference " . join ( "," , @ references ) . " $mert_extract_args --nbest run$i.best$___N_BEST_LIST_SIZE.out.gz" ;
2013-02-22 01:40:01 +04:00
$ cmd . = " -l $__REMOVE_SEGMENTATION" if defined ( $ __PROMIX_TRAINING ) ;
2012-11-14 17:28:40 +04:00
safesystem ( "$cmd 2> /dev/null 1> $evalout" ) ;
2012-06-12 17:16:11 +04:00
open my $ fh , '<' , $ evalout or die "Can't read $evalout : $!" ;
my $ bleu = <$fh> ;
chomp $ bleu ;
if ( $ bleu > $ bestbleu ) {
$ bestbleu = $ bleu ;
$ bestit = $ i ;
}
close $ fh ;
}
print "copying weights from best iteration ($bestit, bleu=$bestbleu) to moses.ini\n" ;
2012-06-13 07:02:04 +04:00
my $ best_sparse_file = undef ;
if ( defined $ sparse_weights_file ) {
$ best_sparse_file = "run$bestit.sparse-weights" ;
}
2014-02-13 18:19:40 +04:00
my $ best_featlist = get_featlist_from_file ( "run$bestit.dense" ) ;
$ best_featlist - > { "untuneables" } = $ featlist - > { "untuneables" } ;
create_config ( $ ___CONFIG_ORIG , "./moses.ini" , $ best_featlist ,
2012-06-13 07:02:04 +04:00
$ bestit , $ bestbleu , $ best_sparse_file ) ;
2012-06-12 17:16:11 +04:00
}
else {
create_config ( $ ___CONFIG_ORIG , "./moses.ini" , $ featlist , $ run , $ devbleu , $ sparse_weights_file ) ;
}
2008-06-10 13:07:20 +04:00
# just to be sure that we have the really last finished step marked
2012-05-02 00:25:00 +04:00
& save_finished_step ( $ finished_step_file , $ run ) ;
2008-06-10 13:07:20 +04:00
#chdir back to the original directory # useless, just to remind we were not there
chdir ( $ cwd ) ;
2012-06-12 17:16:11 +04:00
print "Training finished at " . `date` ;
2008-06-10 13:07:20 +04:00
} # end of local scope
2011-09-07 20:37:33 +04:00
sub get_weights_from_mert {
2012-10-31 18:45:15 +04:00
my ( $ outfile , $ logfile , $ weight_count , $ sparse_weights , $ mix_weights ) = @ _ ;
2012-05-02 00:07:14 +04:00
my ( $ bestpoint , $ devbleu ) ;
2012-06-02 14:55:56 +04:00
if ( $ ___PAIRWISE_RANKED_OPTIMIZER || ( $ ___PRO_STARTING_POINT && $ logfile =~ /pro/ )
2014-05-26 14:03:28 +04:00
|| $ ___BATCH_MIRA || $ __PROMIX_TRAINING || $ ___HG_MIRA ) {
2012-05-02 00:07:14 +04:00
open my $ fh , '<' , $ outfile or die "Can't open $outfile: $!" ;
2012-06-02 14:55:56 +04:00
my @ WEIGHT ;
2012-10-31 18:45:15 +04:00
@$ mix_weights = ( ) ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < $ weight_count ; $ i + + ) { push @ WEIGHT , 0 ; }
2012-06-02 14:55:56 +04:00
my $ sum = 0.0 ;
2012-05-02 00:07:14 +04:00
while ( <$fh> ) {
if ( /^F(\d+) ([\-\.\de]+)/ ) { # regular features
2011-09-07 20:37:33 +04:00
$ WEIGHT [ $ 1 ] = $ 2 ;
$ sum += abs ( $ 2 ) ;
2012-11-20 19:13:31 +04:00
} elsif ( /^M(\d+_\d+) ([\-\.\de]+)/ ) { # mix weights
push @$ mix_weights , $ 2 ;
2012-05-02 00:07:14 +04:00
} elsif ( /^(.+_.+) ([\-\.\de]+)/ ) { # sparse features
2011-09-07 20:37:33 +04:00
$$ sparse_weights { $ 1 } = $ 2 ;
}
}
2012-06-02 14:55:56 +04:00
close $ fh ;
die "It seems feature values are invalid or unable to read $outfile." if $ sum < 1e-09 ;
2012-10-31 18:45:15 +04:00
2011-09-07 20:37:33 +04:00
$ devbleu = "unknown" ;
foreach ( @ WEIGHT ) { $ _ /= $ sum ; }
2011-09-16 15:55:49 +04:00
foreach ( keys % { $ sparse_weights } ) { $$ sparse_weights { $ _ } /= $ sum ; }
2012-05-02 00:07:14 +04:00
$ bestpoint = join ( " " , @ WEIGHT ) ;
2012-06-02 14:55:56 +04:00
2014-05-26 14:03:28 +04:00
if ( $ ___BATCH_MIRA || $ ___HG_MIRA ) {
2012-05-29 21:38:57 +04:00
open my $ fh2 , '<' , $ logfile or die "Can't open $logfile: $!" ;
while ( <$fh2> ) {
if ( /Best BLEU = ([\-\d\.]+)/ ) {
$ devbleu = $ 1 ;
}
}
2012-06-02 14:55:56 +04:00
close $ fh2 ;
2012-05-29 21:38:57 +04:00
}
2012-05-01 23:03:44 +04:00
} else {
2012-05-02 00:07:14 +04:00
open my $ fh , '<' , $ logfile or die "Can't open $logfile: $!" ;
while ( <$fh> ) {
2011-09-07 20:37:33 +04:00
if ( /Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/ ) {
$ bestpoint = $ 1 ;
$ devbleu = $ 2 ;
last ;
}
}
2012-05-02 00:07:14 +04:00
close $ fh ;
2011-09-07 20:37:33 +04:00
}
2012-05-02 00:07:14 +04:00
return ( $ bestpoint , $ devbleu ) ;
2011-09-07 20:37:33 +04:00
}
2008-06-10 13:07:20 +04:00
sub run_decoder {
2011-08-17 13:15:19 +04:00
my ( $ featlist , $ run , $ need_to_normalize ) = @ _ ;
2008-06-10 13:07:20 +04:00
my $ filename_template = "run%d.best$___N_BEST_LIST_SIZE.out" ;
my $ filename = sprintf ( $ filename_template , $ run ) ;
2014-05-23 00:20:14 +04:00
my $ hypergraph_dir = "hypergraph" ;
2011-10-05 00:45:47 +04:00
my $ lsamp_filename = undef ;
if ( $ ___LATTICE_SAMPLES ) {
my $ lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out" ;
$ lsamp_filename = sprintf ( $ lsamp_filename_template , $ run ) ;
}
2012-05-01 22:46:36 +04:00
2011-08-17 13:15:19 +04:00
# user-supplied parameters
print "params = $___DECODER_FLAGS\n" ;
# parameters to set all model weights (to override moses.ini)
my @ vals = @ { $ featlist - > { "values" } } ;
2008-06-10 13:07:20 +04:00
if ( $ need_to_normalize ) {
print STDERR "Normalizing lambdas: @vals\n" ;
2012-05-04 10:40:50 +04:00
my $ totlambda = 0 ;
grep ( $ totlambda += abs ( $ _ ) , @ vals ) ;
grep ( $ _ /= $ totlambda , @ vals ) ;
2008-06-10 13:07:20 +04:00
}
2011-08-17 13:15:19 +04:00
# moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y"
my % model_weights ;
for ( my $ i = 0 ; $ i <scalar(@{$featlist-> { "names" } } ) ; $ i + + ) {
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
2012-12-13 16:01:39 +04:00
$ model_weights { $ name } = "$name=" if ! defined $ model_weights { $ name } ;
2011-08-17 13:15:19 +04:00
$ model_weights { $ name } . = sprintf " %.6f" , $ vals [ $ i ] ;
}
2012-09-24 14:11:40 +04:00
my $ decoder_config = "" ;
2012-12-13 16:01:39 +04:00
$ decoder_config = "-weight-overwrite '" . join ( " " , values % model_weights ) . "'" unless $ ___USE_CONFIG_WEIGHTS_FIRST && $ run == 1 ;
2011-10-05 14:29:01 +04:00
$ decoder_config . = " -weight-file run$run.sparse-weights" if - e "run$run.sparse-weights" ;
2013-02-22 01:40:01 +04:00
$ decoder_config . = " -report-segmentation" if $ __PROMIX_TRAINING ;
2008-06-10 13:07:20 +04:00
print STDERR "DECODER_CFG = $decoder_config\n" ;
print "decoder_config = $decoder_config\n" ;
# run the decoder
my $ decoder_cmd ;
2011-10-05 00:45:47 +04:00
my $ lsamp_cmd = "" ;
if ( $ ___LATTICE_SAMPLES ) {
$ lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES " ;
}
2008-06-10 13:07:20 +04:00
2011-03-09 13:43:34 +03:00
if ( defined $ ___JOBS && $ ___JOBS > 0 ) {
2014-05-23 00:20:14 +04:00
die "Hypergraph mira not supported by moses-parallel" if $ ___HG_MIRA ;
2014-08-08 00:02:51 +04:00
$ decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG" ;
$ decoder_cmd . = " -inputtype $___INPUTTYPE" if defined ( $ ___INPUTTYPE ) ;
$ decoder_cmd . = " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out" ;
2008-06-10 13:07:20 +04:00
} else {
2014-08-08 00:02:51 +04:00
my $ nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct" ;
2014-05-23 00:20:14 +04:00
if ( $ ___HG_MIRA ) {
safesystem ( "rm -rf $hypergraph_dir" ) ;
$ nbest_list_cmd = "-output-search-graph-hypergraph true gz" ;
}
2014-08-08 00:02:51 +04:00
$ decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG" ;
$ decoder_cmd . = " -inputtype $___INPUTTYPE" if defined ( $ ___INPUTTYPE ) ;
2014-08-13 23:58:51 +04:00
$ decoder_cmd . = " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F" ;
2014-08-05 22:20:00 +04:00
if ( defined $ ___DEV_SYMAL ) {
2014-08-13 23:58:51 +04:00
# If simulating post-editing, route command through moses_sim_pe.py
2014-08-05 22:20:00 +04:00
# Always use single (first) reference. Simulated post-editing undefined for multiple references.
2014-08-13 23:58:51 +04:00
$ decoder_cmd = "$___MOSES_SIM_PE $decoder_cmd -ref $references[0] -symal $dev_symal_abs -tmp $working_dir_abs > run$run.out" ;
2014-08-05 22:20:00 +04:00
}
2014-08-13 23:58:51 +04:00
$ decoder_cmd . = " > run$run.out" ;
2008-06-10 13:07:20 +04:00
}
2014-02-07 23:44:54 +04:00
print STDERR "Executing: $decoder_cmd \n" ;
2008-06-10 13:07:20 +04:00
safesystem ( $ decoder_cmd ) or die "The decoder died. CONFIG WAS $decoder_config \n" ;
2014-05-23 00:20:14 +04:00
if ( ! $ ___HG_MIRA ) {
sanity_check_order_of_lambdas ( $ featlist , $ filename ) ;
} else {
print STDERR "WARN: No sanity check of order of features in hypergraph mira\n" ;
}
return ( $ filename , $ lsamp_filename , $ hypergraph_dir ) ;
2011-08-17 13:15:19 +04:00
}
sub insert_ranges_to_featlist {
my $ featlist = shift ;
my $ ranges = shift ;
$ ranges = [] if ! defined $ ranges ;
# first collect the ranges from options
my $ niceranges ;
foreach my $ range ( @$ ranges ) {
my $ name = undef ;
foreach my $ namedpair ( split /,/ , $ range ) {
if ( $ namedpair =~ /^(.*?):/ ) {
$ name = $ 1 ;
$ namedpair =~ s/^.*?:// ;
}
my ( $ min , $ max ) = split /\.\./ , $ namedpair ;
die "Bad min '$min' in --range=$range" if $ min !~ /^-?[0-9.]+$/ ;
die "Bad max '$max' in --range=$range" if $ min !~ /^-?[0-9.]+$/ ;
die "No name given in --range=$range" if ! defined $ name ;
push @ { $ niceranges - > { $ name } } , [ $ min , $ max ] ;
}
}
# now populate featlist
my $ seen = undef ;
2012-05-04 10:40:50 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
2011-08-17 13:15:19 +04:00
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
$ seen - > { $ name } + + ;
my $ min = 0.0 ;
my $ max = 1.0 ;
if ( defined $ niceranges - > { $ name } ) {
my $ minmax = shift @ { $ niceranges - > { $ name } } ;
( $ min , $ max ) = @$ minmax if defined $ minmax ;
}
$ featlist - > { "mins" } - > [ $ i ] = $ min ;
$ featlist - > { "maxs" } - > [ $ i ] = $ max ;
}
return $ featlist ;
}
sub sanity_check_order_of_lambdas {
my $ featlist = shift ;
my $ filename_or_stream = shift ;
my @ expected_lambdas = @ { $ featlist - > { "names" } } ;
my @ got = get_order_of_scores_from_nbestlist ( $ filename_or_stream ) ;
die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas"
if "@got" ne "@expected_lambdas" ;
2008-06-10 13:07:20 +04:00
}
2011-08-17 13:15:19 +04:00
sub get_featlist_from_moses {
# run moses with the given config file and return the list of features and
# their initial values
my $ configfn = shift ;
my $ featlistfn = "./features.list" ;
2012-05-02 06:47:48 +04:00
if ( - e $ featlistfn && ! - z $ featlistfn ) { # exists & not empty
2011-08-17 13:15:19 +04:00
print STDERR "Using cached features list: $featlistfn\n" ;
} else {
print STDERR "Asking moses for feature names and values from $___CONFIG\n" ;
2014-08-08 00:02:51 +04:00
my $ cmd = "$___DECODER $___DECODER_FLAGS -config $configfn" ;
$ cmd . = " -inputtype $___INPUTTYPE" if defined ( $ ___INPUTTYPE ) ;
$ cmd . = " -show-weights > $featlistfn" ;
2014-02-07 23:44:54 +04:00
print STDERR "Executing: $cmd\n" ;
2011-08-17 13:15:19 +04:00
safesystem ( $ cmd ) or die "Failed to run moses with the config $configfn" ;
}
2012-06-13 07:02:04 +04:00
return get_featlist_from_file ( $ featlistfn ) ;
}
2011-08-17 13:15:19 +04:00
2012-06-13 07:02:04 +04:00
sub get_featlist_from_file {
my $ featlistfn = shift ;
2011-08-17 13:15:19 +04:00
# read feature list
my @ names = ( ) ;
my @ startvalues = ( ) ;
2014-01-20 14:06:23 +04:00
my @ untuneables = ( ) ;
2012-05-02 00:07:14 +04:00
open my $ fh , '<' , $ featlistfn or die "Can't read $featlistfn : $!" ;
2011-08-17 13:15:19 +04:00
my $ nr = 0 ;
my @ errs = ( ) ;
2012-05-02 00:07:14 +04:00
while ( <$fh> ) {
2011-08-17 13:15:19 +04:00
$ nr + + ;
chomp ;
2013-05-17 11:37:29 +04:00
if ( /^(\S+)= (.+)$/ ) { # only for feature functions with dense features
my ( $ longname , $ valuesStr ) = ( $ 1 , $ 2 ) ;
next if ( ! defined ( $ valuesStr ) ) ;
2012-12-17 21:17:44 +04:00
2013-05-17 11:37:29 +04:00
my @ values = split ( / / , $ valuesStr ) ;
foreach my $ value ( @ values ) {
push @ errs , "$featlistfn:$nr:Bad initial value of $longname: $value\n"
if $ value !~ /^[+-]?[0-9.\-e]+$/ ;
push @ names , $ longname ;
push @ startvalues , $ value ;
}
2012-12-17 21:17:44 +04:00
}
2014-01-20 14:06:23 +04:00
elsif ( /^(\S+) UNTUNEABLE$/ ) {
my ( $ longname ) = ( $ 1 ) ;
push @ untuneables , $ longname ;
}
2011-08-17 13:15:19 +04:00
}
2012-05-02 00:07:14 +04:00
close $ fh ;
2011-08-17 13:15:19 +04:00
if ( scalar @ errs ) {
2012-05-03 02:22:08 +04:00
warn join ( "" , @ errs ) ;
2011-08-17 13:15:19 +04:00
exit 1 ;
}
2014-01-20 14:06:23 +04:00
return { "names" = > \ @ names , "values" = > \ @ startvalues , "untuneables" = > \ @ untuneables } ;
2011-08-17 13:15:19 +04:00
}
2008-06-10 13:07:20 +04:00
sub get_order_of_scores_from_nbestlist {
# read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist
# return the score labels in order
my $ fname_or_source = shift ;
2011-08-17 13:15:19 +04:00
# print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n";
2012-05-26 03:09:50 +04:00
open my $ fh , $ fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source': $!" ;
2012-05-02 00:07:14 +04:00
my $ line = <$fh> ;
close $ fh ;
2008-06-10 13:07:20 +04:00
die "Line empty in nbestlist '$fname_or_source'" if ! defined $ line ;
my ( $ sent , $ hypo , $ scores , $ total ) = split /\|\|\|/ , $ line ;
$ scores =~ s/^\s*|\s*$//g ;
die "No scores in line: $line" if $ scores eq "" ;
my @ order = ( ) ;
my $ label = undef ;
2011-09-07 20:37:33 +04:00
my $ sparse = 0 ; # we ignore sparse features here
2008-06-10 13:07:20 +04:00
foreach my $ tok ( split /\s+/ , $ scores ) {
2013-05-17 11:37:29 +04:00
if ( $ tok =~ /.+_.+=/ ) {
2011-09-07 20:37:33 +04:00
$ sparse = 1 ;
2012-12-13 16:04:54 +04:00
} elsif ( $ tok =~ /^([a-z][0-9a-z]*)=/i ) {
2008-06-10 13:07:20 +04:00
$ label = $ 1 ;
2012-08-19 02:47:05 +04:00
} elsif ( $ tok =~ /^-?[-0-9.\-e]+$/ ) {
2011-09-07 20:37:33 +04:00
if ( ! $ sparse ) {
# a score found, remember it
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
if ! defined $ label ;
push @ order , $ label ;
}
$ sparse = 0 ;
2008-06-10 13:07:20 +04:00
} else {
die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'" ;
}
}
print STDERR "The decoder returns the scores in this order: @order\n" ;
return @ order ;
}
sub create_config {
2012-05-04 10:40:50 +04:00
# TODO: too many arguments. you might want to consider using hashes
my $ infn = shift ; # source config
my $ outfn = shift ; # where to save the config
my $ featlist = shift ; # the lambdas we should write
my $ iteration = shift ; # just for verbosity
my $ bleu_achieved = shift ; # just for verbosity
my $ sparse_weights_file = shift ; # only defined when optimizing sparse features
2014-01-20 14:06:23 +04:00
my @ keep_weights = ( ) ;
2012-12-14 20:43:12 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
my $ val = $ featlist - > { "values" } - > [ $ i ] ;
# ensure long name
print STDERR "featlist: $name=$val \n" ;
}
2012-05-04 10:40:50 +04:00
my % P ; # the hash of all parameters we wish to override
# first convert the command line parameters to the hash
# ensure local scope of vars
{
my $ parameter = undef ;
print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n" ;
$ ___DECODER_FLAGS =~ s/^\s*|\s*$// ;
$ ___DECODER_FLAGS =~ s/\s+/ / ;
foreach ( split ( / / , $ ___DECODER_FLAGS ) ) {
if ( /^\-([^\d].*)$/ ) {
$ parameter = $ 1 ;
} else {
2012-12-14 20:43:12 +04:00
my $ value = $ _ ;
die "Found value with no -paramname before it: $value"
2012-05-04 10:40:50 +04:00
if ! defined $ parameter ;
2012-12-14 20:43:12 +04:00
push @ { $ P { $ parameter } } , $ value ;
2012-05-04 10:40:50 +04:00
}
2008-06-10 13:07:20 +04:00
}
2012-05-04 10:40:50 +04:00
}
2008-06-10 13:07:20 +04:00
2012-05-04 10:40:50 +04:00
if ( defined ( $ sparse_weights_file ) ) {
2012-05-26 03:09:50 +04:00
push @ { $ P { "weight-file" } } , File::Spec - > catfile ( $ ___WORKING_DIR , $ sparse_weights_file ) ;
2012-05-04 10:40:50 +04:00
}
2011-09-07 20:37:33 +04:00
2012-05-04 10:40:50 +04:00
# create new moses.ini decoder config file by cloning and overriding the original one
open my $ ini_fh , '<' , $ infn or die "Can't read $infn: $!" ;
delete ( $ P { "config" } ) ; # never output
print "Saving new config to: $outfn\n" ;
open my $ out , '>' , $ outfn or die "Can't write $outfn: $!" ;
print $ out "# MERT optimized configuration\n" ;
print $ out "# decoder $___DECODER\n" ;
print $ out "# BLEU $bleu_achieved on dev $___DEV_F\n" ;
print $ out "# We were before running iteration $iteration\n" ;
print $ out "# finished " . `date` ;
my $ line = <$ini_fh> ;
while ( 1 ) {
last unless $ line ;
# skip until hit [parameter]
if ( $ line !~ /^\[(.+)\]\s*$/ ) {
$ line = <$ini_fh> ;
print $ out $ line if $ line =~ /^\#/ || $ line =~ /^\s+$/ ;
next ;
2008-06-10 13:07:20 +04:00
}
2012-05-04 10:40:50 +04:00
# parameter name
my $ parameter = $ 1 ;
2012-05-02 00:07:14 +04:00
2012-12-14 17:29:26 +04:00
if ( $ parameter eq "weight" ) {
2012-12-14 20:43:12 +04:00
# leave weights 'til last. We're changing it
while ( $ line = <$ini_fh> ) {
last if $ line =~ /^\[/ ;
2014-01-20 19:54:17 +04:00
if ( $ line =~ /^([^_=\s]+)/ ) {
2014-01-20 14:06:23 +04:00
for ( @ { $ featlist - > { "untuneables" } } ) {
if ( $ 1 eq $ _ ) { # if weight is untuneable, copy it into new config
push @ keep_weights , $ line ;
}
}
}
2012-12-14 20:43:12 +04:00
}
2012-12-14 17:29:26 +04:00
}
2012-12-14 20:43:12 +04:00
elsif ( defined ( $ P { $ parameter } ) ) {
# found a param (thread, verbose etc) that we're overriding. Leave to the end
while ( $ line = <$ini_fh> ) {
last if $ line =~ /^\[/ ;
2012-12-14 17:29:26 +04:00
}
}
2012-12-14 20:43:12 +04:00
else {
# unchanged parameter, write old
print $ out "[$parameter]\n" ;
while ( $ line = <$ini_fh> ) {
last if $ line =~ /^\[/ ;
print $ out $ line ;
}
}
}
2011-09-07 20:37:33 +04:00
2012-12-14 20:43:12 +04:00
# write all additional parameters
foreach my $ parameter ( keys % P ) {
print $ out "\n[$parameter]\n" ;
foreach ( @ { $ P { $ parameter } } ) {
print $ out $ _ . "\n" ;
}
2012-05-04 10:40:50 +04:00
}
2008-06-10 13:07:20 +04:00
2012-12-14 17:29:26 +04:00
# write all weights
print $ out "[weight]\n" ;
2012-12-18 16:45:11 +04:00
my $ prevName = "" ;
my $ outStr = "" ;
2012-12-14 20:43:12 +04:00
for ( my $ i = 0 ; $ i < scalar ( @ { $ featlist - > { "names" } } ) ; $ i + + ) {
my $ name = $ featlist - > { "names" } - > [ $ i ] ;
my $ val = $ featlist - > { "values" } - > [ $ i ] ;
2012-12-18 16:45:11 +04:00
if ( $ prevName eq $ name ) {
$ outStr . = " $val" ;
}
else {
print $ out "$outStr\n" ;
$ outStr = "$name= $val" ;
$ prevName = $ name ;
}
2012-05-04 10:40:50 +04:00
}
2012-12-18 16:45:11 +04:00
print $ out "$outStr\n" ;
2008-06-10 13:07:20 +04:00
2014-01-20 14:06:23 +04:00
for ( @ keep_weights ) {
print $ out $ _ ;
}
2012-05-04 10:40:50 +04:00
close $ ini_fh ;
close $ out ;
print STDERR "Saved: $outfn\n" ;
2008-06-10 13:07:20 +04:00
}
2012-10-31 18:45:15 +04:00
# Create a new ini file, with the first ttable replaced by the given one
# and its type set to text
sub substitute_ttable {
2013-03-15 20:13:33 +04:00
my ( $ old_ini , $ new_ini , $ new_ttable , $ ttable_type ) = @ _ ;
$ ttable_type = "0" unless defined ( $ ttable_type ) ;
2012-10-31 18:45:15 +04:00
open ( NEW_INI , ">$new_ini" ) || die "Failed to create $new_ini" ;
open ( INI , $ old_ini ) || die "Failed to open $old_ini" ;
while ( <INI> ) {
if ( /\[ttable-file\]/ ) {
print NEW_INI "[ttable-file]\n" ;
my $ ttable_config = <INI> ;
chomp $ ttable_config ;
my @ ttable_fields = split /\s+/ , $ ttable_config ;
2013-03-15 20:13:33 +04:00
$ ttable_fields [ 0 ] = $ ttable_type ;
2012-10-31 18:45:15 +04:00
$ ttable_fields [ 4 ] = $ new_ttable ;
print NEW_INI join ( " " , @ ttable_fields ) . "\n" ;
} else {
print NEW_INI ;
}
}
close NEW_INI ;
close INI ;
}
2008-06-10 13:07:20 +04:00
sub safesystem {
print STDERR "Executing: @_\n" ;
system ( @ _ ) ;
if ( $? == - 1 ) {
2012-05-03 02:22:08 +04:00
warn "Failed to execute: @_\n $!" ;
2008-06-10 13:07:20 +04:00
exit ( 1 ) ;
2012-05-01 23:03:44 +04:00
} elsif ( $? & 127 ) {
2008-06-10 13:07:20 +04:00
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n" ,
( $? & 127 ) , ( $? & 128 ) ? 'with' : 'without' ;
exit ( 1 ) ;
2012-05-01 23:03:44 +04:00
} else {
2008-06-10 13:07:20 +04:00
my $ exitcode = $? >> 8 ;
2012-05-03 02:22:08 +04:00
warn "Exit code: $exitcode\n" if $ exitcode ;
2008-06-10 13:07:20 +04:00
return ! $ exitcode ;
}
}
2012-05-02 00:07:14 +04:00
2008-06-10 13:07:20 +04:00
sub ensure_full_path {
2012-05-04 10:40:50 +04:00
my $ PATH = shift ;
$ PATH =~ s/\/nfsmnt// ;
return $ PATH if $ PATH =~ /^\// ;
my $ dir = Cwd:: getcwd ( ) ;
2012-05-04 10:44:20 +04:00
$ PATH = File::Spec - > catfile ( $ dir , $ PATH ) ;
2012-05-04 10:40:50 +04:00
$ PATH =~ s/[\r\n]//g ;
$ PATH =~ s/\/\.\//\//g ;
$ PATH =~ s/\/+/\//g ;
my $ sanity = 0 ;
while ( $ PATH =~ /\/\.\.\// && $ sanity + + < 10 ) {
2008-06-10 13:07:20 +04:00
$ PATH =~ s/\/+/\//g ;
2012-05-04 10:40:50 +04:00
$ PATH =~ s/\/[^\/]+\/\.\.\//\//g ;
}
$ PATH =~ s/\/[^\/]+\/\.\.$// ;
$ PATH =~ s/\/+$// ;
$ PATH =~ s/\/nfsmnt// ;
return $ PATH ;
2008-06-10 13:07:20 +04:00
}
2011-09-16 15:55:49 +04:00
sub submit_or_exec {
2012-05-02 00:07:14 +04:00
my ( $ cmd , $ stdout , $ stderr ) = @ _ ;
2011-09-16 15:55:49 +04:00
print STDERR "exec: $cmd\n" ;
if ( defined $ ___JOBS && $ ___JOBS > 0 ) {
safesystem ( "$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$stdout -stderr=$stderr" )
or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)" ;
2012-05-02 00:07:14 +04:00
} else {
2011-09-16 15:55:49 +04:00
safesystem ( "$cmd > $stdout 2> $stderr" ) or die "ERROR: Failed to run '$cmd'." ;
}
}
2011-10-10 08:28:55 +04:00
2012-05-02 00:07:14 +04:00
sub create_extractor_script () {
2011-10-10 08:28:55 +04:00
my ( $ cmd , $ outdir ) = @ _ ;
2011-12-08 21:14:41 +04:00
my $ script_path = File::Spec - > catfile ( $ outdir , "extractor.sh" ) ;
2011-10-10 08:28:55 +04:00
2011-12-08 21:14:41 +04:00
open my $ out , '>' , $ script_path
or die "Couldn't open $script_path for writing: $!\n" ;
print $ out "#!/bin/bash\n" ;
print $ out "cd $outdir\n" ;
print $ out "$cmd\n" ;
2012-05-02 00:07:14 +04:00
close $ out ;
2011-10-10 08:28:55 +04:00
`chmod +x $script_path` ;
2011-12-08 21:14:41 +04:00
return $ script_path ;
2011-10-10 08:28:55 +04:00
}
2012-05-02 00:25:00 +04:00
sub save_finished_step {
my ( $ filename , $ step ) = @ _ ;
open my $ fh , '>' , $ filename or die "$filename: $!" ;
print $ fh $ step . "\n" ;
close $ fh ;
}
2012-05-03 01:59:34 +04:00
# It returns a config for mert/extractor.
sub setup_reference_length_type {
if ( ( $ ___CLOSEST + $ ___AVERAGE + $ ___SHORTEST ) > 1 ) {
die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n" ;
}
if ( $ ___SHORTEST ) {
return " reflen:shortest" ;
} elsif ( $ ___AVERAGE ) {
return " reflen:average" ;
} elsif ( $ ___CLOSEST ) {
return " reflen:closest" ;
} else {
2012-05-03 02:11:24 +04:00
return "" ;
2012-05-03 01:59:34 +04:00
}
}
sub setup_case_config {
if ( $ ___NOCASE ) {
return " case:false" ;
} else {
return " case:true" ;
}
}
2012-05-03 23:04:54 +04:00
sub is_mac_osx {
return ( $^O eq "darwin" ) ? 1 : 0 ;
}