From dd9a59499f8430fb4782da2a8facdbf5cdb78119 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 21 May 2014 11:16:40 -0400 Subject: [PATCH] progress on deleting steps and runs --- scripts/ems/experiment.meta | 63 ++++++++++++++++++-------- scripts/ems/experiment.perl | 54 ++++++++++++++++------ scripts/ems/support/lmplz-wrapper.perl | 12 +++++ 3 files changed, 97 insertions(+), 32 deletions(-) create mode 100755 scripts/ems/support/lmplz-wrapper.perl diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 83d597aa0..a7ed2df17 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -55,7 +55,6 @@ truecase template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension parallelizable: yes - source-label in: truecased-stem out: source-labelled @@ -64,7 +63,6 @@ source-label template-if: source-labeller IN.$input-extension OUT.$input-extension template-if: cat IN.$output-extension OUT.$output-extension parallelizable: yes - lowercase in: source-labelled out: lowercased-stem @@ -118,7 +116,7 @@ consolidate default-name: truecaser/corpus template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN train - in: tokenized-stem + in: tokenized-stem out: truecase-model rerun-on-change: trainer default-name: truecaser/truecase-model @@ -219,6 +217,7 @@ binarize default-name: lm/binlm template: $lm-binarizer IN OUT error: set KENLM_MAX_ORDER to at least this value + final-model: yes [INTERPOLATED-LM] single tuning-from-sgm @@ -285,6 +284,7 @@ binarize rerun-on-change: lm default-name: lm/interpolated-binlm error: set kMaxOrder to at least this value + final-model: yes [MML] single tokenize-indomain-source in: raw-indomain-source @@ -395,6 +395,7 @@ build-domains default-name: model/domains ignore-unless: domain-features mml-filter-corpora template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT + final-model: yes mml-score in: MML:model corpus domains out: mml-scores @@ -489,6 +490,7 @@ build-biconcor default-name: model/biconcor ignore-unless: biconcor error: usage + final-model: yes build-suffix-array in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: phrase-translation-table @@ -528,16 +530,17 @@ build-osm template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --srilm-dir $srilm-dir $operation-sequence-model-settings default-name: model/OSM build-transliteration-model - in: corpus word-alignment - out: transliteration-model - ignore-unless: transliteration-module - rerun-on-change: transliteration-module training-options script giza-settings + in: corpus word-alignment + out: transliteration-model + ignore-unless: transliteration-module + rerun-on-change: transliteration-module training-options script giza-settings default-name: model/Transliteration + final-model: yes build-translit-table - in: transliteration-model - out: transliteration-table - ignore-unless: in-decoding-transliteration - rerun-on-change: in-decoding-transliteration transliteration-module + in: transliteration-model + out: transliteration-table + ignore-unless: in-decoding-transliteration + rerun-on-change: in-decoding-transliteration transliteration-module default-name: model/transliteration-phrase-table template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT extract-phrases @@ -553,12 +556,14 @@ build-reordering ignore-unless: lexicalized-reordering rerun-on-change: lexicalized-reordering reordering-factors default-name: model/reordering-table + final-model: yes build-ttable in: extracted-phrases lexical-translation-table corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains out: phrase-translation-table rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features default-name: model/phrase-table ignore-if: suffix-array + final-model: yes sigtest-filter-suffix-array in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sigtest-filter-suffix-array @@ -574,19 +579,22 @@ sigtest-filter-suffix-array mv IN.${output-extension}.sa_offset OUT.${output-extension}.sa_offset ; \ mv IN.${output-extension}.sa_suffix OUT.${output-extension}.sa_suffix ignore-unless: sigtest-filter + final-model: yes sigtest-filter-ttable in: phrase-translation-table sigtest-filter-suffix-array out: sigtest-filter-phrase-translation-table default-name: model/phrase-table-sigtest-filter pass-unless: sigtest-filter - ignore-if: TRAINING:config + ignore-if: TRAINING:config + final-model: yes sigtest-filter-reordering in: reordering-table sigtest-filter-suffix-array out: sigtest-filter-reordering-table default-name: model/reordering-table-sigtest-filter pass-unless: sigtest-filter - ignore-if: TRAINING:config + ignore-if: TRAINING:config ignore-unless: lexicalized-reordering + final-model: yes build-generation in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: generation-table @@ -594,12 +602,14 @@ build-generation ignore-unless: generation-factors ignore-if: generation-corpus default-name: model/generation-table + final-model: yes build-generation-custom in: generation-corpus out: generation-table rerun-on-change: generation-factors generation-type training-options script generation-corpus ignore-unless: AND generation-factors generation-corpus default-name: model/generation-table + final-model: yes build-sparse in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sparse @@ -614,6 +624,7 @@ create-config rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini default-name: model/moses.ini error: Unknown option + final-model: yes binarize-config in: config out: bin-config @@ -621,6 +632,7 @@ binarize-config rerun-on-change: config default-name: model/moses.bin.ini template: $binarize-all IN OUT -Binarizer $ttable-binarizer + final-model: yes hiero-compile-source-suffix-array in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: hiero-source-suffix-array @@ -734,7 +746,6 @@ factorize-input-devtest ignore-unless: use-mira error: can't open error: incompatible number of words in factor - source-label-input in: factorized-input out: source-labelled-input @@ -742,7 +753,6 @@ source-label-input pass-unless: source-labeller template-if: source-labeller IN OUT parallelizable: yes - source-label-input-devtest in: factorized-input-devtest out: source-labelled-input-devtest @@ -750,7 +760,6 @@ source-label-input-devtest pass-unless: source-labeller template-if: source-labeller IN OUT parallelizable: yes - lowercase-input in: source-labelled-input out: truecased-input @@ -903,6 +912,8 @@ tune ignore-if: use-hiero qsub-script: yes default-name: tuning/moses.ini + tmp-name: tuning/tmp + final-model: yes rerun-on-change: decoder-settings tuning-settings nbest lambda async not-error: trans: No such file or directory apply-weights @@ -1018,6 +1029,7 @@ decode rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade error: Translation was not performed correctly not-error: trans: No such file or directory + final-model: yes hiero-decode in: TUNING:hiero-config-with-reused-weights input out: system-output @@ -1058,6 +1070,7 @@ detokenize-output default-name: evaluation/detokenized pass-unless: detokenizer template: $detokenizer < IN > OUT + final-model: yes wrap in: detokenized-output out: wrapped-output @@ -1065,6 +1078,7 @@ wrap rerun-on-change: wrapping-frame use-hiero template: $wrapping-script $wrapping-frame < IN > OUT error: Use of uninitialized value in pattern match + final-model: yes reference-from-sgm in: reference-sgm input-sgm out: raw-reference @@ -1100,6 +1114,7 @@ nist-bleu rerun-on-change: nist-bleu error: Illegal division by zero template: $nist-bleu -s $input-sgm -r IN1 -t IN > OUT + final-model: yes nist-bleu-c in: wrapped-output reference-sgm out: nist-bleu-c-score @@ -1108,6 +1123,7 @@ nist-bleu-c rerun-on-change: nist-bleu-c error: Illegal division by zero template: $nist-bleu-c -c -s $input-sgm -r IN1 -t IN > OUT + final-model: yes ibm-bleu in: wrapped-output reference-sgm out: ibm-bleu-score @@ -1115,6 +1131,7 @@ ibm-bleu ignore-unless: ibm-bleu rerun-on-change: ibm-bleu template: $ibm-bleu -ci -s $input-sgm -r IN1 -t IN > OUT + final-model: yes ibm-bleu-c in: wrapped-output reference-sgm out: ibm-bleu-c-score @@ -1122,6 +1139,7 @@ ibm-bleu-c ignore-unless: ibm-bleu-c rerun-on-change: ibm-bleu-c template: $ibm-bleu-c -s $input-sgm -r IN1 -t IN > OUT + final-model: yes bolt-bleu in: detokenized-output out: bolt-bleu-score @@ -1129,6 +1147,7 @@ bolt-bleu ignore-unless: bolt-bleu rerun-on-change: bolt-bleu template: $bolt-bleu IN > OUT + final-model: yes bolt-bleu-c in: detokenized-output out: bolt-bleu-c-score @@ -1136,6 +1155,7 @@ bolt-bleu-c ignore-unless: bolt-bleu-c rerun-on-change: bolt-bleu-c template: $bolt-bleu-c IN > OUT + final-model: yes multi-bleu in: transliterated-output tokenized-reference out: multi-bleu-score @@ -1143,6 +1163,7 @@ multi-bleu ignore-unless: multi-bleu rerun-on-change: multi-bleu template: $multi-bleu IN1 < IN > OUT + final-model: yes multi-bleu-c in: recased-output tokenized-reference out: multi-bleu-c-score @@ -1150,12 +1171,14 @@ multi-bleu-c ignore-unless: multi-bleu-c rerun-on-change: multi-bleu-c template: $multi-bleu-c IN1 < IN > OUT + final-model: yes ter in: wrapped-output reference-sgm out: ter-score default-name: evaluation/detokenized.sgm.TER ignore-unless: ter rerun-on-change: ter + final-model: yes wer in: recased-output reference out: wer-score @@ -1163,6 +1186,7 @@ wer ignore-unless: wer rerun-on-change: wer template: $wer IN IN1 > OUT + final-model: yes meteor in: transliterated-output reference out: meteor-score @@ -1170,25 +1194,28 @@ meteor ignore-unless: meteor rerun-on-change: meteor template: $meteor IN IN1 $meteor-params > OUT + final-model: yes analysis in: recased-output reference input out: analysis default-name: evaluation/analysis ignore-if: report-precision-by-coverage ignore-unless: analysis - rerun-on-change: analyze-search-graph + rerun-on-change: analyze-search-graph analysis-coverage in: input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table out: analysis-coverage default-name: evaluation/analysis ignore-unless: AND analysis analyze-coverage rerun-on-change: score-settings + final-model: yes analysis-precision in: recased-output reference input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table analysis-coverage out: analysis default-name: evaluation/analysis ignore-unless: AND analysis analyze-coverage report-precision-by-coverage - rerun-on-change: precision-by-coverage-base + rerun-on-change: precision-by-coverage-base + final-model: yes [REPORTING] single report diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 8cc99d48e..b7db74c7c 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -64,7 +64,9 @@ my (@MODULE, %MODULE_STEP, %STEP_IN, %STEP_OUT, - %STEP_OUTNAME, + %STEP_OUTNAME, # output file name for step result + %STEP_TMPNAME, # tmp directory to be used by step + %STEP_FINAL, # output is part of the final model, not an intermediate step %STEP_PASS, # config parameters that have to be set, otherwise pass %STEP_PASS_IF, # config parameters that have to be not set, otherwise pass %STEP_IGNORE, # config parameters that have to be set, otherwise ignore @@ -97,7 +99,7 @@ $VERSION = $DELETE_VERSION if $DELETE_VERSION; &compute_version_number() if $EXECUTE && !$CONTINUE && !$DELETE_CRASHED && !$DELETE_VERSION; `mkdir -p steps/$VERSION`; -&log_config(); +&log_config() unless $DELETE_CRASHED || $DELETE_VERSION; print "running experimenal run number $VERSION\n"; print "\nESTABLISH WHICH STEPS NEED TO BE RUN\n"; @@ -241,6 +243,12 @@ sub read_meta { elsif ($1 eq "default-name") { $STEP_OUTNAME{"$module:$step"} = $2; } + elsif ($1 eq "tmp-name") { + $STEP_TMPNAME{"$module:$step"} = $2; + } + elsif ($1 eq "final-model") { + $STEP_FINAL{"$module:$step"} = $2; + } elsif ($1 eq "pass-unless") { @{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2); push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2); @@ -417,7 +425,7 @@ sub log_config { my $dir = &check_and_get("GENERAL:working-dir"); `mkdir -p $dir/steps`; my $config_file = &steps_file("config.$VERSION",$VERSION); - `cp $CONFIG_FILE $config_file` unless $CONTINUE || $DELETE_CRASHED || $DELETE_VERSION; + `cp $CONFIG_FILE $config_file` unless $CONTINUE; open(PARAMETER,">".&steps_file("parameter.$VERSION",$VERSION)) or die "Cannot open: $!"; foreach my $parameter (sort keys %CONFIG) { print PARAMETER "$parameter ="; @@ -745,7 +753,7 @@ sub delete_version { push @{$DELETABLE{$re_use_version}}, $step if $version == $DELETE_VERSION && defined($ALREADY_DELETED{$re_use_version}); # not deletable step used by not-deleted version - $NOT_DELETABLE{$re_use_version}{$step}++ if $version != $DELETE_VERSION && !defined(ALREADY_DELETED{$version}); + $NOT_DELETABLE{$re_use_version}{$step}++ if $version != $DELETE_VERSION && !defined($ALREADY_DELETED{$version}); } close(RE_USE); } @@ -787,6 +795,11 @@ sub delete_step { my $out_file = $STEP_OUTNAME{"$module:$step"}; $out_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g if $set; &delete_output(&versionize(&long_file_name($out_file,$module,$set), $version)); + + if (defined($STEP_TMPNAME{"$module:$step"})) { + my $tmp_file = &get_tmp_file($module,$set,$step,$version); + &delete_output($tmp_file); + } } sub delete_output { @@ -801,7 +814,8 @@ sub delete_output { } else { my @FILES = `ls $file.* 2>/dev/null`; - foreach (my @FILES) { + foreach (@FILES) { + chop; print "\tdelete file $_\n"; `rm $_` if $EXECUTE; } @@ -1775,11 +1789,13 @@ sub define_tuning_tune { my $tuning_script = &check_and_get("TUNING:tuning-script"); my $use_mira = &backoff_and_get("TUNING:use-mira", 0); my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); + my $tmp_dir = &get_tmp_file("TUNING","","tune"); # the last 3 variables are only used for mira tuning my ($tuned_config,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest, $filtered_config) = &get_output_and_input($step_id); $config = $filtered_config if $filtered_config; + my $cmd = ""; if ($use_mira) { my $addTags = &backoff_and_get("TUNING:add-tags"); @@ -1797,25 +1813,24 @@ sub define_tuning_tune { $input_devtest = $input_devtest_with_tags; } - my $experiment_dir = "$dir/tuning/tmp.$VERSION"; - system("mkdir -p $experiment_dir"); + system("mkdir -p $tmp_dir"); - my $mira_config = "$experiment_dir/mira-config.$VERSION."; + my $mira_config = "$tmp_dir/mira-config.$VERSION."; my $mira_config_log = $mira_config."log"; $mira_config .= "cfg"; - write_mira_config($mira_config,$experiment_dir,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest); + write_mira_config($mira_config,$tmp_dir,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest); #$cmd = "$tuning_script -config $mira_config -exec >& $mira_config_log"; # we want error messages in top-level log file $cmd = "$tuning_script -config $mira_config -exec "; # write script to select the best set of weights after training for the specified number of epochs --> # cp to tuning/tmp.?/moses.ini - my $script_filename = "$experiment_dir/selectBestWeights."; + my $script_filename = "$tmp_dir/selectBestWeights."; my $script_filename_log = $script_filename."log"; $script_filename .= "perl"; - my $weight_output_file = "$experiment_dir/moses.ini"; - write_selectBestMiraWeights($experiment_dir, $script_filename, $weight_output_file); + my $weight_output_file = "$tmp_dir/moses.ini"; + write_selectBestMiraWeights($tmp_dir, $script_filename, $weight_output_file); $cmd .= "\n$script_filename >& $script_filename_log"; } else { @@ -1835,7 +1850,7 @@ sub define_tuning_tune { my $tuning_settings = &backoff_and_get("TUNING:tuning-settings"); $tuning_settings = "" unless $tuning_settings; - $cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $dir/tuning/tmp.$VERSION --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table"; + $cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table"; $cmd .= " --lambdas \"$lambda\"" if $lambda; $cmd .= " --continue" if $tune_continue; $cmd .= " --skip-decoder" if $skip_decoder; @@ -1849,7 +1864,7 @@ sub define_tuning_tune { $cmd .= "\nmkdir -p $tuning_dir"; } - $cmd .= "\ncp $dir/tuning/tmp.$VERSION/moses.ini $tuned_config"; + $cmd .= "\ncp $tmp_dir/moses.ini $tuned_config"; &create_step($step_id,$cmd); } @@ -3373,6 +3388,17 @@ sub get_specified_or_default_file { return &get_default_file($default_module, $default_set, $default_step); } +sub get_tmp_file { + my ($module,$set,$step,$version) = @_; + $version = $VERSION unless $version; + my $tmp_file = $STEP_TMPNAME{"$module:$step"}; + if ($set) { + $tmp_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g; + } + $tmp_file = &versionize(&long_file_name($tmp_file,$module,$set), $version); + return $tmp_file; +} + sub get_default_file { my ($default_module, $default_set, $default_step) = @_; # print "\tget_default_file($default_module, $default_set, $default_step)\n"; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl new file mode 100755 index 000000000..c10263f75 --- /dev/null +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -0,0 +1,12 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long "GetOptions"; + +my ($TEXT,$ORDER,$BIN,$LM) = @_; +&GetOptions('text=s' => \$TEXT, + 'lm=s' => \$LM, + 'bin=s' => \$BIN, + 'order=i' => \$ORDER); + +`$BIN --text $TEXT --order $ORDER --arpa $LM`;