progress on deleting steps and runs

This commit is contained in:
Philipp Koehn 2014-05-21 11:16:40 -04:00
parent 401c4940cf
commit dd9a59499f
3 changed files with 97 additions and 32 deletions

View File

@ -55,7 +55,6 @@ truecase
template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension
template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension
parallelizable: yes
source-label
in: truecased-stem
out: source-labelled
@ -64,7 +63,6 @@ source-label
template-if: source-labeller IN.$input-extension OUT.$input-extension
template-if: cat IN.$output-extension OUT.$output-extension
parallelizable: yes
lowercase
in: source-labelled
out: lowercased-stem
@ -118,7 +116,7 @@ consolidate
default-name: truecaser/corpus
template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN
train
in: tokenized-stem
in: tokenized-stem
out: truecase-model
rerun-on-change: trainer
default-name: truecaser/truecase-model
@ -219,6 +217,7 @@ binarize
default-name: lm/binlm
template: $lm-binarizer IN OUT
error: set KENLM_MAX_ORDER to at least this value
final-model: yes
[INTERPOLATED-LM] single
tuning-from-sgm
@ -285,6 +284,7 @@ binarize
rerun-on-change: lm
default-name: lm/interpolated-binlm
error: set kMaxOrder to at least this value
final-model: yes
[MML] single
tokenize-indomain-source
in: raw-indomain-source
@ -395,6 +395,7 @@ build-domains
default-name: model/domains
ignore-unless: domain-features mml-filter-corpora
template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
final-model: yes
mml-score
in: MML:model corpus domains
out: mml-scores
@ -489,6 +490,7 @@ build-biconcor
default-name: model/biconcor
ignore-unless: biconcor
error: usage
final-model: yes
build-suffix-array
in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
@ -528,16 +530,17 @@ build-osm
template: $moses-script-dir/OSM/OSM-Train.perl --corpus-f IN0.$input-extension --corpus-e IN0.$output-extension --alignment IN1.$alignment-symmetrization-method --order $operation-sequence-model-order --out-dir OUT --moses-src-dir $moses-src-dir --srilm-dir $srilm-dir $operation-sequence-model-settings
default-name: model/OSM
build-transliteration-model
in: corpus word-alignment
out: transliteration-model
ignore-unless: transliteration-module
rerun-on-change: transliteration-module training-options script giza-settings
in: corpus word-alignment
out: transliteration-model
ignore-unless: transliteration-module
rerun-on-change: transliteration-module training-options script giza-settings
default-name: model/Transliteration
final-model: yes
build-translit-table
in: transliteration-model
out: transliteration-table
ignore-unless: in-decoding-transliteration
rerun-on-change: in-decoding-transliteration transliteration-module
in: transliteration-model
out: transliteration-table
ignore-unless: in-decoding-transliteration
rerun-on-change: in-decoding-transliteration transliteration-module
default-name: model/transliteration-phrase-table
template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT
extract-phrases
@ -553,12 +556,14 @@ build-reordering
ignore-unless: lexicalized-reordering
rerun-on-change: lexicalized-reordering reordering-factors
default-name: model/reordering-table
final-model: yes
build-ttable
in: extracted-phrases lexical-translation-table corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains
out: phrase-translation-table
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features
default-name: model/phrase-table
ignore-if: suffix-array
final-model: yes
sigtest-filter-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sigtest-filter-suffix-array
@ -574,19 +579,22 @@ sigtest-filter-suffix-array
mv IN.${output-extension}.sa_offset OUT.${output-extension}.sa_offset ; \
mv IN.${output-extension}.sa_suffix OUT.${output-extension}.sa_suffix
ignore-unless: sigtest-filter
final-model: yes
sigtest-filter-ttable
in: phrase-translation-table sigtest-filter-suffix-array
out: sigtest-filter-phrase-translation-table
default-name: model/phrase-table-sigtest-filter
pass-unless: sigtest-filter
ignore-if: TRAINING:config
ignore-if: TRAINING:config
final-model: yes
sigtest-filter-reordering
in: reordering-table sigtest-filter-suffix-array
out: sigtest-filter-reordering-table
default-name: model/reordering-table-sigtest-filter
pass-unless: sigtest-filter
ignore-if: TRAINING:config
ignore-if: TRAINING:config
ignore-unless: lexicalized-reordering
final-model: yes
build-generation
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: generation-table
@ -594,12 +602,14 @@ build-generation
ignore-unless: generation-factors
ignore-if: generation-corpus
default-name: model/generation-table
final-model: yes
build-generation-custom
in: generation-corpus
out: generation-table
rerun-on-change: generation-factors generation-type training-options script generation-corpus
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
final-model: yes
build-sparse
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sparse
@ -614,6 +624,7 @@ create-config
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
default-name: model/moses.ini
error: Unknown option
final-model: yes
binarize-config
in: config
out: bin-config
@ -621,6 +632,7 @@ binarize-config
rerun-on-change: config
default-name: model/moses.bin.ini
template: $binarize-all IN OUT -Binarizer $ttable-binarizer
final-model: yes
hiero-compile-source-suffix-array
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: hiero-source-suffix-array
@ -734,7 +746,6 @@ factorize-input-devtest
ignore-unless: use-mira
error: can't open
error: incompatible number of words in factor
source-label-input
in: factorized-input
out: source-labelled-input
@ -742,7 +753,6 @@ source-label-input
pass-unless: source-labeller
template-if: source-labeller IN OUT
parallelizable: yes
source-label-input-devtest
in: factorized-input-devtest
out: source-labelled-input-devtest
@ -750,7 +760,6 @@ source-label-input-devtest
pass-unless: source-labeller
template-if: source-labeller IN OUT
parallelizable: yes
lowercase-input
in: source-labelled-input
out: truecased-input
@ -903,6 +912,8 @@ tune
ignore-if: use-hiero
qsub-script: yes
default-name: tuning/moses.ini
tmp-name: tuning/tmp
final-model: yes
rerun-on-change: decoder-settings tuning-settings nbest lambda async
not-error: trans: No such file or directory
apply-weights
@ -1018,6 +1029,7 @@ decode
rerun-on-change: decoder decoder-settings nbest report-segmentation report-precision-by-coverage analyze-search-graph wade
error: Translation was not performed correctly
not-error: trans: No such file or directory
final-model: yes
hiero-decode
in: TUNING:hiero-config-with-reused-weights input
out: system-output
@ -1058,6 +1070,7 @@ detokenize-output
default-name: evaluation/detokenized
pass-unless: detokenizer
template: $detokenizer < IN > OUT
final-model: yes
wrap
in: detokenized-output
out: wrapped-output
@ -1065,6 +1078,7 @@ wrap
rerun-on-change: wrapping-frame use-hiero
template: $wrapping-script $wrapping-frame < IN > OUT
error: Use of uninitialized value in pattern match
final-model: yes
reference-from-sgm
in: reference-sgm input-sgm
out: raw-reference
@ -1100,6 +1114,7 @@ nist-bleu
rerun-on-change: nist-bleu
error: Illegal division by zero
template: $nist-bleu -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
nist-bleu-c
in: wrapped-output reference-sgm
out: nist-bleu-c-score
@ -1108,6 +1123,7 @@ nist-bleu-c
rerun-on-change: nist-bleu-c
error: Illegal division by zero
template: $nist-bleu-c -c -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
ibm-bleu
in: wrapped-output reference-sgm
out: ibm-bleu-score
@ -1115,6 +1131,7 @@ ibm-bleu
ignore-unless: ibm-bleu
rerun-on-change: ibm-bleu
template: $ibm-bleu -ci -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
ibm-bleu-c
in: wrapped-output reference-sgm
out: ibm-bleu-c-score
@ -1122,6 +1139,7 @@ ibm-bleu-c
ignore-unless: ibm-bleu-c
rerun-on-change: ibm-bleu-c
template: $ibm-bleu-c -s $input-sgm -r IN1 -t IN > OUT
final-model: yes
bolt-bleu
in: detokenized-output
out: bolt-bleu-score
@ -1129,6 +1147,7 @@ bolt-bleu
ignore-unless: bolt-bleu
rerun-on-change: bolt-bleu
template: $bolt-bleu IN > OUT
final-model: yes
bolt-bleu-c
in: detokenized-output
out: bolt-bleu-c-score
@ -1136,6 +1155,7 @@ bolt-bleu-c
ignore-unless: bolt-bleu-c
rerun-on-change: bolt-bleu-c
template: $bolt-bleu-c IN > OUT
final-model: yes
multi-bleu
in: transliterated-output tokenized-reference
out: multi-bleu-score
@ -1143,6 +1163,7 @@ multi-bleu
ignore-unless: multi-bleu
rerun-on-change: multi-bleu
template: $multi-bleu IN1 < IN > OUT
final-model: yes
multi-bleu-c
in: recased-output tokenized-reference
out: multi-bleu-c-score
@ -1150,12 +1171,14 @@ multi-bleu-c
ignore-unless: multi-bleu-c
rerun-on-change: multi-bleu-c
template: $multi-bleu-c IN1 < IN > OUT
final-model: yes
ter
in: wrapped-output reference-sgm
out: ter-score
default-name: evaluation/detokenized.sgm.TER
ignore-unless: ter
rerun-on-change: ter
final-model: yes
wer
in: recased-output reference
out: wer-score
@ -1163,6 +1186,7 @@ wer
ignore-unless: wer
rerun-on-change: wer
template: $wer IN IN1 > OUT
final-model: yes
meteor
in: transliterated-output reference
out: meteor-score
@ -1170,25 +1194,28 @@ meteor
ignore-unless: meteor
rerun-on-change: meteor
template: $meteor IN IN1 $meteor-params > OUT
final-model: yes
analysis
in: recased-output reference input
out: analysis
default-name: evaluation/analysis
ignore-if: report-precision-by-coverage
ignore-unless: analysis
rerun-on-change: analyze-search-graph
rerun-on-change: analyze-search-graph
analysis-coverage
in: input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table
out: analysis-coverage
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage
rerun-on-change: score-settings
final-model: yes
analysis-precision
in: recased-output reference input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table analysis-coverage
out: analysis
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage report-precision-by-coverage
rerun-on-change: precision-by-coverage-base
rerun-on-change: precision-by-coverage-base
final-model: yes
[REPORTING] single
report

View File

@ -64,7 +64,9 @@ my (@MODULE,
%MODULE_STEP,
%STEP_IN,
%STEP_OUT,
%STEP_OUTNAME,
%STEP_OUTNAME, # output file name for step result
%STEP_TMPNAME, # tmp directory to be used by step
%STEP_FINAL, # output is part of the final model, not an intermediate step
%STEP_PASS, # config parameters that have to be set, otherwise pass
%STEP_PASS_IF, # config parameters that have to be not set, otherwise pass
%STEP_IGNORE, # config parameters that have to be set, otherwise ignore
@ -97,7 +99,7 @@ $VERSION = $DELETE_VERSION if $DELETE_VERSION;
&compute_version_number() if $EXECUTE && !$CONTINUE && !$DELETE_CRASHED && !$DELETE_VERSION;
`mkdir -p steps/$VERSION`;
&log_config();
&log_config() unless $DELETE_CRASHED || $DELETE_VERSION;
print "running experimenal run number $VERSION\n";
print "\nESTABLISH WHICH STEPS NEED TO BE RUN\n";
@ -241,6 +243,12 @@ sub read_meta {
elsif ($1 eq "default-name") {
$STEP_OUTNAME{"$module:$step"} = $2;
}
elsif ($1 eq "tmp-name") {
$STEP_TMPNAME{"$module:$step"} = $2;
}
elsif ($1 eq "final-model") {
$STEP_FINAL{"$module:$step"} = $2;
}
elsif ($1 eq "pass-unless") {
@{$STEP_PASS{"$module:$step"}} = split(/\s+/,$2);
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
@ -417,7 +425,7 @@ sub log_config {
my $dir = &check_and_get("GENERAL:working-dir");
`mkdir -p $dir/steps`;
my $config_file = &steps_file("config.$VERSION",$VERSION);
`cp $CONFIG_FILE $config_file` unless $CONTINUE || $DELETE_CRASHED || $DELETE_VERSION;
`cp $CONFIG_FILE $config_file` unless $CONTINUE;
open(PARAMETER,">".&steps_file("parameter.$VERSION",$VERSION)) or die "Cannot open: $!";
foreach my $parameter (sort keys %CONFIG) {
print PARAMETER "$parameter =";
@ -745,7 +753,7 @@ sub delete_version {
push @{$DELETABLE{$re_use_version}}, $step if $version == $DELETE_VERSION && defined($ALREADY_DELETED{$re_use_version});
# not deletable step used by not-deleted version
$NOT_DELETABLE{$re_use_version}{$step}++ if $version != $DELETE_VERSION && !defined(ALREADY_DELETED{$version});
$NOT_DELETABLE{$re_use_version}{$step}++ if $version != $DELETE_VERSION && !defined($ALREADY_DELETED{$version});
}
close(RE_USE);
}
@ -787,6 +795,11 @@ sub delete_step {
my $out_file = $STEP_OUTNAME{"$module:$step"};
$out_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g if $set;
&delete_output(&versionize(&long_file_name($out_file,$module,$set), $version));
if (defined($STEP_TMPNAME{"$module:$step"})) {
my $tmp_file = &get_tmp_file($module,$set,$step,$version);
&delete_output($tmp_file);
}
}
sub delete_output {
@ -801,7 +814,8 @@ sub delete_output {
}
else {
my @FILES = `ls $file.* 2>/dev/null`;
foreach (my @FILES) {
foreach (@FILES) {
chop;
print "\tdelete file $_\n";
`rm $_` if $EXECUTE;
}
@ -1775,11 +1789,13 @@ sub define_tuning_tune {
my $tuning_script = &check_and_get("TUNING:tuning-script");
my $use_mira = &backoff_and_get("TUNING:use-mira", 0);
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules");
my $tmp_dir = &get_tmp_file("TUNING","","tune");
# the last 3 variables are only used for mira tuning
my ($tuned_config,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest, $filtered_config) = &get_output_and_input($step_id);
$config = $filtered_config if $filtered_config;
my $cmd = "";
if ($use_mira) {
my $addTags = &backoff_and_get("TUNING:add-tags");
@ -1797,25 +1813,24 @@ sub define_tuning_tune {
$input_devtest = $input_devtest_with_tags;
}
my $experiment_dir = "$dir/tuning/tmp.$VERSION";
system("mkdir -p $experiment_dir");
system("mkdir -p $tmp_dir");
my $mira_config = "$experiment_dir/mira-config.$VERSION.";
my $mira_config = "$tmp_dir/mira-config.$VERSION.";
my $mira_config_log = $mira_config."log";
$mira_config .= "cfg";
write_mira_config($mira_config,$experiment_dir,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest);
write_mira_config($mira_config,$tmp_dir,$config,$input,$reference,$config_devtest,$input_devtest,$reference_devtest);
#$cmd = "$tuning_script -config $mira_config -exec >& $mira_config_log";
# we want error messages in top-level log file
$cmd = "$tuning_script -config $mira_config -exec ";
# write script to select the best set of weights after training for the specified number of epochs -->
# cp to tuning/tmp.?/moses.ini
my $script_filename = "$experiment_dir/selectBestWeights.";
my $script_filename = "$tmp_dir/selectBestWeights.";
my $script_filename_log = $script_filename."log";
$script_filename .= "perl";
my $weight_output_file = "$experiment_dir/moses.ini";
write_selectBestMiraWeights($experiment_dir, $script_filename, $weight_output_file);
my $weight_output_file = "$tmp_dir/moses.ini";
write_selectBestMiraWeights($tmp_dir, $script_filename, $weight_output_file);
$cmd .= "\n$script_filename >& $script_filename_log";
}
else {
@ -1835,7 +1850,7 @@ sub define_tuning_tune {
my $tuning_settings = &backoff_and_get("TUNING:tuning-settings");
$tuning_settings = "" unless $tuning_settings;
$cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $dir/tuning/tmp.$VERSION --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table";
$cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table";
$cmd .= " --lambdas \"$lambda\"" if $lambda;
$cmd .= " --continue" if $tune_continue;
$cmd .= " --skip-decoder" if $skip_decoder;
@ -1849,7 +1864,7 @@ sub define_tuning_tune {
$cmd .= "\nmkdir -p $tuning_dir";
}
$cmd .= "\ncp $dir/tuning/tmp.$VERSION/moses.ini $tuned_config";
$cmd .= "\ncp $tmp_dir/moses.ini $tuned_config";
&create_step($step_id,$cmd);
}
@ -3373,6 +3388,17 @@ sub get_specified_or_default_file {
return &get_default_file($default_module, $default_set, $default_step);
}
sub get_tmp_file {
my ($module,$set,$step,$version) = @_;
$version = $VERSION unless $version;
my $tmp_file = $STEP_TMPNAME{"$module:$step"};
if ($set) {
$tmp_file =~ s/^(.+\/)([^\/]+)$/$1$set.$2/g;
}
$tmp_file = &versionize(&long_file_name($tmp_file,$module,$set), $version);
return $tmp_file;
}
sub get_default_file {
my ($default_module, $default_set, $default_step) = @_;
# print "\tget_default_file($default_module, $default_set, $default_step)\n";

View File

@ -0,0 +1,12 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
my ($TEXT,$ORDER,$BIN,$LM) = @_;
&GetOptions('text=s' => \$TEXT,
'lm=s' => \$LM,
'bin=s' => \$BIN,
'order=i' => \$ORDER);
`$BIN --text $TEXT --order $ORDER --arpa $LM`;