mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
merge
This commit is contained in:
commit
c2489c7d8b
@ -598,7 +598,7 @@ binarize-config
|
||||
pass-unless: binarize-all
|
||||
rerun-on-change: config
|
||||
default-name: model/moses.bin.ini
|
||||
template: $binarize-all $ttable-binarizer $rtable-binarizer OUT IN
|
||||
template: $binarize-all IN OUT -Binarizer $ttable-binarizer
|
||||
hiero-compile-source-suffix-array
|
||||
in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
|
||||
out: hiero-source-suffix-array
|
||||
@ -835,33 +835,31 @@ filter
|
||||
out: filtered-dir
|
||||
default-name: tuning/filtered
|
||||
rerun-on-change: filter-settings
|
||||
pass-if: TRAINING:binarize-all
|
||||
ignore-if: use-hiero
|
||||
ignore-if: TRAINING:binarize-all
|
||||
error: already exists. Please delete
|
||||
filter-devtest
|
||||
in: input-devtest TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table
|
||||
out: filtered-dir-devtest
|
||||
default-name: tuning/filtered.devtest
|
||||
rerun-on-change: filter-settings
|
||||
pass-if: TRAINING:binarize-all
|
||||
ignore-if: TRAINING:binarize-all
|
||||
ignore-unless: use-mira
|
||||
error: already exists. Please delete
|
||||
apply-filter
|
||||
in: TRAINING:config filtered-dir
|
||||
in: TRAINING:bin-config filtered-dir
|
||||
out: filtered-config
|
||||
default-name: tuning/moses.filtered.ini
|
||||
pass-if: TRAINING:binarize-all
|
||||
ignore-if: use-hiero
|
||||
ignore-if: TRAINING:binarize-all
|
||||
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
|
||||
apply-filter-devtest
|
||||
in: TRAINING:config filtered-dir-devtest
|
||||
in: TRAINING:bin-config filtered-dir-devtest
|
||||
out: filtered-config-devtest
|
||||
default-name: tuning/moses.filtered.devtest.ini
|
||||
pass-if: TRAINING:binarize-all
|
||||
ignore-unless: use-mira
|
||||
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
|
||||
tune
|
||||
in: filtered-config input reference filtered-config-devtest input-devtest reference-devtest
|
||||
in: TRAINING:bin-config input reference filtered-config-devtest input-devtest reference-devtest filtered-config
|
||||
out: weight-config
|
||||
ignore-if: use-hiero
|
||||
qsub-script: yes
|
||||
@ -869,7 +867,7 @@ tune
|
||||
rerun-on-change: decoder-settings tuning-settings nbest lambda async
|
||||
not-error: trans: No such file or directory
|
||||
apply-weights
|
||||
in: TRAINING:config weight-config
|
||||
in: TRAINING:bin-config weight-config
|
||||
out: config-with-reused-weights
|
||||
ignore-if: use-hiero
|
||||
default-name: tuning/moses.tuned.ini
|
||||
@ -958,14 +956,13 @@ filter
|
||||
ignore-if: use-hiero
|
||||
error: already exists. Please delete
|
||||
apply-filter
|
||||
in: filtered-dir TRAINING:config TUNING:config-with-reused-weights
|
||||
in: TUNING:config-with-reused-weights filtered-dir
|
||||
out: filtered-config
|
||||
default-name: evaluation/filtered.ini
|
||||
pass-if: TRAINING:binarize-all
|
||||
ignore-if: use-hiero
|
||||
ignore-if: TRAINING:binarize-all
|
||||
template: $moses-script-dir/ems/support/substitute-filtered-tables-and-weights.perl IN/moses.ini IN1 IN2 OUT
|
||||
decode
|
||||
in: filtered-config input
|
||||
in: TUNING:config-with-reused-weights input filtered-config
|
||||
out: system-output
|
||||
default-name: evaluation/output
|
||||
qsub-script: yes
|
||||
|
@ -39,9 +39,11 @@ my $binarizer = undef;
|
||||
my $min_score = undef;
|
||||
my $opt_min_non_initial_rule_count = undef;
|
||||
my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
|
||||
my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice
|
||||
|
||||
GetOptions(
|
||||
"gzip!" => \$opt_gzip,
|
||||
"filter!" => \$opt_filter,
|
||||
"Hierarchical" => \$opt_hierarchical,
|
||||
"Binarizer=s" => \$binarizer,
|
||||
"MinScore=s" => \$min_score,
|
||||
@ -253,32 +255,34 @@ if ($opt_hierarchical) {
|
||||
} #if ($opt_hierarchical) {
|
||||
|
||||
my %PHRASE_USED;
|
||||
if (!$opt_hierarchical) {
|
||||
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
||||
open(INPUT,mk_open_string($input)) or die "Can't read $input";
|
||||
while(my $line = <INPUT>) {
|
||||
chomp($line);
|
||||
my @WORD = split(/ +/,$line);
|
||||
for(my $i=0;$i<=$#WORD;$i++) {
|
||||
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
|
||||
foreach (keys %CONSIDER_FACTORS) {
|
||||
my @FACTOR = split(/,/);
|
||||
my $phrase = "";
|
||||
for(my $k=$i;$k<=$i+$j;$k++) {
|
||||
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
|
||||
for(my $f=0;$f<=$#FACTOR;$f++) {
|
||||
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
|
||||
}
|
||||
chop($phrase);
|
||||
$phrase .= " ";
|
||||
}
|
||||
chop($phrase);
|
||||
$PHRASE_USED{$_}{$phrase}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(INPUT);
|
||||
if ($opt_filter) {
|
||||
if (!$opt_hierarchical) {
|
||||
# get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
|
||||
open(INPUT,mk_open_string($input)) or die "Can't read $input";
|
||||
while(my $line = <INPUT>) {
|
||||
chomp($line);
|
||||
my @WORD = split(/ +/,$line);
|
||||
for(my $i=0;$i<=$#WORD;$i++) {
|
||||
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
|
||||
foreach (keys %CONSIDER_FACTORS) {
|
||||
my @FACTOR = split(/,/);
|
||||
my $phrase = "";
|
||||
for(my $k=$i;$k<=$i+$j;$k++) {
|
||||
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
|
||||
for(my $f=0;$f<=$#FACTOR;$f++) {
|
||||
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
|
||||
}
|
||||
chop($phrase);
|
||||
$phrase .= " ";
|
||||
}
|
||||
chop($phrase);
|
||||
$PHRASE_USED{$_}{$phrase}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(INPUT);
|
||||
}
|
||||
}
|
||||
|
||||
# filter files
|
||||
@ -288,79 +292,89 @@ for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
my $factors = $TABLE_FACTORS[$i];
|
||||
my $new_file = $TABLE_NEW_NAME[$i];
|
||||
print STDERR "filtering $file -> $new_file...\n";
|
||||
|
||||
my $openstring = mk_open_string($file);
|
||||
|
||||
my $new_openstring;
|
||||
if ($new_file =~ /\.gz$/) {
|
||||
$new_openstring = "| gzip -c > $new_file";
|
||||
my $mid_file = $new_file; # used when both filtering and binarizing
|
||||
if (!$opt_filter) {
|
||||
# check if original file was gzipped
|
||||
if ($file !~ /\.gz$/ && -e "$file.gz") {
|
||||
$file .= ".gz";
|
||||
}
|
||||
$mid_file .= ".gz" if $file =~ /\.gz$/;
|
||||
safesystem("ln -s $file $mid_file");
|
||||
} else {
|
||||
$new_openstring = ">$new_file";
|
||||
}
|
||||
|
||||
open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
|
||||
$mid_file .= ".gz"
|
||||
if $mid_file !~ /\.gz/
|
||||
&& $binarizer && $binarizer =~ /processPhraseTable/;
|
||||
|
||||
if ($opt_hierarchical) {
|
||||
my $tmp_input = $TMP_INPUT_FILENAME{$factors};
|
||||
my $options = "";
|
||||
$options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
|
||||
open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
|
||||
while (my $line = <PIPE>) {
|
||||
print FILE_OUT $line
|
||||
}
|
||||
close(FILEHANDLE);
|
||||
} else {
|
||||
open(FILE,$openstring) or die "Can't open '$openstring'";
|
||||
while(my $entry = <FILE>) {
|
||||
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
|
||||
$foreign =~ s/ $//;
|
||||
if (defined($PHRASE_USED{$factors}{$foreign})) {
|
||||
# handle min_score thresholds
|
||||
if ($min_score) {
|
||||
my @ITEM = split(/ *\|\|\| */,$rest);
|
||||
if(scalar (@ITEM)>2) { # do not filter reordering table
|
||||
my @SCORE = split(/ /,$ITEM[1]);
|
||||
my $okay = 1;
|
||||
foreach my $id (keys %MIN_SCORE) {
|
||||
$okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
|
||||
my $openstring = mk_open_string($file);
|
||||
|
||||
my $mid_openstring;
|
||||
if ($mid_file =~ /\.gz$/) {
|
||||
$mid_openstring = "| gzip -c > $mid_file";
|
||||
} else {
|
||||
$mid_openstring = ">$mid_file";
|
||||
}
|
||||
|
||||
|
||||
open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring";
|
||||
|
||||
if ($opt_hierarchical) {
|
||||
my $tmp_input = $TMP_INPUT_FILENAME{$factors};
|
||||
my $options = "";
|
||||
$options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
|
||||
open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
|
||||
while (my $line = <PIPE>) {
|
||||
print FILE_OUT $line
|
||||
}
|
||||
close(FILEHANDLE);
|
||||
} else {
|
||||
open(FILE,$openstring) or die "Can't open '$openstring'";
|
||||
while(my $entry = <FILE>) {
|
||||
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
|
||||
$foreign =~ s/ $//;
|
||||
if (defined($PHRASE_USED{$factors}{$foreign})) {
|
||||
# handle min_score thresholds
|
||||
if ($min_score) {
|
||||
my @ITEM = split(/ *\|\|\| */,$rest);
|
||||
if(scalar (@ITEM)>2) { # do not filter reordering table
|
||||
my @SCORE = split(/ /,$ITEM[1]);
|
||||
my $okay = 1;
|
||||
foreach my $id (keys %MIN_SCORE) {
|
||||
$okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
|
||||
}
|
||||
next unless $okay;
|
||||
}
|
||||
next unless $okay;
|
||||
}
|
||||
}
|
||||
print FILE_OUT $entry;
|
||||
$used++;
|
||||
}
|
||||
$total++;
|
||||
}
|
||||
close(FILE);
|
||||
die "No phrases found in $file!" if $total == 0;
|
||||
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
|
||||
}
|
||||
print FILE_OUT $entry;
|
||||
$used++;
|
||||
}
|
||||
$total++;
|
||||
}
|
||||
close(FILE);
|
||||
die "No phrases found in $file!" if $total == 0;
|
||||
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
|
||||
}
|
||||
}
|
||||
|
||||
my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat");
|
||||
if(defined($binarizer)) {
|
||||
print STDERR "binarizing...";
|
||||
# translation model
|
||||
if ($KNOWN_TTABLE{$i}) {
|
||||
# ... hierarchical translation model
|
||||
if ($opt_hierarchical) {
|
||||
my $cmd = "$binarizer $new_file $new_file.bin";
|
||||
my $cmd = "$binarizer $mid_file $new_file.bin";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
# ... phrase translation model
|
||||
elsif ($binarizer =~ /processPhraseTableMin/) {
|
||||
#compact phrase table
|
||||
my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
|
||||
my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted; $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $mid_file.sorted";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
elsif ($binarizer =~ /CreateOnDiskPt/) {
|
||||
my $cmd = "$binarizer $new_file $new_file.bin";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
else {
|
||||
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
|
||||
} else {
|
||||
my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
@ -371,10 +385,10 @@ for(my $i=0;$i<=$#TABLE;$i++) {
|
||||
$lexbin =~ s/PhraseTable/LexicalTable/;
|
||||
my $cmd;
|
||||
if ($lexbin =~ /processLexicalTableMin/) {
|
||||
$cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
|
||||
$cmd = "$catcmd $mid_file | LC_ALL=C sort -T $dir > $mid_file.sorted; $lexbin -in $mid_file.sorted -out $new_file; rm $mid_file.sorted";
|
||||
} else {
|
||||
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
|
||||
$cmd = "$lexbin -in $new_file -out $new_file";
|
||||
$cmd = "$lexbin -in $mid_file -out $new_file";
|
||||
}
|
||||
print STDERR $cmd."\n";
|
||||
print STDERR `$cmd`;
|
||||
|
@ -2055,11 +2055,12 @@ sub create_ini {
|
||||
|
||||
# lattice feature
|
||||
if ($_NUM_LATTICE_FEATURES) {
|
||||
print INI "\n\n#lattice or confusion net weights\n[weight-i]\n";
|
||||
$feature_spec .= "InputFeature num-input-features=$_NUM_LATTICE_FEATURES\n";
|
||||
$weight_spec .= "InputFeature0=";
|
||||
for (1..$_NUM_LATTICE_FEATURES) {
|
||||
print INI "0.1\n";
|
||||
$weight_spec .= " 0.1";
|
||||
}
|
||||
print "\n";
|
||||
$weight_spec .= "\n";
|
||||
}
|
||||
|
||||
# get addititional content for config file from switch or file
|
||||
|
Loading…
Reference in New Issue
Block a user