minor improvements: binarizing rule tables in filter script, multiple reference translation in ems analysis

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3284 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2010-05-28 22:19:58 +00:00
parent 69d92bfec3
commit 4e0bc582f6
9 changed files with 181 additions and 59 deletions

View File

@ -32,14 +32,17 @@ wmt10-data = $working-dir/data
# moses decoder
decoder = $moses-src-dir/moses-cmd/src/moses
# conversion of phrase table into binary format
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
# tokenizers
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

View File

@ -32,14 +32,17 @@ wmt10-data = $working-dir/data
# moses decoder
decoder = $moses-src-dir/moses-cmd/src/moses
# conversion of phrase table into binary format
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
# tokenizers
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

View File

@ -32,14 +32,17 @@ wmt10-data = $working-dir/data
# moses decoder
decoder = $moses-src-dir/moses-chart-cmd/src/moses_chart
# conversion of phrase table into binary format
# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/misc/processPhraseTable
# tokenizers
# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

View File

@ -32,14 +32,17 @@ wmt10-data = $working-dir/data
# moses decoder
decoder = $moses-src-dir/moses-chart-cmd/src/moses_chart
# conversion of phrase table into binary format
# conversion of phrase table into binary on-disk format
#ttable-binarizer = $moses-src-dir/misc/processPhraseTable
# tokenizers
# conversion of rule table into binary on-disk format
ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

View File

@ -32,14 +32,17 @@ toy-data = $moses-script-dir/ems/example/data
# moses decoder
decoder = $moses-src-dir/moses-cmd/src/moses
# conversion of phrase table into binary format
# conversion of phrase table into binary on-disk format
ttable-binarizer = $moses-src-dir/misc/processPhraseTable
# tokenizers
# conversion of rule table into binary on-disk format
#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"
# tokenizers - comment out if all your data is already tokenized
input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
# truecasers
# truecasers - comment out if you do not use the truecaser
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl

View File

@ -1133,8 +1133,14 @@ sub check_info {
return 0;
}
print "\tcheck '$VALUE{$parameter}' eq '$INFO{$parameter}' -> " if $VERBOSE;
if (&match_info_strings($VALUE{$parameter},$INFO{$parameter})) { print "ok\n" if $VERBOSE; }
else { print "mismatch\n" if $VERBOSE; return 0; }
if (defined($INFO{$parameter})
&& &match_info_strings($VALUE{$parameter},$INFO{$parameter})) {
print "ok\n" if $VERBOSE;
}
else {
print "mismatch\n" if $VERBOSE;
return 0;
}
}
print "\tall parameters match\n" if $VERBOSE;
return 1;
@ -1500,7 +1506,7 @@ sub define_tuning_tune {
$tuning_settings = "" unless $tuning_settings;
my $filter = "$scripts/training/filter-model-given-input.pl";
$filter .= " -Binarizer $binarizer" if $binarizer;
$filter .= " -Binarizer \"$binarizer\"" if $binarizer;
if (&get("TRAINING:hierarchical-rule-set")) {
$filter .= " --Hierarchical";
#$filter .= " --MaxSpan 9999" if &get("GENERAL:input-parser") || &get("GENERAL:output-parser");
@ -1961,7 +1967,7 @@ sub define_evaluation_decode {
my $filter = "$scripts/training/filter-model-given-input.pl";
$filter .= " $dir/evaluation/filtered.$set.$VERSION $config $input_filter";
$filter .= " -Binarizer $binarizer" if $binarizer;
$filter .= " -Binarizer \"$binarizer\"" if $binarizer;
if (&get("TRAINING:hierarchical-rule-set")) {
$filter .= " --Hierarchical";

View File

@ -26,15 +26,34 @@ my (%PRECISION_CORRECT,%PRECISION_TOTAL,
if (defined($system) || defined($reference)) {
die("you need to you specify both system and reference, not just either")
unless defined($system) && defined($reference);
die("can't open system file $system") if ! -e $system;
die("can't open system file $reference") if ! -e $reference;
@SYSTEM = `cat $system`;
@REFERENCE = `cat $reference`;
chop(@SYSTEM);
chop(@REFERENCE);
if (! -e $reference && -e $reference.".ref0") {
for(my $i=0;-e $reference.".ref".$i;$i++) {
my @REF = `cat $reference.ref$i`;
chop(@REF);
for(my $j=0;$j<scalar(@REF);$j++) {
push @{$REFERENCE[$j]}, $REF[$j];
}
}
}
else {
die("can't open system file $reference") if ! -e $reference;
@REFERENCE = `cat $reference`;
chop(@REFERENCE);
}
for(my $i=0;$i<scalar @SYSTEM;$i++) {
&add_match($SYSTEM[$i],$REFERENCE[$i],
\%PRECISION_CORRECT,\%PRECISION_TOTAL);
&add_match($REFERENCE[$i],$SYSTEM[$i],
\%RECALL_CORRECT,\%RECALL_TOTAL);
}
open(SUMMARY,">$dir/summary");
&create_n_gram_stats();
&best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision");
&best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall");
&bleu_annotation();
@ -59,15 +78,6 @@ if (defined($ttable) || defined($corpus)) {
&input_annotation();
}
sub create_n_gram_stats {
for(my $i=0;$i<scalar @SYSTEM;$i++) {
&add_match($SYSTEM[$i],$REFERENCE[$i],
\%PRECISION_CORRECT,\%PRECISION_TOTAL);
&add_match($REFERENCE[$i],$SYSTEM[$i],
\%RECALL_CORRECT,\%RECALL_TOTAL);
}
}
sub best_matches {
my ($CORRECT,$TOTAL,$out) = @_;
my $type = ($out =~ /precision/) ? "precision" : "recall";
@ -91,6 +101,7 @@ sub best_matches {
sub input_phrases {
open(INPUT,$input) or die "Can't read input $input";
while(my $line = <INPUT>) {
$line =~ s/\|\S+//g;
&extract_n_grams($line,\%INPUT_PHRASE);
}
close(INPUT);
@ -105,7 +116,7 @@ sub bleu_annotation {
$system =~ s/ $//;
my (%SYS_NGRAM,%REF_NGRAM);
&extract_n_grams( $system, \%SYS_NGRAM );
&extract_n_grams( $REFERENCE[$i], \%REF_NGRAM );
&extract_n_grams_arrayopt( $REFERENCE[$i], \%REF_NGRAM, "max" );
my @WORD = split(/ /,$system);
my @MATCH;
@ -133,9 +144,20 @@ sub bleu_annotation {
$bleu *= ($ngram_correct/(scalar(@WORD)-$length+2));
}
$bleu = $bleu ** (1/4);
my @RW = split(/ /,$REFERENCE[$i]);
my $ref_length = scalar(@RW);
if (scalar(@WORD) < $ref_length) {
my $ref_length = 9999;
if (ref($REFERENCE[$i]) eq 'ARRAY') {
foreach my $ref (@{$REFERENCE[$i]}) {
my @RW = split(/ /,$ref);
$ref_length = scalar(@RW) if scalar(@RW) < $ref_length;
}
}
else {
my @RW = split(/ /,$REFERENCE[$i]);
$ref_length = scalar(@RW);
}
if (scalar(@WORD) < $ref_length && scalar(@WORD)>0) {
$bleu *= exp(1-$ref_length/scalar(@WORD));
}
@ -144,7 +166,15 @@ sub bleu_annotation {
print OUT " " if $i;
print OUT "$WORD[$i]|$MATCH[$i]";
}
print OUT "\t".$REFERENCE[$i]."\n";
if (ref($REFERENCE[$i]) eq 'ARRAY') {
foreach my $ref (@{$REFERENCE[$i]}) {
print OUT "\t".$ref;
}
}
else {
print OUT "\t".$REFERENCE[$i]
}
print OUT "\n";
}
close(OUT);
}
@ -152,8 +182,8 @@ sub bleu_annotation {
sub add_match {
my ($system,$reference,$CORRECT,$TOTAL) = @_;
my (%SYS_NGRAM,%REF_NGRAM);
&extract_n_grams( $system, \%SYS_NGRAM );
&extract_n_grams( $reference, \%REF_NGRAM );
&extract_n_grams_arrayopt( $system, \%SYS_NGRAM, "min" );
&extract_n_grams_arrayopt( $reference, \%REF_NGRAM, "max" );
foreach my $length (keys %SYS_NGRAM) {
foreach my $ngram (keys %{$SYS_NGRAM{$length}}) {
my $sys_count = $SYS_NGRAM{$length}{$ngram};
@ -176,7 +206,8 @@ sub ttable_coverage {
open(TTABLE,$ttable) or die "Can't read ttable $ttable";
}
open(REPORT,">$dir/ttable-coverage-by-phrase");
my ($last_in,$last_size,$entropy,$size) = ("",0,0);
my ($last_in,$last_size,$size) = ("",0);
my @DISTRIBUTION = ();
while(<TTABLE>) {
chop;
my ($in,$out,$scores) = split(/ \|\|\| /);
@ -185,30 +216,44 @@ sub ttable_coverage {
next unless defined($INPUT_PHRASE{$size}{$in});
$TTABLE_COVERED{$size}{$in}++;
my @SCORE = split(/ /,$scores);
my $p = $SCORE[2]; # forward probability
if ($in ne $last_in) {
if ($last_in ne "") {
my $entropy = &compute_entropy(@DISTRIBUTION);
printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
$entropy = 0;
@DISTRIBUTION = ();
}
$last_in = $in;
$last_size = $size;
}
# TODO: normalized entropy?
$entropy -= $p*log($p)/log(2);
push @DISTRIBUTION, $SCORE[2]; # forward probability
}
my $entropy = &compute_entropy(@DISTRIBUTION);
print REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
close(REPORT);
close(TTABLE);
&additional_coverage_reports("ttable",\%TTABLE_COVERED);
}
sub compute_entropy {
my $z = 0; # normalization
foreach my $p (@_) {
$z += $p;
}
my $entropy = 0;
foreach my $p (@_) {
$entropy -= ($p/$z)*log($p/$z)/log(2);
}
return $entropy;
}
sub corpus_coverage {
# compute how often input phrases occur in the corpus
open(CORPUS,$corpus) or die "Can't read corpus $corpus";
while(<CORPUS>) {
s/\|\S+//g;
my @WORD = split;
my $sentence_length = scalar @WORD;
for(my $start=0;$start < $sentence_length;$start++) {
@ -269,6 +314,7 @@ sub input_annotation {
open(INPUT,$input) or die "Can't read input $input";
while(<INPUT>) {
chop;
s/\|\S+//g;
print OUT $_."\t";
my @WORD = split;
my $sentence_length = scalar @WORD;
@ -287,7 +333,7 @@ sub input_annotation {
$corpus_covered = 0 unless defined($corpus_covered);
if (defined($TTABLE_COVERED{$length}{$phrase})) {
printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy
printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy;
}
}
}
@ -297,8 +343,49 @@ sub input_annotation {
close(OUT);
}
sub extract_n_grams_arrayopt {
my ($sentence,$NGRAM,$minmax) = @_;
if (ref($sentence) eq 'ARRAY') {
my %MINMAX_NGRAM;
&extract_n_grams($$sentence[0],\%MINMAX_NGRAM);
for(my $i=1;$i<scalar(@{$sentence});$i++) {
my %SET_NGRAM;
&extract_n_grams($$sentence[$i],\%SET_NGRAM);
for(my $length=1;$length<=$MAX_LENGTH;$length++) {
if ($minmax eq "min") {
foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
if (!defined($SET_NGRAM{$length}{$ngram})) {
delete( $MINMAX_NGRAM{$length}{$ngram} );
}
elsif($MINMAX_NGRAM{$length}{$ngram} > $SET_NGRAM{$length}{$ngram}) {
$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
}
}
}
else {
foreach my $ngram (keys %{$SET_NGRAM{$length}}) {
if (!defined($MINMAX_NGRAM{$length}{$ngram}) ||
$SET_NGRAM{$length}{$ngram} > $MINMAX_NGRAM{$length}{$ngram}) {
$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
}
}
}
}
}
for(my $length=1;$length<=$MAX_LENGTH;$length++) {
foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
$$NGRAM{$length}{$ngram} += $MINMAX_NGRAM{$length}{$ngram};
}
}
}
else {
&extract_n_grams($sentence,$NGRAM);
}
}
sub extract_n_grams {
my ($sentence,$NGRAM) = @_;
$sentence =~ s/\s+/ /g;
$sentence =~ s/^ //;
$sentence =~ s/ $//;

View File

@ -94,7 +94,7 @@ while(<INI>) {
}
my ($phrase_table_impl,$source_factor,$t,$w,$file) = ($1,$2,$3,$4,$5);
if ($phrase_table_impl ne "0" && $phrase_table_impl ne "6") {
if (($phrase_table_impl ne "0" && $phrase_table_impl ne "6") || $file =~ /glue-grammar/) {
# Only Memory ("0") and NewFormat ("6") can be filtered.
print INI_OUT $table_spec;
next;
@ -110,7 +110,12 @@ while(<INI>) {
$cnt ++ while (defined $new_name_used{"$new_name.$cnt"});
$new_name .= ".$cnt";
$new_name_used{$new_name} = 1;
print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name\n";
if ($binarizer && $phrase_table_impl == 6) {
print INI_OUT "2 $source_factor $t $w $new_name.bin\n";
}
else {
print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name\n";
}
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
@ -240,14 +245,24 @@ for(my $i=0;$i<=$#TABLE;$i++) {
}
if(defined($binarizer)) {
print STDERR "binarizing...";
# translation model
if ($KNOWN_TTABLE{$i}) {
print STDERR "binarizing...";
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
# ... hierarchical translation model
if ($opt_hierarchical) {
my $cmd = "$binarizer $new_file $new_file.bin";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
# ... phrase translation model
else {
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
}
# reordering model
else {
print STDERR "binarizing...";
my $lexbin = $binarizer; $lexbin =~ s/PhraseTable/LexicalTable/;
my $cmd = "$lexbin -in $new_file -out $new_file";
print STDERR $cmd."\n";

View File

@ -29,16 +29,14 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_HMM_ALIGN, $_CONFIG,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
$_PHRASE_WORD_ALIGNMENT,
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS);
my $debug = 0; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
my $BINDIR="";
my $force_factored_filenames = 0;
my $BINDIR="/home/pkoehn/statmt/bin";
$_HELP = 1
unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@ -62,6 +60,7 @@ $_HELP = 1
'first-step=i' => \$_FIRST_STEP,
'last-step=i' => \$_LAST_STEP,
'giza-option=s' => \$_GIZA_OPTION,
'giza-extension=s' => \$_GIZA_EXTENSION,
'parallel' => \$_PARALLEL,
'lm=s' => \@_LM,
'help' => \$_HELP,
@ -103,7 +102,7 @@ $_HELP = 1
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
'do-steps=s' => \$_DO_STEPS,
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$force_factored_filenames,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
);
if ($_HELP) {
@ -436,7 +435,7 @@ for my $mtype ( keys %REORDERING_MODEL_TYPES) {
}
### Factored translation models
my $___NOT_FACTORED = !$force_factored_filenames;
my $___NOT_FACTORED = !$_FORCE_FACTORED_FILENAMES;
my $___ALIGNMENT_FACTORS = "0-0";
$___ALIGNMENT_FACTORS = $_ALIGNMENT_FACTORS if defined($_ALIGNMENT_FACTORS);
die("ERROR: format for alignment factors is \"0-0\" or \"0,1,2-0,1\", you provided $___ALIGNMENT_FACTORS\n") if $___ALIGNMENT_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*$/;
@ -1602,8 +1601,8 @@ sub create_ini {
foreach my $model (@REORDERING_MODELS) {
$weight_d_count += $model->{"numfeatures"};
my $table_file = "$___MODEL_DIR/reordering-table";
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$table_file .= ".$factor" unless $___NOT_FACTORED;
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$table_file .= ".";
$table_file .= $model->{"filename"};
$table_file .= ".gz";