integrated sparse lexical features into experiment.perl, needs some more testing.

This commit is contained in:
phikoehn 2012-07-03 06:20:09 +01:00
parent ff79f9f054
commit c8b2e89573
5 changed files with 132 additions and 11 deletions

View File

@ -395,8 +395,14 @@ build-generation-custom
rerun-on-change: generation-factors generation-type training-options script generation-corpus
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
build-sparse-lexical
in: corpus
out: sparse-lexical
ignore-unless: sparse-lexical-features
default-name: model/most-frequent-words
template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
create-config
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
in: reordering-table phrase-translation-table generation-table sparse-lexical INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini

View File

@ -1778,7 +1778,7 @@ sub define_training_build_custom_generation {
sub define_training_create_config {
my ($step_id) = @_;
my ($config,$reordering_table,$phrase_translation_table,$generation_table,@LM)
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,@LM)
= &get_output_and_input($step_id);
my $cmd = &get_training_setting(9);
@ -1805,8 +1805,7 @@ sub define_training_create_config {
my $ptCmd = $phrase_translation_table;
$ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
$cmd .= "$ptCmd ";
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table;
$cmd .= "-config $config ";
@ -1904,6 +1903,9 @@ sub define_training_create_config {
my $additional_ini = &get("TRAINING:additional-ini");
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if defined($sparse_lexical_features);
&create_step($step_id,$cmd);
}
@ -2236,8 +2238,7 @@ sub define_tuningevaluation_filter {
my $ptCmd = $phrase_translation_table;
$ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
$cmd .= "$ptCmd ";
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
if $reordering_table;
# additional settings for hierarchical models

View File

@ -0,0 +1,95 @@
#!/usr/bin/perl -w
use strict;
# Build necessary files for sparse lexical features
# * target word insertion
# * source word deletion
# * word translation
my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
my $ini = "";
my $report = "";
my %ALREADY;
foreach my $feature_spec (split(/,\s*/,$specification)) {
my @SPEC = split(/\s+/,$feature_spec);
if ($SPEC[0] eq 'target-word-insertion') {
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($output_extension, $SPEC[2]);
$ini .= "[target-word-insertion-feature]\n0 $file\n\n";
$report .= "twi\n";
}
else {
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
}
}
elsif ($SPEC[0] eq 'source-word-deletion') {
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($input_extension, $SPEC[2]);
$ini .= "[source-word-deletion-feature]\n0 $file\n\n";
$report .= "swd\n";
}
else {
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
}
}
elsif ($SPEC[0] eq 'word-translation') {
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
my $file_in = &create_top_words($input_extension, $SPEC[2]);
my $file_out = &create_top_words($output_extension, $SPEC[3]);
$ini .= "[word-translation-feature]\n0 0 $file_in $file_out\n\n";
$report .= "wt\n";
}
else {
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
}
}
else {
die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
}
}
open(INI,">$outfile_prefix.ini");
print INI $ini;
print INI "\n[report-sparse-features]\n$report\n";
print INI "\n[use-alignment-info]\ntrue\n\n";
close(INI);
sub create_top_words {
my ($extension, $count) = @_;
my $file = "$outfile_prefix.$extension.top$count";
return $file if defined($ALREADY{"$extension,$count"});
$ALREADY{"$extension,$count"}++;
# get counts
my %COUNT;
open(CORPUS,"$corpus.$extension");
while(<CORPUS>) {
chop;
foreach (split) {
$_ =~ s/\|.+//; # only surface factor at this point
$COUNT{$_}++ unless $_ eq "";
}
}
close(CORPUS);
# sort
my @COUNT_WORD;
foreach (keys %COUNT) {
next if $COUNT{$_} <= 3; # avoid large tail
next if $_ =~ /:/; # avoid colon bug
push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
}
my @SORTED = reverse sort @COUNT_WORD;
# write top n to file
open(TOP,">$file");
for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
$SORTED[$i] =~ /^\d+ (.+)$/;
print TOP "$1\n";
}
close(TOP);
return $file;
}

View File

@ -10,17 +10,27 @@ my ($weight_file) = @ARGV;
my %WEIGHT;
my $current_weight = "";
my $weights_file_spec = "";
my $weights_file_flag = 0;
open(WEIGHT,$weight_file)
|| die("ERROR: could not open weight file: $weight_file");
while(<WEIGHT>) {
if (/^\[weight\-(\S+)\]/) {
if (/^\[weight-file\]/) {
$weights_file_spec = "\n".$_;
$weights_file_flag = 1;
}
elsif (/^\[weight\-(\S+)\]/) {
$current_weight = $1;
}
elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
push @{$WEIGHT{$current_weight}},$1;
}
elsif ($weights_file_flag && !/^\[/ && !/^\s*$/) {
$weights_file_spec .= $_;
}
elsif (/^\[/) {
$current_weight = "";
$current_weight = "";
$weights_file_flag = 0;
}
}
close(WEIGHT);
@ -67,3 +77,6 @@ foreach my $weight (keys %WEIGHT) {
}
}
}
print $weights_file_spec;

View File

@ -36,7 +36,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
$_ADDITIONAL_INI,
$_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
$_DICTIONARY, $_EPPEX, $IGNORE);
my $_CORES = 1;
@ -121,8 +121,9 @@ $_HELP = 1
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'eppex:s' => \$_EPPEX,
'additional-ini=s' => \$_ADDITIONAL_INI,
'cores=i' => \$_CORES
'additional-ini=s' => \$_ADDITIONAL_INI,
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
'cores=i' => \$_CORES
);
if ($_HELP) {
@ -1998,10 +1999,15 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
# get addititional content for config file from switch or file
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
}
if ($_ADDITIONAL_INI_FILE) {
print INI "\n# additional settings\n\n";
print INI `cat $_ADDITIONAL_INI_FILE`;
}
close(INI);
}