mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
integrated sparse lexical features into experiment.perl, needs some more testing.
This commit is contained in:
parent
ff79f9f054
commit
c8b2e89573
@ -395,8 +395,14 @@ build-generation-custom
|
||||
rerun-on-change: generation-factors generation-type training-options script generation-corpus
|
||||
ignore-unless: AND generation-factors generation-corpus
|
||||
default-name: model/generation-table
|
||||
build-sparse-lexical
|
||||
in: corpus
|
||||
out: sparse-lexical
|
||||
ignore-unless: sparse-lexical-features
|
||||
default-name: model/most-frequent-words
|
||||
template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
|
||||
create-config
|
||||
in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
|
||||
in: reordering-table phrase-translation-table generation-table sparse-lexical INTERPOLATED-LM:binlm LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
|
||||
|
@ -1778,7 +1778,7 @@ sub define_training_build_custom_generation {
|
||||
sub define_training_create_config {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($config,$reordering_table,$phrase_translation_table,$generation_table,@LM)
|
||||
my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,@LM)
|
||||
= &get_output_and_input($step_id);
|
||||
|
||||
my $cmd = &get_training_setting(9);
|
||||
@ -1805,8 +1805,7 @@ sub define_training_create_config {
|
||||
my $ptCmd = $phrase_translation_table;
|
||||
$ptCmd .= ":$ptImpl" if $ptImpl>0;
|
||||
$ptCmd .= ":$numFF" if defined($numFF);
|
||||
$cmd .= "$ptCmd ";
|
||||
|
||||
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
|
||||
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
|
||||
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table;
|
||||
$cmd .= "-config $config ";
|
||||
@ -1904,6 +1903,9 @@ sub define_training_create_config {
|
||||
my $additional_ini = &get("TRAINING:additional-ini");
|
||||
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if defined($sparse_lexical_features);
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
}
|
||||
|
||||
@ -2236,8 +2238,7 @@ sub define_tuningevaluation_filter {
|
||||
my $ptCmd = $phrase_translation_table;
|
||||
$ptCmd .= ":$ptImpl" if $ptImpl>0;
|
||||
$ptCmd .= ":$numFF" if defined($numFF);
|
||||
$cmd .= "$ptCmd ";
|
||||
|
||||
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
|
||||
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
|
||||
if $reordering_table;
|
||||
# additional settings for hierarchical models
|
||||
|
95
scripts/ems/support/build-sparse-lexical-features.perl
Executable file
95
scripts/ems/support/build-sparse-lexical-features.perl
Executable file
@ -0,0 +1,95 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
|
||||
# Build necessary files for sparse lexical features
|
||||
# * target word insertion
|
||||
# * source word deletion
|
||||
# * word translation
|
||||
|
||||
my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
|
||||
my $ini = "";
|
||||
my $report = "";
|
||||
my %ALREADY;
|
||||
|
||||
foreach my $feature_spec (split(/,\s*/,$specification)) {
|
||||
my @SPEC = split(/\s+/,$feature_spec);
|
||||
if ($SPEC[0] eq 'target-word-insertion') {
|
||||
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
|
||||
my $file = &create_top_words($output_extension, $SPEC[2]);
|
||||
$ini .= "[target-word-insertion-feature]\n0 $file\n\n";
|
||||
$report .= "twi\n";
|
||||
}
|
||||
else {
|
||||
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
|
||||
}
|
||||
}
|
||||
elsif ($SPEC[0] eq 'source-word-deletion') {
|
||||
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
|
||||
my $file = &create_top_words($input_extension, $SPEC[2]);
|
||||
$ini .= "[source-word-deletion-feature]\n0 $file\n\n";
|
||||
$report .= "swd\n";
|
||||
}
|
||||
else {
|
||||
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
|
||||
}
|
||||
}
|
||||
elsif ($SPEC[0] eq 'word-translation') {
|
||||
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
|
||||
my $file_in = &create_top_words($input_extension, $SPEC[2]);
|
||||
my $file_out = &create_top_words($output_extension, $SPEC[3]);
|
||||
$ini .= "[word-translation-feature]\n0 0 $file_in $file_out\n\n";
|
||||
$report .= "wt\n";
|
||||
}
|
||||
else {
|
||||
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
|
||||
}
|
||||
}
|
||||
else {
|
||||
die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
|
||||
}
|
||||
}
|
||||
|
||||
open(INI,">$outfile_prefix.ini");
|
||||
print INI $ini;
|
||||
print INI "\n[report-sparse-features]\n$report\n";
|
||||
print INI "\n[use-alignment-info]\ntrue\n\n";
|
||||
close(INI);
|
||||
|
||||
sub create_top_words {
|
||||
my ($extension, $count) = @_;
|
||||
my $file = "$outfile_prefix.$extension.top$count";
|
||||
return $file if defined($ALREADY{"$extension,$count"});
|
||||
$ALREADY{"$extension,$count"}++;
|
||||
|
||||
# get counts
|
||||
my %COUNT;
|
||||
open(CORPUS,"$corpus.$extension");
|
||||
while(<CORPUS>) {
|
||||
chop;
|
||||
foreach (split) {
|
||||
$_ =~ s/\|.+//; # only surface factor at this point
|
||||
$COUNT{$_}++ unless $_ eq "";
|
||||
}
|
||||
}
|
||||
close(CORPUS);
|
||||
|
||||
# sort
|
||||
my @COUNT_WORD;
|
||||
foreach (keys %COUNT) {
|
||||
next if $COUNT{$_} <= 3; # avoid large tail
|
||||
next if $_ =~ /:/; # avoid colon bug
|
||||
push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
|
||||
}
|
||||
my @SORTED = reverse sort @COUNT_WORD;
|
||||
|
||||
# write top n to file
|
||||
open(TOP,">$file");
|
||||
for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
|
||||
$SORTED[$i] =~ /^\d+ (.+)$/;
|
||||
print TOP "$1\n";
|
||||
}
|
||||
close(TOP);
|
||||
|
||||
return $file;
|
||||
}
|
@ -10,17 +10,27 @@ my ($weight_file) = @ARGV;
|
||||
|
||||
my %WEIGHT;
|
||||
my $current_weight = "";
|
||||
my $weights_file_spec = "";
|
||||
my $weights_file_flag = 0;
|
||||
open(WEIGHT,$weight_file)
|
||||
|| die("ERROR: could not open weight file: $weight_file");
|
||||
while(<WEIGHT>) {
|
||||
if (/^\[weight\-(\S+)\]/) {
|
||||
if (/^\[weight-file\]/) {
|
||||
$weights_file_spec = "\n".$_;
|
||||
$weights_file_flag = 1;
|
||||
}
|
||||
elsif (/^\[weight\-(\S+)\]/) {
|
||||
$current_weight = $1;
|
||||
}
|
||||
elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
|
||||
push @{$WEIGHT{$current_weight}},$1;
|
||||
}
|
||||
elsif ($weights_file_flag && !/^\[/ && !/^\s*$/) {
|
||||
$weights_file_spec .= $_;
|
||||
}
|
||||
elsif (/^\[/) {
|
||||
$current_weight = "";
|
||||
$current_weight = "";
|
||||
$weights_file_flag = 0;
|
||||
}
|
||||
}
|
||||
close(WEIGHT);
|
||||
@ -67,3 +77,6 @@ foreach my $weight (keys %WEIGHT) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print $weights_file_spec;
|
||||
|
||||
|
@ -36,7 +36,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
||||
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
|
||||
$_ADDITIONAL_INI,
|
||||
$_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
|
||||
$_DICTIONARY, $_EPPEX, $IGNORE);
|
||||
my $_CORES = 1;
|
||||
|
||||
@ -121,8 +121,9 @@ $_HELP = 1
|
||||
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
|
||||
'dictionary=s' => \$_DICTIONARY,
|
||||
'eppex:s' => \$_EPPEX,
|
||||
'additional-ini=s' => \$_ADDITIONAL_INI,
|
||||
'cores=i' => \$_CORES
|
||||
'additional-ini=s' => \$_ADDITIONAL_INI,
|
||||
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
|
||||
'cores=i' => \$_CORES
|
||||
);
|
||||
|
||||
if ($_HELP) {
|
||||
@ -1998,10 +1999,15 @@ sub create_ini {
|
||||
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
|
||||
}
|
||||
|
||||
# get addititional content for config file from switch or file
|
||||
if ($_ADDITIONAL_INI) {
|
||||
print INI "\n# additional settings\n\n";
|
||||
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
|
||||
}
|
||||
if ($_ADDITIONAL_INI_FILE) {
|
||||
print INI "\n# additional settings\n\n";
|
||||
print INI `cat $_ADDITIONAL_INI_FILE`;
|
||||
}
|
||||
|
||||
close(INI);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user