mert-moses.pl now supports multiple input weights for lattices and confusion networks, using the --inputweights argument.

I'll leave it to someone who knows mert-moses-new.pl better to make the changes there.

"zcat" is now abstracted as a $ZCAT variable in these files, and is set to "gzip -cd" which should work on more platforms (notably on the mac, where zcat fails unless an archive name ends in ".Z").

 


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2082 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
jdschroeder 2009-02-05 17:39:36 +00:00
parent e53ab5da6d
commit cc95706045
4 changed files with 33 additions and 20 deletions

View File

@ -10,6 +10,10 @@
use strict;
my $MAX_LENGTH = 10;
# utilities
my $ZCAT = "gzip -cd";
# consider phrases in input up to this length
# in other words, all phrase-tables will be truncated at least to 10 words per
# phrase
@ -149,9 +153,9 @@ for(my $i=0;$i<=$#TABLE;$i++) {
my $openstring;
if ($file !~ /\.gz$/ && -e "$file.gz") {
$openstring = "zcat $file.gz |";
$openstring = "$ZCAT $file.gz |";
} elsif ($file =~ /\.gz$/) {
$openstring = "zcat $file |";
$openstring = "$ZCAT $file |";
} else {
$openstring = "< $file";
}

View File

@ -50,6 +50,9 @@ if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
}
# utilities
my $ZCAT = "gzip -cd";
# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
# but the translation model has currently 5 features
@ -135,6 +138,8 @@ my $___NONORM = 0;
# set 0 if input type is text, set 1 if input type is confusion network
my $___INPUTTYPE = 0;
#input weights for CNs and Lattices: don't have a direct ini file counter, so specified here
my $___INPUTWEIGHTS = 1;
# set 1 if using with async decoder
my $___ASYNC = 0;
@ -166,6 +171,7 @@ GetOptions(
"working-dir=s" => \$___WORKING_DIR,
"input=s" => \$___DEV_F,
"inputtype=i" => \$___INPUTTYPE,
"inputweights=i" => \$___INPUTWEIGHTS,
"refs=s" => \$___DEV_E,
"decoder=s" => \$___DECODER,
"config=s" => \$___CONFIG,
@ -250,6 +256,8 @@ Options:
--scorenbestcmd=STRING ... path to score-nbest.py
--old-sge ... passed to moses-parallel, assume Sun Grid Engine < 6.0
--inputtype=[0|1|2] ... Handle different input types (0 for text, 1 for confusion network, 2 for lattices, default is 0)
--inputweights=N ... For confusion networks and lattices, number of weights to optimize for weight-i
(must supply -link-param-count N to decoder-flags if N != 1 for decoder to deal with this correctly)
--no-filter-phrase-table ... disallow filtering of phrase tables
(useful if binary phrase tables are available)
--efficient_scorenbest_flag ... activate a time-efficient scoring of nbest lists
@ -261,26 +269,23 @@ Options:
exit 1;
}
# update variables if input is confusion network
if ($___INPUTTYPE == 1)
# update default variables if input is confusion network or lattice
if ($___INPUTTYPE == 1 || $___INPUTTYPE == 2)
{
$ABBR_FULL_MAP = "$ABBR_FULL_MAP I=weight-i";
%ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
%FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
#$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
my @my_array;
for(my $i=0 ; $i < $___INPUTWEIGHTS ; $i++)
{
push @my_array, [ 1.0, 0.0, 2.0 ];
}
push @{$default_triples -> {"I"}}, @my_array;
}
# update variables if input is lattice - handle like conf. net for now
if ($___INPUTTYPE == 2)
{
$ABBR_FULL_MAP = "$ABBR_FULL_MAP I=weight-i";
%ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
%FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
}
# Check validity of input parameters and set defaults if needed
@ -558,7 +563,7 @@ while(1) {
my $aggregate_nbl_size=0;
if (defined $obo_scorenbest) {
# Faster scoring method, never rescore previous iterations
my $cmd = "zcat run$run.best*.out.gz | $obo_scorenbest ".join(" ", @references);
my $cmd = "$ZCAT run$run.best*.out.gz | $obo_scorenbest ".join(" ", @references);
my $targetfile = "run$run.feats";
if (defined $___JOBS) {
safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$targetfile -stderr=run$run.scorenbest.err")

View File

@ -11,6 +11,10 @@ use Getopt::Long;
use IO::File;
use File::Basename;
# utilities
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
@ -109,9 +113,9 @@ sub my_open {
my $ft = `file $f`;
# file might not recognize some files!
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
$opn = "zcat $f |";
$opn = "$ZCAT $f |";
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
$opn = "bzcat $f |";
$opn = "$BZCAT $f |";
} else {
$opn = "$f";
}

View File

@ -121,7 +121,7 @@ my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score";
# utilities
my $ZCAT = "zcat";
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
# do a sanity check to make sure we can find the necessary binaries since