lotsa minor changes: mostly bug fixes, tokenizer now esacapes special Moses characters (|<>&)

This commit is contained in:
phikoehn 2012-03-20 04:57:37 +00:00
parent 82ae12249c
commit 048e0f8de4
8 changed files with 47 additions and 17 deletions

View File

@ -1034,14 +1034,13 @@ sub execute_steps {
}
elsif (! -e &versionize(&step_file($i)).".DONE") {
my $step = &versionize(&step_file($i));
print "\texecuting $step via ";
&define_step($i);
&write_info($i);
# cluster job submission
if ($CLUSTER && ! &is_qsub_script($i)) {
$DO{$i}++;
print "qsub\n";
print "\texecuting $step via qsub ($active active)\n";
my $qsub_args = &get_qsub_args($DO_STEP[$i]);
`qsub $qsub_args -e $step.STDERR -o $step.STDOUT $step`;
}
@ -1050,16 +1049,13 @@ sub execute_steps {
elsif ($CLUSTER || $active < $MAX_ACTIVE) {
$active++;
$DO{$i}++;
print "sh ($active active)\n";
print "\texecuting $step via sh ($active active)\n";
sleep(5);
if (!fork) {
`sh $step >$step.STDOUT 2> $step.STDERR`;
exit;
}
}
else {
print " --- on hold\n";
}
}
}
@ -1853,6 +1849,9 @@ sub define_training_create_config {
$cmd .= "-lm $factor:$order:$lm_file:$type ";
}
my $additional_ini = &get("TRAINING:additional-ini");
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
&create_step($step_id,$cmd);
}
@ -2185,6 +2184,7 @@ sub define_evaluation_decode {
my $nbest = &backoff_and_get("EVALUATION:$set:nbest");
my $moses_parallel = &backoff_and_get("EVALUATION:$set:moses-parallel");
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation");
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph");
my $report_precision_by_coverage = &backoff_and_get("EVALUATION:$set:report-precision-by-coverage");
my $hierarchical = &get("TRAINING:hierarchical-rule-set");
@ -2193,6 +2193,9 @@ sub define_evaluation_decode {
$settings .= " -use-alignment-info -alignment-output-file $system_output.wa";
$report_segmentation = "yes";
}
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") {
$settings .= " -unpruned-search-graph -osg $system_output.graph";
}
if (defined($report_segmentation) && $report_segmentation eq "yes") {
if ($hierarchical) {
$settings .= " -T $system_output.trace";
@ -2237,12 +2240,17 @@ sub define_evaluation_analysis {
$output,$reference,$input) = &get_output_and_input($step_id);
my $script = &backoff_and_get("EVALUATION:$set:analysis");
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation");
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph");
my $cmd = "$script -system $output -reference $reference -input $input -dir $analysis";
if (defined($report_segmentation) && $report_segmentation eq "yes") {
my $segmentation_file = &get_default_file("EVALUATION",$set,"decode");
$cmd .= " -segmentation $segmentation_file";
}
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") {
my $search_graph_file = &get_default_file("EVALUATION",$set,"decode");
$cmd .= " -search-graph $search_graph_file.graph";
}
if (&get("TRAINING:hierarchical-rule-set")) {
$cmd .= " -hierarchical";
}

View File

@ -110,7 +110,7 @@ print STDERR "\n=== BUILDING FINAL LM ===\n\n";
sub interpolate {
my ($name,@LM) = @_;
die("cannot interpolate more than 10 language models at once.")
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);

View File

@ -17,12 +17,17 @@ close(ORDER);
# get from sgm file which lines belong to which system
my %DOC;
my $system_from_refset = 0;
my ($doc,$system);
open(REF,$ref);
while(<REF>) {
if (/<refset/ && /refid="([^\"]+)"/i) {
$system = $1;
$system_from_refset = 1;
}
if (/<doc/i) {
die unless /sysid="([^\"]+)"/i;
$system = $1;
die unless /sysid="([^\"]+)"/i || $system_from_refset;
$system = $1 unless $system_from_refset;
die unless /docid="([^\"]+)"/i;
$doc = $1;
}

View File

@ -18,8 +18,9 @@ while(<SRC>) {
elsif (/^<\/srcset/) {
s/<\/srcset/<\/tstset/;
}
elsif (/^<DOC/i) {
s/<DOC/<DOC sysid="$system"/i;
elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//;
s/<doc/<doc sysid="$system"/i;
}
elsif (/<seg/) {
my $line = shift(@OUT);

View File

@ -63,8 +63,13 @@ sub detokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
$text =~ s/ \@\-\@ /-/g;
# de-escape special chars
$text =~ s/\&bar;/\|/g;
$text =~ s/\&lt;/\</g;
$text =~ s/\&gt;/\>/g;
$text =~ s/\&amp;/\&/g;
my $word;
my $i;
my @words = split(/ /,$text);

View File

@ -18,6 +18,7 @@ my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
#my $start = [ Time::HiRes::gettimeofday( ) ];
@ -27,6 +28,7 @@ while (@ARGV) {
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
}
@ -50,7 +52,7 @@ if (scalar(%NONBREAKING_PREFIX) eq 0){
}
while(<STDIN>) {
if (/^<.+>$/ || /^\s*$/) {
if (($SKIP_XML && /^<.+>$/) || /^\s*$/) {
#don't try to tokenize XML/HTML tag lines
print $_;
}
@ -141,7 +143,13 @@ sub tokenize {
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#escape special chars
$text =~ s/\&/\&amp;/g;
$text =~ s/\|/\&bar;/g;
$text =~ s/\</\&lt;/g;
$text =~ s/\>/\&gt;/g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;

View File

@ -404,6 +404,9 @@ if (-e $ref_abs) {
else {
# if multiple file, get a full list of the files
my $part = 0;
if (! -e $ref_abs."0" && -e $ref_abs.".ref0") {
$ref_abs .= ".ref";
}
while (-e $ref_abs.$part) {
push @references, $ref_abs.$part;
$part++;

View File

@ -6,8 +6,8 @@ my ($size,$in,$out) = @ARGV;
open(IN,$in);
open(OUT,">$out");
binmode(IN, ":utf8");
binmode(OUT, ":utf8");
binmode(IN, ":UTF8");
binmode(OUT, ":UTF8");
while(<IN>) {
my $first = 1;