diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index a8f4fd458..c95db9623 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1034,14 +1034,13 @@ sub execute_steps { } elsif (! -e &versionize(&step_file($i)).".DONE") { my $step = &versionize(&step_file($i)); - print "\texecuting $step via "; &define_step($i); &write_info($i); # cluster job submission if ($CLUSTER && ! &is_qsub_script($i)) { $DO{$i}++; - print "qsub\n"; + print "\texecuting $step via qsub ($active active)\n"; my $qsub_args = &get_qsub_args($DO_STEP[$i]); `qsub $qsub_args -e $step.STDERR -o $step.STDOUT $step`; } @@ -1050,16 +1049,13 @@ sub execute_steps { elsif ($CLUSTER || $active < $MAX_ACTIVE) { $active++; $DO{$i}++; - print "sh ($active active)\n"; + print "\texecuting $step via sh ($active active)\n"; sleep(5); if (!fork) { `sh $step >$step.STDOUT 2> $step.STDERR`; exit; } } - else { - print " --- on hold\n"; - } } } @@ -1853,6 +1849,9 @@ sub define_training_create_config { $cmd .= "-lm $factor:$order:$lm_file:$type "; } + my $additional_ini = &get("TRAINING:additional-ini"); + $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini); + &create_step($step_id,$cmd); } @@ -2185,6 +2184,7 @@ sub define_evaluation_decode { my $nbest = &backoff_and_get("EVALUATION:$set:nbest"); my $moses_parallel = &backoff_and_get("EVALUATION:$set:moses-parallel"); my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation"); + my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph"); my $report_precision_by_coverage = &backoff_and_get("EVALUATION:$set:report-precision-by-coverage"); my $hierarchical = &get("TRAINING:hierarchical-rule-set"); @@ -2193,6 +2193,9 @@ sub define_evaluation_decode { $settings .= " -use-alignment-info -alignment-output-file $system_output.wa"; $report_segmentation = "yes"; } + if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") { + $settings .= " -unpruned-search-graph -osg $system_output.graph"; + } if (defined($report_segmentation) && $report_segmentation eq "yes") { if ($hierarchical) { $settings .= " -T $system_output.trace"; @@ -2237,12 +2240,17 @@ sub define_evaluation_analysis { $output,$reference,$input) = &get_output_and_input($step_id); my $script = &backoff_and_get("EVALUATION:$set:analysis"); my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation"); + my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph"); my $cmd = "$script -system $output -reference $reference -input $input -dir $analysis"; if (defined($report_segmentation) && $report_segmentation eq "yes") { my $segmentation_file = &get_default_file("EVALUATION",$set,"decode"); $cmd .= " -segmentation $segmentation_file"; } + if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") { + my $search_graph_file = &get_default_file("EVALUATION",$set,"decode"); + $cmd .= " -search-graph $search_graph_file.graph"; + } if (&get("TRAINING:hierarchical-rule-set")) { $cmd .= " -hierarchical"; } diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 6b6840543..27f2d4975 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -110,7 +110,7 @@ print STDERR "\n=== BUILDING FINAL LM ===\n\n"; sub interpolate { my ($name,@LM) = @_; - die("cannot interpolate more than 10 language models at once.") + die("cannot interpolate more than 10 language models at once: ",join(",",@LM)) if scalar(@LM) > 10; my $tmp = tempdir(DIR=>$TEMPDIR); diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl index 87987b264..eccb03cad 100755 --- a/scripts/ems/support/reference-from-sgm.perl +++ b/scripts/ems/support/reference-from-sgm.perl @@ -17,12 +17,17 @@ close(ORDER); # get from sgm file which lines belong to which system my %DOC; +my $system_from_refset = 0; my ($doc,$system); open(REF,$ref); while() { + if (/) { elsif (/^<\/srcset/) { s/<\/srcset/<\/tstset/; } - elsif (/^/g; + $text =~ s/\&/\&/g; + my $word; my $i; my @words = split(/ /,$text); diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index 84fdc3462..7158c417a 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -18,6 +18,7 @@ my $language = "en"; my $QUIET = 0; my $HELP = 0; my $AGGRESSIVE = 0; +my $SKIP_XML = 0; #my $start = [ Time::HiRes::gettimeofday( ) ]; @@ -27,6 +28,7 @@ while (@ARGV) { /^-l$/ && ($language = shift, next); /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); /^-a$/ && ($AGGRESSIVE = 1, next); } @@ -50,7 +52,7 @@ if (scalar(%NONBREAKING_PREFIX) eq 0){ } while() { - if (/^<.+>$/ || /^\s*$/) { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) { #don't try to tokenize XML/HTML tag lines print $_; } @@ -141,7 +143,13 @@ sub tokenize { $text =~ s/DOTDOTMULTI/DOTMULTI./g; } $text =~ s/DOTMULTI/./g; - + + #escape special chars + $text =~ s/\&/\&/g; + $text =~ s/\|/\&bar;/g; + $text =~ s/\/\>/g; + #ensure final line break $text .= "\n" unless $text =~ /\n$/; diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 280eb4543..ee5b231bb 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -404,6 +404,9 @@ if (-e $ref_abs) { else { # if multiple file, get a full list of the files my $part = 0; + if (! -e $ref_abs."0" && -e $ref_abs.".ref0") { + $ref_abs .= ".ref"; + } while (-e $ref_abs.$part) { push @references, $ref_abs.$part; $part++; diff --git a/scripts/training/wrappers/make-factor-stem.perl b/scripts/training/wrappers/make-factor-stem.perl index 892c1636c..3de60d39b 100755 --- a/scripts/training/wrappers/make-factor-stem.perl +++ b/scripts/training/wrappers/make-factor-stem.perl @@ -6,8 +6,8 @@ my ($size,$in,$out) = @ARGV; open(IN,$in); open(OUT,">$out"); -binmode(IN, ":utf8"); -binmode(OUT, ":utf8"); +binmode(IN, ":UTF8"); +binmode(OUT, ":UTF8"); while() { my $first = 1;