Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Barry Haddow 2013-07-24 20:49:59 +01:00
commit f79746b3c2
2 changed files with 55 additions and 27 deletions

View File

@ -28,6 +28,8 @@ my $makeTTable = 1; # whether to build the ttable extract files
my $otherExtractArgs= "";
my $weights = "";
my $baselineExtract;
my $glueFile;
for (my $i = 8; $i < $#ARGV + 1; ++$i)
{
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
@ -39,6 +41,11 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
$weights = $ARGV[++$i];
next;
}
if ($ARGV[$i] eq '--GlueGrammar') {
$glueFile = $ARGV[++$i];
next;
}
$otherExtractArgs .= $ARGV[$i] ." ";
}
@ -117,7 +124,14 @@ for (my $i = 0; $i < $numParallel; ++$i)
if ($weights) {
$weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
}
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
my $glueArg = "";
if (defined($glueFile)) {
$glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
}
print "glueArg=$glueArg \n";
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
print STDERR $cmd;
`$cmd`;
@ -135,8 +149,7 @@ foreach (@children) {
}
# merge
my $is_osx = ($^O eq "darwin");
my $catCmd = $is_osx?"gunzip -c ":"zcat ";
my $catCmd = "gunzip -c ";
my $catInvCmd = $catCmd;
my $catOCmd = $catCmd;
for (my $i = 0; $i < $numParallel; ++$i)
@ -184,6 +197,13 @@ foreach (@children) {
waitpid($_, 0);
}
# glue rules
if (defined($glueFile)) {
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
print STDERR "Merging glue rules: $cmd \n";
print STDERR `$cmd`;
}
# delete temporary files
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;

View File

@ -24,49 +24,57 @@ sub run {
my $numberSymbol = $opts{m} || '@NUM@';
while(<>) {
chomp;
print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
}
}
sub recognize {
my $line = shift;
sub mark_numbers {
my $input = shift;
my $corpusMode = shift;
my $legacyMode = shift;
my $numberSymbol = shift || '@NUM@';
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
my $numref = recognize($input);
my $input_length = length($input);
my $output = "";
my $remainder = "";
while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
my $between = $1;
my $number = $3;
print STDERR "Between: x${between}x\n" if $debug;
print STDERR "Number: x${number}x\n" if $debug;
# If there are more numbers separated by whitespace, add these
my $numberContinuation = "";
while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) {
$numberContinuation .= $1.$2;
my $position = 0;
for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
my $numstart = $numref->[$i][0];
my $numend = $numref->[$i][1];
if($position < $numstart) {
$output .= substr($input,$position,$numstart-$position);
}
$number .= $numberContinuation;
$output .= $between;
my $number = substr($input,$numstart,$numend-$numstart);
if($corpusMode) {
$output .= $2.$numberSymbol;
$output .= $number;
}
else {
if($legacyMode) {
$output .= $2."<ne translation=\"$number\">$numberSymbol</ne>";
$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
}
else {
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
}
}
$remainder = $';
$position = $numend;
}
print STDERR "Remainder: x".$remainder."x\n" if $debug;
print STDERR "\n" if $debug;
$output .= $remainder if $remainder;
$output .= substr($input,$position);
return $output;
}
sub recognize {
my $input = shift;
my @recognized = ();
while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
my $start = $-[3];
my $end = $+[3];
while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
$end = $+[2];
}
push @recognized,[$start,$end];
}
return \@recognized;
}
1;