mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
f79746b3c2
@ -28,6 +28,8 @@ my $makeTTable = 1; # whether to build the ttable extract files
|
||||
my $otherExtractArgs= "";
|
||||
my $weights = "";
|
||||
my $baselineExtract;
|
||||
my $glueFile;
|
||||
|
||||
for (my $i = 8; $i < $#ARGV + 1; ++$i)
|
||||
{
|
||||
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
|
||||
@ -39,6 +41,11 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
|
||||
$weights = $ARGV[++$i];
|
||||
next;
|
||||
}
|
||||
if ($ARGV[$i] eq '--GlueGrammar') {
|
||||
$glueFile = $ARGV[++$i];
|
||||
next;
|
||||
}
|
||||
|
||||
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||
}
|
||||
|
||||
@ -117,7 +124,14 @@ for (my $i = 0; $i < $numParallel; ++$i)
|
||||
if ($weights) {
|
||||
$weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
|
||||
}
|
||||
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
|
||||
|
||||
my $glueArg = "";
|
||||
if (defined($glueFile)) {
|
||||
$glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
|
||||
}
|
||||
print "glueArg=$glueArg \n";
|
||||
|
||||
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
|
||||
print STDERR $cmd;
|
||||
`$cmd`;
|
||||
|
||||
@ -135,8 +149,7 @@ foreach (@children) {
|
||||
}
|
||||
|
||||
# merge
|
||||
my $is_osx = ($^O eq "darwin");
|
||||
my $catCmd = $is_osx?"gunzip -c ":"zcat ";
|
||||
my $catCmd = "gunzip -c ";
|
||||
my $catInvCmd = $catCmd;
|
||||
my $catOCmd = $catCmd;
|
||||
for (my $i = 0; $i < $numParallel; ++$i)
|
||||
@ -184,6 +197,13 @@ foreach (@children) {
|
||||
waitpid($_, 0);
|
||||
}
|
||||
|
||||
# glue rules
|
||||
if (defined($glueFile)) {
|
||||
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
|
||||
print STDERR "Merging glue rules: $cmd \n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
|
||||
# delete temporary files
|
||||
$cmd = "rm -rf $TMPDIR \n";
|
||||
print STDERR $cmd;
|
||||
|
@ -24,49 +24,57 @@ sub run {
|
||||
my $numberSymbol = $opts{m} || '@NUM@';
|
||||
while(<>) {
|
||||
chomp;
|
||||
print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
|
||||
print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub recognize {
|
||||
my $line = shift;
|
||||
sub mark_numbers {
|
||||
my $input = shift;
|
||||
my $corpusMode = shift;
|
||||
my $legacyMode = shift;
|
||||
my $numberSymbol = shift || '@NUM@';
|
||||
|
||||
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
|
||||
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
|
||||
my $numref = recognize($input);
|
||||
my $input_length = length($input);
|
||||
my $output = "";
|
||||
my $remainder = "";
|
||||
while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
|
||||
my $between = $1;
|
||||
my $number = $3;
|
||||
print STDERR "Between: x${between}x\n" if $debug;
|
||||
print STDERR "Number: x${number}x\n" if $debug;
|
||||
# If there are more numbers separated by whitespace, add these
|
||||
my $numberContinuation = "";
|
||||
while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) {
|
||||
$numberContinuation .= $1.$2;
|
||||
my $position = 0;
|
||||
for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
|
||||
my $numstart = $numref->[$i][0];
|
||||
my $numend = $numref->[$i][1];
|
||||
if($position < $numstart) {
|
||||
$output .= substr($input,$position,$numstart-$position);
|
||||
}
|
||||
$number .= $numberContinuation;
|
||||
$output .= $between;
|
||||
my $number = substr($input,$numstart,$numend-$numstart);
|
||||
if($corpusMode) {
|
||||
$output .= $2.$numberSymbol;
|
||||
$output .= $number;
|
||||
}
|
||||
else {
|
||||
if($legacyMode) {
|
||||
$output .= $2."<ne translation=\"$number\">$numberSymbol</ne>";
|
||||
$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
|
||||
}
|
||||
else {
|
||||
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
|
||||
$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
|
||||
}
|
||||
}
|
||||
$remainder = $';
|
||||
$position = $numend;
|
||||
}
|
||||
print STDERR "Remainder: x".$remainder."x\n" if $debug;
|
||||
print STDERR "\n" if $debug;
|
||||
$output .= $remainder if $remainder;
|
||||
$output .= substr($input,$position);
|
||||
return $output;
|
||||
}
|
||||
|
||||
sub recognize {
|
||||
my $input = shift;
|
||||
|
||||
my @recognized = ();
|
||||
while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
|
||||
my $start = $-[3];
|
||||
my $end = $+[3];
|
||||
while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
|
||||
$end = $+[2];
|
||||
}
|
||||
push @recognized,[$start,$end];
|
||||
}
|
||||
return \@recognized;
|
||||
}
|
||||
|
||||
1;
|
||||
|
Loading…
Reference in New Issue
Block a user