mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-09 16:04:41 +03:00
102 lines
2.7 KiB
Perl
Executable File
102 lines
2.7 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
package ph_numbers;
|
|
|
|
# Script to recognize and replace numbers in Moses training corpora
|
|
# and decoder input
|
|
#
|
|
# (c) 2013 TAUS
|
|
|
|
use strict;
|
|
|
|
run() unless caller();
|
|
use Getopt::Std;
|
|
|
|
my $debug = $ENV{DEBUG} || 0;
|
|
|
|
sub run {
|
|
my %opts;
|
|
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
|
|
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
|
|
exit;
|
|
}
|
|
my $sourceLocale = $opts{s} || "";
|
|
my $targetLocale = $opts{t} || "";
|
|
my $numberSymbol = $opts{m} || '@num@';
|
|
while(<>) {
|
|
chomp;
|
|
print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
|
|
}
|
|
}
|
|
|
|
sub mark_numbers {
|
|
my $input = shift;
|
|
my $corpusMode = shift;
|
|
my $legacyMode = shift;
|
|
my $numberSymbol = shift || '@num@';
|
|
|
|
my $numref = recognize($input);
|
|
my $input_length = length($input);
|
|
my $output = "";
|
|
my $position = 0;
|
|
for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
|
|
my $numstart = $numref->[$i][0];
|
|
my $numend = $numref->[$i][1];
|
|
if($position < $numstart) {
|
|
$output .= substr($input,$position,$numstart-$position);
|
|
}
|
|
my $number = substr($input,$numstart,$numend-$numstart);
|
|
if($corpusMode) {
|
|
$output .= $numberSymbol;
|
|
}
|
|
else {
|
|
if($legacyMode) {
|
|
$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
|
|
}
|
|
else {
|
|
$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
|
|
}
|
|
}
|
|
$position = $numend;
|
|
}
|
|
$output .= substr($input,$position);
|
|
return $output;
|
|
}
|
|
|
|
sub recognize {
|
|
my $input = shift;
|
|
#print STDERR "input=$input\n";
|
|
|
|
my @recognized = ();
|
|
while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
|
|
my $start = $-[3];
|
|
my $end = $+[3];
|
|
while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
|
|
$end = $+[2];
|
|
}
|
|
|
|
# ALL characters in the word must be
|
|
my $isRecognized = 1;
|
|
if ($start == 0 || substr($input, $start - 1, 1) eq " ") {
|
|
# 1st word, or previous char is a space
|
|
}
|
|
else {
|
|
$isRecognized = 0;
|
|
}
|
|
|
|
if ($end == length($input) -1 || substr($input, $end, 1) eq " ") {
|
|
# last word, or next char is a space
|
|
}
|
|
else {
|
|
$isRecognized = 0;
|
|
}
|
|
|
|
#print STDERR "start=$start end=$end len=" .length($input) ."\n";
|
|
if ($isRecognized) {
|
|
push @recognized,[$start,$end];
|
|
}
|
|
}
|
|
return \@recognized;
|
|
}
|
|
|
|
1;
|