#!/usr/bin/perl -w package ph_numbers; # Script to recognize and replace numbers in Moses training corpora # and decoder input # # (c) 2013 TAUS use strict; run() unless caller(); use Getopt::Std; my $debug = $ENV{DEBUG} || 0; sub run { my %opts; if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) { print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n"; exit; } my $sourceLocale = $opts{s} || ""; my $targetLocale = $opts{t} || ""; my $numberSymbol = $opts{m} || '@num@'; while(<>) { chomp; print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n"; } } sub mark_numbers { my $input = shift; my $corpusMode = shift; my $legacyMode = shift; my $numberSymbol = shift || '@num@'; my $numref = recognize($input); my $input_length = length($input); my $output = ""; my $position = 0; for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) { my $numstart = $numref->[$i][0]; my $numend = $numref->[$i][1]; if($position < $numstart) { $output .= substr($input,$position,$numstart-$position); } my $number = substr($input,$numstart,$numend-$numstart); if($corpusMode) { $output .= $numberSymbol; } else { if($legacyMode) { $output .= "$numberSymbol"; } else { $output .= "$numberSymbol"; } } $position = $numend; } $output .= substr($input,$position); return $output; } sub recognize { my $input = shift; #print STDERR "input=$input\n"; my @recognized = (); while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) { my $start = $-[3]; my $end = $+[3]; while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) { $end = $+[2]; } # ALL characters in the word must be my $isRecognized = 1; if ($start == 0 || substr($input, $start - 1, 1) eq " ") { # 1st word, or previous char is a space } else { $isRecognized = 0; } if ($end == length($input) -1 || substr($input, $end, 1) eq " ") { # last word, or next char is a space } else { $isRecognized = 0; } #print STDERR "start=$start end=$end len=" .length($input) ."\n"; if ($isRecognized) { push @recognized,[$start,$end]; } } return \@recognized; } 1;