Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2013-07-24 19:01:57 +01:00
commit 6fc21a32fc
9 changed files with 318 additions and 47 deletions

4
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "contrib/arrow-pipelines/python/pcl"]
path = contrib/arrow-pipelines/python/pcl
url = git://github.com/ianj-als/pcl.git
url = https://github.com/ianj-als/pcl.git
[submodule "contrib/omtc/omtc"]
path = contrib/omtc/omtc
url = git://github.com/ianj-als/omtc.git
url = https://github.com/ianj-als/omtc.git

@ -1 +1 @@
Subproject commit b4334b8f276d401c38b1163c4c33ad6b840e28be
Subproject commit 408b85900ac1c84c3224f478da8f290c92ca328a

View File

@ -64,7 +64,7 @@ lib moses :
ThreadPool.cpp
SyntacticLanguageModel.cpp
*Test.cpp Mock*.cpp
LM/Factory.cpp
FF/Factory.cpp
]
headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt
..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt ;

View File

@ -388,6 +388,12 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("UnpairedExtractFormat")) {
options.unpairedExtractFormat = true;
}
// Workaround for extract-parallel issue.
if (options.sentenceOffset > 0) {
options.glueGrammarFile.clear();
options.unknownWordFile.clear();
}
}
void ExtractGHKM::Error(const std::string &msg) const

View File

@ -24,49 +24,57 @@ sub run {
my $numberSymbol = $opts{m} || '@NUM@';
while(<>) {
chomp;
print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
}
}
sub recognize {
my $line = shift;
sub mark_numbers {
my $input = shift;
my $corpusMode = shift;
my $legacyMode = shift;
my $numberSymbol = shift || '@NUM@';
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
my $numref = recognize($input);
my $input_length = length($input);
my $output = "";
my $remainder = "";
while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
my $between = $1;
my $number = $3;
print STDERR "Between: x${between}x\n" if $debug;
print STDERR "Number: x${number}x\n" if $debug;
# If there are more numbers separated by whitespace, add these
my $numberContinuation = "";
while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) {
$numberContinuation .= $1.$2;
my $position = 0;
for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
my $numstart = $numref->[$i][0];
my $numend = $numref->[$i][1];
if($position < $numstart) {
$output .= substr($input,$position,$numstart-$position);
}
$number .= $numberContinuation;
$output .= $between;
my $number = substr($input,$numstart,$numend-$numstart);
if($corpusMode) {
$output .= $2.$numberSymbol;
$output .= $number;
}
else {
if($legacyMode) {
$output .= $2."<ne translation=\"$number\">$numberSymbol</ne>";
$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
}
else {
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
}
}
$remainder = $';
$position = $numend;
}
print STDERR "Remainder: x".$remainder."x\n" if $debug;
print STDERR "\n" if $debug;
$output .= $remainder if $remainder;
$output .= substr($input,$position);
return $output;
}
sub recognize {
my $input = shift;
my @recognized = ();
while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
my $start = $-[3];
my $end = $+[3];
while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
$end = $+[2];
}
push @recognized,[$start,$end];
}
return \@recognized;
}
1;

View File

@ -14,6 +14,7 @@ my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $UPPERCASE_SENT = 0;
my $PENN = 0;
while (@ARGV) {
$_ = shift;
@ -22,14 +23,16 @@ while (@ARGV) {
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-u$/ && ($UPPERCASE_SENT = 1, next);
/^-penn$/ && ($PENN = 1, next);
}
if ($HELP) {
print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
print "Options:\n";
print " -u ... uppercase the first char in the final sentence.\n";
print " -q ... don't report detokenizer revision.\n";
print " -b ... disable Perl buffering.\n";
print " -u ... uppercase the first char in the final sentence.\n";
print " -q ... don't report detokenizer revision.\n";
print " -b ... disable Perl buffering.\n";
print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
exit;
}
@ -37,6 +40,11 @@ if ($language !~ /^(cs|en|fr|it)$/) {
print STDERR "Warning: No built-in rules for language $language.\n"
}
if ($PENN && $language ne "en") {
print STDERR "Error: -penn option only supported for English text.\n";
exit;
}
if (!$QUIET) {
print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
print STDERR "Language: $language\n";
@ -46,8 +54,9 @@ while(<STDIN>) {
if (/^<.+>$/ || /^\s*$/) {
#don't try to detokenize XML/HTML tag lines
print $_;
}
else {
} elsif ($PENN) {
print &detokenize_penn($_);
} else {
print &detokenize($_);
}
}
@ -60,12 +69,9 @@ sub ucsecondarg {
return $arg1.uc($arg2);
}
sub detokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
sub deescape {
# de-escape special chars
my ($text) = @_;
$text =~ s/\&bar;/\|/g; # factor separator (legacy)
$text =~ s/\&#124;/\|/g; # factor separator
$text =~ s/\&lt;/\</g; # xml
@ -77,6 +83,15 @@ sub detokenize {
$text =~ s/\&#91;/\[/g; # syntax non-terminal
$text =~ s/\&#93;/\]/g; # syntax non-terminal
$text =~ s/\&amp;/\&/g; # escape escape
return $text;
}
sub detokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
$text = &deescape($text);
my $word;
my $i;
@ -182,6 +197,91 @@ sub detokenize {
return $text;
}
sub detokenize_penn {
my($text) = @_;
chomp($text);
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
$text =~ s/ \@\/\@ /\//g;
$text = &deescape($text);
# merge de-contracted forms except where the second word begins with an
# apostrophe (those are handled later)
$text =~ s/ n't /n't /g;
$text =~ s/ N'T /N'T /g;
$text =~ s/ ([Cc])an not / $1annot /g;
$text =~ s/ ([Dd])' ye / $1'ye /g;
$text =~ s/ ([Gg])im me / $1imme /g;
$text =~ s/ ([Gg])on na / $1onna /g;
$text =~ s/ ([Gg])ot ta / $1otta /g;
$text =~ s/ ([Ll])em me / $1emme /g;
$text =~ s/ '([Tt]) is / '$1is /g;
$text =~ s/ '([Tt]) was / '$1was /g;
$text =~ s/ ([Ww])an na / $1anna /g;
# restore brackets
$text =~ s/-LRB-/\(/g;
$text =~ s/-RRB-/\)/g;
$text =~ s/-LSB-/\[/g;
$text =~ s/-RSB-/\]/g;
$text =~ s/-LCB-/{/g;
$text =~ s/-RCB-/}/g;
my $i;
my @words = split(/ /,$text);
$text = "";
my $prependSpace = " ";
for ($i=0;$i<(scalar(@words));$i++) {
if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
# perform right shift on currency and other random punctuation items
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
# perform left shift on punctuation items
$text=$text.$words[$i];
$prependSpace = " ";
} elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
# left-shift the contraction
$text=$text.$words[$i];
$prependSpace = " ";
} elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
# opening single quote: convert to straight quote and right-shift
$text = $text.$prependSpace."\'";
$prependSpace = "";
} elsif ($words[$i] eq "``") {
# opening double quote: convert to straight quote and right-shift
$text = $text.$prependSpace."\"";
$prependSpace = "";
} elsif ($words[$i] eq "\'") {
# closing single quote: convert to straight quote and left shift
$text = $text."\'";
$prependSpace = " ";
} elsif ($words[$i] eq "\'\'") {
# closing double quote: convert to straight quote and left shift
$text = $text."\"";
$prependSpace = " ";
} else {
$text = $text.$prependSpace.$words[$i];
$prependSpace = " ";
}
}
# clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# add trailing break
$text .= "\n" unless $text =~ /\n$/;
$text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
return $text;
}
sub startsWithCJKChar {
my ($str) = @_;
return 0 if length($str) == 0;

View File

@ -30,6 +30,7 @@ my $SKIP_XML = 0;
my $TIMING = 0;
my $NUM_THREADS = 1;
my $NUM_SENTENCES_PER_THREAD = 2000;
my $PENN = 0;
while (@ARGV)
{
@ -43,6 +44,7 @@ while (@ARGV)
/^-time$/ && ($TIMING = 1, next);
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
}
# for time calculation
@ -61,6 +63,7 @@ if ($HELP)
print " -a ... aggressive hyphen splitting.\n";
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
exit;
}
@ -197,6 +200,11 @@ sub tokenize_batch
sub tokenize
{
my($text) = @_;
if ($PENN) {
return tokenize_penn($text);
}
chomp($text);
$text = " $text ";
@ -309,6 +317,145 @@ sub tokenize
return $text;
}
sub tokenize_penn
{
# Improved compatibility with Penn Treebank tokenization. Useful if
# the text is to later be parsed with a PTB-trained parser.
#
# Adapted from Robert MacIntyre's sed script:
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
my($text) = @_;
chomp($text);
# remove ASCII junk
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# attempt to get correct directional quotes
$text =~ s/^``/`` /g;
$text =~ s/^"/`` /g;
$text =~ s/^`([^`])/` $1/g;
$text =~ s/^'/` /g;
$text =~ s/([ ([{<])"/$1 `` /g;
$text =~ s/([ ([{<])``/$1 `` /g;
$text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
$text =~ s/([ ([{<])'/$1 ` /g;
# close quotes handled at end
$text =~ s=\.\.\.= _ELLIPSIS_ =g;
# separate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
#$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
# Separate out intra-token slashes. PTB tokenization doesn't do this, so
# the tokens should be merged prior to parsing with a PTB-trained parser
# (see syntax-hyphen-splitting.perl).
$text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
$text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
$text =~ s=([?!])= $1 =g;
# parentheses, brackets, etc.
$text =~ s=([\]\[\(\){}<>])= $1 =g;
$text =~ s/\(/-LRB-/g;
$text =~ s/\)/-RRB-/g;
$text =~ s/\[/-LSB-/g;
$text =~ s/\]/-RSB-/g;
$text =~ s/{/-LCB-/g;
$text =~ s/}/-RCB-/g;
$text =~ s=--= -- =g;
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
$text =~ s=$= =;
$text =~ s=^= =;
$text =~ s="= '' =g;
# possessive or close-single-quote
$text =~ s=([^'])' =$1 ' =g;
# as in it's, I'm, we'd
$text =~ s='([sSmMdD]) = '$1 =g;
$text =~ s='ll = 'll =g;
$text =~ s='re = 're =g;
$text =~ s='ve = 've =g;
$text =~ s=n't = n't =g;
$text =~ s='LL = 'LL =g;
$text =~ s='RE = 'RE =g;
$text =~ s='VE = 'VE =g;
$text =~ s=N'T = N'T =g;
$text =~ s= ([Cc])annot = $1an not =g;
$text =~ s= ([Dd])'ye = $1' ye =g;
$text =~ s= ([Gg])imme = $1im me =g;
$text =~ s= ([Gg])onna = $1on na =g;
$text =~ s= ([Gg])otta = $1ot ta =g;
$text =~ s= ([Ll])emme = $1em me =g;
$text =~ s= ([Mm])ore'n = $1ore 'n =g;
$text =~ s= '([Tt])is = '$1 is =g;
$text =~ s= '([Tt])was = '$1 was =g;
$text =~ s= ([Ww])anna = $1an na =g;
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++)
{
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/)
{
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
{
#no change
}
elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
{
#no change
}
else
{
$word = $pre." .";
}
}
$text .= $word." ";
}
# restore ellipses
$text =~ s=_ELLIPSIS_=\.\.\.=g;
# clean out extra spaces
$text =~ s= *= =g;
$text =~ s=^ *==g;
$text =~ s= *$==g;
#escape special chars
$text =~ s/\&/\&amp;/g; # escape escape
$text =~ s/\|/\&#124;/g; # factor separator
$text =~ s/\</\&lt;/g; # xml
$text =~ s/\>/\&gt;/g; # xml
$text =~ s/\'/\&apos;/g; # xml
$text =~ s/\"/\&quot;/g; # xml
$text =~ s/\[/\&#91;/g; # syntax non-terminal
$text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes
{
my ($language, $PREFIX_REF) = @_;

View File

@ -4,13 +4,14 @@ use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE);
my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE);
die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-split-slash] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
unless &GetOptions
('jar=s' => \$JAR,
'gr=s' => \$GRAMMAR,
'split-hyphen' => \$SPLIT_HYPHEN,
'split-slash' => \$SPLIT_SLASH,
'mark-split' => \$MARK_SPLIT,
'binarize' => \$BINARIZE)
&& defined($JAR) && defined($GRAMMAR);
@ -21,6 +22,8 @@ die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
$BINARIZE = $BINARIZE ? "-binarize" : "";
$SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $RealBin/syntax-hyphen-splitting.perl $BINARIZE" : "";
$SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT;
$SPLIT_SLASH = $SPLIT_SLASH ? "| $RealBin/syntax-hyphen-splitting.perl -slash $BINARIZE" : "";
$SPLIT_SLASH .= " -mark-split" if $SPLIT_SLASH && $MARK_SPLIT;
my $tmp = "/tmp/parse-de-berkeley.$$";
@ -28,6 +31,8 @@ open(TMP,"| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmp");
while(<STDIN>) {
# unsplit hyphens
s/ \@-\@ /-/g if $SPLIT_HYPHEN;
# unsplit slashes
s/ \@\/\@ /\//g if $SPLIT_SLASH;
# handle parentheses
s/\(/*LRB*/g;
@ -40,7 +45,7 @@ while(<STDIN>) {
}
close(TMP);
my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN";
my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN $SPLIT_SLASH";
print STDERR $cmd."\n";
open(PARSE,"$cmd|");

View File

@ -5,8 +5,11 @@ use Getopt::Long "GetOptions";
my $MARK_HYP = 0;
my $BINARIZE = 0;
my $SLASH = 0;
die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP);
die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP,'slash' => \$SLASH);
my $punc = $SLASH ? "/" : "-";
while(<STDIN>) {
chop;
@ -15,24 +18,26 @@ while(<STDIN>) {
if (/^</ || />$/) {
push @OUT, $_;
}
elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) {
s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
elsif(/([\p{IsAlnum}])$punc([\p{IsAlnum}])/) {
s/([\p{IsAlnum}])$punc([\p{IsAlnum}])/$1 \@$punc\@ $2/g;
my @WORD = split;
$OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
my $pos = $1;
my $mark = $SLASH ? "SLASH-" : "HYP-";
my $punc_pos = $SLASH ? "SLASH" : "HYP";
if ($MARK_HYP) {
$OUT[$#OUT] =~ s/label=\"/label=\"HYP-/;
$OUT[$#OUT] =~ s/label=\"/label=\"$mark/;
}
if ($BINARIZE) {
for(my $i=0;$i<scalar(@WORD)-2;$i++) {
push @OUT,"<tree label=\"\@".($MARK_HYP ? "HYP-" : "")."$pos\">";
push @OUT,"<tree label=\"\@".($MARK_HYP ? $mark : "")."$pos\">";
}
}
for(my $i=0;$i<scalar(@WORD);$i++) {
if ($BINARIZE && $i>=2) {
push @OUT, "</tree>";
}
push @OUT,"<tree label=\"".(($WORD[$i] eq "\@-\@") ? "HYP" : $pos)."\"> $WORD[$i] </tree>";
push @OUT,"<tree label=\"".(($WORD[$i] eq "\@$punc\@") ? $punc_pos : $pos)."\"> $WORD[$i] </tree>";
}
}
else {