From b5584fdecfad33221aeae74750f24ecd1658ad7a Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 23 Jul 2013 14:55:16 +0100 Subject: [PATCH 1/5] extract-ghkm: workaround for extract-parallel issue Don't write glue grammar or unknown word label files unless the sentence offset is 0. This prevents multiple instances of extract-ghkm writing to the same two files when extract-parallel is used. TODO Better solutions might be: 1. modify extract-parallel so that it only configures one instance of extract-ghkm to write the glue / unknown-lhs files (like the current workaround, this assumes file chunks are representative of the whole) 2. add multithreading support directly to extract-ghkm 3. write distinct output files for each extract-ghkm instance and combine them on completion --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 80568ccd5..b0102e8f2 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -388,6 +388,12 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("UnpairedExtractFormat")) { options.unpairedExtractFormat = true; } + + // Workaround for extract-parallel issue. + if (options.sentenceOffset > 0) { + options.glueGrammarFile.clear(); + options.unknownWordFile.clear(); + } } void ExtractGHKM::Error(const std::string &msg) const From 08f64dea2803ec0424d586013758fe9afe64c9df Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Wed, 24 Jul 2013 11:52:14 +0100 Subject: [PATCH 2/5] Arrow pipeline submodules now use https protocol. --- .gitmodules | 4 ++-- contrib/arrow-pipelines/python/pcl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index a0fb859db..51ab8750b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "contrib/arrow-pipelines/python/pcl"] path = contrib/arrow-pipelines/python/pcl - url = git://github.com/ianj-als/pcl.git + url = https://github.com/ianj-als/pcl.git [submodule "contrib/omtc/omtc"] path = contrib/omtc/omtc - url = git://github.com/ianj-als/omtc.git + url = https://github.com/ianj-als/omtc.git diff --git a/contrib/arrow-pipelines/python/pcl b/contrib/arrow-pipelines/python/pcl index b4334b8f2..408b85900 160000 --- a/contrib/arrow-pipelines/python/pcl +++ b/contrib/arrow-pipelines/python/pcl @@ -1 +1 @@ -Subproject commit b4334b8f276d401c38b1163c4c33ad6b840e28be +Subproject commit 408b85900ac1c84c3224f478da8f290c92ca328a From 71ae8c9d19c1e31df44a328a666fe0aa7de2d80b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 24 Jul 2013 12:13:11 +0100 Subject: [PATCH 3/5] LM/Factory.cpp -> FF/Factory.cpp oops --- moses/Jamfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/Jamfile b/moses/Jamfile index 20ac3cabf..0572c70ac 100644 --- a/moses/Jamfile +++ b/moses/Jamfile @@ -64,7 +64,7 @@ lib moses : ThreadPool.cpp SyntacticLanguageModel.cpp *Test.cpp Mock*.cpp - LM/Factory.cpp + FF/Factory.cpp ] headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt ..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt ; From 1238041f98f30b23a6a61ca8f688f67b82d3c026 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 24 Jul 2013 13:41:21 +0100 Subject: [PATCH 4/5] Add option to do Penn Treebank style tokenization tokenizer.perl and detokenizer.perl now have an option called -penn which does Penn Treebank-like tokenization (English only). This is useful if your pipeline involves processing the corpus with tools trained on PTB-tokenized text. Unlike PTB, the tokenizer splits on slashes (e.g. "Monday/Tuesday" becomes "Monday", "@/@", "Tuesday"). If using parse-de-berkeley.perl, the option -split-slash re-joins tokens that are separated by slashes for parsing then splits them afterwards. --- scripts/tokenizer/detokenizer.perl | 120 ++++++++++++-- scripts/tokenizer/tokenizer.perl | 147 ++++++++++++++++++ .../training/wrappers/parse-de-berkeley.perl | 11 +- .../wrappers/syntax-hyphen-splitting.perl | 17 +- 4 files changed, 276 insertions(+), 19 deletions(-) diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index 488ff7b5a..a8de7e86e 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -14,6 +14,7 @@ my $language = "en"; my $QUIET = 0; my $HELP = 0; my $UPPERCASE_SENT = 0; +my $PENN = 0; while (@ARGV) { $_ = shift; @@ -22,14 +23,16 @@ while (@ARGV) { /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); /^-u$/ && ($UPPERCASE_SENT = 1, next); + /^-penn$/ && ($PENN = 1, next); } if ($HELP) { print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; print "Options:\n"; - print " -u ... uppercase the first char in the final sentence.\n"; - print " -q ... don't report detokenizer revision.\n"; - print " -b ... disable Perl buffering.\n"; + print " -u ... uppercase the first char in the final sentence.\n"; + print " -q ... don't report detokenizer revision.\n"; + print " -b ... disable Perl buffering.\n"; + print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; exit; } @@ -37,6 +40,11 @@ if ($language !~ /^(cs|en|fr|it)$/) { print STDERR "Warning: No built-in rules for language $language.\n" } +if ($PENN && $language ne "en") { + print STDERR "Error: -penn option only supported for English text.\n"; + exit; +} + if (!$QUIET) { print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; print STDERR "Language: $language\n"; @@ -46,8 +54,9 @@ while() { if (/^<.+>$/ || /^\s*$/) { #don't try to detokenize XML/HTML tag lines print $_; - } - else { + } elsif ($PENN) { + print &detokenize_penn($_); + } else { print &detokenize($_); } } @@ -60,12 +69,9 @@ sub ucsecondarg { return $arg1.uc($arg2); } -sub detokenize { - my($text) = @_; - chomp($text); - $text = " $text "; - $text =~ s/ \@\-\@ /-/g; +sub deescape { # de-escape special chars + my ($text) = @_; $text =~ s/\&bar;/\|/g; # factor separator (legacy) $text =~ s/\|/\|/g; # factor separator $text =~ s/\</\0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + # left-shift the contraction + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only + # opening single quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\'"; + $prependSpace = ""; + } elsif ($words[$i] eq "``") { + # opening double quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\""; + $prependSpace = ""; + } elsif ($words[$i] eq "\'") { + # closing single quote: convert to straight quote and left shift + $text = $text."\'"; + $prependSpace = " "; + } elsif ($words[$i] eq "\'\'") { + # closing double quote: convert to straight quote and left shift + $text = $text."\""; + $prependSpace = " "; + } else { + $text = $text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + sub startsWithCJKChar { my ($str) = @_; return 0 if length($str) == 0; diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index 986a2dfb5..1f68ecf63 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -30,6 +30,7 @@ my $SKIP_XML = 0; my $TIMING = 0; my $NUM_THREADS = 1; my $NUM_SENTENCES_PER_THREAD = 2000; +my $PENN = 0; while (@ARGV) { @@ -43,6 +44,7 @@ while (@ARGV) /^-time$/ && ($TIMING = 1, next); /^-threads$/ && ($NUM_THREADS = int(shift), next); /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); + /^-penn$/ && ($PENN = 1, next); } # for time calculation @@ -61,6 +63,7 @@ if ($HELP) print " -a ... aggressive hyphen splitting.\n"; print " -b ... disable Perl buffering.\n"; print " -time ... enable processing time calculation.\n"; + print " -penn ... use Penn treebank-like tokenization.\n"; exit; } @@ -197,6 +200,11 @@ sub tokenize_batch sub tokenize { my($text) = @_; + + if ($PENN) { + return tokenize_penn($text); + } + chomp($text); $text = " $text "; @@ -309,6 +317,145 @@ sub tokenize return $text; } +sub tokenize_penn +{ + # Improved compatibility with Penn Treebank tokenization. Useful if + # the text is to later be parsed with a PTB-trained parser. + # + # Adapted from Robert MacIntyre's sed script: + # http://www.cis.upenn.edu/~treebank/tokenizer.sed + + my($text) = @_; + chomp($text); + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # attempt to get correct directional quotes + $text =~ s/^``/`` /g; + $text =~ s/^"/`` /g; + $text =~ s/^`([^`])/` $1/g; + $text =~ s/^'/` /g; + $text =~ s/([ ([{<])"/$1 `` /g; + $text =~ s/([ ([{<])``/$1 `` /g; + $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; + $text =~ s/([ ([{<])'/$1 ` /g; + # close quotes handled at end + + $text =~ s=\.\.\.= _ELLIPSIS_ =g; + + # separate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; +$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; + + # Separate out intra-token slashes. PTB tokenization doesn't do this, so + # the tokens should be merged prior to parsing with a PTB-trained parser + # (see syntax-hyphen-splitting.perl). + $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; + + # Assume sentence tokenization has been done first, so split FINAL periods + # only. + $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; + # however, we may as well split ALL question marks and exclamation points, + # since they shouldn't have the abbrev.-marker ambiguity problem + $text =~ s=([?!])= $1 =g; + + # parentheses, brackets, etc. + $text =~ s=([\]\[\(\){}<>])= $1 =g; + $text =~ s/\(/-LRB-/g; + $text =~ s/\)/-RRB-/g; + $text =~ s/\[/-LSB-/g; + $text =~ s/\]/-RSB-/g; + $text =~ s/{/-LCB-/g; + $text =~ s/}/-RCB-/g; + + $text =~ s=--= -- =g; + + # First off, add a space to the beginning and end of each line, to reduce + # necessary number of regexps. + $text =~ s=$= =; + $text =~ s=^= =; + + $text =~ s="= '' =g; + # possessive or close-single-quote + $text =~ s=([^'])' =$1 ' =g; + # as in it's, I'm, we'd + $text =~ s='([sSmMdD]) = '$1 =g; + $text =~ s='ll = 'll =g; + $text =~ s='re = 're =g; + $text =~ s='ve = 've =g; + $text =~ s=n't = n't =g; + $text =~ s='LL = 'LL =g; + $text =~ s='RE = 'RE =g; + $text =~ s='VE = 'VE =g; + $text =~ s=N'T = N'T =g; + + $text =~ s= ([Cc])annot = $1an not =g; + $text =~ s= ([Dd])'ye = $1' ye =g; + $text =~ s= ([Gg])imme = $1im me =g; + $text =~ s= ([Gg])onna = $1on na =g; + $text =~ s= ([Gg])otta = $1ot ta =g; + $text =~ s= ([Ll])emme = $1em me =g; + $text =~ s= ([Mm])ore'n = $1ore 'n =g; + $text =~ s= '([Tt])is = '$1 is =g; + $text =~ s= '([Tt])was = '$1 was =g; + $text =~ s= ([Ww])anna = $1an na =g; + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + sub load_prefixes { my ($language, $PREFIX_REF) = @_; diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl index 5087ac3a1..5d4a4d313 100755 --- a/scripts/training/wrappers/parse-de-berkeley.perl +++ b/scripts/training/wrappers/parse-de-berkeley.perl @@ -4,13 +4,14 @@ use strict; use Getopt::Long "GetOptions"; use FindBin qw($RealBin); -my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE); +my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE); -die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n") +die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-split-slash] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n") unless &GetOptions ('jar=s' => \$JAR, 'gr=s' => \$GRAMMAR, 'split-hyphen' => \$SPLIT_HYPHEN, + 'split-slash' => \$SPLIT_SLASH, 'mark-split' => \$MARK_SPLIT, 'binarize' => \$BINARIZE) && defined($JAR) && defined($GRAMMAR); @@ -21,6 +22,8 @@ die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR; $BINARIZE = $BINARIZE ? "-binarize" : ""; $SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $RealBin/syntax-hyphen-splitting.perl $BINARIZE" : ""; $SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT; +$SPLIT_SLASH = $SPLIT_SLASH ? "| $RealBin/syntax-hyphen-splitting.perl -slash $BINARIZE" : ""; +$SPLIT_SLASH .= " -mark-split" if $SPLIT_SLASH && $MARK_SPLIT; my $tmp = "/tmp/parse-de-berkeley.$$"; @@ -28,6 +31,8 @@ open(TMP,"| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmp"); while() { # unsplit hyphens s/ \@-\@ /-/g if $SPLIT_HYPHEN; + # unsplit slashes + s/ \@\/\@ /\//g if $SPLIT_SLASH; # handle parentheses s/\(/*LRB*/g; @@ -40,7 +45,7 @@ while() { } close(TMP); -my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN"; +my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN $SPLIT_SLASH"; print STDERR $cmd."\n"; open(PARSE,"$cmd|"); diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl index 69290e51d..d78106fe2 100755 --- a/scripts/training/wrappers/syntax-hyphen-splitting.perl +++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl @@ -5,8 +5,11 @@ use Getopt::Long "GetOptions"; my $MARK_HYP = 0; my $BINARIZE = 0; +my $SLASH = 0; -die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP); +die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP,'slash' => \$SLASH); + +my $punc = $SLASH ? "/" : "-"; while() { chop; @@ -15,24 +18,26 @@ while() { if (/^$/) { push @OUT, $_; } - elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) { - s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; + elsif(/([\p{IsAlnum}])$punc([\p{IsAlnum}])/) { + s/([\p{IsAlnum}])$punc([\p{IsAlnum}])/$1 \@$punc\@ $2/g; my @WORD = split; $OUT[$#OUT] =~ /label=\"([^\"]+)\"/; my $pos = $1; + my $mark = $SLASH ? "SLASH-" : "HYP-"; + my $punc_pos = $SLASH ? "SLASH" : "HYP"; if ($MARK_HYP) { - $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/; + $OUT[$#OUT] =~ s/label=\"/label=\"$mark/; } if ($BINARIZE) { for(my $i=0;$i"; + push @OUT,""; } } for(my $i=0;$i=2) { push @OUT, ""; } - push @OUT," $WORD[$i] "; + push @OUT," $WORD[$i] "; } } else { From 1813f9784b9d92a4661b1160cb46b8c750bb7bcd Mon Sep 17 00:00:00 2001 From: Achim Ruopp Date: Wed, 24 Jul 2013 12:44:53 -0400 Subject: [PATCH 5/5] Additional factoring to allow more NE recognizers; bug fixes --- scripts/generic/ph_numbers.perl | 56 +++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl index 8f285bf5d..fcd732606 100755 --- a/scripts/generic/ph_numbers.perl +++ b/scripts/generic/ph_numbers.perl @@ -24,49 +24,57 @@ sub run { my $numberSymbol = $opts{m} || '@NUM@'; while(<>) { chomp; - print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n"; + print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n"; } } -sub recognize { - my $line = shift; +sub mark_numbers { + my $input = shift; my $corpusMode = shift; my $legacyMode = shift; my $numberSymbol = shift || '@NUM@'; - # [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? - # while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) { + my $numref = recognize($input); + my $input_length = length($input); my $output = ""; - my $remainder = ""; - while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) { - my $between = $1; - my $number = $3; - print STDERR "Between: x${between}x\n" if $debug; - print STDERR "Number: x${number}x\n" if $debug; - # If there are more numbers separated by whitespace, add these - my $numberContinuation = ""; - while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) { - $numberContinuation .= $1.$2; + my $position = 0; + for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) { + my $numstart = $numref->[$i][0]; + my $numend = $numref->[$i][1]; + if($position < $numstart) { + $output .= substr($input,$position,$numstart-$position); } - $number .= $numberContinuation; - $output .= $between; + my $number = substr($input,$numstart,$numend-$numstart); if($corpusMode) { - $output .= $2.$numberSymbol; + $output .= $number; } else { if($legacyMode) { - $output .= $2."$numberSymbol"; + $output .= "$numberSymbol"; } else { - $output .= $2."$numberSymbol"; + $output .= "$numberSymbol"; } } - $remainder = $'; + $position = $numend; } - print STDERR "Remainder: x".$remainder."x\n" if $debug; - print STDERR "\n" if $debug; - $output .= $remainder if $remainder; + $output .= substr($input,$position); return $output; } +sub recognize { + my $input = shift; + + my @recognized = (); + while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) { + my $start = $-[3]; + my $end = $+[3]; + while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) { + $end = $+[2]; + } + push @recognized,[$start,$end]; + } + return \@recognized; +} + 1;