From b5584fdecfad33221aeae74750f24ecd1658ad7a Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Tue, 23 Jul 2013 14:55:16 +0100
Subject: [PATCH 1/5] extract-ghkm: workaround for extract-parallel issue

Don't write glue grammar or unknown word label files unless the sentence
offset is 0.  This prevents multiple instances of extract-ghkm writing
to the same two files when extract-parallel is used.

TODO Better solutions might be:
 1. modify extract-parallel so that it only configures one instance of
    extract-ghkm to write the glue / unknown-lhs files (like the current
    workaround, this assumes file chunks are representative of the whole)
 2. add multithreading support directly to extract-ghkm
 3. write distinct output files for each extract-ghkm instance and
    combine them on completion
---
 phrase-extract/extract-ghkm/ExtractGHKM.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 80568ccd5..b0102e8f2 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -388,6 +388,12 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
   if (vm.count("UnpairedExtractFormat")) {
     options.unpairedExtractFormat = true;
   }
+
+  // Workaround for extract-parallel issue.
+  if (options.sentenceOffset > 0) {
+    options.glueGrammarFile.clear();
+    options.unknownWordFile.clear();
+  }
 }
 
 void ExtractGHKM::Error(const std::string &msg) const

From 08f64dea2803ec0424d586013758fe9afe64c9df Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian.johnson@appliedlanguage.com>
Date: Wed, 24 Jul 2013 11:52:14 +0100
Subject: [PATCH 2/5] Arrow pipeline submodules now use https protocol.

---
 .gitmodules                        | 4 ++--
 contrib/arrow-pipelines/python/pcl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index a0fb859db..51ab8750b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "contrib/arrow-pipelines/python/pcl"]
 	path = contrib/arrow-pipelines/python/pcl
-	url = git://github.com/ianj-als/pcl.git
+	url = https://github.com/ianj-als/pcl.git
 [submodule "contrib/omtc/omtc"]
 	path = contrib/omtc/omtc
-	url = git://github.com/ianj-als/omtc.git
+	url = https://github.com/ianj-als/omtc.git
diff --git a/contrib/arrow-pipelines/python/pcl b/contrib/arrow-pipelines/python/pcl
index b4334b8f2..408b85900 160000
--- a/contrib/arrow-pipelines/python/pcl
+++ b/contrib/arrow-pipelines/python/pcl
@@ -1 +1 @@
-Subproject commit b4334b8f276d401c38b1163c4c33ad6b840e28be
+Subproject commit 408b85900ac1c84c3224f478da8f290c92ca328a

From 71ae8c9d19c1e31df44a328a666fe0aa7de2d80b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Wed, 24 Jul 2013 12:13:11 +0100
Subject: [PATCH 3/5] LM/Factory.cpp -> FF/Factory.cpp oops

---
 moses/Jamfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/Jamfile b/moses/Jamfile
index 20ac3cabf..0572c70ac 100644
--- a/moses/Jamfile
+++ b/moses/Jamfile
@@ -64,7 +64,7 @@ lib moses :
   ThreadPool.cpp
   SyntacticLanguageModel.cpp
   *Test.cpp Mock*.cpp
-  LM/Factory.cpp
+  FF/Factory.cpp
 ]
 headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt
 ..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt ;

From 1238041f98f30b23a6a61ca8f688f67b82d3c026 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Wed, 24 Jul 2013 13:41:21 +0100
Subject: [PATCH 4/5] Add option to do Penn Treebank style tokenization

tokenizer.perl and detokenizer.perl now have an option called -penn
which does Penn Treebank-like tokenization (English only).  This is
useful if your pipeline involves processing the corpus with tools
trained on PTB-tokenized text.

Unlike PTB, the tokenizer splits on slashes (e.g. "Monday/Tuesday"
becomes "Monday", "@/@", "Tuesday").  If using parse-de-berkeley.perl,
the option -split-slash re-joins tokens that are separated by slashes
for parsing then splits them afterwards.
---
 scripts/tokenizer/detokenizer.perl            | 120 ++++++++++++--
 scripts/tokenizer/tokenizer.perl              | 147 ++++++++++++++++++
 .../training/wrappers/parse-de-berkeley.perl  |  11 +-
 .../wrappers/syntax-hyphen-splitting.perl     |  17 +-
 4 files changed, 276 insertions(+), 19 deletions(-)

diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 488ff7b5a..a8de7e86e 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -14,6 +14,7 @@ my $language = "en";
 my $QUIET = 0;
 my $HELP = 0;
 my $UPPERCASE_SENT = 0;
+my $PENN = 0;
 
 while (@ARGV) {
 	$_ = shift;
@@ -22,14 +23,16 @@ while (@ARGV) {
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
 	/^-u$/ && ($UPPERCASE_SENT = 1, next);
+  /^-penn$/ && ($PENN = 1, next);
 }
 
 if ($HELP) {
 	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
         print "Options:\n";
-        print "  -u  ... uppercase the first char in the final sentence.\n";
-        print "  -q  ... don't report detokenizer revision.\n";
-        print "  -b  ... disable Perl buffering.\n";
+        print "  -u     ... uppercase the first char in the final sentence.\n";
+        print "  -q     ... don't report detokenizer revision.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -penn  ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
 	exit;
 }
 
@@ -37,6 +40,11 @@ if ($language !~ /^(cs|en|fr|it)$/) {
   print STDERR "Warning: No built-in rules for language $language.\n"
 }
 
+if ($PENN && $language ne "en") {
+  print STDERR "Error: -penn option only supported for English text.\n";
+  exit;
+}
+
 if (!$QUIET) {
 	print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
 	print STDERR "Language: $language\n";
@@ -46,8 +54,9 @@ while(<STDIN>) {
 	if (/^<.+>$/ || /^\s*$/) {
 		#don't try to detokenize XML/HTML tag lines
 		print $_;
-	}
-	else {
+  } elsif ($PENN) {
+    print &detokenize_penn($_);
+  } else {
 		print &detokenize($_);
 	}
 }
@@ -60,12 +69,9 @@ sub ucsecondarg {
   return $arg1.uc($arg2);
 }
 
-sub detokenize {
-	my($text) = @_;
-	chomp($text);
-	$text = " $text ";
-  $text =~ s/ \@\-\@ /-/g;
+sub deescape {
   # de-escape special chars
+  my ($text) = @_;
   $text =~ s/\&bar;/\|/g;   # factor separator (legacy)
   $text =~ s/\&#124;/\|/g;  # factor separator
   $text =~ s/\&lt;/\</g;    # xml
@@ -77,6 +83,15 @@ sub detokenize {
   $text =~ s/\&#91;/\[/g;   # syntax non-terminal
   $text =~ s/\&#93;/\]/g;   # syntax non-terminal
   $text =~ s/\&amp;/\&/g;   # escape escape
+  return $text;
+}
+
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text = &deescape($text);
 
 	my $word;
 	my $i;
@@ -182,6 +197,91 @@ sub detokenize {
 	return $text;
 }
 
+sub detokenize_penn {
+  my($text) = @_;
+
+  chomp($text);
+  $text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text =~ s/ \@\/\@ /\//g;
+  $text = &deescape($text);
+
+  # merge de-contracted forms except where the second word begins with an
+  # apostrophe (those are handled later)
+  $text =~ s/ n't /n't /g;
+  $text =~ s/ N'T /N'T /g;
+  $text =~ s/ ([Cc])an not / $1annot /g;
+  $text =~ s/ ([Dd])' ye / $1'ye /g;
+  $text =~ s/ ([Gg])im me / $1imme /g;
+  $text =~ s/ ([Gg])on na / $1onna /g;
+  $text =~ s/ ([Gg])ot ta / $1otta /g;
+  $text =~ s/ ([Ll])em me / $1emme /g;
+  $text =~ s/ '([Tt]) is / '$1is /g;
+  $text =~ s/ '([Tt]) was / '$1was /g;
+  $text =~ s/ ([Ww])an na / $1anna /g;
+
+  # restore brackets
+  $text =~ s/-LRB-/\(/g;
+  $text =~ s/-RRB-/\)/g;
+  $text =~ s/-LSB-/\[/g;
+  $text =~ s/-RSB-/\]/g;
+  $text =~ s/-LCB-/{/g;
+  $text =~ s/-RCB-/}/g;
+
+  my $i;
+  my @words = split(/ /,$text);
+  $text = "";
+  my $prependSpace = " ";
+  for ($i=0;$i<(scalar(@words));$i++) {
+    if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+      # perform right shift on currency and other random punctuation items
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = "";
+    } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+      # perform left shift on punctuation items
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+      # left-shift the contraction
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
+      # opening single quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\'";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "``") {
+      # opening double quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\"";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "\'") {
+      # closing single quote: convert to straight quote and left shift
+      $text = $text."\'";
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "\'\'") {
+      # closing double quote: convert to straight quote and left shift
+      $text = $text."\"";
+      $prependSpace = " ";
+    } else {
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = " ";
+    }
+  }
+
+  # clean up spaces at head and tail of each line as well as any double-spacing
+  $text =~ s/ +/ /g;
+  $text =~ s/\n /\n/g;
+  $text =~ s/ \n/\n/g;
+  $text =~ s/^ //g;
+  $text =~ s/ $//g;
+
+  # add trailing break
+  $text .= "\n" unless $text =~ /\n$/;
+
+  $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
+  return $text;
+}
+
 sub startsWithCJKChar {
     my ($str) = @_;
     return 0 if length($str) == 0;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 986a2dfb5..1f68ecf63 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -30,6 +30,7 @@ my $SKIP_XML = 0;
 my $TIMING = 0;
 my $NUM_THREADS = 1;
 my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
 
 while (@ARGV) 
 {
@@ -43,6 +44,7 @@ while (@ARGV)
 	/^-time$/ && ($TIMING = 1, next);
 	/^-threads$/ && ($NUM_THREADS = int(shift), next);
 	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
 }
 
 # for time calculation
@@ -61,6 +63,7 @@ if ($HELP)
         print "  -a     ... aggressive hyphen splitting.\n";
         print "  -b     ... disable Perl buffering.\n";
         print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
 	exit;
 }
 
@@ -197,6 +200,11 @@ sub tokenize_batch
 sub tokenize 
 {
     my($text) = @_;
+
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+
     chomp($text);
     $text = " $text ";
     
@@ -309,6 +317,145 @@ sub tokenize
     return $text;
 }
 
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+
+    my($text) = @_;
+    chomp($text);
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+
+    $text =~ s=--= -- =g;
+
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
 sub load_prefixes 
 {
     my ($language, $PREFIX_REF) = @_;
diff --git a/scripts/training/wrappers/parse-de-berkeley.perl b/scripts/training/wrappers/parse-de-berkeley.perl
index 5087ac3a1..5d4a4d313 100755
--- a/scripts/training/wrappers/parse-de-berkeley.perl
+++ b/scripts/training/wrappers/parse-de-berkeley.perl
@@ -4,13 +4,14 @@ use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
 
-my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$MARK_SPLIT,$BINARIZE);
+my ($JAR,$GRAMMAR,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE);
 
-die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n") 
+die("ERROR: syntax is: parse-de-berkeley.perl [-split-hyphen] [-split-slash] [-mark-split] [-binarize] -jar jar-file -gr grammar < in > out\n")
   unless &GetOptions
   ('jar=s' => \$JAR,
    'gr=s' => \$GRAMMAR,
    'split-hyphen' => \$SPLIT_HYPHEN,
+   'split-slash' => \$SPLIT_SLASH,
    'mark-split' => \$MARK_SPLIT,
    'binarize' => \$BINARIZE)
   && defined($JAR) && defined($GRAMMAR);
@@ -21,6 +22,8 @@ die("ERROR: could not find grammar file '$GRAMMAR'\n") unless -e $GRAMMAR;
 $BINARIZE = $BINARIZE ? "-binarize" : "";
 $SPLIT_HYPHEN = $SPLIT_HYPHEN ? "| $RealBin/syntax-hyphen-splitting.perl $BINARIZE" : "";
 $SPLIT_HYPHEN .= " -mark-split" if $SPLIT_HYPHEN && $MARK_SPLIT;
+$SPLIT_SLASH = $SPLIT_SLASH ? "| $RealBin/syntax-hyphen-splitting.perl -slash $BINARIZE" : "";
+$SPLIT_SLASH .= " -mark-split" if $SPLIT_SLASH && $MARK_SPLIT;
 
 my $tmp = "/tmp/parse-de-berkeley.$$";
 
@@ -28,6 +31,8 @@ open(TMP,"| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmp");
 while(<STDIN>) {
   # unsplit hyphens
   s/ \@-\@ /-/g if $SPLIT_HYPHEN;
+  # unsplit slashes
+  s/ \@\/\@ /\//g if $SPLIT_SLASH;
 
   # handle parentheses
   s/\(/*LRB*/g;
@@ -40,7 +45,7 @@ while(<STDIN>) {
 }
 close(TMP);
 
-my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN";
+my $cmd = "cat $tmp | java -Xmx10000m -Xms10000m -Dfile.encoding=UTF8 -jar $JAR -gr $GRAMMAR -maxLength 1000 $BINARIZE | $RealBin/berkeleyparsed2mosesxml.perl $SPLIT_HYPHEN $SPLIT_SLASH";
 print STDERR $cmd."\n";
 
 open(PARSE,"$cmd|");
diff --git a/scripts/training/wrappers/syntax-hyphen-splitting.perl b/scripts/training/wrappers/syntax-hyphen-splitting.perl
index 69290e51d..d78106fe2 100755
--- a/scripts/training/wrappers/syntax-hyphen-splitting.perl
+++ b/scripts/training/wrappers/syntax-hyphen-splitting.perl
@@ -5,8 +5,11 @@ use Getopt::Long "GetOptions";
 
 my $MARK_HYP = 0;
 my $BINARIZE = 0;
+my $SLASH = 0;
 
-die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP);
+die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP,'slash' => \$SLASH);
+
+my $punc = $SLASH ? "/" : "-";
 
 while(<STDIN>) {
   chop;
@@ -15,24 +18,26 @@ while(<STDIN>) {
     if (/^</ || />$/) {
       push @OUT, $_;
     }
-    elsif(/([\p{IsAlnum}])\-([\p{IsAlnum}])/) {
-      s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+    elsif(/([\p{IsAlnum}])$punc([\p{IsAlnum}])/) {
+      s/([\p{IsAlnum}])$punc([\p{IsAlnum}])/$1 \@$punc\@ $2/g;
       my @WORD = split;
       $OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
       my $pos = $1;
+      my $mark = $SLASH ? "SLASH-" : "HYP-";
+      my $punc_pos = $SLASH ? "SLASH" : "HYP";
       if ($MARK_HYP) {
-        $OUT[$#OUT] =~ s/label=\"/label=\"HYP-/;
+        $OUT[$#OUT] =~ s/label=\"/label=\"$mark/;
       }
       if ($BINARIZE) {
         for(my $i=0;$i<scalar(@WORD)-2;$i++) {
-          push @OUT,"<tree label=\"\@".($MARK_HYP ? "HYP-" : "")."$pos\">";
+          push @OUT,"<tree label=\"\@".($MARK_HYP ? $mark : "")."$pos\">";
         }
       }
       for(my $i=0;$i<scalar(@WORD);$i++) {
         if ($BINARIZE && $i>=2) {
           push @OUT, "</tree>";
         }
-        push @OUT,"<tree label=\"".(($WORD[$i] eq "\@-\@") ? "HYP" : $pos)."\"> $WORD[$i] </tree>";
+        push @OUT,"<tree label=\"".(($WORD[$i] eq "\@$punc\@") ? $punc_pos : $pos)."\"> $WORD[$i] </tree>";
       }
     }
     else {

From 1813f9784b9d92a4661b1160cb46b8c750bb7bcd Mon Sep 17 00:00:00 2001
From: Achim Ruopp <achim@tauslabs.com>
Date: Wed, 24 Jul 2013 12:44:53 -0400
Subject: [PATCH 5/5] Additional factoring to allow more NE recognizers; bug
 fixes

---
 scripts/generic/ph_numbers.perl | 56 +++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/scripts/generic/ph_numbers.perl b/scripts/generic/ph_numbers.perl
index 8f285bf5d..fcd732606 100755
--- a/scripts/generic/ph_numbers.perl
+++ b/scripts/generic/ph_numbers.perl
@@ -24,49 +24,57 @@ sub run {
     my $numberSymbol = $opts{m} || '@NUM@';
     while(<>) {
 	chomp;
-	print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
+	print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
     }
 }
 
-sub recognize {
-    my $line = shift;
+sub mark_numbers {
+    my $input = shift;
     my $corpusMode = shift;
     my $legacyMode = shift;
     my $numberSymbol = shift || '@NUM@';
 
-    # [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
-    # while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
+    my $numref = recognize($input);
+    my $input_length = length($input);
     my $output = "";
-    my $remainder = "";
-    while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
-	my $between = $1;
-	my $number = $3;
-	print STDERR "Between: x${between}x\n" if $debug;
-	print STDERR "Number: x${number}x\n" if $debug;
-	# If there are more numbers separated by whitespace, add these
-	my $numberContinuation = "";
-	while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) {
-	    $numberContinuation .= $1.$2;
+    my $position = 0;
+    for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
+	my $numstart = $numref->[$i][0];
+	my $numend = $numref->[$i][1];
+	if($position < $numstart) {
+	    $output .= substr($input,$position,$numstart-$position);
 	}
-	$number .= $numberContinuation;
-	$output .= $between;
+	my $number = substr($input,$numstart,$numend-$numstart);
 	if($corpusMode) {
-	    $output .= $2.$numberSymbol;
+	    $output .= $number;
 	}
 	else {
 	    if($legacyMode) {
-		$output .= $2."<ne translation=\"$number\">$numberSymbol</ne>";
+		$output .= "<ne translation=\"$number\">$numberSymbol</ne>";
 	    }
 	    else {
-		$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
+		$output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
 	    }
 	}
-	$remainder = $';
+	$position = $numend;
     }
-    print STDERR "Remainder: x".$remainder."x\n" if $debug; 
-    print STDERR "\n" if $debug; 
-    $output .= $remainder if $remainder; 
+    $output .= substr($input,$position); 
     return $output; 
 }
 
+sub recognize {
+    my $input = shift;
+
+    my @recognized = ();
+    while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
+	my $start = $-[3];
+	my $end = $+[3];
+	while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
+	    $end = $+[2];
+	}
+	push @recognized,[$start,$end];
+    }
+    return \@recognized;
+}
+
 1;