Preprocessor for tokenization that fixes known errors in the raw input.

2024-09-21 08:07:14 +03:00 · 2014-02-22 00:28:33 +00:00 · 2014-02-22 00:28:33 +00:00 · e70766a6de
commit e70766a6de
parent a8d66cd68d
1 changed files with 34 additions and 0 deletions
--- a/scripts/tokenizer/pre-tokenizer.perl
+++ b/scripts/tokenizer/pre-tokenizer.perl
@ -0,0 +1,34 @@
+#!/usr/bin/perl -W 
+# script for preprocessing language data prior to tokenization
+# Start by Ulrich Germann, after noticing systematic preprocessing errors
+# in some of the English Europarl data.
+
+use strict;
+use Getopt::Std;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+sub usage
+{
+  print "Script for preprocessing of raw language data prior to tokenization\n";
+  print "Usage: $0 -l <language tag>\n";
+}
+
+my %args;
+getopt('l=s h',\%args);
+usage() && exit(0) if $args{'h'};
+
+if ($args{'l'} eq "en")
+  {
+      while (<>)
+      {
+	  s/([[:alpha:]]\') s\b/$1s/g;
+	  print;
+      }
+      
+  }
+else
+{
+    print while <>;
+}