#!/usr/bin/perl -w
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($Bin);
use File::Basename;
use File::Temp qw/tempfile/;
my $BITPAR = "/home/pkoehn/statmt/project/bitpar/GermanParser";
my $TMPDIR = "tmp";
my $DEBUG = 0;
my $BASIC = 0;
my $RAW = "";
GetOptions(
"basic" => \$BASIC,
"bitpar=s" => \$BITPAR,
"raw=s" => \$RAW
) or die("ERROR: unknown options");
`mkdir -p $TMPDIR`;
my ($scriptname, $directories) = fileparse($0);
my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>1);
open(INPUT,"iconv -c -f utf8 -t iso-8859-1 |");
while()
{
foreach (split)
{
s/\(/\*LRB\*/g;
s/\)/\*RRB\*/g;
print $TMP $_."\n";
}
print $TMP "\n";
}
close($TMP);
my $pipeline = "cat $tmpfile | $BITPAR/bin/bitpar -ts '()' -s TOP -v $BITPAR/Tiger/grammar $BITPAR/Tiger/lexicon -u $BITPAR/Tiger/open-class-tags -w $BITPAR/Tiger/wordclass.txt |";
if ($RAW)
{
$pipeline .= "tee \"$RAW\" |";
}
$pipeline .= "iconv -c -t utf8 -f iso-8859-1 |";
open(PARSER,$pipeline);
while(my $line = ) {
if ($line =~ /^No parse for/) {
print "\n";
next;
}
print STDERR $line if $DEBUG;
chop($line);
my @LABEL = ();
my @OUT = ();
for(my $i=0;$i node,
# any (non-auxiliary) subtrees will instead be attached to the
# parent (or closest non-auxiliary ancestor).
}
else {
if ($label =~ /^\\\$(,|.|Par)$/)
{
$label = "PUNC$1";
}
else
{
$label =~ s/\$/PUNC/g; # no $!
$label =~ s/\|/:/g; # moses does not like bars
$label =~ s/^[^A-Z]*([A-Z]+).*/$1/g if $BASIC; # basic labels only
}
push @OUT,"";
}
push @LABEL,$label;
$i++ if substr($line,$i+1,1) eq " ";
$i++ if substr($line,$i+1,1) eq " ";
}
elsif (substr($line,$i,1) eq ")") {
die("ERROR: NO LABEL ON STACK") unless @LABEL;
my $label = pop @LABEL;
print STDERR substr(" ",0,scalar @LABEL)."END of $label\n" if $DEBUG;
if (!&is_aux_label($label)) {
push @OUT,"";
}
$i++ if substr($line,$i+1,1) eq " ";
}
elsif (substr($line,$i,3) eq "*T*") {
die("ERROR: NO LABEL FOR TRACE") unless @LABEL;
pop @LABEL;
pop @OUT;
#print "POPPING TRACE LABEL ", pop @OUT;
my ($trace,$rest) = split(/\)/,substr($line,$i+1));
$i+=length($trace)+1;
}
else {
my ($word,$rest) = split(/\)/,substr($line,$i));
if (substr($line,$i,2) eq "\\)") {
$word = substr($line,$i,2);
}
$i+=length($word)-1;
print STDERR substr(" ",0,scalar @LABEL)."WORD $word\n" if $DEBUG;
push @OUT,&escape($word);
}
}
die("ERROR: STACK NOT EMPTY $#LABEL\n") if @LABEL;
my $first=1;
foreach (@OUT) {
print " " unless $first;
s/\\//;
s/\*RRB\*/\)/g;
s/\*LRB\*/\(/g;
print $_;
$first = 0;
}
print "\n";
}
sub is_aux_label {
my ($label) = @_;
return ($label =~ /^\\<.*\\>$/);
}
sub escape {
my ($text) = @_;
$text =~ s/&/&/g;
$text =~ s/</g;
$text =~ s/>/>/g;
return $text;
}