#!/usr/bin/perl -w use strict; use Getopt::Long "GetOptions"; use FindBin qw($Bin); use File::Basename; use File::Temp qw/tempfile/; my $BITPAR = "/home/pkoehn/statmt/project/bitpar/GermanParser"; my $TMPDIR = "tmp"; my $DEBUG = 0; my $BASIC = 0; my $RAW = ""; GetOptions( "basic" => \$BASIC, "bitpar=s" => \$BITPAR, "raw=s" => \$RAW ) or die("ERROR: unknown options"); `mkdir -p $TMPDIR`; my ($scriptname, $directories) = fileparse($0); my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>1); open(INPUT,"iconv -c -f utf8 -t iso-8859-1 |"); while() { foreach (split) { s/\(/\*LRB\*/g; s/\)/\*RRB\*/g; print $TMP $_."\n"; } print $TMP "\n"; } close($TMP); my $pipeline = "cat $tmpfile | $BITPAR/bin/bitpar -ts '()' -s TOP -v $BITPAR/Tiger/grammar $BITPAR/Tiger/lexicon -u $BITPAR/Tiger/open-class-tags -w $BITPAR/Tiger/wordclass.txt |"; if ($RAW) { $pipeline .= "tee \"$RAW\" |"; } $pipeline .= "iconv -c -t utf8 -f iso-8859-1 |"; open(PARSER,$pipeline); while(my $line = ) { if ($line =~ /^No parse for/) { print "\n"; next; } print STDERR $line if $DEBUG; chop($line); my @LABEL = (); my @OUT = (); for(my $i=0;$i node, # any (non-auxiliary) subtrees will instead be attached to the # parent (or closest non-auxiliary ancestor). } else { if ($label =~ /^\\\$(,|.|Par)$/) { $label = "PUNC$1"; } else { $label =~ s/\$/PUNC/g; # no $! $label =~ s/\|/:/g; # moses does not like bars $label =~ s/^[^A-Z]*([A-Z]+).*/$1/g if $BASIC; # basic labels only } push @OUT,""; } push @LABEL,$label; $i++ if substr($line,$i+1,1) eq " "; $i++ if substr($line,$i+1,1) eq " "; } elsif (substr($line,$i,1) eq ")") { die("ERROR: NO LABEL ON STACK") unless @LABEL; my $label = pop @LABEL; print STDERR substr(" ",0,scalar @LABEL)."END of $label\n" if $DEBUG; if (!&is_aux_label($label)) { push @OUT,""; } $i++ if substr($line,$i+1,1) eq " "; } elsif (substr($line,$i,3) eq "*T*") { die("ERROR: NO LABEL FOR TRACE") unless @LABEL; pop @LABEL; pop @OUT; #print "POPPING TRACE LABEL ", pop @OUT; my ($trace,$rest) = split(/\)/,substr($line,$i+1)); $i+=length($trace)+1; } else { my ($word,$rest) = split(/\)/,substr($line,$i)); if (substr($line,$i,2) eq "\\)") { $word = substr($line,$i,2); } $i+=length($word)-1; print STDERR substr(" ",0,scalar @LABEL)."WORD $word\n" if $DEBUG; push @OUT,&escape($word); } } die("ERROR: STACK NOT EMPTY $#LABEL\n") if @LABEL; my $first=1; foreach (@OUT) { print " " unless $first; s/\\//; s/\*RRB\*/\)/g; s/\*LRB\*/\(/g; print $_; $first = 0; } print "\n"; } sub is_aux_label { my ($label) = @_; return ($label =~ /^\\<.*\\>$/); } sub escape { my ($text) = @_; $text =~ s/&/&/g; $text =~ s//>/g; return $text; }