mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
185 lines
5.4 KiB
Perl
Executable File
185 lines
5.4 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
|
|
use strict;
|
|
use Getopt::Long "GetOptions";
|
|
use FindBin qw($RealBin);
|
|
use File::Basename;
|
|
use File::Temp qw/tempfile/;
|
|
|
|
my $BITPAR = "/exports/home/s0565741/work/bin/bitpar";
|
|
my $TMPDIR = "tmp";
|
|
|
|
my $SCRIPTS_ROOT_DIR = "$RealBin/../..";
|
|
my $DEESCAPE = "$SCRIPTS_ROOT_DIR/tokenizer/deescape-special-chars.perl";
|
|
|
|
my $DEBUG = 0;
|
|
my $BASIC = 0;
|
|
my $OLD_BITPAR = 0;
|
|
my $UNPARSEABLE = 0;
|
|
|
|
my $RAW = "";
|
|
|
|
GetOptions(
|
|
"basic" => \$BASIC,
|
|
"bitpar=s" => \$BITPAR,
|
|
"old-bitpar" => \$OLD_BITPAR,
|
|
"raw=s" => \$RAW,
|
|
"unparseable" => \$UNPARSEABLE
|
|
) or die("ERROR: unknown options");
|
|
|
|
`mkdir -p $TMPDIR`;
|
|
my ($scriptname, $directories) = fileparse($0);
|
|
my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>1);
|
|
if ($OLD_BITPAR)
|
|
{
|
|
open(INPUT,"$DEESCAPE | iconv -c -f UTF-8 -t iso-8859-1 |");
|
|
}
|
|
else
|
|
{
|
|
open (INPUT,"$DEESCAPE |");
|
|
}
|
|
while(<INPUT>)
|
|
{
|
|
my $hasWords = 0;
|
|
foreach (split)
|
|
{
|
|
if ($OLD_BITPAR)
|
|
{
|
|
s/\(/\*LRB\*/g;
|
|
s/\)/\*RRB\*/g;
|
|
}
|
|
print $TMP $_."\n";
|
|
|
|
$hasWords = 1;
|
|
}
|
|
|
|
if ($hasWords == 0) {
|
|
print $TMP " \n";
|
|
}
|
|
|
|
print $TMP "\n";
|
|
}
|
|
close($TMP);
|
|
|
|
my $pipeline = "cat $tmpfile | $BITPAR/BitPar/src/bitpar -ts '()' -s TOP -v $BITPAR/GermanParser/Tiger/grammar $BITPAR/GermanParser/Tiger/lexicon -u $BITPAR/GermanParser/Tiger/open-class-tags -w $BITPAR/GermanParser/Tiger/wordclass.txt |";
|
|
if ($RAW)
|
|
{
|
|
$pipeline .= "tee \"$RAW\" |";
|
|
}
|
|
if ($OLD_BITPAR)
|
|
{
|
|
$pipeline .= "iconv -c -t UTF-8 -f iso-8859-1 |";
|
|
}
|
|
open(PARSER,$pipeline);
|
|
while(my $line = <PARSER>) {
|
|
if ($line =~ /^No parse for/) {
|
|
if ($UNPARSEABLE) {
|
|
my $len = length($line);
|
|
$line = substr($line, 15, $len - 17);
|
|
$line = escape($line);
|
|
print $line;
|
|
}
|
|
print "\n";
|
|
next;
|
|
}
|
|
print STDERR $line if $DEBUG;
|
|
chop($line);
|
|
my @LABEL = ();
|
|
my @OUT = ();
|
|
for(my $i=0;$i<length($line);$i++) {
|
|
# print STDERR substr($line,$i)."\n";
|
|
if (substr($line,$i,4) eq "(*T*") {
|
|
my ($trace,$rest) = split(/\)/,substr($line,$i+1));
|
|
$i+=length($trace)+2;
|
|
$i++ if substr($line,$i+1,1) eq " ";
|
|
die("ERROR: NO LABEL FOR TRACE") unless @LABEL;
|
|
if (!$DEBUG) { pop @LABEL; pop @OUT; }
|
|
print STDERR substr(" ",0,scalar @LABEL-1)."REMOVING LABEL ".(pop @LABEL)." ".(pop @OUT)."\n" if $DEBUG;
|
|
}
|
|
elsif (substr($line,$i,1) eq "(") {
|
|
my ($label,$rest) = split(/[\( ]/,substr($line,$i+1));
|
|
print STDERR substr(" ",0,scalar @LABEL)."BEGINNING of $label\n" if $DEBUG;
|
|
$i+=length($label);
|
|
if (&is_aux_label($label)) {
|
|
# Discard auxiliary nodes. Since we don't create a <tree> node,
|
|
# any (non-auxiliary) subtrees will instead be attached to the
|
|
# parent (or closest non-auxiliary ancestor).
|
|
}
|
|
else {
|
|
if ($label =~ /^\\\$(,|.|Par)$/)
|
|
{
|
|
$label = "PUNC$1";
|
|
}
|
|
else
|
|
{
|
|
$label =~ s/\$/PUNC/g; # no $!
|
|
$label =~ s/\|/:/g; # moses does not like bars
|
|
$label =~ s/^[^A-Z]*([A-Z]+).*/$1/g if $BASIC; # basic labels only
|
|
}
|
|
push @OUT,"<tree label=\"$label\">";
|
|
}
|
|
push @LABEL,$label;
|
|
$i++ if substr($line,$i+1,1) eq " ";
|
|
$i++ if substr($line,$i+1,1) eq " ";
|
|
}
|
|
elsif (substr($line,$i,1) eq ")") {
|
|
die("ERROR: NO LABEL ON STACK") unless @LABEL;
|
|
my $label = pop @LABEL;
|
|
print STDERR substr(" ",0,scalar @LABEL)."END of $label\n" if $DEBUG;
|
|
if (!&is_aux_label($label)) {
|
|
push @OUT,"</tree>";
|
|
}
|
|
$i++ if substr($line,$i+1,1) eq " ";
|
|
}
|
|
elsif (substr($line,$i,3) eq "*T*") {
|
|
die("ERROR: NO LABEL FOR TRACE") unless @LABEL;
|
|
pop @LABEL;
|
|
pop @OUT;
|
|
#print "POPPING TRACE LABEL ", pop @OUT;
|
|
my ($trace,$rest) = split(/\)/,substr($line,$i+1));
|
|
$i+=length($trace)+1;
|
|
}
|
|
else {
|
|
my ($word,$rest) = split(/\)/,substr($line,$i));
|
|
if (substr($line,$i,2) eq "\\)") {
|
|
$word = substr($line,$i,2);
|
|
}
|
|
$i+=length($word)-1;
|
|
print STDERR substr(" ",0,scalar @LABEL)."WORD $word\n" if $DEBUG;
|
|
push @OUT,&escape($word);
|
|
}
|
|
}
|
|
die("ERROR: STACK NOT EMPTY $#LABEL\n") if @LABEL;
|
|
my $first=1;
|
|
foreach (@OUT) {
|
|
print " " unless $first;
|
|
s/\\//;
|
|
if ($OLD_BITPAR)
|
|
{
|
|
s/\*RRB\*/\)/g;
|
|
s/\*LRB\*/\(/g;
|
|
}
|
|
print $_;
|
|
$first = 0;
|
|
}
|
|
print "\n";
|
|
}
|
|
|
|
sub is_aux_label {
|
|
my ($label) = @_;
|
|
return ($label =~ /^\\<.*\\>$/);
|
|
}
|
|
|
|
sub escape {
|
|
my ($text) = @_;
|
|
$text =~ s/&/&/g;
|
|
$text =~ s/\|/|/g;
|
|
$text =~ s/</</g;
|
|
$text =~ s/>/>/g;
|
|
$text =~ s/'/'/g;
|
|
$text =~ s/"/"/g;
|
|
$text =~ s/\[/[/g;
|
|
$text =~ s/\]/]/g;
|
|
return $text;
|
|
}
|