a script to convert AT&T FSA to 'python lattice format' that Moses reads

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3866 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
bojar 2011-02-03 08:32:44 +00:00
parent 93b0a15a2d
commit 72945c543e
2 changed files with 188 additions and 0 deletions

View File

@ -0,0 +1,10 @@
0 1 Prague 0.5
1 2 Stock 1
2 6 Market 1
0 3 New 0.5
3 4 York 1
4 5 Stock 1
5 6 Exchange 1
6 7 falls 0.5
6 7 drops 0.5
7 8 . 1

178
scripts/generic/fsa2plf.pl Executable file
View File

@ -0,0 +1,178 @@
#!/usr/bin/perl
# Converts AT&T FSA format to 'python lattice format'.
# Note that the input FSA needs to be epsilon-free and topologically sorted.
# This script checks for topological sortedness.
# The start node has to have the index 0.
# All path ends are assumed to be final nodes, not just the explicitly stated
# final nodes.
# Note that the output format may not contain any spaces.
# Ondrej Bojar, bojar@ufal.mff.cuni.cz
use strict;
use Getopt::Long;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $filelist;
my $ignore_final_state_cost = 0;
my $mangle_weights = undef;
GetOptions(
"ignore-final-state-cost" => \$ignore_final_state_cost,
# sometimes, final states have a cost (e.g. "45 0.05\n")
# instead of dying there, ignore the problem
"filelist|fl=s" => \$filelist,
"mangle-weights=s" => \$mangle_weights,
) or exit 1;
my @infiles;
if (defined $filelist) {
my $fh = my_open($filelist);
while (<$fh>) {
chomp;
push @infiles, $_;
}
close $fh;
}
push @infiles, @ARGV;
@ARGV = ();
if (0 == scalar(@infiles)) {
print STDERR "Reading input from stdin\n";
push @infiles, "-";
}
my $err = 0;
foreach my $inf (@infiles) {
my $nr = 0;
NEXTLATTICE:
my %usedids = (); # collect all used ids for densification
my %usedtgtids = (); # collect all used ids for densification
my @outnodes = ();
my $fh = my_open($inf);
my %is_final; # remember which nodes were final
while (<$fh>) {
chomp;
$nr++;
last if $_ eq ""; # assume a blank line delimits lattices
my ($src, $tgt, $label, $weight) = split /\s+/;
die "$inf:$nr:Bad src node index: $src" if $src !~ /^[0-9]+$/;
if (!defined $label && !defined $weight) {
# explicit final node, warn at the end if there are any intermed. final
# nodes
$is_final{$src};
# final nodes can have a cost
die "$inf:$nr:Final state $src has cost $tgt. Unsupported, use --ignore-final-state-cost"
if defined $tgt && !$ignore_final_state_cost;
next;
}
$weight = 0 if !defined $weight;
$usedids{$src} = 1;
$usedtgtids{$tgt} = 1;
# process the weight
# when reading RWTH FSA output, the weights are negated natural logarithms
# we need to negate them back
if (defined $mangle_weights) {
if ($mangle_weights eq "expneg") {
$weight = join(",", map {exp(-$_)} split /,/, $weight);
} else {
die "Bad weights mangling: $mangle_weights";
}
}
# remember the node
my $targetnode = $tgt-$src;
die "$inf:$nr:Not topologically sorted, got arc from $src to $tgt"
if $targetnode <= 0;
push @{$outnodes[$src]}, [ $label, $weight, $tgt ];
}
if (eof($fh)) {
close $fh;
$fh = undef;
}
# Assign our dense IDs: source node ids are assigned first
my %denseids = (); # maps node ids from the file to dense ids
my $nextid = 0;
foreach my $id (sort {$a<=>$b} keys %usedids) {
$denseids{$id} = $nextid;
$nextid++;
}
# All unseen target nodes then get the same next id, the final node id
foreach my $id (keys %usedtgtids) {
next if defined $denseids{$id};
$denseids{$id} = $nextid;
}
foreach my $f (keys %is_final) {
if (defined $outnodes[$f]) {
print STDERR "$inf:Node $f is final but it has outgoing edges!\n";
$err = 1;
}
}
# # Verbose: print original to dense IDs mapping
# foreach my $src (sort {$a<=>$b} keys %denseids) {
# print STDERR "$src ...> $denseids{$src}\n";
# }
print "(";
for(my $origsrc = 0; $origsrc < @outnodes; $origsrc++) {
my $src = $denseids{$origsrc};
next if !defined $src; # this original node ID is not used at all
next if $src == $nextid; # this is the ultimate merged final node
my $outnode = $outnodes[$origsrc];
print "(";
foreach my $arc (@$outnode) {
my $origtgt = $arc->[2];
my $tgt = $denseids{$origtgt};
if (!defined $tgt) {
# this was a final node only
$tgt = $denseids{$origtgt} = $nextid;
$nextid++;
}
my $step_to_target = $tgt - $src;
die "$inf:Bug, I damaged top-sortedness (orig $origsrc .. $origtgt; curr $src .. $tgt)." if $step_to_target <= 0;
print "('".apo($arc->[0])."',$arc->[1],$step_to_target),";
}
print "),";
}
print ")\n";
goto NEXTLATTICE if defined $fh && ! eof($fh);
}
die "There were errors." if $err;
sub apo {
my $s = shift;
# protects apostrophy and backslash
$s =~ s/\\/\\\\/g;
$s =~ s/(['])/\\$1/g;
return $s;
}
sub my_open {
my $f = shift;
if ($f eq "-") {
binmode(STDIN, ":utf8");
return *STDIN;
}
die "Not found: $f" if ! -e $f;
my $opn;
my $hdl;
my $ft = `file '$f'`;
# file might not recognize some files!
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
$opn = "zcat '$f' |";
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
$opn = "bzcat '$f' |";
} else {
$opn = "$f";
}
open $hdl, $opn or die "Can't open '$opn': $!";
binmode $hdl, ":utf8";
return $hdl;
}