mosesdecoder/scripts/fuzzy-match/create_xml.perl
Jeroen Vermeulen ef028446f3 Add license notices to scripts.
This is not pleasant to read (and much, much less pleasant to write!) but
sort of necessary in an open project.  Right now it's quite hard to figure
out what is licensed how, which doesn't matter much to most people but can
suddenly become very important when people want to know what they're being
allowed to do.

I kept the notices as short as I could.  As far as I could see, everything
without a clear license notice is LGPL v2.1 or later.
2015-05-29 18:30:26 +07:00

329 lines
8.8 KiB
Perl
Executable File

#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
binmode( STDIN, ":utf8" );
binmode( STDOUT, ":utf8" );
use warnings;
use strict;
use FindBin qw($RealBin);
use File::Basename;
sub trim($);
print STDERR "HELLO ";
for ( my $i = 0 ; $i < scalar @ARGV ; ++$i ) {
print STDERR $ARGV[$i] . " ";
}
print STDERR "GOODBYE \n";
############################################
# START
my $inPath = $ARGV[0];
open( IN, "<" . $inPath );
open( RULE, ">$inPath.extract" );
open( RULE_INV, ">$inPath.extract.inv" );
my ( $sentenceInd, $score, $source, $input, $target, $align, $path, $count );
# MAIN LOOP
while ( $sentenceInd = <IN> ) {
$score = <IN>;
$source = <IN>;
$input = <IN>;
$target = <IN>;
$align = <IN>;
$path = <IN>;
$count = <IN>;
chomp($sentenceInd);
chomp($score);
chomp($source);
chomp($input);
chomp($target);
chomp($align);
chomp($path);
chomp($count);
$source = trim($sentenceInd);
$source = trim($score);
$source = trim($source);
$input = trim($input);
$target = trim($target);
$align = trim($align);
$path = trim($path);
$count = trim($count);
my ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv ) =
&create_xml( $source, $input, $target, $align, $path );
#print STDOUT $frame."\n";
print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n";
print RULE_INV
"$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n";
#print STDOUT "$sentenceInd ||| $score ||| $count\n";
}
close(IN);
close(RULE);
close(RULE_INV);
`LC_ALL=C sort $inPath.extract | gzip -c > $inPath.extract.sorted.gz`;
`LC_ALL=C sort $inPath.extract.inv | gzip -c > $inPath.extract.inv.sorted.gz`;
my $lex_file = "-";
my $cmd;
$cmd =
"$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt";
print STDERR "Executing: $cmd \n";
`$cmd`;
#######################################################
sub create_xml {
my ( $source, $input, $target, $alignment, $path ) = @_;
my @INPUT = split( / /, $input );
my @SOURCE = split( / /, $source );
my @TARGET = split( / /, $target );
my %ALIGN = &create_alignment($alignment);
my %FRAME_INPUT;
my ( @NT, @INPUT_BITMAP, @TARGET_BITMAP, %ALIGNMENT_I_TO_S );
foreach (@TARGET) { push @TARGET_BITMAP, 1 }
### STEP 1: FIND MISMATCHES
my ( $s, $i ) = ( 0, 0 );
my $currently_matching = 0;
my ( $start_s, $start_i ) = ( 0, 0 );
$path .= "X"; # indicate end
print STDERR "$input\n$source\n$target\n$path\n";
for ( my $p = 0 ; $p < length($path) ; $p++ ) {
my $action = substr( $path, $p, 1 );
# beginning of a mismatch
if ( $currently_matching && $action ne "M" && $action ne "X" ) {
$start_i = $i;
$start_s = $s;
$currently_matching = 0;
}
# end of a mismatch
elsif ( !$currently_matching
&& ( $action eq "M" || $action eq "X" ) )
{
# remove use of affected target words
for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
$TARGET_BITMAP[$tt] = 0;
}
# also remove enclosed unaligned words?
}
# are there input words that need to be inserted ?
print STDERR "($start_i<$i)?\n";
if ( $start_i < $i ) {
# take note of input words to be inserted
my $insertion = "";
for ( my $ii = $start_i ; $ii < $i ; $ii++ ) {
$insertion .= $INPUT[$ii] . " ";
}
# find position for inserted input words
# find first removed target word
my $start_t = 1000;
for ( my $ss = $start_s ; $ss < $s ; $ss++ ) {
foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
$start_t = $tt if $tt < $start_t;
}
}
# end of sentence? add to end
if ( $start_t == 1000 && $i > $#INPUT ) {
$start_t = $#TARGET;
}
# backtrack to previous words if unaligned
if ( $start_t == 1000 ) {
$start_t = -1;
for ( my $ss = $s - 1 ; $start_t == -1 && $ss >= 0 ; $ss-- ) {
foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) {
$start_t = $tt if $tt > $start_t;
}
}
}
$FRAME_INPUT{$start_t} .= $insertion;
my %NT = (
"start_t" => $start_t,
"start_i" => $start_i
);
push @NT, \%NT;
}
$currently_matching = 1;
}
print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
if ( $action ne "I" ) {
print STDERR " ->";
foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$s] } ) {
print STDERR " " . $tt;
}
}
print STDERR "\n";
$s++ unless $action eq "I";
$i++ unless $action eq "D";
$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
push @INPUT_BITMAP, 1 if $action eq "M";
push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
}
print STDERR $target . "\n";
foreach (@TARGET_BITMAP) { print STDERR $_; }
print STDERR "\n";
foreach ( sort keys %FRAME_INPUT ) {
print STDERR "$_: $FRAME_INPUT{$_}\n";
}
### STEP 2: BUILD RULE AND FRAME
# hierarchical rule
my $rule_s = "";
my $rule_pos_s = 0;
my %RULE_ALIGNMENT_S;
for ( my $i = 0 ; $i < scalar(@INPUT_BITMAP) ; $i++ ) {
if ( $INPUT_BITMAP[$i] ) {
$rule_s .= $INPUT[$i] . " ";
$RULE_ALIGNMENT_S{ $ALIGNMENT_I_TO_S{$i} } = $rule_pos_s++;
}
foreach my $NT (@NT) {
if ( $i == $$NT{"start_i"} ) {
$rule_s .= "[X][X] ";
$$NT{"rule_pos_s"} = $rule_pos_s++;
}
}
}
my $rule_t = "";
my $rule_pos_t = 0;
my %RULE_ALIGNMENT_T;
for ( my $t = -1 ; $t < scalar(@TARGET_BITMAP) ; $t++ ) {
if ( $t >= 0 && $TARGET_BITMAP[$t] ) {
$rule_t .= $TARGET[$t] . " ";
$RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
}
foreach my $NT (@NT) {
if ( $t == $$NT{"start_t"} ) {
$rule_t .= "[X][X] ";
$$NT{"rule_pos_t"} = $rule_pos_t++;
}
}
}
my $rule_alignment = "";
foreach my $s ( sort { $a <=> $b } keys %RULE_ALIGNMENT_S ) {
foreach my $t ( keys %{ $ALIGN{"s"}[$s] } ) {
next unless defined( $RULE_ALIGNMENT_T{$t} );
$rule_alignment .=
$RULE_ALIGNMENT_S{$s} . "-" . $RULE_ALIGNMENT_T{$t} . " ";
}
}
foreach my $NT (@NT) {
$rule_alignment .= $$NT{"rule_pos_s"} . "-" . $$NT{"rule_pos_t"} . " ";
}
chop($rule_s);
chop($rule_t);
chop($rule_alignment);
my $rule_alignment_inv = "";
foreach ( split( / /, $rule_alignment ) ) {
/^(\d+)\-(\d+)$/;
$rule_alignment_inv .= "$2-$1 ";
}
chop($rule_alignment_inv);
# frame
my $frame = "";
$frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
my $currently_included = 0;
my $start_t = -1;
push @TARGET_BITMAP, 0; # indicate end
for ( my $t = 0 ; $t <= scalar(@TARGET) ; $t++ ) {
# beginning of tm target inclusion
if ( !$currently_included && $TARGET_BITMAP[$t] ) {
$start_t = $t;
$currently_included = 1;
}
# end of tm target inclusion (not included word or inserted input)
elsif ( $currently_included
&& ( !$TARGET_BITMAP[$t] || defined( $FRAME_INPUT{$t} ) ) )
{
# add xml (unless change is at the beginning of the sentence
if ( $start_t >= 0 ) {
my $target = "";
print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
for ( my $tt = $start_t ; $tt < $t + $TARGET_BITMAP[$t] ; $tt++ ) {
$target .= $TARGET[$tt] . " ";
}
chop($target);
$frame .= "<xml translation=\"$target\"> x </xml> ";
}
$currently_included = 0;
}
$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
}
print STDERR $frame . "\n-------------------------------------\n";
return ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv );
}
sub create_alignment {
my ($line) = @_;
my ( @ALIGNED_TO_S, @ALIGNED_TO_T );
foreach my $point ( split( / /, $line ) ) {
my ( $s, $t ) = split( /\-/, $point );
$ALIGNED_TO_S[$s]{$t}++;
$ALIGNED_TO_T[$t]{$s}++;
}
my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
return %ALIGNMENT;
}
# Perl trim function to remove whitespace from the start and end of the string
sub trim($) {
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
# Left trim function to remove leading whitespace
sub ltrim($) {
my $string = shift;
$string =~ s/^\s+//;
return $string;
}
# Right trim function to remove trailing whitespace
sub rtrim($) {
my $string = shift;
$string =~ s/\s+$//;
return $string;
}