diff --git a/scripts/fuzzy-match/create_xml.perl b/scripts/fuzzy-match/create_xml.perl index de3d61f71..36715cc35 100755 --- a/scripts/fuzzy-match/create_xml.perl +++ b/scripts/fuzzy-match/create_xml.perl @@ -1,7 +1,7 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl -w -d -binmode(STDIN, ":utf8"); -binmode(STDOUT, ":utf8"); +binmode( STDIN, ":utf8" ); +binmode( STDOUT, ":utf8" ); use strict; use FindBin qw($RealBin); @@ -9,49 +9,58 @@ use File::Basename; sub trim($); +print STDERR "HELLO "; +for ( my $i = 0 ; $i < scalar @ARGV ; ++$i ) { + print STDERR $ARGV[$i] . " "; +} +print STDERR "GOODBYE \n"; + ############################################ -# START +# START my $inPath = $ARGV[0]; -open(IN,"<".$inPath); +open( IN, "<" . $inPath ); -open(RULE,">$inPath.extract"); -open(RULE_INV,">$inPath.extract.inv"); +open( RULE, ">$inPath.extract" ); +open( RULE_INV, ">$inPath.extract.inv" ); -my ($sentenceInd, $score, $source, $input, $target, $align, $path, $count); +my ( $sentenceInd, $score, $source, $input, $target, $align, $path, $count ); # MAIN LOOP -while ($sentenceInd = ) { - $score = ; - $source = ; - $input = ; - $target = ; - $align = ; - $path = ; - $count = ; - chomp($sentenceInd); - chomp($score); - chomp($source); - chomp($input); - chomp($target); - chomp($align); - chomp($path); - chomp($count); - $source = trim($sentenceInd); - $source = trim($score); - $source = trim($source); - $input = trim($input); - $target = trim($target); - $align = trim($align); - $path = trim($path); - $count = trim($count); - - my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = &create_xml($source, $input, $target, $align, $path); +while ( $sentenceInd = ) { + $score = ; + $source = ; + $input = ; + $target = ; + $align = ; + $path = ; + $count = ; + chomp($sentenceInd); + chomp($score); + chomp($source); + chomp($input); + chomp($target); + chomp($align); + chomp($path); + chomp($count); + $source = trim($sentenceInd); + $source = trim($score); + $source = trim($source); + $input = trim($input); + $target = trim($target); + $align = trim($align); + $path = trim($path); + $count = trim($count); - #print STDOUT $frame."\n"; - print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n"; - print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n"; - #print STDOUT "$sentenceInd ||| $score ||| $count\n"; + my ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv ) = + &create_xml( $source, $input, $target, $align, $path ); + + #print STDOUT $frame."\n"; + print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $count\n"; + print RULE_INV + "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $count\n"; + + #print STDOUT "$sentenceInd ||| $score ||| $count\n"; } @@ -65,245 +74,251 @@ close(RULE_INV); my $lex_file = "-"; my $cmd; -$cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt"; +$cmd = +"$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $inPath.extract -lexical-file $lex_file -score-options \"--NoLex\" -phrase-translation-table $inPath.pt"; print STDERR "Executing: $cmd \n"; `$cmd`; ####################################################### sub create_xml { - my ($source,$input,$target,$alignment,$path) = @_; - - my @INPUT = split(/ /,$input); - my @SOURCE = split(/ /,$source); - my @TARGET = split(/ /,$target); - my %ALIGN = &create_alignment($alignment); - - my %FRAME_INPUT; - my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S); - foreach (@TARGET) { push @TARGET_BITMAP,1 } - - ### STEP 1: FIND MISMATCHES + my ( $source, $input, $target, $alignment, $path ) = @_; - my ($s,$i) = (0,0); - my $currently_matching = 0; - my ($start_s,$start_i) = (0,0); + my @INPUT = split( / /, $input ); + my @SOURCE = split( / /, $source ); + my @TARGET = split( / /, $target ); + my %ALIGN = &create_alignment($alignment); - $path .= "X"; # indicate end - print STDERR "$input\n$source\n$target\n$path\n"; - for(my $p=0;$p $#INPUT) { - $start_t = $#TARGET; - } - - # backtrack to previous words if unaligned - if ($start_t == 1000) { - $start_t = -1; - for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { - foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { - $start_t = $tt if $tt > $start_t; - } - } - } - $FRAME_INPUT{$start_t} .= $insertion; - my %NT = ("start_t" => $start_t, - "start_i" => $start_i ); - push @NT,\%NT; - } - $currently_matching = 1; - } - - print STDERR "$action $s $i ($start_s $start_i) $currently_matching"; - if ($action ne "I") { - print STDERR " ->"; - foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { - print STDERR " ".$tt; - } - } - print STDERR "\n"; - $s++ unless $action eq "I"; - $i++ unless $action eq "D"; - $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; - push @INPUT_BITMAP, 1 if $action eq "M"; - push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; - } - + my ( $s, $i ) = ( 0, 0 ); + my $currently_matching = 0; + my ( $start_s, $start_i ) = ( 0, 0 ); - print STDERR $target."\n"; - foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n"; - foreach (sort keys %FRAME_INPUT) { - print STDERR "$_: $FRAME_INPUT{$_}\n"; + $path .= "X"; # indicate end + print STDERR "$input\n$source\n$target\n$path\n"; + for ( my $p = 0 ; $p < length($path) ; $p++ ) { + my $action = substr( $path, $p, 1 ); + + # beginning of a mismatch + if ( $currently_matching && $action ne "M" && $action ne "X" ) { + $start_i = $i; + $start_s = $s; + $currently_matching = 0; } - ### STEP 2: BUILD RULE AND FRAME - - # hierarchical rule - my $rule_s = ""; - my $rule_pos_s = 0; - my %RULE_ALIGNMENT_S; - for(my $i=0;$i $#INPUT ) { + $start_t = $#TARGET; + } + + # backtrack to previous words if unaligned + if ( $start_t == 1000 ) { + $start_t = -1; + for ( my $ss = $s - 1 ; $start_t == -1 && $ss >= 0 ; $ss-- ) { + foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$ss] } ) { + $start_t = $tt if $tt > $start_t; + } + } + } + $FRAME_INPUT{$start_t} .= $insertion; + my %NT = ( + "start_t" => $start_t, + "start_i" => $start_i + ); + push @NT, \%NT; + } + $currently_matching = 1; } - my $rule_t = ""; - my $rule_pos_t = 0; - my %RULE_ALIGNMENT_T; - for(my $t=-1;$t=0 && $TARGET_BITMAP[$t]) { - $rule_t .= $TARGET[$t]." "; - $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; - } - foreach my $NT (@NT) { - if ($t == $$NT{"start_t"}) { - $rule_t .= "[X][X] "; - $$NT{"rule_pos_t"} = $rule_pos_t++; - } - } + print STDERR "$action $s $i ($start_s $start_i) $currently_matching"; + if ( $action ne "I" ) { + print STDERR " ->"; + foreach my $tt ( keys %{ ${ $ALIGN{'s'} }[$s] } ) { + print STDERR " " . $tt; + } } + print STDERR "\n"; + $s++ unless $action eq "I"; + $i++ unless $action eq "D"; + $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; + push @INPUT_BITMAP, 1 if $action eq "M"; + push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; + } - my $rule_alignment = ""; - foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) { - foreach my $t (keys %{$ALIGN{"s"}[$s]}) { - next unless defined($RULE_ALIGNMENT_T{$t}); - $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." "; - } + print STDERR $target . "\n"; + foreach (@TARGET_BITMAP) { print STDERR $_; } + print STDERR "\n"; + foreach ( sort keys %FRAME_INPUT ) { + print STDERR "$_: $FRAME_INPUT{$_}\n"; + } + + ### STEP 2: BUILD RULE AND FRAME + + # hierarchical rule + my $rule_s = ""; + my $rule_pos_s = 0; + my %RULE_ALIGNMENT_S; + for ( my $i = 0 ; $i < scalar(@INPUT_BITMAP) ; $i++ ) { + if ( $INPUT_BITMAP[$i] ) { + $rule_s .= $INPUT[$i] . " "; + $RULE_ALIGNMENT_S{ $ALIGNMENT_I_TO_S{$i} } = $rule_pos_s++; } foreach my $NT (@NT) { - $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." "; + if ( $i == $$NT{"start_i"} ) { + $rule_s .= "[X][X] "; + $$NT{"rule_pos_s"} = $rule_pos_s++; + } } - - chop($rule_s); - chop($rule_t); - chop($rule_alignment); + } - my $rule_alignment_inv = ""; - foreach (split(/ /,$rule_alignment)) { - /^(\d+)\-(\d+)$/; - $rule_alignment_inv .= "$2-$1 "; + my $rule_t = ""; + my $rule_pos_t = 0; + my %RULE_ALIGNMENT_T; + for ( my $t = -1 ; $t < scalar(@TARGET_BITMAP) ; $t++ ) { + if ( $t >= 0 && $TARGET_BITMAP[$t] ) { + $rule_t .= $TARGET[$t] . " "; + $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; } - chop($rule_alignment_inv); + foreach my $NT (@NT) { + if ( $t == $$NT{"start_t"} ) { + $rule_t .= "[X][X] "; + $$NT{"rule_pos_t"} = $rule_pos_t++; + } + } + } - # frame - my $frame = ""; - $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; + my $rule_alignment = ""; + foreach my $s ( sort { $a <=> $b } keys %RULE_ALIGNMENT_S ) { + foreach my $t ( keys %{ $ALIGN{"s"}[$s] } ) { + next unless defined( $RULE_ALIGNMENT_T{$t} ); + $rule_alignment .= + $RULE_ALIGNMENT_S{$s} . "-" . $RULE_ALIGNMENT_T{$t} . " "; + } + } + foreach my $NT (@NT) { + $rule_alignment .= $$NT{"rule_pos_s"} . "-" . $$NT{"rule_pos_t"} . " "; + } - my $currently_included = 0; - my $start_t = -1; - push @TARGET_BITMAP,0; # indicate end + chop($rule_s); + chop($rule_t); + chop($rule_alignment); - for(my $t=0;$t<=scalar(@TARGET);$t++) { - # beginning of tm target inclusion - if (!$currently_included && $TARGET_BITMAP[$t]) { - $start_t = $t; - $currently_included = 1; - } - - # end of tm target inclusion (not included word or inserted input) - elsif ($currently_included && - (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { - # add xml (unless change is at the beginning of the sentence - if ($start_t >= 0) { - my $target = ""; - print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; - for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { - $target .= $TARGET[$tt] . " "; - } - chop($target); - $frame .= " x "; - } - $currently_included = 0; - } - - $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; - print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; + my $rule_alignment_inv = ""; + foreach ( split( / /, $rule_alignment ) ) { + /^(\d+)\-(\d+)$/; + $rule_alignment_inv .= "$2-$1 "; + } + chop($rule_alignment_inv); + + # frame + my $frame = ""; + $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; + + my $currently_included = 0; + my $start_t = -1; + push @TARGET_BITMAP, 0; # indicate end + + for ( my $t = 0 ; $t <= scalar(@TARGET) ; $t++ ) { + + # beginning of tm target inclusion + if ( !$currently_included && $TARGET_BITMAP[$t] ) { + $start_t = $t; + $currently_included = 1; } - print STDERR $frame."\n-------------------------------------\n"; - return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv); + # end of tm target inclusion (not included word or inserted input) + elsif ( $currently_included + && ( !$TARGET_BITMAP[$t] || defined( $FRAME_INPUT{$t} ) ) ) + { + # add xml (unless change is at the beginning of the sentence + if ( $start_t >= 0 ) { + my $target = ""; + print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; + for ( my $tt = $start_t ; $tt < $t + $TARGET_BITMAP[$t] ; $tt++ ) { + $target .= $TARGET[$tt] . " "; + } + chop($target); + $frame .= " x "; + } + $currently_included = 0; + } + + $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; + print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; + } + + print STDERR $frame . "\n-------------------------------------\n"; + return ( $frame, $rule_s, $rule_t, $rule_alignment, $rule_alignment_inv ); } sub create_alignment { - my ($line) = @_; - my (@ALIGNED_TO_S,@ALIGNED_TO_T); - foreach my $point (split(/ /,$line)) { - my ($s,$t) = split(/\-/,$point); - $ALIGNED_TO_S[$s]{$t}++; - $ALIGNED_TO_T[$t]{$s}++; - } - my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); - return %ALIGNMENT; + my ($line) = @_; + my ( @ALIGNED_TO_S, @ALIGNED_TO_T ); + foreach my $point ( split( / /, $line ) ) { + my ( $s, $t ) = split( /\-/, $point ); + $ALIGNED_TO_S[$s]{$t}++; + $ALIGNED_TO_T[$t]{$s}++; + } + my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); + return %ALIGNMENT; } # Perl trim function to remove whitespace from the start and end of the string -sub trim($) -{ - my $string = shift; - $string =~ s/^\s+//; - $string =~ s/\s+$//; - return $string; +sub trim($) { + my $string = shift; + $string =~ s/^\s+//; + $string =~ s/\s+$//; + return $string; } + # Left trim function to remove leading whitespace -sub ltrim($) -{ - my $string = shift; - $string =~ s/^\s+//; - return $string; +sub ltrim($) { + my $string = shift; + $string =~ s/^\s+//; + return $string; } + # Right trim function to remove trailing whitespace -sub rtrim($) -{ - my $string = shift; - $string =~ s/\s+$//; - return $string; -} \ No newline at end of file +sub rtrim($) { + my $string = shift; + $string =~ s/\s+$//; + return $string; +}