From 07498356d234d55376d39c9f65b2fd15298d48b4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 17 Jul 2012 18:20:53 +0100 Subject: [PATCH] tm-mt integration. Runs all the way through to creating rule table without crashing --- .../tm-mt-integration/make-pt-from-tm.perl | 296 ++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100755 contrib/tm-mt-integration/make-pt-from-tm.perl diff --git a/contrib/tm-mt-integration/make-pt-from-tm.perl b/contrib/tm-mt-integration/make-pt-from-tm.perl new file mode 100755 index 000000000..eb62107c7 --- /dev/null +++ b/contrib/tm-mt-integration/make-pt-from-tm.perl @@ -0,0 +1,296 @@ +#!/usr/bin/perl -w + +use strict; +use FindBin qw($RealBin); +use File::Basename; + +my $DEBUG = 1; +my $OUTPUT_RULES = 1; + +#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/"; +my $in_file = $ARGV[0]; #"$data_root/in/ac-test.input.tc.4"; +my $source_file = $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq"; +my $target_file = $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq"; +my $alignment_file = $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq"; +my $lex_file = $ARGV[4]; #$data_root/in/lex.4; +my $pt_file = $ARGV[5]; #"$data_root/out/pt"; + +my $cmd; + +print "pt_file=$pt_file \n"; +my $TMPDIR=dirname($pt_file) ."/tmp.$$"; +$cmd = "mkdir -p $TMPDIR"; +`$cmd`; + +my $match_file = "$TMPDIR/match"; + +# suffix array creation and extraction +$cmd = "$RealBin/fuzzy-match --multiple $in_file $source_file > $match_file"; +`$cmd`; + +# make into xml and pt +my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning"; + +my @INPUT = `cat $in_file`; chop(@INPUT); +my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE); +my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET); +my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT); + +open(MATCH,$match_file); +open(FRAME,">$out_file"); +open(RULE,">$out_file.extract") if $OUTPUT_RULES; +open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES; +open(INFO,">$out_file.info"); +while( my $match = ) { + chop($match); + my ($score,$sentence,$path) = split(/ \|\|\| /,$match); + + $score =~ /^(\d+) (.+)/ || die; + my ($i,$match_score) = ($1,$2); + + # construct frame + if ($sentence < 1e9 && $sentence >= 0) { + my $SOURCE = $ALL_SOURCE[$sentence]; + my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]); + my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]); + + for(my $j=0;$j $out_file.extract.sorted.gz`; +`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`; + +`$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file` if $OUTPUT_RULES; + +sub create_xml { + my ($source,$input,$target,$alignment,$path) = @_; + + my @INPUT = split(/ /,$input); + my @SOURCE = split(/ /,$source); + my @TARGET = split(/ /,$target); + my %ALIGN = &create_alignment($alignment); + + my %FRAME_INPUT; + my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S); + foreach (@TARGET) { push @TARGET_BITMAP,1 } + + ### STEP 1: FIND MISMATCHES + + my ($s,$i) = (0,0); + my $currently_matching = 0; + my ($start_s,$start_i) = (0,0); + + $path .= "X"; # indicate end + print "$input\n$source\n$target\n$path\n"; + for(my $p=0;$p $#INPUT) { + $start_t = $#TARGET; + } + + # backtrack to previous words if unaligned + if ($start_t == 1000) { + $start_t = -1; + for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) { + foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) { + $start_t = $tt if $tt > $start_t; + } + } + } + $FRAME_INPUT{$start_t} .= $insertion; + my %NT = ("start_t" => $start_t, + "start_i" => $start_i ); + push @NT,\%NT; + } + $currently_matching = 1; + } + + print "$action $s $i ($start_s $start_i) $currently_matching"; + if ($action ne "I") { + print " ->"; + foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) { + print " ".$tt; + } + } + print "\n"; + $s++ unless $action eq "I"; + $i++ unless $action eq "D"; + $ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D"; + push @INPUT_BITMAP, 1 if $action eq "M"; + push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S"; + } + + + print $target."\n"; + foreach (@TARGET_BITMAP) { print $_; } print "\n"; + foreach (sort keys %FRAME_INPUT) { + print "$_: $FRAME_INPUT{$_}\n"; + } + + ### STEP 2: BUILD RULE AND FRAME + + # hierarchical rule + my $rule_s = ""; + my $rule_pos_s = 0; + my %RULE_ALIGNMENT_S; + for(my $i=0;$i=0 && $TARGET_BITMAP[$t]) { + $rule_t .= $TARGET[$t]." "; + $RULE_ALIGNMENT_T{$t} = $rule_pos_t++; + } + foreach my $NT (@NT) { + if ($t == $$NT{"start_t"}) { + $rule_t .= "[X][X] "; + $$NT{"rule_pos_t"} = $rule_pos_t++; + } + } + } + + my $rule_alignment = ""; + foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) { + foreach my $t (keys %{$ALIGN{"s"}[$s]}) { + next unless defined($RULE_ALIGNMENT_T{$t}); + $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." "; + } + } + foreach my $NT (@NT) { + $rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." "; + } + + chop($rule_s); + chop($rule_t); + chop($rule_alignment); + + my $rule_alignment_inv = ""; + foreach (split(/ /,$rule_alignment)) { + /^(\d+)\-(\d+)$/; + $rule_alignment_inv .= "$2-$1 "; + } + chop($rule_alignment_inv); + + # frame + my $frame = ""; + $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1}; + + my $currently_included = 0; + my $start_t = -1; + push @TARGET_BITMAP,0; # indicate end + + for(my $t=0;$t<=scalar(@TARGET);$t++) { + # beginning of tm target inclusion + if (!$currently_included && $TARGET_BITMAP[$t]) { + $start_t = $t; + $currently_included = 1; + } + + # end of tm target inclusion (not included word or inserted input) + elsif ($currently_included && + (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) { + # add xml (unless change is at the beginning of the sentence + if ($start_t >= 0) { + my $target = ""; + print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; + for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) { + $target .= $TARGET[$tt] . " "; + } + chop($target); + $frame .= " x "; + } + $currently_included = 0; + } + + $frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t}; + print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n"; + } + + print $frame."\n-------------------------------------\n"; + return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv); +} + +sub create_alignment { + my ($line) = @_; + my (@ALIGNED_TO_S,@ALIGNED_TO_T); + foreach my $point (split(/ /,$line)) { + my ($s,$t) = split(/\-/,$point); + $ALIGNED_TO_S[$s]{$t}++; + $ALIGNED_TO_T[$t]{$s}++; + } + my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T ); + return %ALIGNMENT; +}