more debugging of tm-mt scripts

2024-09-20 15:48:05 +03:00 · 2012-07-25 17:53:49 +01:00 · 2012-07-25 17:53:49 +01:00 · 41e990a814
commit 41e990a814
parent 13a83ae0c7
8 changed files with 4169 additions and 3633 deletions
--- a/contrib/other-builds/fuzzy-match.xcodeproj/project.pbxproj
+++ b/contrib/other-builds/fuzzy-match.xcodeproj/project.pbxproj
@ -77,10 +77,10 @@
 				1E42EFD715C00D6300E937EB /* Match.h */,
 				1E42EFD315C00C0A00E937EB /* SentenceAlignment.h */,
 				1E42EFD215C00BAE00E937EB /* Util.h */,
-				1E42EFD115C00AC100E937EB /* fuzzy-match2.h */,
 				1E806DCF15BED3D4001914A2 /* SuffixArray.cpp */,
 				1E806DD015BED3D4001914A2 /* SuffixArray.h */,
 				1E806DCD15BED3BC001914A2 /* fuzzy-match.cpp */,
+				1E42EFD115C00AC100E937EB /* fuzzy-match2.h */,
 				1E42EFA515BEFABD00E937EB /* fuzzy-match2.cpp */,
 				1E806DCA15BED3AC001914A2 /* Vocabulary.cpp */,
 				1E806DCB15BED3AC001914A2 /* Vocabulary.h */,
--- a/contrib/other-builds/fuzzy-match.xcodeproj/project.xcworkspace/xcuserdata/hieuhoang.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/contrib/other-builds/fuzzy-match.xcodeproj/project.xcworkspace/xcuserdata/hieuhoang.xcuserdatad/UserInterfaceState.xcuserstate
--- a/contrib/other-builds/fuzzy-match.xcodeproj/xcuserdata/hieuhoang.xcuserdatad/xcdebugger/Breakpoints.xcbkptlist
+++ b/contrib/other-builds/fuzzy-match.xcodeproj/xcuserdata/hieuhoang.xcuserdatad/xcdebugger/Breakpoints.xcbkptlist
@ -3,74 +3,18 @@
   type = "1"
   version = "1.0">
   <FileBreakpoints>
-      <FileBreakpoint
-         shouldBeEnabled = "Yes"
-         ignoreCount = "0"
-         continueAfterRunningActions = "No"
-         isPathRelative = "0"
-         filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match.cpp"
-         timestampString = "364836494.083835"
-         startingColumnNumber = "9223372036854775807"
-         endingColumnNumber = "9223372036854775807"
-         startingLineNumber = "825"
-         endingLineNumber = "825"
-         landmarkName = "main(int argc, char* argv[])"
-         landmarkType = "7">
-      </FileBreakpoint>
-      <FileBreakpoint
-         shouldBeEnabled = "Yes"
-         ignoreCount = "0"
-         continueAfterRunningActions = "No"
-         isPathRelative = "0"
-         filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match.cpp"
-         timestampString = "364836573.089496"
-         startingColumnNumber = "9223372036854775807"
-         endingColumnNumber = "9223372036854775807"
-         startingLineNumber = "206"
-         endingLineNumber = "206"
-         landmarkName = "sed( const vector&lt; WORD_ID &gt; &amp;a, const vector&lt; WORD_ID &gt; &amp;b, string &amp;best_path, bool use_letter_sed )"
-         landmarkType = "7">
-      </FileBreakpoint>
      <FileBreakpoint
         shouldBeEnabled = "Yes"
         ignoreCount = "0"
         continueAfterRunningActions = "No"
         isPathRelative = "0"
         filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match2.cpp"
-         timestampString = "364843192.030752"
+         timestampString = "364924840.150553"
         startingColumnNumber = "9223372036854775807"
         endingColumnNumber = "9223372036854775807"
-         startingLineNumber = "851"
-         endingLineNumber = "851"
-         landmarkName = "main(int argc, char* argv[])"
-         landmarkType = "7">
-      </FileBreakpoint>
-      <FileBreakpoint
-         shouldBeEnabled = "Yes"
-         ignoreCount = "0"
-         continueAfterRunningActions = "No"
-         isPathRelative = "0"
-         filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match2.cpp"
-         timestampString = "364843261.346081"
-         startingColumnNumber = "9223372036854775807"
-         endingColumnNumber = "9223372036854775807"
-         startingLineNumber = "938"
-         endingLineNumber = "938"
-         landmarkName = "main(int argc, char* argv[])"
-         landmarkType = "7">
-      </FileBreakpoint>
-      <FileBreakpoint
-         shouldBeEnabled = "Yes"
-         ignoreCount = "0"
-         continueAfterRunningActions = "No"
-         isPathRelative = "0"
-         filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match2.cpp"
-         timestampString = "364843304.325754"
-         startingColumnNumber = "9223372036854775807"
-         endingColumnNumber = "9223372036854775807"
-         startingLineNumber = "1035"
-         endingLineNumber = "1035"
-         landmarkName = "main(int argc, char* argv[])"
+         startingLineNumber = "454"
+         endingLineNumber = "454"
+         landmarkName = "create_extract(const vector&lt; WORD_ID &gt; &amp;sourceSentence, const vector&lt;SentenceAlignment&gt; &amp;targets, const string &amp;inputStr, const string &amp;path)"
         landmarkType = "7">
      </FileBreakpoint>
   </FileBreakpoints>
--- a/contrib/other-builds/fuzzy-match.xcodeproj/xcuserdata/hieuhoang.xcuserdatad/xcschemes/fuzzy-match.xcscheme
+++ b/contrib/other-builds/fuzzy-match.xcodeproj/xcuserdata/hieuhoang.xcuserdatad/xcschemes/fuzzy-match.xcscheme
@ -44,6 +44,12 @@
            ReferencedContainer = "container:fuzzy-match.xcodeproj">
         </BuildableReference>
      </BuildableProductRunnable>
+      <CommandLineArguments>
+         <CommandLineArgument
+            argument = "--multiple /Users/hieuhoang/workspace/experiment/data/tm-mt-integration//in/ac-test.input.tc.4 /Users/hieuhoang/workspace/experiment/data/tm-mt-integration//in/acquis.truecased.4.en.uniq"
+            isEnabled = "YES">
+         </CommandLineArgument>
+      </CommandLineArguments>
      <AdditionalOptions>
      </AdditionalOptions>
   </LaunchAction>
--- a/contrib/tm-mt-integration/create_xml.perl
+++ b/contrib/tm-mt-integration/create_xml.perl
@ -0,0 +1,276 @@
+#!/usr/bin/perl -w 
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use strict;
+use FindBin qw($RealBin);
+use File::Basename;
+
+sub trim($);
+
+my ($source, $input, $target, $align, $path);
+
+while ($source = <STDIN>) { 
+	$input = <STDIN>;
+	$target = <STDIN>;
+	$align = <STDIN>;
+	$path = <STDIN>;
+	chomp($source);
+	chomp($input);
+	chomp($target);
+	chomp($align);
+	chomp($path);
+	$source = trim($source);
+	$input 	= trim($input);
+	$target	= trim($target);
+	$align	= trim($align);
+	$path	= trim($path);
+	
+	my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = &create_xml($source, $input, $target, $align, $path);
+
+	print STDOUT $frame."\n";
+	print STDOUT "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n";
+	print STDOUT "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n";
+	print STDOUT "$i ||| $match_score ||| $target_count\n";
+
+}
+
+#######################################################
+sub create_xml {
+    my ($source,$input,$target,$alignment,$path) = @_;
+    
+    my @INPUT = split(/ /,$input);
+    my @SOURCE = split(/ /,$source);
+    my @TARGET = split(/ /,$target);
+    my %ALIGN = &create_alignment($alignment);
+    
+    my %FRAME_INPUT;
+    my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
+    foreach (@TARGET) { push @TARGET_BITMAP,1 }
+    
+    ### STEP 1: FIND MISMATCHES
+
+    my ($s,$i) = (0,0);
+    my $currently_matching = 0;
+    my ($start_s,$start_i) = (0,0);
+
+    $path .= "X"; # indicate end
+    print STDERR "$input\n$source\n$target\n$path\n";
+    for(my $p=0;$p<length($path);$p++) {
+	my $action = substr($path,$p,1);
+
+	# beginning of a mismatch
+	if ($currently_matching && $action ne "M" && $action ne "X") {
+	    $start_i = $i;
+	    $start_s = $s;
+	    $currently_matching = 0;
+	}
+	
+	# end of a mismatch
+	elsif (!$currently_matching && 
+	       ($action eq "M" || $action eq "X")) {
+	    
+	    # remove use of affected target words
+	    for(my $ss = $start_s; $ss<$s; $ss++) {
+		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+		    $TARGET_BITMAP[$tt] = 0;
+		}
+		
+		# also remove enclosed unaligned words?
+	    }
+	    
+	    # are there input words that need to be inserted ?
+	    print STDERR "($start_i<$i)?\n";
+	    if ($start_i<$i) {
+		
+		# take note of input words to be inserted
+		my $insertion = "";
+		for(my $ii = $start_i; $ii<$i; $ii++) {
+		    $insertion .= $INPUT[$ii]." ";
+		}
+		
+		# find position for inserted input words
+		
+		# find first removed target word
+		my $start_t = 1000;
+		for(my $ss = $start_s; $ss<$s; $ss++) {
+		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			$start_t = $tt if $tt < $start_t;
+		    }
+		}
+
+		# end of sentence? add to end
+		if ($start_t == 1000 && $i > $#INPUT) {
+		    $start_t = $#TARGET;
+		}
+		
+		# backtrack to previous words if unaligned
+		if ($start_t == 1000) {
+		    $start_t = -1;
+		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
+			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			    $start_t = $tt if $tt > $start_t;
+			}
+		    }
+		}
+		$FRAME_INPUT{$start_t} .= $insertion;
+		my %NT = ("start_t" => $start_t,
+			  "start_i" => $start_i );
+		push @NT,\%NT;		
+	    }	    
+	    $currently_matching = 1;
+	}
+	
+	print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
+	if ($action ne "I") {
+	    print STDERR " ->";
+	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
+		print STDERR " ".$tt;
+	    }
+	}
+	print STDERR "\n";
+	$s++ unless $action eq "I";
+	$i++ unless $action eq "D";
+	$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+	push @INPUT_BITMAP, 1 if $action eq "M";
+	push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+    }
+    
+
+    print STDERR $target."\n";
+    foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
+    foreach (sort keys %FRAME_INPUT) { 
+	print STDERR "$_: $FRAME_INPUT{$_}\n";
+    }
+
+    ### STEP 2: BUILD RULE AND FRAME
+        
+    # hierarchical rule
+    my $rule_s = "";
+    my $rule_pos_s = 0;
+    my %RULE_ALIGNMENT_S;
+    for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
+		if ($INPUT_BITMAP[$i]) {
+			$rule_s .= $INPUT[$i]." ";
+			$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
+		}
+		foreach my $NT (@NT) {
+			if ($i == $$NT{"start_i"}) {
+				$rule_s .= "[X][X] ";
+				$$NT{"rule_pos_s"} = $rule_pos_s++;
+			}
+		}
+    }
+
+    my $rule_t = "";
+    my $rule_pos_t = 0;
+    my %RULE_ALIGNMENT_T;
+    for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
+	if ($t>=0 && $TARGET_BITMAP[$t]) {
+	    $rule_t .= $TARGET[$t]." ";
+	    $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
+	}
+	foreach my $NT (@NT) {
+	    if ($t == $$NT{"start_t"}) {
+		$rule_t .= "[X][X] ";
+		$$NT{"rule_pos_t"} = $rule_pos_t++;
+	    }
+	}
+    }
+
+    my $rule_alignment = "";
+    foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
+	foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
+	    next unless defined($RULE_ALIGNMENT_T{$t});
+	    $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
+	}
+    }
+    foreach my $NT (@NT) {
+	$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+    }
+    
+    chop($rule_s);
+    chop($rule_t);
+    chop($rule_alignment);
+
+    my $rule_alignment_inv = "";
+    foreach (split(/ /,$rule_alignment)) {
+	/^(\d+)\-(\d+)$/;
+	$rule_alignment_inv .= "$2-$1 ";
+    }
+    chop($rule_alignment_inv);
+
+    # frame
+    my $frame = "";
+    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+    my $currently_included = 0;
+    my $start_t = -1;
+    push @TARGET_BITMAP,0; # indicate end
+
+    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
+	# beginning of tm target inclusion
+	if (!$currently_included && $TARGET_BITMAP[$t]) {
+	    $start_t = $t;
+	    $currently_included = 1;
+	}
+	
+	# end of tm target inclusion (not included word or inserted input)
+	elsif ($currently_included && 
+	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
+	    # add xml (unless change is at the beginning of the sentence
+	    if ($start_t >= 0) {
+		my $target = "";
+		print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
+		    $target .= $TARGET[$tt] . " ";
+		}
+		chop($target);
+		$frame .= "<xml translation=\"$target\"> x </xml> ";
+	    }
+	    $currently_included = 0;
+	}
+	
+	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+	print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+    }
+
+    print STDERR $frame."\n-------------------------------------\n";
+    return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+}
+
+sub create_alignment {
+	my ($line) = @_;
+	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
+	foreach my $point (split(/ /,$line)) {
+		my ($s,$t) = split(/\-/,$point);
+		$ALIGNED_TO_S[$s]{$t}++;
+		$ALIGNED_TO_T[$t]{$s}++;
+	}
+	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+	return %ALIGNMENT;
+}
+
+# Perl trim function to remove whitespace from the start and end of the string
+sub trim($)
+{
+	my $string = shift;
+	$string =~ s/^\s+//;
+	$string =~ s/\s+$//;
+	return $string;
+}
+# Left trim function to remove leading whitespace
+sub ltrim($)
+{
+	my $string = shift;
+	$string =~ s/^\s+//;
+	return $string;
+}
+# Right trim function to remove trailing whitespace
+sub rtrim($)
+{
+	my $string = shift;
+	$string =~ s/\s+$//;
+	return $string;
+}
--- a/contrib/tm-mt-integration/fuzzy-match2.cpp
+++ b/contrib/tm-mt-integration/fuzzy-match2.cpp
@ -7,6 +7,7 @@
 #include <fstream>
 #include <cstring>
 #include <time.h>
+#include <fstream>

 #include "fuzzy-match2.h"

@ -348,7 +349,7 @@ int main(int argc, char* argv[])
    // create xml and extract files
    string inputStr, sourceStr;
    for (size_t pos = 0; pos < input_length; ++pos) {
-      inputStr += input[i][pos] + " ";
+      inputStr += vocabulary.GetWord(input[i][pos]) + " ";
    }
    
 		// do not try to find the best ... report multiple matches
@ -363,6 +364,11 @@ int main(int argc, char* argv[])
 				cout << letter_cost << "/" << input_letter_length << " ";
 				cout << "(" << best_cost <<"/" << input_length <<") ";
 				cout << "||| " << s << " ||| " << path << endl;
+        
+        vector<WORD_ID> &sourceSentence = source[s];
+        vector<SentenceAlignment> &targets = targetAndAlignment[s];
+        create_extract(sourceSentence, targets, inputStr, path);
+
 			}
 		} // if (multiple_flag)
    else {
@ -410,23 +416,8 @@ int main(int argc, char* argv[])

      // creat xml & extracts
      vector<WORD_ID> &sourceSentence = source[best_match];
-      for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
-        WORD_ID wordId = sourceSentence[pos];
-        sourceStr += vocabulary.GetWord(wordId) + " ";
-      }
-      
      vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
-      for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
-        const SentenceAlignment &sentenceAlignment = targets[targetInd]; 
-        string targetStr = sentenceAlignment.getTargetString();
-        string alignStr = sentenceAlignment.getAlignmentString();
-        
-        cerr << "create_xml " << endl 
-        << sourceStr << endl 
-        << inputStr << endl
-        << targetStr << endl
-        << alignStr << endl;
-      }
+      create_extract(sourceSentence, targets, inputStr, best_path);

    } // else if (multiple_flag)
    
@ -435,3 +426,33 @@ int main(int argc, char* argv[])
 	cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl;
 	
 }
+
+void create_extract(const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path)
+{
+  string sourceStr;
+  for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
+    WORD_ID wordId = sourceSentence[pos];
+    sourceStr += vocabulary.GetWord(wordId) + " ";
+  }
+    
+  char *inputFileName = tmpnam(NULL);
+  ofstream inputFile(inputFileName);
+
+  for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
+    const SentenceAlignment &sentenceAlignment = targets[targetInd]; 
+    string targetStr = sentenceAlignment.getTargetString();
+    string alignStr = sentenceAlignment.getAlignmentString();
+    
+    inputFile 
+      << sourceStr << endl 
+      << inputStr << endl
+      << targetStr << endl
+      << alignStr << endl
+      << path << endl;
+  }
+  
+  string cmd = string("perl create_xml.perl ") + inputFileName;
+  cerr << cmd << endl;
+  inputFile.close();
+  
+}
--- a/contrib/tm-mt-integration/fuzzy-match2.h
+++ b/contrib/tm-mt-integration/fuzzy-match2.h
@ -32,7 +32,10 @@ int multiple_flag = false;
 int multiple_slack = 0;
 int multiple_max = 100;
 map< WORD_ID,vector< int > > single_word_index;
+// global cache for word pairs
+map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;

+void create_extract(const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path);

 void load_corpus( const char* fileName, vector< vector< WORD_ID > > &corpus )
 { // source 
@ -159,9 +162,6 @@ void load_alignment( const char* fileName, vector< vector< SentenceAlignment > >

 /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */

-// global cache for word pairs
-map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;
-
 unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx )
 {
 	// check if already computed -> lookup in cache
--- a/contrib/tm-mt-integration/make-pt-from-tm.perl
+++ b/contrib/tm-mt-integration/make-pt-from-tm.perl
@ -1,4 +1,4 @@
-#!/usr/bin/perl -w -d 
+#!/usr/bin/perl -w 

 use strict;
 use FindBin qw($RealBin);
@ -20,12 +20,12 @@ my $cmd;
 my $TMPDIR=dirname($pt_file)  ."/tmp.$$";
 $cmd = "mkdir -p $TMPDIR";
 `$cmd`;
-$TMPDIR = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/out/tmp.3196";

 my $match_file  = "$TMPDIR/match";

 # suffix array creation and extraction
 $cmd = "$RealBin/fuzzy-match --multiple $in_file  $source_file > $match_file";
+print STDERR "$cmd \n";
 `$cmd`;

 # make into xml and pt
@ -47,7 +47,8 @@ while( my $match = <MATCH> ) {

    $score =~ /^(\d+) (.+)/ || die;
    my ($i,$match_score) = ($1,$2);
-
+	print STDERR "i=$i\n";
+	
    # construct frame
    if ($sentence < 1e9 && $sentence >= 0) {
 		my $SOURCE = $ALL_SOURCE[$sentence];
@ -92,6 +93,8 @@ if ($OUTPUT_RULES)
 sub create_xml {
    my ($source,$input,$target,$alignment,$path) = @_;
    
+	print STDERR " HIEU \n $source \n $input \n $target \n $alignment \n $path \n";
+
    my @INPUT = split(/ /,$input);
    my @SOURCE = split(/ /,$source);
    my @TARGET = split(/ /,$target);