initial version of phrase-extract and phrase-score used by training script

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@567 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-20 23:58:15 +03:00 · 2006-08-08 18:54:28 +00:00 · 2006-08-08 18:54:28 +00:00 · b83fc72dd2
commit b83fc72dd2
parent 7f8914c6d5
5 changed files with 839 additions and 0 deletions
--- a/scripts/training/phrase-extract/Makefile
+++ b/scripts/training/phrase-extract/Makefile
@ -0,0 +1,50 @@
+DECODER_FLAG	= -D_DECODER
+
+# ================================================================
+#LINUX VARS
+CC  = /usr/bin/gcc# -Wno-deprecated
+CXX = /usr/bin/g++# -Wno-deprecated
+CCFLAGS = $(BYTESWAP_FLAG) $(DECODER_FLAG) -O3
+BYTESWAP_FLAG   = -DSLM_SWAP_BYTES
+OSNAME = linux
+
+# ================================================================
+# DIRECTORIES TO BE USED
+# general variables
+TopDir := $(shell pwd)
+BinDir = $(TopDir)/bin
+SrcDir = $(TopDir) $(TopDir)/extract
+INCLUDES = -I$(TopDir) -I$(TopDir)/extract
+ObjDir = $(TopDir)/extractobj.$(OSNAME)
+
+# =========================================================
+# Variables for the EXTRACTOR
+
+ExtractObjFiles = extract.o
+
+ExtractObjFilesWithPath = $(addprefix $(ObjDir)/, $(ExtractObjFiles))
+
+# =========================================================
+
+extract: OutputPaths $(ExtractObjFiles)
+	$(CXX) $(CCFLAGS) $(INCLUDES) -o $(BinDir)/phrase-extract $(ExtractObjFilesWithPath)
+
+OutputPaths: MakeObjDir MakeBinDir
+
+MakeObjDir: 
+	@mkdir -p $(ObjDir)
+MakeBinDir:
+	@mkdir -p $(BinDir)
+
+VPATH = $(ObjDir) $(SrcDir)
+
+%.o : %.cpp
+	@echo @$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
+	@echo Compiling $(@F)
+	@$(CXX) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
+	@echo
+
+%.o : %.c
+	@echo Compiling $(@F)
+	@$(CC) -c $(CCFLAGS) $(INCLUDES) $< -o $(ObjDir)/$(@F)
+	@echo
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@ -0,0 +1,308 @@
+using namespace std;
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
+#define LINE_MAX_LENGTH 10000
+
+class SentenceAlignment {
+ public:
+  vector<string> english;
+  vector<string> foreign;
+  vector<int> alignedCountF;
+  vector< vector<int> > alignedToE;
+
+  int create( char[], char[], char[], int );
+  //  void clear() { delete(alignment); };
+};
+
+void extract( SentenceAlignment & );
+void addPhrase( SentenceAlignment &, int, int, int, int );
+vector<string> tokenize( char [] );
+bool isAligned ( SentenceAlignment &, int, int );
+
+ofstream extractFile;
+ofstream extractFileInv;
+ofstream extractFileOrientation;
+int maxPhraseLength;
+int phraseCount = 0;
+char* fileNameExtract;
+bool orientationFlag;
+
+int main(int argc, char* argv[]) 
+{
+  cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n"
+       << "phrase extraction from an aligned parallel corpus\n";
+  time_t starttime = time(NULL);
+
+  if (argc != 6 && argc != 7) {
+    cerr << "syntax: phrase-extract en de align extract max-length [orientation]\n";
+    exit(0);
+  }
+  char* &fileNameE = argv[1];
+  char* &fileNameF = argv[2];
+  char* &fileNameA = argv[3];
+  fileNameExtract = argv[4];
+  maxPhraseLength = atoi(argv[5]);
+  orientationFlag = (argc == 7);
+  if (orientationFlag) cerr << "(also extracting orientation)\n";
+
+  //  string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en";
+  //  string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de";
+  //  string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final";
+
+  ifstream eFile;
+  ifstream fFile;
+  ifstream aFile;
+  eFile.open(fileNameE);
+  fFile.open(fileNameF);
+  aFile.open(fileNameA);
+  istream *eFileP = &eFile;
+  istream *fFileP = &fFile;
+  istream *aFileP = &aFile;
+  
+  // string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract";
+
+  int i=0;
+  while(true) {
+    i++;
+    if (i%10000 == 0) cerr << "." << flush;
+    char englishString[LINE_MAX_LENGTH];
+    char foreignString[LINE_MAX_LENGTH];
+    char alignmentString[LINE_MAX_LENGTH];
+    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
+    if (eFileP->eof()) break;
+    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
+    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
+    SentenceAlignment sentence;
+    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+    if (sentence.create( englishString, foreignString, alignmentString, i ))
+      extract(sentence);
+  }
+
+  eFile.close();
+  fFile.close();
+  aFile.close();
+  extractFile.close();
+  extractFileInv.close();
+}
+ 
+void extract( SentenceAlignment &sentence ) {
+  int countE = sentence.english.size();
+  int countF = sentence.foreign.size();
+
+  // check alignments for english phrase startE...endE
+  for(int startE=0;startE<countE;startE++) {
+    for(int endE=startE;
+	(endE<countE && endE<startE+maxPhraseLength);
+	endE++) {
+      
+      int minF = 9999;
+      int maxF = -1;
+      vector< int > usedF = sentence.alignedCountF;
+      for(int ei=startE;ei<=endE;ei++) {
+	for(int i=0;i<sentence.alignedToE[ei].size();i++) {
+	  int fi = sentence.alignedToE[ei][i];
+	  // cout << "point (" << fi << ", " << ei << ")\n";
+	  if (fi<minF) { minF = fi; }
+	  if (fi>maxF) { maxF = fi; }
+	  usedF[ fi ]--;
+	}
+      }
+      
+      // cout << "f projected ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; 
+
+      if (maxF >= 0 && // aligned to any foreign words at all
+	  maxF-minF < maxPhraseLength) { // foreign phrase within limits
+	
+	// check if foreign words are aligned to out of bound english words
+	bool out_of_bounds = false;
+	for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
+	  if (usedF[fi]>0) {
+	    // cout << "ouf of bounds: " << fi << "\n";
+	    out_of_bounds = true;
+	  }
+	
+	// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; 
+	if (!out_of_bounds)
+	  // start point of foreign phrase may retreat over unaligned
+	  for(int startF=minF;
+	      (startF>=0 &&
+	       startF>maxF-maxPhraseLength && // within length limit
+	       (startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
+	      startF--)
+	    // end point of foreign phrase may advance over unaligned
+	    for(int endF=maxF;
+		(endF<countF && 
+		 endF<startF+maxPhraseLength && // within length limit
+		 (endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
+		endF++) 
+	      addPhrase(sentence,startE,endE,startF,endF);
+      }
+    }
+  }
+}
+
+void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF ) {
+  // foreign
+  // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; 
+  if (phraseCount % 10000000 == 0) {
+    if (phraseCount>0) {
+      extractFile.close();
+      extractFileInv.close();
+      if (orientationFlag) extractFileOrientation.close();
+    }
+    char part[10];
+    sprintf(part,".part%04d",phraseCount/10000000);
+    string fileNameExtractPart = string(fileNameExtract) + part;
+    string fileNameExtractInvPart = string(fileNameExtract) + ".inv" + part;
+    string fileNameExtractOrientationPart = string(fileNameExtract) + ".o" + part;
+    extractFile.open(fileNameExtractPart.c_str());
+    extractFileInv.open(fileNameExtractInvPart.c_str());
+    if (orientationFlag) extractFileOrientation.open(fileNameExtractOrientationPart.c_str());
+  }
+  phraseCount++;
+
+  for(int fi=startF;fi<=endF;fi++) {
+    extractFile << sentence.foreign[fi] << " ";
+    if (orientationFlag) extractFileOrientation << sentence.foreign[fi] << " ";
+  }
+  extractFile << "||| ";
+  if (orientationFlag) extractFileOrientation << "||| ";
+
+  // english
+  for(int ei=startE;ei<=endE;ei++) {
+    extractFile << sentence.english[ei] << " ";
+    extractFileInv << sentence.english[ei] << " ";
+    if (orientationFlag) extractFileOrientation << sentence.english[ei] << " ";
+  }
+  extractFile << "|||";
+  extractFileInv << "||| ";
+  if (orientationFlag) extractFileOrientation << "||| ";
+
+  // foreign (for inverse)
+  for(int fi=startF;fi<=endF;fi++)
+    extractFileInv << sentence.foreign[fi] << " ";
+  extractFileInv << "|||";
+
+  // alignment
+  for(int ei=startE;ei<=endE;ei++) 
+    for(int i=0;i<sentence.alignedToE[ei].size();i++) {
+      int fi = sentence.alignedToE[ei][i];
+      extractFile << " " << fi-startF << "-" << ei-startE;
+      extractFileInv << " " << ei-startE << "-" << fi-startF;
+    }
+
+  if (orientationFlag) {
+
+    // orientation to previous E
+    bool connectedLeftTop  = isAligned( sentence, startF-1, startE-1 );
+    bool connectedRightTop = isAligned( sentence, endF+1,   startE-1 );
+    if      ( connectedLeftTop && !connectedRightTop) 
+      extractFileOrientation << "mono";
+    else if (!connectedLeftTop &&  connectedRightTop) 
+      extractFileOrientation << "swap";
+    else 
+      extractFileOrientation << "other";
+  
+    // orientation to following E
+    bool connectedLeftBottom  = isAligned( sentence, startF-1, endE+1 );
+    bool connectedRightBottom = isAligned( sentence, endF+1,   endE+1 );
+    if      ( connectedLeftBottom && !connectedRightBottom) 
+      extractFileOrientation << " swap";
+    else if (!connectedLeftBottom &&  connectedRightBottom) 
+      extractFileOrientation << " mono";
+    else 
+      extractFileOrientation << " other";
+  }
+
+  extractFile << "\n";
+  extractFileInv << "\n";
+  if (orientationFlag) extractFileOrientation << "\n";
+}
+  
+bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) {
+  if (ei == -1 && fi == -1) return true;
+  if (ei <= -1 || fi <= -1) return false;
+  if (ei == sentence.english.size() && fi == sentence.foreign.size()) return true;
+  if (ei >= sentence.english.size() || fi >= sentence.foreign.size()) return false;
+  for(int i=0;i<sentence.alignedToE[ei].size();i++) 
+    if (sentence.alignedToE[ei][i] == fi) return true;
+  return false;
+}
+
+// as in beamdecoder/tables.cpp
+vector<string> tokenize( char input[] ) {
+  vector< string > token;
+  bool betweenWords = true;
+  int start;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( string( input+start, i-start ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( string( input+start, i-start ) );
+  return token;
+}
+
+int SentenceAlignment::create( char englishString[], char foreignString[], char alignmentString[], int sentenceID ) {
+  english = tokenize( englishString );
+  foreign = tokenize( foreignString );
+  //  alignment = new bool[foreign.size()*english.size()];
+  //  alignment = (bool**) calloc(english.size()*foreign.size(),sizeof(bool)); // is this right?
+  
+  if (english.size() == 0 || foreign.size() == 0) {
+    cerr << "no english (" << english.size() << ") or foreign (" << foreign.size() << ") words << end insentence " << sentenceID << endl;
+    cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
+    return 0;
+  }
+  // cout << "english.size = " << english.size() << endl;
+  // cout << "foreign.size = " << foreign.size() << endl;
+
+  // cout << "xxx\n";
+  for(int i=0; i<foreign.size(); i++) {
+    // cout << "i" << i << endl;
+    alignedCountF.push_back( 0 );
+  }
+  for(int i=0; i<english.size(); i++) {
+    vector< int > dummy;
+    alignedToE.push_back( dummy );
+  }
+  // cout << "\nscanning...\n";
+
+  vector<string> alignmentSequence = tokenize( alignmentString );
+  for(int i=0; i<alignmentSequence.size(); i++) {
+    int e,f;
+    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
+    if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &f, &e)) {
+      cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentnce " << sentenceID << endl; 
+      cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
+      return 0;
+    }
+      // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
+    if (e >= english.size() || f >= foreign.size()) { 
+      cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << f << ", " << e << ") out of bounds (" << foreign.size() << ", " << english.size() << ")\n";
+      cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
+      return 0;
+    }
+    alignedToE[e].push_back( f );
+    alignedCountF[f]++;
+  }
+  return 1;
+}
+
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@ -0,0 +1,323 @@
+using namespace std;
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "tables-core.h"
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
+#define LINE_MAX_LENGTH 10000
+
+class PhraseAlignment {
+public:
+  int english, foreign;
+  vector< vector<int> > alignedToE;
+  vector< vector<int> > alignedToF;
+  
+  void create( char*, int );
+  void clear();
+  bool equals( PhraseAlignment );
+};
+
+class LexicalTable {
+public:
+  map< WORD_ID, map< WORD_ID, double > > ltable;
+  void load( char[] );
+};
+
+vector<string> tokenize( char [] );
+
+void processPhrasePairs( vector< PhraseAlignment > & );
+
+ofstream phraseTableFile;
+
+Vocabulary vcbE;
+Vocabulary vcbF;
+LexicalTable lexTable;
+PhraseTable phraseTableE;
+PhraseTable phraseTableF;
+bool inverseFlag;
+
+int main(int argc, char* argv[]) 
+{
+  cerr << "PhraseScore v1.2.1, written by Philipp Koehn\n"
+       << "phrase scoring methods for extracted phrases\n";
+  time_t starttime = time(NULL);
+
+  if (argc != 4 && argc != 5) {
+    cerr << "syntax: phrase-score extract lex phrase-table [inverse]\n";
+    exit(0);
+  }
+  char* &fileNameExtract = argv[1];
+  char* &fileNameLex = argv[2];
+  char* &fileNamePhraseTable = argv[3];
+  inverseFlag = false;
+  if (argc > 4) {
+    inverseFlag = true;
+    cerr << "using inverse mode\n";
+  }
+  //  char[] fileNameExtract& = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract.sorted";
+  //  string fileNameLex = "/data/nlp/koehn/europarl-v2/models/de-en/model/lex.f2n";
+  //  string fileNamePhraseTable = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-phrase-table-half.f2n";
+
+  // lexical translation table
+  lexTable.load( fileNameLex );
+  
+  // sorted phrase extraction file
+  ifstream extractFile;
+
+  extractFile.open(fileNameExtract);
+  if (extractFile.fail()) {
+    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+    exit(0);
+  }
+  istream *extractFileP = &extractFile;
+
+  // output file: phrase translation table
+  phraseTableFile.open(fileNamePhraseTable);
+  if (phraseTableFile.fail()) {
+    cerr << "ERROR: could not open file phrase table file " 
+	 << fileNamePhraseTable << endl;
+    exit(0);
+  }
+  
+  // loop through all extracted phrase translations
+  int lastForeign = -1;
+  vector< PhraseAlignment > phrasePairsWithSameF;
+  int i=0;
+  int fileCount = 0;
+  while(true) {
+    if (extractFileP->eof()) break;
+    if (++i % 100000 == 0) cerr << "." << flush;
+    char line[LINE_MAX_LENGTH];    
+    SAFE_GETLINE((*extractFileP), line, LINE_MAX_LENGTH, '\n');
+    //    if (fileCount>0)
+    if (extractFileP->eof()) break;
+    PhraseAlignment phrasePair;
+    phrasePair.create( line, i );
+    if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
+      processPhrasePairs( phrasePairsWithSameF );
+      for(int j=0;j<phrasePairsWithSameF.size();j++)
+	phrasePairsWithSameF[j].clear();
+      phrasePairsWithSameF.clear();
+      phraseTableE.clear();
+      phraseTableF.clear();
+      phrasePair.clear(); // process line again, since phrase tables flushed
+      phrasePair.create( line, i ); 
+    }
+    lastForeign = phrasePair.foreign;
+    phrasePairsWithSameF.push_back( phrasePair );
+  }
+  processPhrasePairs( phrasePairsWithSameF );
+  phraseTableFile.close();
+}
+
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
+  map<int, int> countE;
+  map<int, int> alignmentE;
+  int totalCount = 0;
+  int currentCount = 0;
+  int maxSameCount = 0;
+  int maxSame = -1;
+  int old = -1;
+  for(int i=0;i<phrasePair.size();i++) {
+    if (i>0) {
+      if (phrasePair[old].english == phrasePair[i].english) {
+	if (! phrasePair[i].equals( phrasePair[old] )) {
+	  if (currentCount > maxSameCount) {
+	    maxSameCount = currentCount;
+	    maxSame = i-1;
+	  }
+	  currentCount = 0;
+	}
+      }
+      else {
+	// wrap up old E
+	if (currentCount > maxSameCount) {
+	  maxSameCount = currentCount;
+	  maxSame = i-1;
+	}
+
+	alignmentE[ phrasePair[old].english ] = maxSame;
+	//	if (maxSameCount != totalCount)
+	//  cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+	
+	// get ready for new E
+	totalCount = 0;
+	currentCount = 0;
+	maxSameCount = 0;
+	maxSame = -1;
+      }
+    }
+    countE[ phrasePair[i].english ]++;
+    old = i;
+    currentCount++;
+    totalCount++;
+  }
+  
+  // wrap up old E
+  if (currentCount > maxSameCount) {
+    maxSameCount = currentCount;
+    maxSame = phrasePair.size()-1;
+  }
+  alignmentE[ phrasePair[old].english ] = maxSame;
+  //  if (maxSameCount != totalCount)
+  //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+
+  // output table
+  typedef map< int, int >::iterator II;
+  PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
+  for(II i = countE.begin(); i != countE.end(); i++) {
+    //    cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
+
+    // foreign phrase (unless inverse)
+    if (! inverseFlag) {
+      for(int j=0;j<phraseF.size();j++)
+	phraseTableFile << vcbF.getWord( phraseF[j] ) << " ";
+      phraseTableFile << "||| ";
+    }
+
+    // english phrase
+    PHRASE phraseE = phraseTableE.getPhrase( i->first );
+    for(int j=0;j<phraseE.size();j++)
+      phraseTableFile << vcbE.getWord( phraseE[j] ) << " ";
+    phraseTableFile << "||| ";
+
+    // foreign phrase (if inverse)
+    if (inverseFlag) {
+      for(int j=0;j<phraseF.size();j++)
+	phraseTableFile << vcbF.getWord( phraseF[j] ) << " ";
+      phraseTableFile << "||| ";
+    }
+ 
+    // phrase translation probability
+    phraseTableFile << ((double) i->second / (double) phrasePair.size());
+
+    // lexical translation probability
+    double lexScore = 1;
+    int null = vcbF.getWordID("NULL");
+    PhraseAlignment &current = phrasePair[ alignmentE[ i->first ] ];
+    for(int ei=0;ei<phraseE.size();ei++) { // all english words have to be explained
+      if (current.alignedToE[ ei ].size() == 0)
+	lexScore *= lexTable.ltable[ null ][ phraseE[ ei ] ]; // by NULL if neccessary
+      else {
+	double thisWordScore = 0;
+	for(int j=0;j<current.alignedToE[ ei ].size();j++) {
+	  thisWordScore += lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ];
+	  //	  cout << "lex" << j << "(" << vcbE.getWord( phraseE[ ei ] ) << "|" << vcbF.getWord( phraseF[current.alignedToE[ ei ][ j ] ] ) << ")=" << lexTable.ltable[ phraseF[current.alignedToE[ ei ][ j ] ] ][ phraseE[ ei ] ] << " ";
+	}
+	lexScore *= thisWordScore / (double)current.alignedToE[ ei ].size();
+      }
+      //      cout << " => " << lexScore << endl;
+    }
+    phraseTableFile << " " << lexScore;
+
+    // model 1 score
+
+    // zens&ney lexical score
+
+    phraseTableFile << endl;
+  }
+}
+
+void PhraseAlignment::create( char line[], int lineID ) {
+  vector< string > token = tokenize( line );
+  int item = 1;
+  PHRASE phraseF, phraseE;
+  for (int j=0; j<token.size(); j++) {
+    if (token[j] == "|||") item++;
+    else {
+      if (item == 1)
+	phraseF.push_back( vcbF.storeIfNew( token[j] ) );
+      else if (item == 2)
+	phraseE.push_back( vcbE.storeIfNew( token[j] ) );
+      else if (item == 3) {
+	int e,f;
+	sscanf(token[j].c_str(), "%d-%d", &f, &e);
+	if (e >= phraseE.size() || f >= phraseF.size()) { 
+	  cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
+	else {
+	  if (alignedToE.size() == 0) {
+	    vector< int > dummy;
+	    for(int i=0;i<phraseE.size();i++)
+	      alignedToE.push_back( dummy );
+	    for(int i=0;i<phraseF.size();i++)
+	      alignedToF.push_back( dummy );
+	    foreign = phraseTableF.storeIfNew( phraseF );
+	    english = phraseTableE.storeIfNew( phraseE );
+	  }
+	  alignedToE[e].push_back( f );
+	  alignedToF[f].push_back( e );
+	}
+      }
+    }
+  }
+}
+
+void PhraseAlignment::clear() {
+  for(int i=0;i<alignedToE.size();i++)
+    alignedToE[i].clear();
+  for(int i=0;i<alignedToF.size();i++)
+    alignedToF[i].clear();
+  alignedToE.clear();
+  alignedToF.clear();
+}
+
+bool PhraseAlignment::equals( PhraseAlignment other ) {
+  if (other.english != english) return false;
+  if (other.foreign != foreign) return false;
+  PHRASE phraseE = phraseTableE.getPhrase( english );
+  PHRASE phraseF = phraseTableF.getPhrase( foreign );
+  for(int i=0;i<phraseE.size();i++) {
+    if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
+    for(int j=0; j<alignedToE[i].size(); j++) {
+      if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
+    }
+  }
+  for(int i=0;i<phraseF.size();i++) {
+    if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
+    for(int j=0; j<alignedToF[i].size(); j++) {
+      if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
+    }
+  }
+  return true;
+}
+
+void LexicalTable::load( char *fileName ) {
+  cerr << "Loading lexical translation table from " << fileName;
+  ifstream inFile;
+  inFile.open(fileName);
+  if (inFile.fail()) {
+    cerr << " - ERROR: could not open file\n";
+    exit(0);
+  }
+  istream *inFileP = &inFile;
+
+  char line[LINE_MAX_LENGTH];
+
+  int i=0;
+  while(true) {
+    i++;
+    if (i%100000 == 0) cerr << "." << flush;
+    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
+    if (inFileP->eof()) break;
+
+    vector<string> token = tokenize( line );
+    if (token.size() != 3) {
+      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+	token.size() << " " << token[0] << " " << line << endl;
+      continue;
+    }
+    
+    double prob = atof( token[2].c_str() );
+    WORD_ID wordE = vcbE.storeIfNew( token[0] );
+    WORD_ID wordF = vcbF.storeIfNew( token[1] );
+    ltable[ wordF ][ wordE ] = prob;
+  }
+  cerr << endl;
+}
--- a/scripts/training/phrase-extract/tables-core.cpp
+++ b/scripts/training/phrase-extract/tables-core.cpp
@ -0,0 +1,101 @@
+//#include "beammain.h"
+#include "tables-core.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+#define UNKNOWNSTR	"UNK"
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) {_IS.getline(_LINE, _SIZE, _DELIM); if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear();}
+
+vector<string> tokenize( char input[] ) {
+  vector< string > token;
+  bool betweenWords = true;
+  int start;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( string( input+start, i-start ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( string( input+start, i-start+1 ) );
+  return token;
+}
+
+WORD_ID Vocabulary::storeIfNew( WORD word ) {
+  if( lookup.find( word ) != lookup.end() )
+    return lookup[ word ];
+
+  WORD_ID id = vocab.size();
+  vocab.push_back( word );
+  lookup[ word ] = id;
+  return id;  
+}
+
+WORD_ID Vocabulary::getWordID( WORD word ) {
+  if( lookup.find( word ) == lookup.end() )
+    return 0;
+  return lookup[ word ];
+}
+
+PHRASE_ID PhraseTable::storeIfNew( PHRASE phrase ) {
+  if( lookup.find( phrase ) != lookup.end() )
+    return lookup[ phrase ];
+
+  PHRASE_ID id  = phraseTable.size();
+  phraseTable.push_back( phrase );
+  lookup[ phrase ] = id;
+  return id;
+}
+
+PHRASE_ID PhraseTable::getPhraseID( PHRASE phrase ) {
+  if( lookup.find( phrase ) == lookup.end() )
+    return 0;
+  return lookup[ phrase ];
+}
+
+void PhraseTable::clear() {
+  lookup.clear();
+  phraseTable.clear();
+}
+
+void DTable::init() {
+  for(int i = -10; i<10; i++)
+    dtable[i] = -abs( i );
+}
+
+void DTable::load( string fileName ) {
+  ifstream inFile;
+  inFile.open(fileName.c_str());
+  istream *inFileP = &inFile;
+
+  char line[TABLE_LINE_MAX_LENGTH];
+  int i=0;
+  while(true) {
+    i++;
+    SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n');
+    if (inFileP->eof()) break;
+
+    vector<string> token = tokenize( line );
+    if (token.size() < 2) {
+      cerr << "line " << i << " in " << fileName << " too short, skipping\n";
+      continue;
+    }
+
+    int d = atoi( token[0].c_str() );
+    double prob = log( atof( token[1].c_str() ) );
+    dtable[ d ] = prob;
+  }  
+}
+
+double DTable::get( int distortion ) {
+  if (dtable.find( distortion ) == dtable.end())
+    return log( 0.00001 );
+  return dtable[ distortion ];
+}
--- a/scripts/training/phrase-extract/tables-core.h
+++ b/scripts/training/phrase-extract/tables-core.h
@ -0,0 +1,57 @@
+#ifndef _TABLES_H
+#define _TABLES_H
+
+using namespace std;
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+
+vector<string> tokenize( char[] );
+
+typedef string WORD;
+typedef unsigned int WORD_ID;
+
+class Vocabulary {
+ public:
+  map<WORD, WORD_ID>  lookup;
+  vector< WORD > vocab;
+  WORD_ID storeIfNew( WORD );
+  WORD_ID getWordID( WORD );
+  inline WORD &getWord( WORD_ID id ) { return vocab[ id ]; }
+};
+
+typedef vector< WORD_ID > PHRASE;
+typedef unsigned int PHRASE_ID;
+
+class PhraseTable {
+ public:
+  map< PHRASE, PHRASE_ID > lookup;
+  vector< PHRASE > phraseTable;
+  PHRASE_ID storeIfNew( PHRASE );
+  PHRASE_ID getPhraseID( PHRASE );
+  void clear();
+  inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; }
+};
+
+typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC;
+
+class TTable {
+ public:
+  map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable;
+  map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti;
+};
+
+class DTable {
+ public:
+  map< int, double > dtable;
+  void init();
+  void load( string );
+  double get( int );
+};
+
+#endif