faster scorer

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4119 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-11 11:25:40 +03:00 · 2011-08-05 10:27:15 +00:00 · 2011-08-05 10:27:15 +00:00 · 30ca534b86
commit 30ca534b86
parent b4c79f721e
9 changed files with 101 additions and 612 deletions
--- a/regression-testing/MosesRegressionTesting.pm
+++ b/regression-testing/MosesRegressionTesting.pm
@ -5,7 +5,7 @@ use strict;
 # if your tests need a new version of the test data, increment this
 # and make sure that a moses-regression-tests-vX.Y is available for
 # download from statmt.org (redpony AT umd dot edu for more info)
-use constant TESTING_DATA_VERSION => '6';
+use constant TESTING_DATA_VERSION => '7';

 # find the data directory in a few likely locations and make sure
 # that it is the correct version
--- a/regression-testing/compare-results.perl
+++ b/regression-testing/compare-results.perl
--- a/regression-testing/run-single-test.perl
+++ b/regression-testing/run-single-test.perl
@ -105,7 +105,7 @@ if($NBEST > 0){
  run_command("gzip $results/run.nbest");
 }

-($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.pl $results $truth");
+($o, $ec, $sig) = run_command("$BIN_TEST/compare-results.perl $results $truth");
 print $o;
 if ($ec) {
  print STDERR "FAILURE, for debugging, local moses.ini=$local_moses_ini\n";
--- a/scripts/Makefile
+++ b/scripts/Makefile
@ -8,8 +8,8 @@ DS?=$(shell date '+%Y%m%d')
 # Set TARGETDIR to directory where you want the compiled scripts to be copied
 # to.
 # Set BINDIR to the directory where GIZA++ and other tools are installed.
-TARGETDIR=/mnt/odin3/bhaddow/moses
-BINDIR=/mnt/odin3/bhaddow/moses/bin
+TARGETDIR=/opt/AO/sw/edinburgh-code/
+BINDIR=/opt/AO/sw/edinburgh-code/

 MAIN_SCRIPTS_TARGET_DIR=$(TARGETDIR)
 # MAIN_SCRIPTS_TARGET_DIR=$(shell echo `pwd`/temp)
--- a/scripts/training/phrase-extract/Makefile
+++ b/scripts/training/phrase-extract/Makefile
@ -1,5 +1,5 @@
 all: consolidate consolidate-direct consolidate-reverse extract extract-rules relax-parse \
-     score score2 statistics extract-lex
+     score statistics extract-lex

 clean: 
 	rm -f *.o
@ -19,9 +19,6 @@ extract-lex: extract-lex.o
 score: tables-core.o AlignmentPhrase.o score.o PhraseAlignment.o InputFileStream.o
 	$(CXX) $^ -lz -o score

-score2: tables-core.o AlignmentPhrase.o score2.o PhraseAlignment.o InputFileStream.o
-	$(CXX) $^ -lz -o score2
-
 consolidate: consolidate.o tables-core.o InputFileStream.o
 	$(CXX) $^ -lz -o consolidate

--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@ -58,8 +58,8 @@ vector<string> tokenize( const char [] );

 void computeCountOfCounts( char* fileNameExtract, int maxLines );
 void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > & );
-void outputPhrasePair( vector< PhraseAlignment * > &, float, ostream &phraseTableFile );
+PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection & );
+void outputPhrasePair(const PhraseAlignmentCollection &, float, ostream &phraseTableFile );
 double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );

 LexicalTable lexTable;
@ -168,7 +168,11 @@ int main(int argc, char* argv[])
  PhraseAlignment *lastPhrasePair = NULL;
  while(true) {
    if (extractFileP.eof()) break;
-    if (++i % 100000 == 0) cerr << "." << flush;
+    if (++i % 100000 == 0)
+    {
+      cerr << i << " " << flush;
+    }
+
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (extractFileP.eof())	break;

@ -193,6 +197,7 @@ int main(int argc, char* argv[])
    // if new source phrase, process last batch
    if (lastPhrasePair != NULL &&
        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
+      
      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
      phrasePairsWithSameF.clear();
      lastPhrasePair = NULL;
@ -291,39 +296,44 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT

  // group phrase pairs based on alignments that matter
  // (i.e. that re-arrange non-terminals)
-  vector< vector< PhraseAlignment * > > phrasePairGroup;
+  PhrasePairGroup phrasePairGroup;
  float totalSource = 0;

+  //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
+
  // loop through phrase pairs
  for(size_t i=0; i<phrasePair.size(); i++) {
    // add to total count
+    PhraseAlignment &currPhrasePair = phrasePair[i];
+    
    totalSource += phrasePair[i].count;

-    bool matched = false;
    // check for matches
-    for(size_t g=0; g<phrasePairGroup.size(); g++) {
-      vector< PhraseAlignment* > &group = phrasePairGroup[g];
-      // matched? place into same group
-      if ( group[0]->match( phrasePair[i] )) {
-        group.push_back( &phrasePair[i] );
-        matched = true;
-      }
-    }
-    // not matched? create new group
-    if (! matched) {
-      vector< PhraseAlignment* > newGroup;
-      newGroup.push_back( &phrasePair[i] );
-      phrasePairGroup.push_back( newGroup );
+    //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
+
+    PhraseAlignmentCollection phraseAlignColl;
+    phraseAlignColl.push_back(&currPhrasePair);
+    pair<PhrasePairGroup::iterator, bool> retInsert;
+    retInsert = phrasePairGroup.insert(phraseAlignColl);
+    if (!retInsert.second)
+    { // already exist. Add to that collection instead
+      PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
+      existingColl.push_back(&currPhrasePair);
    }
+
  }

-  for(size_t g=0; g<phrasePairGroup.size(); g++) {
-    vector< PhraseAlignment* > &group = phrasePairGroup[g];
+  const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
+  PhrasePairGroup::SortedColl::const_iterator iter;
+  
+  for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) 
+  {
+    const PhraseAlignmentCollection &group = **iter;
    outputPhrasePair( group, totalSource, phraseTableFile );
  }
 }

-PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
+PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
  float bestAlignmentCount = -1;
  PhraseAlignment* bestAlignment;
@ -338,7 +348,7 @@ PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
  return bestAlignment;
 }

-void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, ostream &phraseTableFile )
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, ostream &phraseTableFile )
 {
  if (phrasePair.size() == 0) return;

@ -488,3 +498,18 @@ void LexicalTable::load( char *fileName )
  cerr << endl;
 }

+
+std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
+{
+  std::pair<iterator,bool> ret = m_coll.insert(obj);
+
+  if (ret.second)
+  { // obj inserted. Also add to sorted vector
+    const PhraseAlignmentCollection &insertedObj = *ret.first;
+    m_sortedColl.push_back(&insertedObj);
+  }
+
+  return ret;
+}
+
+
--- a/scripts/training/phrase-extract/score.h
+++ b/scripts/training/phrase-extract/score.h
@ -8,11 +8,60 @@
 *
 */
 #include <string>
+#include <vector>
+
+class PhraseAlignment;
+
+typedef std::vector<PhraseAlignment*>          PhraseAlignmentCollection;
+//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
+
+class PhraseAlignmentCollectionOrderer
+{
+public:
+	bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
+	{
+    assert(collA.size() > 0);
+    assert(collB.size() > 0);
+
+    const PhraseAlignment &objA = *collA[0];
+    const PhraseAlignment &objB = *collB[0];
+    bool ret = objA < objB;
+
+    return ret;
+	}
+};


+//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
+
+class PhrasePairGroup
+{
+private:
+  typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
+  Coll m_coll;
+
+
+public:
+  typedef Coll::iterator iterator;
+  typedef Coll::const_iterator const_iterator;
+  typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
+
+  std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
+
+  const SortedColl &GetSortedColl() const
+  { return m_sortedColl; }
+
+private:
+  SortedColl m_sortedColl;
+
+};
+
+// other functions *********************************************
 inline bool isNonTerminal( std::string &word )
 {
  return (word.length()>=3 &&
          word.substr(0,1).compare("[") == 0 &&
          word.substr(word.length()-1,1).compare("]") == 0);
 }
+
+
--- a/scripts/training/phrase-extract/score2.cpp
+++ b/scripts/training/phrase-extract/score2.cpp
@ -1,515 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2009 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include <sstream>
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <stdlib.h>
-#include <assert.h>
-#include <cstring>
-#include <set>
-
-#include "SafeGetline.h"
-#include "tables-core.h"
-#include "PhraseAlignment.h"
-#include "score2.h"
-#include "InputFileStream.h"
-
-using namespace std;
-
-#define LINE_MAX_LENGTH 100000
-
-Vocabulary vcbT;
-Vocabulary vcbS;
-
-class LexicalTable
-{
-public:
-  map< WORD_ID, map< WORD_ID, double > > ltable;
-  void load( char[] );
-  double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
-    // cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
-    if (ltable.find( wordS ) == ltable.end()) return 1.0;
-    if (ltable[ wordS ].find( wordT ) == ltable[ wordS ].end()) return 1.0;
-    // cout << ltable[ wordS ][ wordT ];
-    return ltable[ wordS ][ wordT ];
-  }
-};
-
-vector<string> tokenize( const char [] );
-
-void computeCountOfCounts( char* fileNameExtract, int maxLines );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection & );
-void outputPhrasePair(const PhraseAlignmentCollection &, float, ostream &phraseTableFile );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-
-LexicalTable lexTable;
-bool inverseFlag = false;
-bool hierarchicalFlag = false;
-bool wordAlignmentFlag = false;
-bool goodTuringFlag = false;
-#define GT_MAX 10
-bool logProbFlag = false;
-int negLogProb = 1;
-bool lexFlag = true;
-int countOfCounts[GT_MAX+1];
-float discountFactor[GT_MAX+1];
-int maxLinesGTDiscount = -1;
-bool phrasePairCountFlag = false;
-
-int main(int argc, char* argv[])
-{
-  cerr << "Score v2.0 written by Philipp Koehn\n"
-       << "scoring methods for extracted rules\n";
-
-  if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment] [--MaxLinesGTDiscount num] [--PhrasePairCount]\n";
-    exit(1);
-  }
-  char* fileNameExtract = argv[1];
-  char* fileNameLex = argv[2];
-  char* fileNamePhraseTable = argv[3];
-
-  for(int i=4; i<argc; i++) {
-    if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
-      inverseFlag = true;
-      cerr << "using inverse mode\n";
-    } else if (strcmp(argv[i],"--Hierarchical") == 0) {
-      hierarchicalFlag = true;
-      cerr << "processing hierarchical rules\n";
-    } else if (strcmp(argv[i],"--WordAlignment") == 0) {
-      wordAlignmentFlag = true;
-      cerr << "outputing word alignment" << endl;
-    } else if (strcmp(argv[i],"--NoLex") == 0) {
-      lexFlag = false;
-      cerr << "not computing lexical translation score\n";
-    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
-      goodTuringFlag = true;
-      cerr << "using Good Turing discounting\n";
-    } else if (strcmp(argv[i],"--LogProb") == 0) {
-      logProbFlag = true;
-      cerr << "using log-probabilities\n";
-    } else if (strcmp(argv[i],"--NegLogProb") == 0) {
-      logProbFlag = true;
-      negLogProb = -1;
-      cerr << "using negative log-probabilities\n";
-    } else if (strcmp(argv[i],"--MaxLinesGTDiscount") == 0) {
-      ++i;
-      maxLinesGTDiscount = atoi(argv[i]);
-      cerr << "maxLinesGTDiscount=" << maxLinesGTDiscount << endl;
-    } else if (strcmp(argv[i],"--PhrasePairCount") == 0) {
-      phrasePairCountFlag = true;
-      cerr << "outputting phrase pair counts" << endl;
-    } else {
-      cerr << "ERROR: unknown option " << argv[i] << endl;
-      exit(1);
-    }
-  }
-
-  // lexical translation table
-  if (lexFlag)
-    lexTable.load( fileNameLex );
-
-  // compute count of counts for Good Turing discounting
-  if (goodTuringFlag)
-    computeCountOfCounts( fileNameExtract, maxLinesGTDiscount );
-
-  // sorted phrase extraction file
-  Moses::InputFileStream extractFile(fileNameExtract);
-
-  if (extractFile.fail()) {
-    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
-    exit(1);
-  }
-  istream &extractFileP = extractFile;
-
-  // output file: phrase translation table
-	ostream *phraseTableFile;
-
-	if (strcmp(fileNamePhraseTable, "-") == 0) {
-		phraseTableFile = &cout;
-	}
-	else {
-		ofstream *outputFile = new ofstream();
-		outputFile->open(fileNamePhraseTable);
-		if (outputFile->fail()) {
-			cerr << "ERROR: could not open file phrase table file "
-					 << fileNamePhraseTable << endl;
-			exit(1);
-		}
-		phraseTableFile = outputFile;
-	}
-	
-  // loop through all extracted phrase translations
-  float lastCount = 0.0f;
-  vector< PhraseAlignment > phrasePairsWithSameF;
-  int i=0;
-  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
-  lastLine[0] = '\0';
-  PhraseAlignment *lastPhrasePair = NULL;
-  while(true) {
-    if (extractFileP.eof()) break;
-    if (++i % 100000 == 0)
-    {
-      cerr << i << " " << flush;
-    }
-
-    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (extractFileP.eof())	break;
-
-    // identical to last line? just add count
-    if (strcmp(line,lastLine) == 0) {
-      lastPhrasePair->count += lastCount;
-      continue;
-    }
-    strcpy( lastLine, line );
-
-    // create new phrase pair
-    PhraseAlignment phrasePair;
-    phrasePair.create( line, i );
-    lastCount = phrasePair.count;
-
-    // only differs in count? just add count
-    if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
-      lastPhrasePair->count += phrasePair.count;
-      continue;
-    }
-
-    // if new source phrase, process last batch
-    if (lastPhrasePair != NULL &&
-        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
-      
-      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
-      phrasePairsWithSameF.clear();
-      lastPhrasePair = NULL;
-    }
-
-    // add phrase pairs to list, it's now the last one
-    phrasePairsWithSameF.push_back( phrasePair );
-    lastPhrasePair = &phrasePairsWithSameF.back();
-  }
-  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
-	
-	phraseTableFile->flush();
-	if (phraseTableFile != &cout) {
-		(dynamic_cast<ofstream*>(phraseTableFile))->close();
-		delete phraseTableFile;
-	}
-}
-
-void computeCountOfCounts( char* fileNameExtract, int maxLines )
-{
-  cerr << "computing counts of counts";
-  for(int i=1; i<=GT_MAX; i++) countOfCounts[i] = 0;
-
-  Moses::InputFileStream extractFile(fileNameExtract);
-  if (extractFile.fail()) {
-    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
-    exit(1);
-  }
-  istream &extractFileP = extractFile;
-
-  // loop through all extracted phrase translations
-  int lineNum = 0;
-  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
-  lastLine[0] = '\0';
-  float lastCount = 0.0f;
-  PhraseAlignment *lastPhrasePair = NULL;
-  while(true) {
-    if (extractFileP.eof()) break;
-    if (maxLines > 0 && lineNum >= maxLines) break;
-    if (++lineNum % 100000 == 0) cerr << "." << flush;
-    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (extractFileP.eof())	break;
-
-    // identical to last line? just add count
-    if (strcmp(line,lastLine) == 0) {
-      lastPhrasePair->count += lastCount;
-      continue;
-    }
-    strcpy( lastLine, line );
-
-    // create new phrase pair
-    PhraseAlignment *phrasePair = new PhraseAlignment();
-    phrasePair->create( line, lineNum );
-    lastCount = phrasePair->count;
-
-    if (lineNum == 1) {
-      lastPhrasePair = phrasePair;
-      continue;
-    }
-
-    // only differs in count? just add count
-    if (lastPhrasePair->match( *phrasePair )) {
-      lastPhrasePair->count += phrasePair->count;
-      phrasePair->clear();
-      delete(phrasePair);
-      continue;
-    }
-
-    int count = lastPhrasePair->count + 0.99999;
-    if(count <= GT_MAX)
-      countOfCounts[ count ]++;
-    lastPhrasePair->clear();
-    delete( lastPhrasePair );
-    lastPhrasePair = phrasePair;
-  }
-
-  delete lastPhrasePair;
-
-  discountFactor[0] = 0.01; // floor
-  cerr << "\n";
-  for(int i=1; i<GT_MAX; i++) {
-    discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1));
-    cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i];
-    // some smoothing...
-    if (discountFactor[i]>1)
-      discountFactor[i] = 1;
-    if (discountFactor[i]<discountFactor[i-1])
-      discountFactor[i] = discountFactor[i-1];
-    cerr << " -> " << discountFactor[i]*i << endl;
-  }
-}
-
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
-{
-  if (phrasePair.size() == 0) return;
-
-  // group phrase pairs based on alignments that matter
-  // (i.e. that re-arrange non-terminals)
-  PhrasePairGroup phrasePairGroup;
-  float totalSource = 0;
-
-  //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
-
-  // loop through phrase pairs
-  for(size_t i=0; i<phrasePair.size(); i++) {
-    // add to total count
-    PhraseAlignment &currPhrasePair = phrasePair[i];
-    
-    totalSource += phrasePair[i].count;
-
-    // check for matches
-    //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
-
-    PhraseAlignmentCollection phraseAlignColl;
-    phraseAlignColl.push_back(&currPhrasePair);
-    pair<PhrasePairGroup::iterator, bool> retInsert;
-    retInsert = phrasePairGroup.insert(phraseAlignColl);
-    if (!retInsert.second)
-    { // already exist. Add to that collection instead
-      PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
-      existingColl.push_back(&currPhrasePair);
-    }
-
-  }
-
-  const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
-  PhrasePairGroup::SortedColl::const_iterator iter;
-  
-  for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) 
-  {
-    const PhraseAlignmentCollection &group = **iter;
-    outputPhrasePair( group, totalSource, phraseTableFile );
-  }
-}
-
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
-{
-  float bestAlignmentCount = -1;
-  PhraseAlignment* bestAlignment;
-
-  for(int i=0; i<phrasePair.size(); i++) {
-    if (phrasePair[i]->count > bestAlignmentCount) {
-      bestAlignmentCount = phrasePair[i]->count;
-      bestAlignment = phrasePair[i];
-    }
-  }
-
-  return bestAlignment;
-}
-
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, ostream &phraseTableFile )
-{
-  if (phrasePair.size() == 0) return;
-
-  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
-
-  // compute count
-  float count = 0;
-  for(size_t i=0; i<phrasePair.size(); i++) {
-    count += phrasePair[i]->count;
-  }
-  const float originalCount = count;
-
-  const PHRASE &phraseS = phrasePair[0]->GetSource();
-  const PHRASE &phraseT = phrasePair[0]->GetTarget();
-
-  // labels (if hierarchical)
-
-  // source phrase (unless inverse)
-  if (! inverseFlag) {
-    for(int j=0; j<phraseS.size(); j++) {
-      phraseTableFile << vcbS.getWord( phraseS[j] );
-      phraseTableFile << " ";
-    }
-    phraseTableFile << "||| ";
-  }
-
-  // target phrase
-  for(int j=0; j<phraseT.size(); j++) {
-    phraseTableFile << vcbT.getWord( phraseT[j] );
-    phraseTableFile << " ";
-  }
-  phraseTableFile << "||| ";
-
-  // source phrase (if inverse)
-  if (inverseFlag) {
-    for(int j=0; j<phraseS.size(); j++) {
-      phraseTableFile << vcbS.getWord( phraseS[j] );
-      phraseTableFile << " ";
-    }
-    phraseTableFile << "||| ";
-  }
-
-  // phrase translation probability
-  if (goodTuringFlag && count<GT_MAX)
-    count *= discountFactor[(int)(count+0.99999)];
-  double condScore = count / totalCount;
-  phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore );
-
-  // lexical translation probability
-  if (lexFlag) {
-    double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
-  }
-
-  phraseTableFile << " ||| ";
-
-  // alignment info for non-terminals
-  if (! inverseFlag) {
-    if (hierarchicalFlag) {
-      // always output alignment if hiero style, but only for non-terms
-      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
-      for(int j = 0; j < phraseT.size() - 1; j++) {
-        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment->alignedToT[ j ].size() != 1) {
-            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
-            phraseTableFile.flush();
-            assert(bestAlignment->alignedToT[ j ].size() == 1);
-          }
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
-          phraseTableFile << sourcePos << "-" << j << " ";
-        }
-      }
-    } else if (wordAlignmentFlag) {
-      // alignment info in pb model
-      for(int j=0; j<bestAlignment->alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment->alignedToT[j];
-        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
-          phraseTableFile << *p << "-" << j << " ";
-        }
-      }
-    }
-  }
-
-  phraseTableFile << " ||| " << totalCount;
-  if (phrasePairCountFlag) {
-    phraseTableFile << " " << originalCount;
-  }
-  phraseTableFile << endl;
-}
-
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
-{
-  // lexical translation probability
-  double lexScore = 1.0;
-  int null = vcbS.getWordID("NULL");
-  // all target words have to be explained
-  for(int ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
-    if (srcIndices.empty()) {
-      // explain unaligned word by NULL
-      lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
-    } else {
-      // go through all the aligned words to compute average
-      double thisWordScore = 0;
-      for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
-        thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] );
-      }
-      lexScore *= thisWordScore / (double)srcIndices.size();
-    }
-  }
-  return lexScore;
-}
-
-void LexicalTable::load( char *fileName )
-{
-  cerr << "Loading lexical translation table from " << fileName;
-  ifstream inFile;
-  inFile.open(fileName);
-  if (inFile.fail()) {
-    cerr << " - ERROR: could not open file\n";
-    exit(1);
-  }
-  istream *inFileP = &inFile;
-
-  char line[LINE_MAX_LENGTH];
-
-  int i=0;
-  while(true) {
-    i++;
-    if (i%100000 == 0) cerr << "." << flush;
-    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
-    if (inFileP->eof()) break;
-
-    vector<string> token = tokenize( line );
-    if (token.size() != 3) {
-      cerr << "line " << i << " in " << fileName
-           << " has wrong number of tokens, skipping:\n"
-           << token.size() << " " << token[0] << " " << line << endl;
-      continue;
-    }
-
-    double prob = atof( token[2].c_str() );
-    WORD_ID wordT = vcbT.storeIfNew( token[0] );
-    WORD_ID wordS = vcbS.storeIfNew( token[1] );
-    ltable[ wordS ][ wordT ] = prob;
-  }
-  cerr << endl;
-}
-
-
-std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
-{
-  std::pair<iterator,bool> ret = m_coll.insert(obj);
-
-  if (ret.second)
-  { // obj inserted. Also add to sorted vector
-    const PhraseAlignmentCollection &insertedObj = *ret.first;
-    m_sortedColl.push_back(&insertedObj);
-  }
-
-  return ret;
-}
-
-
--- a/scripts/training/phrase-extract/score2.h
+++ b/scripts/training/phrase-extract/score2.h
@ -1,67 +0,0 @@
-#pragma once
-/*
- *  score.h
- *  extract
- *
- *  Created by Hieu Hoang on 28/07/2010.
- *  Copyright 2010 __MyCompanyName__. All rights reserved.
- *
- */
-#include <string>
-#include <vector>
-
-class PhraseAlignment;
-
-typedef std::vector<PhraseAlignment*>          PhraseAlignmentCollection;
-//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
-
-class PhraseAlignmentCollectionOrderer
-{
-public:
-	bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
-	{
-    assert(collA.size() > 0);
-    assert(collB.size() > 0);
-
-    const PhraseAlignment &objA = *collA[0];
-    const PhraseAlignment &objB = *collB[0];
-    bool ret = objA < objB;
-
-    return ret;
-	}
-};
-
-
-//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
-
-class PhrasePairGroup
-{
-private:
-  typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
-  Coll m_coll;
-
-
-public:
-  typedef Coll::iterator iterator;
-  typedef Coll::const_iterator const_iterator;
-  typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
-
-  std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
-
-  const SortedColl &GetSortedColl() const
-  { return m_sortedColl; }
-
-private:
-  SortedColl m_sortedColl;
-
-};
-
-// other functions *********************************************
-inline bool isNonTerminal( std::string &word )
-{
-  return (word.length()>=3 &&
-          word.substr(0,1).compare("[") == 0 &&
-          word.substr(word.length()-1,1).compare("]") == 0);
-}
-
-