program to compute countings for phrase pairs

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2647 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-26 21:42:19 +03:00 · 2010-01-08 17:16:37 +00:00 · 2010-01-08 17:16:37 +00:00 · 3ad833d136
commit 3ad833d136
parent 34d9feccc8
1 changed files with 347 additions and 0 deletions
--- a/scripts/training/phrase-extract/statistics.cpp
+++ b/scripts/training/phrase-extract/statistics.cpp
@ -0,0 +1,347 @@
+// $Id$
+// vim:tabstop=2
+
+#include <sstream>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include "AlignmentPhrase.h"
+#include "tables-core.h"
+
+using namespace std;
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
+                _IS.getline(_LINE, _SIZE, _DELIM); \
+                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+                if (_IS.gcount() == _SIZE-1) { \
+                  cerr << "Line too long! Buffer overflow. Delete lines >=" \
+                    << _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/statistics.cpp" \
+                    << endl; \
+                    exit(1); \
+                } \
+              }
+#define LINE_MAX_LENGTH 10000
+
+class PhraseAlignment {
+public:
+  int english, foreign;
+  vector< vector<size_t> > alignedToE;
+  vector< vector<size_t> > alignedToF;
+  
+  bool create( char*, int );
+  void clear();
+  bool equals( const PhraseAlignment& );
+};
+
+class LexicalTable {
+public:
+  map< WORD_ID, map< WORD_ID, double > > ltable;
+  void load( char[] );
+};
+
+vector<string> tokenize( char [] );
+
+void processPhrasePairs( vector< PhraseAlignment > & );
+
+ofstream phraseTableFile;
+
+Vocabulary vcbE;
+Vocabulary vcbF;
+LexicalTable lexTable;
+PhraseTable phraseTableE;
+PhraseTable phraseTableF;
+bool inverseFlag;
+int phrasePairBase = 0; // only used for "proper" conditioning
+
+int main(int argc, char* argv[]) 
+{
+  cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
+   		 << "modifying PhraseScore v1.4 written by Philipp Koehn\n"
+       << "It computes statistics for extracted phrase pairs\n"
+       << "if (direct):\n"
+       << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
+       << "if (inverse)\n"
+       << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
+  time_t starttime = time(NULL);
+
+  if (argc != 4 && argc != 5) {
+    cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
+    exit(1);
+  }
+  char* &fileNameExtract = argv[1];
+  char* &fileNameLex = argv[2];
+  char* &fileNamePhraseTable = argv[3];
+  inverseFlag = false;
+  if (argc > 4) {
+    inverseFlag = true;
+    cerr << "using inverse mode\n";
+  }
+
+  // lexical translation table
+  lexTable.load( fileNameLex );
+  
+  // sorted phrase extraction file
+  ifstream extractFile;
+
+  extractFile.open(fileNameExtract);
+  if (extractFile.fail()) {
+    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+    exit(1);
+  }
+  istream &extractFileP = extractFile;
+
+  // output file: phrase translation table
+  phraseTableFile.open(fileNamePhraseTable);
+  if (phraseTableFile.fail()) {
+    cerr << "ERROR: could not open file phrase table file " 
+	 << fileNamePhraseTable << endl;
+    exit(1);
+  }
+  
+  // loop through all extracted phrase translations
+  int lastForeign = -1;
+  vector< PhraseAlignment > phrasePairsWithSameF;
+  int i=0;
+  int fileCount = 0;
+  while(true) {
+    if (extractFileP.eof()) break;
+    if (++i % 100000 == 0) cerr << "." << flush;
+    char line[LINE_MAX_LENGTH];    
+    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n');
+    //    if (fileCount>0)
+    if (extractFileP.eof()) 
+			break;
+    PhraseAlignment phrasePair;
+    bool isPhrasePair = phrasePair.create( line, i );
+    if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
+      processPhrasePairs( phrasePairsWithSameF );
+      for(int j=0;j<phrasePairsWithSameF.size();j++)
+				phrasePairsWithSameF[j].clear();
+      phrasePairsWithSameF.clear();
+      phraseTableE.clear();
+      phraseTableF.clear();
+      phrasePair.clear(); // process line again, since phrase tables flushed
+      phrasePair.create( line, i ); 
+      phrasePairBase = 0;
+    }
+    lastForeign = phrasePair.foreign;
+    if (isPhrasePair)
+      phrasePairsWithSameF.push_back( phrasePair );
+    else
+      phrasePairBase++;
+  }
+  processPhrasePairs( phrasePairsWithSameF );
+  phraseTableFile.close();
+}
+
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) {
+  if (phrasePair.size() == 0) return;
+  map<int, int> countE;
+  map<int, int> alignmentE;
+  int totalCount = 0;
+  int currentCount = 0;
+  int maxSameCount = 0;
+  int maxSame = -1;
+  int old = -1;
+  for(int i=0;i<phrasePair.size();i++) {
+    if (i>0) {
+      if (phrasePair[old].english == phrasePair[i].english) {
+				if (! phrasePair[i].equals( phrasePair[old] )) {
+					if (currentCount > maxSameCount) {
+						maxSameCount = currentCount;
+						maxSame = i-1;
+					}
+					currentCount = 0;
+				}
+			}
+      else {
+				// wrap up old E
+				if (currentCount > maxSameCount) {
+					maxSameCount = currentCount;
+					maxSame = i-1;
+				}
+
+				alignmentE[ phrasePair[old].english ] = maxSame;
+				//	if (maxSameCount != totalCount)
+				//  cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+				
+				// get ready for new E
+				totalCount = 0;
+				currentCount = 0;
+				maxSameCount = 0;
+				maxSame = -1;
+			}
+    }
+    countE[ phrasePair[i].english ]++;
+    old = i;
+    currentCount++;
+    totalCount++;
+  }
+  
+  // wrap up old E
+  if (currentCount > maxSameCount) {
+    maxSameCount = currentCount;
+    maxSame = phrasePair.size()-1;
+  }
+  alignmentE[ phrasePair[old].english ] = maxSame;
+  //  if (maxSameCount != totalCount)
+  //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+
+  // output table
+  typedef map< int, int >::iterator II;
+  PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
+	size_t index = 0;
+  for(II i = countE.begin(); i != countE.end(); i++) {
+    //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
+		//cerr << index << endl;
+
+    // foreign phrase (unless inverse)
+    if (! inverseFlag) {
+      for(int j=0;j<phraseF.size();j++)
+			{
+				phraseTableFile << vcbF.getWord( phraseF[j] );
+				phraseTableFile << " ";
+			}
+      phraseTableFile << "||| ";
+		}
+
+    // english phrase
+    PHRASE phraseE = phraseTableE.getPhrase( i->first );
+		for(int j=0;j<phraseE.size();j++)
+		{
+      phraseTableFile << vcbE.getWord( phraseE[j] );
+			phraseTableFile << " ";
+		}
+    phraseTableFile << "||| ";
+
+    // foreign phrase (if inverse)
+    if (inverseFlag) {
+      for(int j=0;j<phraseF.size();j++)
+			{
+				phraseTableFile << vcbF.getWord( phraseF[j] );
+				phraseTableFile << " ";
+			}
+      phraseTableFile << "||| ";
+		}
+    
+		// phrase pair frequency
+    phraseTableFile << i->second;
+
+    //source phrase pair frequency 
+    phraseTableFile << " " << phrasePair.size();
+
+    // source phrase length
+		phraseTableFile	<< " " << phraseF.size();
+
+    // target phrase length
+		phraseTableFile	<< " " << phraseE.size();
+
+    phraseTableFile << endl;
+
+		index += i->second;
+  }
+}
+
+bool PhraseAlignment::create( char line[], int lineID ) {
+  vector< string > token = tokenize( line );
+  int item = 1;
+  PHRASE phraseF, phraseE;
+  for (int j=0; j<token.size(); j++) {
+    if (token[j] == "|||") item++;
+    else {
+      if (item == 1)
+	phraseF.push_back( vcbF.storeIfNew( token[j] ) );
+      else if (item == 2)
+	phraseE.push_back( vcbE.storeIfNew( token[j] ) );
+      else if (item == 3) {
+	int e,f;
+	sscanf(token[j].c_str(), "%d-%d", &f, &e);
+	if (e >= phraseE.size() || f >= phraseF.size()) { 
+	  cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n"; }
+	else {
+	  if (alignedToE.size() == 0) {
+	    vector< size_t > dummy;
+	    for(int i=0;i<phraseE.size();i++)
+	      alignedToE.push_back( dummy );
+	    for(int i=0;i<phraseF.size();i++)
+	      alignedToF.push_back( dummy );
+	    foreign = phraseTableF.storeIfNew( phraseF );
+	    english = phraseTableE.storeIfNew( phraseE );
+	  }
+	  alignedToE[e].push_back( f );
+	  alignedToF[f].push_back( e );
+	}
+      }
+    }
+  }
+  return (item>2); // real phrase pair, not just foreign phrase
+}
+
+void PhraseAlignment::clear() {
+  for(int i=0;i<alignedToE.size();i++)
+    alignedToE[i].clear();
+  for(int i=0;i<alignedToF.size();i++)
+    alignedToF[i].clear();
+  alignedToE.clear();
+  alignedToF.clear();
+}
+
+bool PhraseAlignment::equals( const PhraseAlignment& other ) {
+  if (this == &other) return true;
+  if (other.english != english) return false;
+  if (other.foreign != foreign) return false;
+  PHRASE phraseE = phraseTableE.getPhrase( english );
+  PHRASE phraseF = phraseTableF.getPhrase( foreign );
+  for(int i=0;i<phraseE.size();i++) {
+    if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
+    for(int j=0; j<alignedToE[i].size(); j++) {
+      if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
+    }
+  }
+  for(int i=0;i<phraseF.size();i++) {
+    if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
+    for(int j=0; j<alignedToF[i].size(); j++) {
+      if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
+    }
+  }
+  return true;
+}
+
+void LexicalTable::load( char *fileName ) {
+  cerr << "Loading lexical translation table from " << fileName;
+  ifstream inFile;
+  inFile.open(fileName);
+  if (inFile.fail()) {
+    cerr << " - ERROR: could not open file\n";
+    exit(1);
+  }
+  istream *inFileP = &inFile;
+
+  char line[LINE_MAX_LENGTH];
+
+  int i=0;
+  while(true) {
+    i++;
+    if (i%100000 == 0) cerr << "." << flush;
+    SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n');
+    if (inFileP->eof()) break;
+
+    vector<string> token = tokenize( line );
+    if (token.size() != 3) {
+      cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+	token.size() << " " << token[0] << " " << line << endl;
+      continue;
+    }
+    
+    double prob = atof( token[2].c_str() );
+    WORD_ID wordE = vcbE.storeIfNew( token[0] );
+    WORD_ID wordF = vcbF.storeIfNew( token[1] );
+    ltable[ wordF ][ wordE ] = prob;
+  }
+  cerr << endl;
+}