Merge branch 'master' of git://github.com/moses-smt/mosesdecoder

2024-09-11 19:27:11 +03:00 · 2012-12-01 13:45:00 +00:00 · 2012-12-01 13:45:00 +00:00 · 269883fedd
commit 269883fedd
parent 0c5d000192 205cea8644
9 changed files with 168 additions and 183 deletions
--- a/misc/processLexicalTableMin.cpp
+++ b/misc/processLexicalTableMin.cpp
@ -5,7 +5,7 @@
 #include <boost/thread/thread.hpp>
 #endif

-#include "moses/CompactPT/LexicalReorderingTableCreator.h"
+#include "moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h"

 using namespace Moses;

--- a/misc/processPhraseTableMin.cpp
+++ b/misc/processPhraseTableMin.cpp
@ -5,7 +5,7 @@
 #endif 

 #include "moses/TypeDef.h"
-#include "moses/CompactPT/PhraseTableCreator.h"
+#include "moses/TranslationModel/CompactPT/PhraseTableCreator.h"

 using namespace Moses;

--- a/misc/queryPhraseTableMin.cpp
+++ b/misc/queryPhraseTableMin.cpp
@ -6,7 +6,7 @@
 #include <string>
 #include <vector>

-#include "moses/CompactPT/PhraseDictionaryCompact.h"
+#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
 #include "moses/Util.h"
 #include "moses/Phrase.h"

--- a/moses/LexicalReorderingTable.cpp
+++ b/moses/LexicalReorderingTable.cpp
@ -52,14 +52,10 @@ void auxAppend(IPhrase& head, const IPhrase& tail)
 LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
 {
  //decide use Compact or Tree or Memory table
-#ifdef HAVE_CMPH
-  if(FileExists(filePath + ".minlexr")) {
-    LexicalReorderingTable *compactLexr =
-      LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
-    if(compactLexr)
-      return compactLexr;
-  }
-#endif
+  LexicalReorderingTable *compactLexr =
+    LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
+  if(compactLexr)
+    return compactLexr;
  if(FileExists(filePath+".binlexr.idx")) {
    //there exists a binary version use that
    return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -1663,7 +1663,7 @@ bool StaticData::LoadPhraseBoundaryFeature()
 {
  const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-pb"));
  if (weight.size() > 1) {
-	std::cerr << "only one sparse producer weight allowed for the phrase boundary feature" << std::endl;
+	std::cerr << "Only one sparse producer weight allowed for the phrase boundary feature" << std::endl;
    return false;
  }

--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@ -141,7 +141,7 @@ LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad(
  }
  // file name is specified with suffix
  if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
-     || FileExists(filePath))
+     && FileExists(filePath))
  {
    //there exists a compact binary version use that
    VERBOSE(2,"Using compact lexical reordering table" << std::endl);  
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@ -49,35 +49,13 @@ namespace tmmt
    cerr << "loading completed" << endl;
  }

-  FuzzyMatchWrapper::WordIndex &FuzzyMatchWrapper::GetWordIndex(long translationId)
-  {
-    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-    std::map<long, WordIndex>::iterator iter = m_wordIndex.find(translationId);
-    assert(iter != m_wordIndex.end());
-
-    return iter->second;
-  }
-
-  void FuzzyMatchWrapper::AddWordIndex(long translationId)
-  {
-    boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
-    WordIndex &ret = m_wordIndex[translationId];
-  }
-
-  void FuzzyMatchWrapper::DeleteWordIndex(long translationId)
-  {
-    boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
-    size_t ret = m_wordIndex.erase(translationId);
-    CHECK(ret == 1);
-  }
-
  string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
  {
    const Moses::StaticData &staticData = Moses::StaticData::Instance();
-    
-    AddWordIndex(translationId);

-    string fuzzyMatchFile = ExtractTM(translationId, dirNameStr);
+    WordIndex wordIndex;
+
+    string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
    
    // create extrac files
    create_xml(fuzzyMatchFile);
@ -104,12 +82,11 @@ namespace tmmt
 		    	+ " -phrase-translation-table " + fuzzyMatchFile + ".pt";
    system(cmd.c_str());

-    DeleteWordIndex(translationId);

    return fuzzyMatchFile + ".pt.gz";
  }
  
-  string FuzzyMatchWrapper::ExtractTM(long translationId, const string &dirNameStr)
+  string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
  {
    const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();

@ -277,7 +254,7 @@ namespace tmmt
 		int pruned_match_count = 0;
 		if (short_match_max_length( input_length ))
 		{
-			init_short_matches(translationId, input[sentenceInd] );
+			init_short_matches(wordIndex, translationId, input[sentenceInd] );
 		}
 		vector< int > best_tm;
 		typedef map< int, vector< Match > >::iterator I;
@ -289,7 +266,7 @@ namespace tmmt
 			int tmID = tm->first;
 			int tm_length = suffixArray->GetSentenceLength(tmID);
 			vector< Match > &match = tm->second;
-			add_short_matches( translationId, match, source[tmID], input_length, best_cost );
+			add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
      
 			//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
      
@ -573,16 +550,34 @@ namespace tmmt
    }
  }
  
+  bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
+  {
+    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+    map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
+    if (lookup != m_lsed.end()) {
+      value = lookup->second;
+      return true;
+    }
+
+    return false;
+  }
+
+  void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
+  {
+    boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+    m_lsed[ key ] = value;
+  }
+
 /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */

 unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
 {
 	// check if already computed -> lookup in cache
 	pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
-	map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
-	if (lookup != lsed.end())
-	{
-		return (lookup->second);
+	unsigned int value;
+	bool ret = GetLSEDCache(pIdx, value);
+	if (ret) {
+		return value;
 	}
  
 	// get surface strings for word indices
@ -623,129 +618,129 @@ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
 	free( cost );
  
 	// cache and return result
-	lsed[ pIdx ] = final;
+	SetLSEDCache(pIdx, final);
 	return final;
 }
+  
+  /* string edit distance implementation */
+  
+  unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {

-/* string edit distance implementation */
+    // initialize cost and path matrices
+    unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+    char **path = (char**) calloc( sizeof( char* ), a.size()+1 );

-unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
-  
-	// initialize cost and path matrices
-	unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
-	char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
-	
-	for( unsigned int i=0; i<=a.size(); i++ ) {
-		cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
-		path[i] = (char*) calloc( sizeof(char), b.size()+1 );
-		if (i>0)
-		{
-			cost[i][0] = cost[i-1][0];
-			if (use_letter_sed)
-			{
-				cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
-			}
-			else
-			{
-				cost[i][0]++;
-			}
-		}
-		else
-		{
-			cost[i][0] = 0;
-		}
-		path[i][0] = 'I';
-	}
-  
-	for( unsigned int j=0; j<=b.size(); j++ ) {
-		if (j>0) 
-		{
-			cost[0][j] = cost[0][j-1];
-			if (use_letter_sed)
-			{
-				cost[0][j] +=	GetVocabulary().GetWord( b[j-1] ).size();
-			}
-			else
-			{
-				cost[0][j]++;
-			}
-		}
-		else
-		{
-			cost[0][j] = 0;
-		}
-		path[0][j] = 'D';
-	}
-  
-	// core string edit distance algorithm
-	for( unsigned int i=1; i<=a.size(); i++ ) {
-		for( unsigned int j=1; j<=b.size(); j++ ) {
-			unsigned int ins = cost[i-1][j];
-			unsigned int del = cost[i][j-1];
-			unsigned int match;
-			if (use_letter_sed)
-			{
-				ins += GetVocabulary().GetWord( a[i-1] ).size();
-				del += GetVocabulary().GetWord( b[j-1] ).size();
-				match = letter_sed( a[i-1], b[j-1] );
-			}
-			else
-			{
-				ins++;
-				del++;
-				match = ( a[i-1] == b[j-1] ) ? 0 : 1;
-			}
-			unsigned int diag = cost[i-1][j-1] + match;
-      
-			char action = (ins < del) ? 'I' : 'D';
-			unsigned int min = (ins < del) ? ins : del;
-			if (diag < min)
-			{
-				action = (match>0) ? 'S' : 'M';
-				min = diag;
-			}
-      
-			cost[i][j] = min;
-			path[i][j] = action;
-		}
-	}
-  
-	// construct string for best path
-	unsigned int i = a.size();
-	unsigned int j = b.size();
-	best_path = "";
-	while( i>0 || j>0 )
-	{
-		best_path = path[i][j] + best_path;
-		if (path[i][j] == 'I') 
-		{
-			i--;
-		}
-		else if (path[i][j] == 'D') 
-		{
-			j--;
-		}
-		else 
-		{ 
-			i--; 
-			j--;
-		}
-	}
-	
-  
-	// clear out memory
-	unsigned int final = cost[a.size()][b.size()];
-  
-	for( unsigned int i=0; i<=a.size(); i++ ) {
-		free( cost[i] );
-		free( path[i] );
-	}
-	free( cost );
-	free( path );
-  
-	// return result
-	return final;
-}
+    for( unsigned int i=0; i<=a.size(); i++ ) {
+      cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+      path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+      if (i>0)
+      {
+        cost[i][0] = cost[i-1][0];
+        if (use_letter_sed)
+        {
+          cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
+        }
+        else
+        {
+          cost[i][0]++;
+        }
+      }
+      else
+      {
+        cost[i][0] = 0;
+      }
+      path[i][0] = 'I';
+    }
+
+    for( unsigned int j=0; j<=b.size(); j++ ) {
+      if (j>0)
+      {
+        cost[0][j] = cost[0][j-1];
+        if (use_letter_sed)
+        {
+          cost[0][j] +=	GetVocabulary().GetWord( b[j-1] ).size();
+        }
+        else
+        {
+          cost[0][j]++;
+        }
+      }
+      else
+      {
+        cost[0][j] = 0;
+      }
+      path[0][j] = 'D';
+    }
+
+    // core string edit distance algorithm
+    for( unsigned int i=1; i<=a.size(); i++ ) {
+      for( unsigned int j=1; j<=b.size(); j++ ) {
+        unsigned int ins = cost[i-1][j];
+        unsigned int del = cost[i][j-1];
+        unsigned int match;
+        if (use_letter_sed)
+        {
+          ins += GetVocabulary().GetWord( a[i-1] ).size();
+          del += GetVocabulary().GetWord( b[j-1] ).size();
+          match = letter_sed( a[i-1], b[j-1] );
+        }
+        else
+        {
+          ins++;
+          del++;
+          match = ( a[i-1] == b[j-1] ) ? 0 : 1;
+        }
+        unsigned int diag = cost[i-1][j-1] + match;
+
+        char action = (ins < del) ? 'I' : 'D';
+        unsigned int min = (ins < del) ? ins : del;
+        if (diag < min)
+        {
+          action = (match>0) ? 'S' : 'M';
+          min = diag;
+        }
+
+        cost[i][j] = min;
+        path[i][j] = action;
+      }
+    }
+
+    // construct string for best path
+    unsigned int i = a.size();
+    unsigned int j = b.size();
+    best_path = "";
+    while( i>0 || j>0 )
+    {
+      best_path = path[i][j] + best_path;
+      if (path[i][j] == 'I')
+      {
+        i--;
+      }
+      else if (path[i][j] == 'D')
+      {
+        j--;
+      }
+      else
+      {
+        i--;
+        j--;
+      }
+    }
+
+
+    // clear out memory
+    unsigned int final = cost[a.size()][b.size()];
+
+    for( unsigned int i=0; i<=a.size(); i++ ) {
+      free( cost[i] );
+      free( path[i] );
+    }
+    free( cost );
+    free( path );
+
+    // return result
+    return final;
+  }

 /* utlility function: compute length of sentence in characters 
 (spaces do not count) */
@ -838,13 +833,12 @@ int FuzzyMatchWrapper::short_match_max_length( int input_length )
 (to be used by the next function) 
 (done here, because this has be done only once for an input sentence) */

-void FuzzyMatchWrapper::init_short_matches(long translationId, const vector< WORD_ID > &input )
+void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
 {
 	int max_length = short_match_max_length( input.size() );
 	if (max_length == 0)
 		return;
  
-	WordIndex &wordIndex = GetWordIndex(translationId);
 	wordIndex.clear();
 	
 	// store input words and their positions in hash map
@ -861,14 +855,12 @@ void FuzzyMatchWrapper::init_short_matches(long translationId, const vector< WOR

 /* add all short matches to list of matches for a sentence */

-void FuzzyMatchWrapper::add_short_matches(long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
+void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
 {	
 	int max_length = short_match_max_length( input_length );
 	if (max_length == 0)
 		return;
  
-  WordIndex &wordIndex = GetWordIndex(translationId);
-
 	int tm_length = tm.size();
 	map< WORD_ID,vector< int > >::iterator input_word_hit;
 	for(int t_pos=0; t_pos<tm.size(); t_pos++)
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
@ -12,6 +12,7 @@
 #ifdef WITH_THREADS
 #include <boost/thread/shared_mutex.hpp>
 #endif
+
 #include <fstream>
 #include <string>
 #include "SuffixArray.h"
@ -46,16 +47,14 @@ protected:
  int multiple_max;

  typedef std::map< WORD_ID,std::vector< int > > WordIndex;
-  std::map<long, WordIndex> m_wordIndex;
-  //WordIndex m_wordIndex;
+
+  // global cache for word pairs
+  std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
 #ifdef WITH_THREADS
  //reader-writer lock
  mutable boost::shared_mutex m_accessLock;
 #endif

-  // global cache for word pairs
-  std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > lsed;
-
  void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
  void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
  void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
@ -69,21 +68,21 @@ protected:
  unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
  unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
  unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
-  void init_short_matches(long translationId, const std::vector< WORD_ID > &input );
+  void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
  int short_match_max_length( int input_length );
-  void add_short_matches(long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
+  void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
  std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
  int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );

  void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string  &path, std::ofstream &outputFile);

-  std::string ExtractTM(long translationId, const std::string &inputPath);
+  std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
  Vocabulary &GetVocabulary()
  { return suffixArray->GetVocabulary(); }

-  WordIndex &GetWordIndex(long translationId);
-  void AddWordIndex(long translationId);
-  void DeleteWordIndex(long translationId);
+  bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
+  void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
+
 };

 }
--- a/scripts/ems/support/mml-filter.py
+++ b/scripts/ems/support/mml-filter.py
@ -11,8 +11,6 @@ import optparse
 import random
 import sys

-import numpy
-
 from defaultconfig import Config

 logging.basicConfig(format = "%(asctime)-15s %(message)s")