diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index 232c5e5ce..c680d7245 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -550,16 +550,34 @@ namespace tmmt } } + bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const + { + boost::shared_lock read_lock(m_accessLock); + map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); + if (lookup != m_lsed.end()) { + value = lookup->second; + return true; + } + + return false; + } + + void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) + { + boost::unique_lock lock(m_accessLock); + m_lsed[ key ] = value; + } + /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx ) { // check if already computed -> lookup in cache pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); - map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx ); - if (lookup != lsed.end()) - { - return (lookup->second); + unsigned int value; + bool ret = GetLSEDCache(pIdx, value); + if (ret) { + return value; } // get surface strings for word indices @@ -600,129 +618,129 @@ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx ) free( cost ); // cache and return result - lsed[ pIdx ] = final; + SetLSEDCache(pIdx, final); return final; } + + /* string edit distance implementation */ + + unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) { -/* string edit distance implementation */ + // initialize cost and path matrices + unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); + char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); -unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) { - - // initialize cost and path matrices - unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); - char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); - - for( unsigned int i=0; i<=a.size(); i++ ) { - cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); - path[i] = (char*) calloc( sizeof(char), b.size()+1 ); - if (i>0) - { - cost[i][0] = cost[i-1][0]; - if (use_letter_sed) - { - cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); - } - else - { - cost[i][0]++; - } - } - else - { - cost[i][0] = 0; - } - path[i][0] = 'I'; - } - - for( unsigned int j=0; j<=b.size(); j++ ) { - if (j>0) - { - cost[0][j] = cost[0][j-1]; - if (use_letter_sed) - { - cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); - } - else - { - cost[0][j]++; - } - } - else - { - cost[0][j] = 0; - } - path[0][j] = 'D'; - } - - // core string edit distance algorithm - for( unsigned int i=1; i<=a.size(); i++ ) { - for( unsigned int j=1; j<=b.size(); j++ ) { - unsigned int ins = cost[i-1][j]; - unsigned int del = cost[i][j-1]; - unsigned int match; - if (use_letter_sed) - { - ins += GetVocabulary().GetWord( a[i-1] ).size(); - del += GetVocabulary().GetWord( b[j-1] ).size(); - match = letter_sed( a[i-1], b[j-1] ); - } - else - { - ins++; - del++; - match = ( a[i-1] == b[j-1] ) ? 0 : 1; - } - unsigned int diag = cost[i-1][j-1] + match; - - char action = (ins < del) ? 'I' : 'D'; - unsigned int min = (ins < del) ? ins : del; - if (diag < min) - { - action = (match>0) ? 'S' : 'M'; - min = diag; - } - - cost[i][j] = min; - path[i][j] = action; - } - } - - // construct string for best path - unsigned int i = a.size(); - unsigned int j = b.size(); - best_path = ""; - while( i>0 || j>0 ) - { - best_path = path[i][j] + best_path; - if (path[i][j] == 'I') - { - i--; - } - else if (path[i][j] == 'D') - { - j--; - } - else - { - i--; - j--; - } - } - - - // clear out memory - unsigned int final = cost[a.size()][b.size()]; - - for( unsigned int i=0; i<=a.size(); i++ ) { - free( cost[i] ); - free( path[i] ); - } - free( cost ); - free( path ); - - // return result - return final; -} + for( unsigned int i=0; i<=a.size(); i++ ) { + cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); + path[i] = (char*) calloc( sizeof(char), b.size()+1 ); + if (i>0) + { + cost[i][0] = cost[i-1][0]; + if (use_letter_sed) + { + cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); + } + else + { + cost[i][0]++; + } + } + else + { + cost[i][0] = 0; + } + path[i][0] = 'I'; + } + + for( unsigned int j=0; j<=b.size(); j++ ) { + if (j>0) + { + cost[0][j] = cost[0][j-1]; + if (use_letter_sed) + { + cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); + } + else + { + cost[0][j]++; + } + } + else + { + cost[0][j] = 0; + } + path[0][j] = 'D'; + } + + // core string edit distance algorithm + for( unsigned int i=1; i<=a.size(); i++ ) { + for( unsigned int j=1; j<=b.size(); j++ ) { + unsigned int ins = cost[i-1][j]; + unsigned int del = cost[i][j-1]; + unsigned int match; + if (use_letter_sed) + { + ins += GetVocabulary().GetWord( a[i-1] ).size(); + del += GetVocabulary().GetWord( b[j-1] ).size(); + match = letter_sed( a[i-1], b[j-1] ); + } + else + { + ins++; + del++; + match = ( a[i-1] == b[j-1] ) ? 0 : 1; + } + unsigned int diag = cost[i-1][j-1] + match; + + char action = (ins < del) ? 'I' : 'D'; + unsigned int min = (ins < del) ? ins : del; + if (diag < min) + { + action = (match>0) ? 'S' : 'M'; + min = diag; + } + + cost[i][j] = min; + path[i][j] = action; + } + } + + // construct string for best path + unsigned int i = a.size(); + unsigned int j = b.size(); + best_path = ""; + while( i>0 || j>0 ) + { + best_path = path[i][j] + best_path; + if (path[i][j] == 'I') + { + i--; + } + else if (path[i][j] == 'D') + { + j--; + } + else + { + i--; + j--; + } + } + + + // clear out memory + unsigned int final = cost[a.size()][b.size()]; + + for( unsigned int i=0; i<=a.size(); i++ ) { + free( cost[i] ); + free( path[i] ); + } + free( cost ); + free( path ); + + // return result + return final; + } /* utlility function: compute length of sentence in characters (spaces do not count) */ diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h index 923723bbf..a6f772fb9 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h @@ -9,6 +9,10 @@ #ifndef moses_FuzzyMatchWrapper_h #define moses_FuzzyMatchWrapper_h +#ifdef WITH_THREADS +#include +#endif + #include #include #include "SuffixArray.h" @@ -45,7 +49,11 @@ protected: typedef std::map< WORD_ID,std::vector< int > > WordIndex; // global cache for word pairs - std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > lsed; + std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed; +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus ); void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus); @@ -72,6 +80,9 @@ protected: Vocabulary &GetVocabulary() { return suffixArray->GetVocabulary(); } + bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const; + void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value); + }; }