mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp

#include "SuffixArray.h"
#include <string>
#include <stdlib.h>
#include <cstring>

using namespace std;

namespace tmmt
{

SuffixArray::SuffixArray( string fileName )
{
  m_vcb.StoreIfNew( "<uNk>" );
  m_endOfSentence = m_vcb.StoreIfNew( "<s>" );

  ifstream extractFile;
  char line[LINE_MAX_LENGTH];

  // count the number of words first;
  extractFile.open(fileName.c_str());
  istream *fileP = &extractFile;
  m_size = 0;
  size_t sentenceCount = 0;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );
    m_size += words.size() + 1;
    sentenceCount++;
  }
  extractFile.close();
  cerr << m_size << " words (incl. sentence boundaries)" << endl;

  // allocate memory
  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
  m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
  m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
  m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );

  // fill the array
  int wordIndex = 0;
  int sentenceId = 0;
  extractFile.open(fileName.c_str());
  fileP = &extractFile;
  while(!fileP->eof()) {
    SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
    if (fileP->eof()) break;
    vector< WORD_ID > words = m_vcb.Tokenize( line );

    // add to corpus vector
    corpus.push_back(words);

    // create SA

    vector< WORD_ID >::const_iterator i;
    for( i=words.begin(); i!=words.end(); i++) {
      m_index[ wordIndex ] = wordIndex;
      m_sentence[ wordIndex ] = sentenceId;
      m_wordInSentence[ wordIndex ] = i-words.begin();
      m_array[ wordIndex++ ] = *i;
    }
    m_index[ wordIndex ] = wordIndex;
    m_array[ wordIndex++ ] = m_endOfSentence;
    m_sentenceLength[ sentenceId++ ] = words.size();
  }
  extractFile.close();
  cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
  // List(0,9);

  // sort
  m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
  Sort( 0, m_size-1 );
  free( m_buffer );
  cerr << "done sorting" << endl;
}

// good ol' quick sort
void SuffixArray::Sort(INDEX start, INDEX end)
{
  if (start == end) return;
  INDEX mid = (start+end+1)/2;
  Sort( start, mid-1 );
  Sort( mid, end );

  // merge
  size_t i = start;
  size_t j = mid;
  size_t k = 0;
  size_t length = end-start+1;
  while( k<length ) {
    if (i == mid ) {
      m_buffer[ k++ ] = m_index[ j++ ];
    } else if (j > end ) {
      m_buffer[ k++ ] = m_index[ i++ ];
    } else {
      if (CompareIndex( m_index[i], m_index[j] ) < 0) {
        m_buffer[ k++ ] = m_index[ i++ ];
      } else {
        m_buffer[ k++ ] = m_index[ j++ ];
      }
    }
  }

  memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
          ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
}

SuffixArray::~SuffixArray()
{
  free(m_index);
  free(m_array);
}

int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
{
  // skip over identical words
  INDEX offset = 0;
  while( a+offset < m_size &&
         b+offset < m_size &&
         m_array[ a+offset ] == m_array[ b+offset ] ) {
    offset++;
  }

  if( a+offset == m_size ) return -1;
  if( b+offset == m_size ) return 1;
  return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
}

inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
  // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
  return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}

int SuffixArray::Count( const vector< WORD > &phrase )
{
  INDEX dummy;
  return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
}

bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
{
  INDEX dummy;
  return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
}

bool SuffixArray::Exists( const vector< WORD > &phrase )
{
  INDEX dummy;
  return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
}

int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
  return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
}

int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
  // cerr << "FindFirst\n";
  INDEX start = search_start;
  INDEX end = (search_end == -1) ? (m_size-1) : search_end;
  INDEX mid = FindFirst( phrase, start, end );
  // cerr << "done\n";
  if (mid == m_size) return 0; // no matches
  if (min == 1) return 1;      // only existance check

  int matchCount = 1;

  //cerr << "before...\n";
  firstMatch = FindLast( phrase, mid, start, -1 );
  matchCount += mid - firstMatch;

  //cerr << "after...\n";
  lastMatch = FindLast( phrase, mid, end, 1 );
  matchCount += lastMatch - mid;

  return matchCount;
}

SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
{
  end += direction;
  while(true) {
    INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;

    int match = Match( phrase, mid );
    int matchNext = Match( phrase, mid+direction );
    //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;

    if (match == 0 && matchNext != 0) return mid;

    if (match == 0) // mid point is a match
      start = mid;
    else
      end = mid;
  }
}

SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
{
  while(true) {
    INDEX mid = ( start + end + 1 )/2;
    //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
    int match = Match( phrase, mid );

    if (match == 0) return mid;
    if (start >= end && match != 0 ) return m_size;

    if (match > 0)
      start = mid+1;
    else
      end = mid-1;
  }
}

int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
{
  INDEX pos = m_index[ index ];
  for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
    int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
    // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
    if (match != 0)
      return match;
  }
  return 0;
}

void SuffixArray::List(INDEX start, INDEX end)
{
  for(INDEX i=start; i<=end; i++) {
    INDEX pos = m_index[ i ];
    // cerr << i << ":" << pos << "\t";
    for(int j=0; j<5 && j+pos<m_size; j++) {
      //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
    }
    // cerr << "\n";
  }
}

}
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`#include "SuffixArray.h"`
			`#include <string>`
			`#include <stdlib.h>`
			`#include <cstring>`

			`using namespace std;`

			`namespace tmmt`
			`{`

beautify 2013-05-29 21:16:15 +04:00			`SuffixArray::SuffixArray( string fileName )`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`{`
beautify 2013-05-29 21:16:15 +04:00			`m_vcb.StoreIfNew( "<uNk>" );`
			`m_endOfSentence = m_vcb.StoreIfNew( "<s>" );`

			`ifstream extractFile;`
			`char line[LINE_MAX_LENGTH];`

			`// count the number of words first;`
			`extractFile.open(fileName.c_str());`
			`istream *fileP = &extractFile;`
			`m_size = 0;`
			`size_t sentenceCount = 0;`
			`while(!fileP->eof()) {`
			`SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');`
			`if (fileP->eof()) break;`
			`vector< WORD_ID > words = m_vcb.Tokenize( line );`
			`m_size += words.size() + 1;`
			`sentenceCount++;`
			`}`
			`extractFile.close();`
			`cerr << m_size << " words (incl. sentence boundaries)" << endl;`

			`// allocate memory`
			`m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );`
			`m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );`
			`m_wordInSentence = (char*) calloc( sizeof( char ), m_size );`
			`m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );`
			`m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );`

			`// fill the array`
			`int wordIndex = 0;`
			`int sentenceId = 0;`
			`extractFile.open(fileName.c_str());`
			`fileP = &extractFile;`
			`while(!fileP->eof()) {`
			`SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');`
			`if (fileP->eof()) break;`
			`vector< WORD_ID > words = m_vcb.Tokenize( line );`

			`// add to corpus vector`
			`corpus.push_back(words);`

			`// create SA`

			`vector< WORD_ID >::const_iterator i;`
			`for( i=words.begin(); i!=words.end(); i++) {`
			`m_index[ wordIndex ] = wordIndex;`
			`m_sentence[ wordIndex ] = sentenceId;`
			`m_wordInSentence[ wordIndex ] = i-words.begin();`
			`m_array[ wordIndex++ ] = *i;`
			`}`
			`m_index[ wordIndex ] = wordIndex;`
			`m_array[ wordIndex++ ] = m_endOfSentence;`
			`m_sentenceLength[ sentenceId++ ] = words.size();`
			`}`
			`extractFile.close();`
			`cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;`
			`// List(0,9);`

			`// sort`
			`m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );`
			`Sort( 0, m_size-1 );`
			`free( m_buffer );`
			`cerr << "done sorting" << endl;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`// good ol' quick sort`
beautify 2013-05-29 21:16:15 +04:00			`void SuffixArray::Sort(INDEX start, INDEX end)`
			`{`
			`if (start == end) return;`
			`INDEX mid = (start+end+1)/2;`
			`Sort( start, mid-1 );`
			`Sort( mid, end );`

			`// merge`
Changed int to size_t to avoid compiler signed/unsigned comparison warnings. 2013-09-26 01:34:55 +04:00			`size_t i = start;`
			`size_t j = mid;`
			`size_t k = 0;`
			`size_t length = end-start+1;`
beautify 2013-05-29 21:16:15 +04:00			`while( k<length ) {`
			`if (i == mid ) {`
			`m_buffer[ k++ ] = m_index[ j++ ];`
			`} else if (j > end ) {`
			`m_buffer[ k++ ] = m_index[ i++ ];`
			`} else {`
			`if (CompareIndex( m_index[i], m_index[j] ) < 0) {`
			`m_buffer[ k++ ] = m_index[ i++ ];`
			`} else {`
			`m_buffer[ k++ ] = m_index[ j++ ];`
			`}`
			`}`
			`}`

			`memcpy( ((char)m_index) + sizeof( INDEX ) start,`
			`((char)m_buffer), sizeof( INDEX ) (end-start+1) );`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`SuffixArray::~SuffixArray()`
beautify 2013-05-29 21:16:15 +04:00			`{`
			`free(m_index);`
			`free(m_array);`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`int SuffixArray::CompareIndex( INDEX a, INDEX b ) const`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`// skip over identical words`
			`INDEX offset = 0;`
			`while( a+offset < m_size &&`
			`b+offset < m_size &&`
			`m_array[ a+offset ] == m_array[ b+offset ] ) {`
			`offset++;`
			`}`

			`if( a+offset == m_size ) return -1;`
			`if( b+offset == m_size ) return 1;`
			`return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;`
			`return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`int SuffixArray::Count( const vector< WORD > &phrase )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`INDEX dummy;`
			`return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`INDEX dummy;`
			`return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`bool SuffixArray::Exists( const vector< WORD > &phrase )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`INDEX dummy;`
			`return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`// cerr << "FindFirst\n";`
			`INDEX start = search_start;`
			`INDEX end = (search_end == -1) ? (m_size-1) : search_end;`
			`INDEX mid = FindFirst( phrase, start, end );`
			`// cerr << "done\n";`
			`if (mid == m_size) return 0; // no matches`
			`if (min == 1) return 1; // only existance check`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
beautify 2013-05-29 21:16:15 +04:00			`int matchCount = 1;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
beautify 2013-05-29 21:16:15 +04:00			`//cerr << "before...\n";`
			`firstMatch = FindLast( phrase, mid, start, -1 );`
			`matchCount += mid - firstMatch;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
beautify 2013-05-29 21:16:15 +04:00			`//cerr << "after...\n";`
			`lastMatch = FindLast( phrase, mid, end, 1 );`
			`matchCount += lastMatch - mid;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
beautify 2013-05-29 21:16:15 +04:00			`return matchCount;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`end += direction;`
			`while(true) {`
			`INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;`

			`int match = Match( phrase, mid );`
			`int matchNext = Match( phrase, mid+direction );`
			`//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;`

			`if (match == 0 && matchNext != 0) return mid;`

			`if (match == 0) // mid point is a match`
			`start = mid;`
			`else`
			`end = mid;`
			`}`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`while(true) {`
			`INDEX mid = ( start + end + 1 )/2;`
			`//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";`
			`int match = Match( phrase, mid );`

			`if (match == 0) return mid;`
			`if (start >= end && match != 0 ) return m_size;`

			`if (match > 0)`
			`start = mid+1;`
			`else`
			`end = mid-1;`
			`}`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`INDEX pos = m_index[ index ];`
			`for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {`
			`int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );`
			`// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;`
			`if (match != 0)`
			`return match;`
			`}`
			`return 0;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`void SuffixArray::List(INDEX start, INDEX end)`
			`{`
beautify 2013-05-29 21:16:15 +04:00			`for(INDEX i=start; i<=end; i++) {`
			`INDEX pos = m_index[ i ];`
			`// cerr << i << ":" << pos << "\t";`
			`for(int j=0; j<5 && j+pos<m_size; j++) {`
			`//cout << " " << m_vcb.GetWord( m_array[ pos+j ] );`
			`}`
			`// cerr << "\n";`
			`}`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00			`}`

			`}`