2010-10-21 13:49:27 +04:00
|
|
|
#pragma once
|
|
|
|
|
2012-05-07 17:59:37 +04:00
|
|
|
#include "Vocabulary.h"
|
2010-10-21 13:49:27 +04:00
|
|
|
|
2011-02-24 16:57:11 +03:00
|
|
|
class SuffixArray
|
2010-10-21 13:49:27 +04:00
|
|
|
{
|
|
|
|
public:
|
2011-02-24 16:57:11 +03:00
|
|
|
typedef unsigned int INDEX;
|
2010-10-21 13:49:27 +04:00
|
|
|
|
|
|
|
private:
|
2011-02-24 16:57:11 +03:00
|
|
|
WORD_ID *m_array;
|
|
|
|
INDEX *m_index;
|
|
|
|
INDEX *m_buffer;
|
|
|
|
char *m_wordInSentence;
|
|
|
|
INDEX *m_sentence;
|
|
|
|
char *m_sentenceLength;
|
|
|
|
WORD_ID m_endOfSentence;
|
|
|
|
Vocabulary m_vcb;
|
|
|
|
INDEX m_size;
|
|
|
|
INDEX m_sentenceCount;
|
2010-10-21 13:49:27 +04:00
|
|
|
|
2012-05-07 20:13:31 +04:00
|
|
|
// No copying allowed.
|
|
|
|
SuffixArray(const SuffixArray&);
|
|
|
|
void operator=(const SuffixArray&);
|
|
|
|
|
2010-10-21 13:49:27 +04:00
|
|
|
public:
|
2012-05-07 20:01:09 +04:00
|
|
|
SuffixArray();
|
2011-02-24 16:57:11 +03:00
|
|
|
~SuffixArray();
|
2010-10-21 13:49:27 +04:00
|
|
|
|
2012-05-07 18:26:32 +04:00
|
|
|
void Create(const std::string& fileName );
|
2011-02-24 16:57:11 +03:00
|
|
|
void Sort(INDEX start, INDEX end);
|
|
|
|
int CompareIndex( INDEX a, INDEX b ) const;
|
|
|
|
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
|
2012-05-07 18:26:32 +04:00
|
|
|
int Count( const std::vector< WORD > &phrase );
|
|
|
|
bool MinCount( const std::vector< WORD > &phrase, INDEX min );
|
|
|
|
bool Exists( const std::vector< WORD > &phrase );
|
|
|
|
int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
|
|
|
|
int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
|
|
|
|
INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
|
|
|
|
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
|
|
|
|
int Match( const std::vector< WORD > &phrase, INDEX index );
|
2011-02-24 16:57:11 +03:00
|
|
|
void List( INDEX start, INDEX end );
|
2012-05-07 19:58:44 +04:00
|
|
|
inline INDEX GetPosition( INDEX index ) const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_index[ index ];
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
inline INDEX GetSentence( INDEX position ) const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_sentence[position];
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
inline char GetWordInSentence( INDEX position ) const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_wordInSentence[position];
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
inline char GetSentenceLength( INDEX sentenceId ) const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_sentenceLength[sentenceId];
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
inline INDEX GetSize() const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_size;
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
inline WORD GetWord( INDEX position ) const {
|
2011-02-24 16:57:11 +03:00
|
|
|
return m_vcb.GetWord( m_array[position] );
|
|
|
|
}
|
2012-05-07 19:58:44 +04:00
|
|
|
void Save(const std::string& fileName ) const;
|
2012-05-07 18:26:32 +04:00
|
|
|
void Load(const std::string& fileName );
|
2010-10-21 13:49:27 +04:00
|
|
|
};
|