mosesdecoder/biconcor/SuffixArray.h

83 lines
2.7 KiB
C++

#pragma once
#include "Vocabulary.h"
class SuffixArray
{
public:
typedef unsigned int INDEX;
private:
WORD_ID *m_array;
INDEX *m_index;
INDEX *m_buffer;
char *m_wordInSentence;
INDEX *m_sentence;
char *m_sentenceLength;
WORD_ID m_endOfSentence;
INDEX *m_document;
INDEX *m_documentName;
char *m_documentNameBuffer;
size_t m_documentNameLength;
size_t m_documentCount;
bool m_useDocument;
Vocabulary m_vcb;
INDEX m_size;
INDEX m_sentenceCount;
// No copying allowed.
SuffixArray(const SuffixArray&);
void operator=(const SuffixArray&);
public:
SuffixArray();
~SuffixArray();
void Create(const std::string& fileName );
bool ProcessDocumentLine( const char* const, const size_t );
void Sort(INDEX start, INDEX end);
int CompareIndex( INDEX a, INDEX b ) const;
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
int Count( const std::vector< WORD > &phrase );
bool MinCount( const std::vector< WORD > &phrase, INDEX min );
bool Exists( const std::vector< WORD > &phrase );
int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
int Match( const std::vector< WORD > &phrase, INDEX index );
void List( INDEX start, INDEX end );
void PrintSentenceMatches( const std::vector< WORD > &phrase );
inline INDEX GetPosition( INDEX index ) const {
return m_index[ index ];
}
inline INDEX GetSentence( INDEX position ) const {
return m_sentence[position];
}
inline char GetWordInSentence( INDEX position ) const {
return m_wordInSentence[position];
}
inline char GetSentenceLength( INDEX sentenceId ) const {
return m_sentenceLength[sentenceId];
}
inline INDEX GetSize() const {
return m_size;
}
inline WORD GetWord( INDEX position ) const {
return m_vcb.GetWord( m_array[position] );
}
void UseDocument() {
m_useDocument = true;
}
INDEX GetDocument( INDEX sentence ) const;
void PrintDocumentName( INDEX document ) {
for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) {
std::cout << m_documentNameBuffer[ i ];
}
}
void Save(const std::string& fileName ) const;
void Load(const std::string& fileName );
void CheckAllocation(bool, const char *dataStructure) const;
bool Error( const char* message, const std::string& fileName) const;
};