mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h

#include "Vocabulary.h"

#pragma once

#define LINE_MAX_LENGTH 10000

namespace tmmt
{

class SuffixArray
{
public:
	typedef unsigned int INDEX;

private:
  std::vector< std::vector< WORD_ID > > corpus;

  WORD_ID *m_array;
	INDEX *m_index;
	INDEX *m_buffer;
	char *m_wordInSentence;
	size_t *m_sentence;
	char *m_sentenceLength;
	WORD_ID m_endOfSentence;
	Vocabulary m_vcb;
	INDEX m_size;

public:
	SuffixArray( std::string fileName );
	~SuffixArray();

	void Sort(INDEX start, INDEX end);
	int CompareIndex( INDEX a, INDEX b ) const;
	inline int CompareWord( WORD_ID a, WORD_ID b ) const;
	int Count( const std::vector< WORD > &phrase );
	bool MinCount( const std::vector< WORD > &phrase, INDEX min );
	bool Exists( const std::vector< WORD > &phrase );
	int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
	int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
	INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
	INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
	int Match( const std::vector< WORD > &phrase, INDEX index );
	void List( INDEX start, INDEX end );
	inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
	inline size_t GetSentence( INDEX position ) { return m_sentence[position]; }
	inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
	inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; }
	inline INDEX GetSize() { return m_size; }

	Vocabulary &GetVocabulary()
	{ return m_vcb; }
	const std::vector< std::vector< WORD_ID > > &GetCorpus() const
  { return corpus; }
};

}