open-source-search-engine/Speller.h
2014-11-10 14:45:11 -08:00

178 lines
4.7 KiB
C++

// Matt Wells, copyright Sep 2003
// Speller is a class for doing spell checking on user queries.
// . TODO: we might be able to use this as a related searches feature too, but
// we might have to use a different distance metric (getSimilarity())
// that is more word based and less letter based.
#ifndef _SPELLER_H_
#define _SPELLER_H_
// max int32_t returned by getPhrasePopularity() function
#define MAX_PHRASE_POP 16800
#include "StopWords.h"
#include "Language.h"
// . the height and width of m_stable[][] that takes a letter pair as an index
// . valid chars are returned by isValidChar() routine
// . we use A-Z, 0-9, space, hyphen, apostrophe and \0... that's it
// need this to distribute pop words dictionary.
#define MAX_UNIQUE_HOSTS_PER_SPLIT 16
class StateFrag{
public:
// ALL THESE ARE CONCERNED WITH THE FRAG
void *m_state;//StateSpeller
int32_t m_errno;
Query *m_q;
int32_t m_startQword;
int32_t m_endQword;
bool m_recommended;
// break the frag into word ptrs, it must be NULL terminated
char *m_wp [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
int32_t m_wplen [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
bool m_isstop [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
bool m_isfound[MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
// total number of words that have had recommendations
int32_t m_numFound;
char m_dst[MAX_FRAG_SIZE];
Multicast m_mcast[MAX_UNIQUE_HOSTS_PER_SPLIT];
int32_t m_numRequests;
int32_t m_numReplies;
int32_t m_pLen;
int32_t m_pPosn;
char *m_a;
int32_t m_alen;
char *m_b;
char m_c;
bool m_narrowPhrase;
int32_t m_numNarrowPhrases;
char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
};
class StateSpeller{
public:
void *m_state;
void (*m_callback)(void *state);
Query *m_q;
bool m_spellcheck;
char *m_dst;
char *m_dend;
bool m_narrowSearch;
char *m_nrw;
char *m_nend;
int32_t *m_numNarrow;
uint64_t m_start;
int32_t m_numFrags;
int32_t m_numFragsReceived;
StateFrag *m_stFrag[MAX_FRAG_SIZE];
};
class Speller {
public:
Speller();
~Speller();
bool registerHandler();
void reset();
bool init();
void test (char *ff);
//uint8_t getUniqueLang ( int64_t *wid );
int64_t getLangBits64 ( int64_t *wid ) ;
int32_t getPhrasePopularity ( char *s, uint64_t h,
bool checkTitleRecDict,
unsigned char langId = langEnglish );
bool canSplitWords ( char *s, int32_t slen, bool *isPorn,
char *splitWords,
unsigned char langId, int32_t encodeType);
bool findNext( char *s, char *send, char **nextWord, bool *isPorn,
unsigned char langId, int32_t encodeType );
int32_t checkDict ( char *s, int32_t slen, char encodeType,
unsigned char lang = langEnglish ){
return m_language[lang].checkDict(s,slen,encodeType);
}
// should be same hash algo to make wordId
bool isInDict ( uint64_t wordId ) {
return m_unifiedDict.isInTable(&wordId); };
// . dump out the first "numWordsToDump" words and phrases
// encountered will scanning the records in Titledb
// . use these words/phrases to make the dictionaries
bool generateDicts ( int32_t numWordsToDump , char *coll );
bool getPhonetic( char *word, char *target );
bool getRecommendation ( Query *q, bool spellcheck,
char *dst, int32_t dstLen,
bool narrowSearch,
char *narrow, int32_t narrowLen,
int32_t *numNarrows, void *state,
void (*callback)(void *state));
bool getRecommendation ( StateFrag *st );
bool launchReco( StateFrag *st );
bool gotSpellerReply( StateFrag *st );
void gotFrags( void *state );
bool getRecommendation ( char *frag , char *dst , int32_t dstLen );
int32_t getWords ( const char *s ,
char *wp [MAX_FRAG_SIZE] ,
int32_t wplen [MAX_FRAG_SIZE] ,
bool *isstop );
Language m_language[MAX_LANGUAGES];
char *getRandomWord() ;
bool loadUnifiedDict();
bool createUnifiedDict ();
void dictLookupTest ( char *ff );
char *getPhraseRecord(char *phrase, int len);
int64_t *getPhraseLanguages(char *phrase, int len);
bool getPhraseLanguages(char *phrase, int len, int64_t *array);
bool getPhraseLanguages2 (char *phraseRec , int64_t *array) ;
char getPhraseLanguage(char *phrase, int len );
bool getSynsInEnglish ( char *w ,
int32_t wlen ,
char nativeLang ,
char wikiLang ) ;
void CheckWordRecs(const char *filename);
//private:
bool populateHashTable( char *ff, HashTableX *htable,
unsigned char langId );
//private:
//HashTableT <uint64_t, char* > m_unifiedDict;
HashTableX m_unifiedDict;
// can this queryword start a phrase ?
bool canStart( QueryWord *qw );
//char *m_unifiedBuf;
//int32_t m_unifiedBufSize;
SafeBuf m_unifiedBuf;
int32_t m_hostsPerSplit;
};
extern class Speller g_speller;
#endif