open-source-search-engine/Speller.h

178 lines
4.6 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Sep 2003
// Speller is a class for doing spell checking on user queries.
// . TODO: we might be able to use this as a related searches feature too, but
// we might have to use a different distance metric (getSimilarity())
// that is more word based and less letter based.
#ifndef _SPELLER_H_
#define _SPELLER_H_
// max long returned by getPhrasePopularity() function
#define MAX_PHRASE_POP 16800
#include "StopWords.h"
#include "Language.h"
// . the height and width of m_stable[][] that takes a letter pair as an index
// . valid chars are returned by isValidChar() routine
// . we use A-Z, 0-9, space, hyphen, apostrophe and \0... that's it
// need this to distribute pop words dictionary.
#define MAX_UNIQUE_HOSTS_PER_SPLIT 16
class StateFrag{
public:
// ALL THESE ARE CONCERNED WITH THE FRAG
void *m_state;//StateSpeller
long m_errno;
Query *m_q;
long m_startQword;
long m_endQword;
bool m_recommended;
// break the frag into word ptrs, it must be NULL terminated
char *m_wp [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
long m_wplen [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
bool m_isstop [MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
bool m_isfound[MAX_FRAG_SIZE];//[ MAX_QUERY_WORDS ];
// total number of words that have had recommendations
long m_numFound;
char m_dst[MAX_FRAG_SIZE];
Multicast m_mcast[MAX_UNIQUE_HOSTS_PER_SPLIT];
long m_numRequests;
long m_numReplies;
long m_pLen;
long m_pPosn;
char *m_a;
long m_alen;
char *m_b;
char m_c;
bool m_narrowPhrase;
long m_numNarrowPhrases;
char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
};
class StateSpeller{
public:
void *m_state;
void (*m_callback)(void *state);
Query *m_q;
bool m_spellcheck;
char *m_dst;
char *m_dend;
bool m_narrowSearch;
char *m_nrw;
char *m_nend;
long *m_numNarrow;
unsigned long long m_start;
long m_numFrags;
long m_numFragsReceived;
StateFrag *m_stFrag[MAX_FRAG_SIZE];
};
class Speller {
public:
Speller();
~Speller();
bool registerHandler();
void reset();
bool init();
void test (char *ff);
//uint8_t getUniqueLang ( long long *wid );
long long getLangBits64 ( long long *wid ) ;
long getPhrasePopularity ( char *s, unsigned long long h,
bool checkTitleRecDict,
unsigned char langId = langEnglish );
bool canSplitWords ( char *s, long slen, bool *isPorn,
char *splitWords,
unsigned char langId, long encodeType);
bool findNext( char *s, char *send, char **nextWord, bool *isPorn,
unsigned char langId, long encodeType );
long checkDict ( char *s, long slen, char encodeType,
unsigned char lang = langEnglish ){
return m_language[lang].checkDict(s,slen,encodeType);
}
// should be same hash algo to make wordId
bool isInDict ( uint64_t wordId ) {
return m_unifiedDict.isInTable(&wordId); };
// . dump out the first "numWordsToDump" words and phrases
// encountered will scanning the records in Titledb
// . use these words/phrases to make the dictionaries
bool generateDicts ( long numWordsToDump , char *coll );
bool getPhonetic( char *word, char *target );
bool getRecommendation ( Query *q, bool spellcheck,
char *dst, long dstLen,
bool narrowSearch,
char *narrow, long narrowLen,
long *numNarrows, void *state,
void (*callback)(void *state));
bool getRecommendation ( StateFrag *st );
bool launchReco( StateFrag *st );
bool gotSpellerReply( StateFrag *st );
void gotFrags( void *state );
bool getRecommendation ( char *frag , char *dst , long dstLen );
long getWords ( const char *s ,
char *wp [MAX_FRAG_SIZE] ,
long wplen [MAX_FRAG_SIZE] ,
bool *isstop );
Language m_language[MAX_LANGUAGES];
char *getRandomWord() ;
bool loadUnifiedDict();
bool createUnifiedDict ();
void dictLookupTest ( char *ff );
char *getPhraseRecord(char *phrase, int len);
long long *getPhraseLanguages(char *phrase, int len);
bool getPhraseLanguages(char *phrase, int len, long long *array);
bool getPhraseLanguages2 (char *phraseRec , long long *array) ;
char getPhraseLanguage(char *phrase, int len );
bool getSynsInEnglish ( char *w ,
long wlen ,
char nativeLang ,
char wikiLang ) ;
void CheckWordRecs(const char *filename);
//private:
bool populateHashTable( char *ff, HashTableX *htable,
unsigned char langId );
//private:
//HashTableT <unsigned long long, char* > m_unifiedDict;
HashTableX m_unifiedDict;
// can this queryword start a phrase ?
bool canStart( QueryWord *qw );
//char *m_unifiedBuf;
//long m_unifiedBufSize;
SafeBuf m_unifiedBuf;
long m_hostsPerSplit;
};
extern class Speller g_speller;
#endif