mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-05 12:47:37 +03:00
301 lines
7.4 KiB
C++
301 lines
7.4 KiB
C++
|
|
#ifndef _LANGUAGE_H_
|
|
#define _LANGUAGE_H_
|
|
//#include <wchar.h>
|
|
#include "gb-include.h"
|
|
//#include "UnicodeProperties.h" //UChar32
|
|
#include "File.h"
|
|
#include "HashTableT.h"
|
|
#include "Query.h"
|
|
#include "Lang.h"
|
|
#include "Multicast.h"
|
|
#include "Threads.h"
|
|
#include "Titledb.h"
|
|
#include "Iso8859.h"
|
|
#include "IndexList.h"
|
|
//#include "Msg3a.h"
|
|
|
|
#include "Msg20.h"
|
|
#include "Msg37.h"
|
|
|
|
// max chars in any language
|
|
#define MAX_WORDS_PER_PHRASE 5
|
|
#define MAX_CHARS 256
|
|
#define TOP_POP_PHRASES 40 * 1024
|
|
#define NUM_CHARS 40
|
|
#define MAX_FRAG_SIZE 1024
|
|
// max chars that start the rule
|
|
|
|
#define MAX_PHRASE_LEN 80
|
|
#define MAX_RECOMMENDATIONS 10
|
|
#define LARGE_SCORE 0xfffff
|
|
#define MAX_NARROW_SEARCHES 19
|
|
|
|
/*
|
|
// used only while generating titles from wikipedia pages, makeWikiFiles()
|
|
class StateWik {
|
|
public:
|
|
bool getIndexList( );
|
|
bool getSummary ( );
|
|
bool gotSummary ( );
|
|
|
|
int m_fdw;
|
|
Msg0 m_msg0;
|
|
IndexList m_list;
|
|
Query m_q;
|
|
key_t m_startKey;
|
|
key_t m_endKey;
|
|
char *m_coll;
|
|
long m_collLen;
|
|
long long m_termId;
|
|
long m_minRecSize;
|
|
Msg20 m_msg20s[MAX_FRAG_SIZE];
|
|
long m_numMsg20sOutstanding;
|
|
long m_numMsg20sLaunched;
|
|
long m_numMsg20sReceived;
|
|
};
|
|
|
|
class StateDict{
|
|
public:
|
|
char *m_dictBuf;
|
|
long m_dictBufSize;
|
|
char *m_buf;
|
|
long m_bufSize;
|
|
char **m_wordsPtr;
|
|
long long *m_termIds;
|
|
long long *m_termFreqs;
|
|
long m_numTuples;
|
|
Msg37 m_msg37;
|
|
};
|
|
*/
|
|
|
|
/*class StateAff{
|
|
public:
|
|
bool openAffinityFile ( );
|
|
bool launchAffinity ( );
|
|
bool gotAffinityFreqs1 ( );
|
|
bool gotAffinityFreqs2 ( );
|
|
bool doneAffinities ( );
|
|
|
|
FILE *m_fdr;
|
|
int m_fdw;
|
|
long m_fileNum;
|
|
char m_buf[1026];
|
|
Msg3a m_msg3a;
|
|
Query m_q;
|
|
long long m_numerator;
|
|
long long m_denominator;
|
|
};*/
|
|
|
|
typedef struct Reco{
|
|
char reco[MAX_PHRASE_LEN];
|
|
long score;
|
|
}Reco;
|
|
|
|
class Language {
|
|
|
|
public:
|
|
|
|
Language();
|
|
~Language();
|
|
|
|
void reset();
|
|
|
|
bool init( char *unifiedBuf, long unifiedBufSize, long lang,
|
|
long hostsPerSplit, unsigned long myHash );
|
|
|
|
void setLang( long lang ) { m_lang = lang; };
|
|
|
|
//bool makeAffinities();
|
|
|
|
//long getPhrasePopularity ( char *s, unsigned long long h,
|
|
// bool checkTitleRecDict );
|
|
|
|
bool checkDict(char *s, long slen, char encodeType);
|
|
|
|
bool getRecommendation( char *origWord, long origWordLen,
|
|
char *recommendation, long recommendationLen,
|
|
bool *found, long *score, long *popularity,
|
|
bool forceReco = false );
|
|
|
|
//long narrowPhrase ( char *request, char *phrases, long *pops,
|
|
// long maxPhrases );
|
|
|
|
//bool generateDicts ( long numWordsToDump , char *coll );
|
|
|
|
//bool convertLatin1DictToUTF8 ( char *infile );
|
|
|
|
// needed for makeDict
|
|
//bool gotTermFreqs( StateDict *st );
|
|
//StateDict *m_stateDict;
|
|
|
|
// hash table of the dictionary
|
|
HashTableT <unsigned long long, long>m_dict;
|
|
|
|
private:
|
|
long spellcheckDict();
|
|
|
|
// always accepts only ascii chars. makeClean() converts unicode into
|
|
// ascii
|
|
bool getPhonetic( char *origWord, long origWordLen,
|
|
char *target, long targetLen );
|
|
|
|
bool loadRules();
|
|
|
|
bool loadSpellerDict( char *spellerBuf, long spellerbufSize,
|
|
long hostsPerSplit, unsigned long myHash );
|
|
|
|
//bool loadTitleRecDicts( );
|
|
|
|
//bool loadNarrow( char *spellerBuf, long spellerBufSize,
|
|
// long hostsPerSplit, unsigned long myHash );
|
|
|
|
bool loadDictHashTable( );
|
|
|
|
//bool genTopPopFile ( char *infile );
|
|
|
|
bool genDistributedPopFile ( char *infile, unsigned long myHash );
|
|
|
|
//bool cleanDictFile ( );
|
|
|
|
bool makeClean( char *inBuf, long inBufSize,
|
|
char *outBuf, long outBufSize );//, bool isUTF16 );
|
|
|
|
//bool makePhonet( char *infile);
|
|
|
|
//bool makeDict();
|
|
|
|
//bool makeQueryFiles ( );
|
|
|
|
//bool makeWikiFiles ( );
|
|
|
|
bool loadWikipediaWords();
|
|
|
|
bool loadMispelledWords();
|
|
|
|
bool hasMispelling(char *phrase, long phraseLen);
|
|
|
|
long tryPhonet( char *phonetTmp, char *origPhonet,
|
|
char *origClean, long tryForScore,
|
|
Reco *recos, long numRecos, long *lowestScore );
|
|
|
|
long editDistance( char *a, char *b, long level, // starting level
|
|
long limit ); // maximum level
|
|
|
|
long weightedAverage(long soundslikeScore, long wordScore);
|
|
|
|
long limitEditDistance( char *a, char *b, long limit );
|
|
|
|
long limit1EditDistance( char *a, char *b );
|
|
|
|
long limit2EditDistance( char *a, char *b );
|
|
|
|
long checkRest( char *a, char *b, long w, char *amax, long min );
|
|
|
|
long check2( char *a, char *b, long w, char *amax, long min );
|
|
|
|
short editDistance( char *a0, char *b0 );
|
|
|
|
short reduceScore ( char *a, char *b );
|
|
|
|
//bool makeWordFiles ( long numWordsToDump , long numWordsPerPhrase ,
|
|
// char *coll );
|
|
|
|
//bool makePopFiles ( long numWordsToDump , long numWordsPerPhrase ,
|
|
// char *coll);
|
|
|
|
//bool makeScoreFiles ( long maxWordsPerFile );
|
|
|
|
// this map maps a char to a "dict char"
|
|
//unsigned char m_map [ 256 ];
|
|
|
|
// . when comparing letter pairs, we only allow them to consist of
|
|
// certain chars: 0-9, A-Z, apostrophe and space and \0 otherwise
|
|
// m_table gets too big. This implies a NUM_CHARS of
|
|
// . this compressed the value, too
|
|
// . \0, space, 0-9, A-Z, \' is the ordering
|
|
//unsigned char to_dict_char ( unsigned char c ) { return m_map[c]; };
|
|
|
|
// Temporary unicode workaround for latin-1 compatibility
|
|
//unsigned char uc_to_dict_char ( UChar c ) {
|
|
// if (c>255)c=0;
|
|
// return m_map[c];
|
|
//};
|
|
|
|
// what language loaded
|
|
long m_lang;
|
|
|
|
// what charset does this language use
|
|
unsigned char m_charset;
|
|
|
|
// buffer to store the phonetic rules
|
|
char *m_rulesBuf;
|
|
long m_rulesBufSize;
|
|
char **m_rulesPtr;
|
|
long m_rulesPtrSize;
|
|
long m_numRules;
|
|
// points to the index of each rule that starts with a new character
|
|
long m_ruleStarts[MAX_CHARS];
|
|
// the chars that are in a phonet
|
|
bool m_ruleChars[MAX_CHARS];
|
|
|
|
// buffers to store the dictionaries
|
|
char *m_distributedBuf;
|
|
long m_distributedBufSize;
|
|
char **m_tuplePtr;
|
|
long m_tuplePtrSize;
|
|
long m_numTuples;
|
|
|
|
// total number of phonets
|
|
long m_numPhonets;
|
|
|
|
// narrow phrase
|
|
char *m_narrowBuf;
|
|
long m_narrowBufSize;
|
|
long m_numNarrowPtrs;
|
|
char **m_frntPtrs;
|
|
char **m_bckPtrs;
|
|
long *m_frntCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
|
|
long *m_bckCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
|
|
|
|
// m_phonetics stores the hash of the phonetic as the key.
|
|
// the value is a composite of index in m_tuplePtrs where the list
|
|
// starts as the high 32 bits of the value and the number of
|
|
// words having the same phonetic as the low 32 bits of the value
|
|
HashTableT <unsigned long long, unsigned long long > m_phonetics;
|
|
|
|
// hash table of the distributed pop words dictionary
|
|
// HashTableT <unsigned long, long> m_titlerecDict;
|
|
|
|
// hash table of the distributed pop words dictionary
|
|
HashTableT <unsigned long long, long>m_distributedPopPhrases;
|
|
|
|
// hash table of the top popular words in the dictionary
|
|
// HashTableT <unsigned long, char *> m_topPopPhrases;
|
|
|
|
// hash table of mispelled words
|
|
HashTableT <unsigned long, bool>m_misp;
|
|
|
|
// hash table of wikipedia words
|
|
HashTableT <unsigned long, bool>m_wiki;
|
|
|
|
// PARMS, which can be adjusted. Currently all languages have the
|
|
// same adjustments, so using the same parms.
|
|
long m_editDistanceWeightsDel1;
|
|
long m_editDistanceWeightsDel2;
|
|
long m_editDistanceWeightsSwap;
|
|
long m_editDistanceWeightsSub;
|
|
long m_editDistanceWeightsSimilar;
|
|
long m_editDistanceWeightsMin;
|
|
long m_editDistanceWeightsMax;
|
|
long m_soundslikeWeight;
|
|
long m_wordWeight;
|
|
long m_span;
|
|
|
|
bool m_followup;
|
|
bool m_collapseResult;
|
|
bool m_removeAccents;
|
|
};
|
|
|
|
#endif
|