#ifndef _LANGUAGE_H_
#define _LANGUAGE_H_
//#include <wchar.h>
#include "gb-include.h"
//#include "UnicodeProperties.h" //UChar32
#include "File.h"
#include "HashTableT.h"
#include "Query.h"
#include "Lang.h"
#include "Multicast.h"
#include "Threads.h"
#include "Titledb.h"
#include "Iso8859.h"
#include "IndexList.h"
//#include "Msg3a.h"
#include "Msg20.h"
#include "Msg37.h"
// max chars in any language
#define MAX_CHARS 256
#define TOP_POP_PHRASES 40 * 1024
#define NUM_CHARS 40
#define MAX_FRAG_SIZE 1024
// max chars that start the rule
#define MAX_PHRASE_LEN 80
#define LARGE_SCORE 0xfffff
// used only while generating titles from wikipedia pages, makeWikiFiles()
class StateWik {
bool getIndexList( );
bool getSummary ( );
bool gotSummary ( );
int m_fdw;
Msg0 m_msg0;
IndexList m_list;
Query m_q;
key_t m_startKey;
key_t m_endKey;
char *m_coll;
long m_collLen;
long long m_termId;
long m_minRecSize;
Msg20 m_msg20s[MAX_FRAG_SIZE];
long m_numMsg20sOutstanding;
long m_numMsg20sLaunched;
long m_numMsg20sReceived;
class StateDict{
char *m_dictBuf;
long m_dictBufSize;
char *m_buf;
long m_bufSize;
char **m_wordsPtr;
long long *m_termIds;
long long *m_termFreqs;
long m_numTuples;
Msg37 m_msg37;
/*class StateAff{
bool openAffinityFile ( );
bool launchAffinity ( );
bool gotAffinityFreqs1 ( );
bool gotAffinityFreqs2 ( );
bool doneAffinities ( );
FILE *m_fdr;
int m_fdw;
long m_fileNum;
char m_buf[1026];
Msg3a m_msg3a;
Query m_q;
long long m_numerator;
long long m_denominator;
typedef struct Reco{
char reco[MAX_PHRASE_LEN];
long score;
class Language {
void reset();
bool init( char *unifiedBuf, long unifiedBufSize, long lang,
long hostsPerSplit, unsigned long myHash );
void setLang( long lang ) { m_lang = lang; };
//bool makeAffinities();
//long getPhrasePopularity ( char *s, unsigned long long h,
// bool checkTitleRecDict );
bool checkDict(char *s, long slen, char encodeType);
bool getRecommendation( char *origWord, long origWordLen,
char *recommendation, long recommendationLen,
bool *found, long *score, long *popularity,
bool forceReco = false );
//long narrowPhrase ( char *request, char *phrases, long *pops,
// long maxPhrases );
//bool generateDicts ( long numWordsToDump , char *coll );
//bool convertLatin1DictToUTF8 ( char *infile );
// needed for makeDict
//bool gotTermFreqs( StateDict *st );
//StateDict *m_stateDict;
// hash table of the dictionary
HashTableT <unsigned long long, long>m_dict;
long spellcheckDict();
// always accepts only ascii chars. makeClean() converts unicode into
// ascii
bool getPhonetic( char *origWord, long origWordLen,
char *target, long targetLen );
bool loadRules();
bool loadSpellerDict( char *spellerBuf, long spellerbufSize,
long hostsPerSplit, unsigned long myHash );
//bool loadTitleRecDicts( );
//bool loadNarrow( char *spellerBuf, long spellerBufSize,
// long hostsPerSplit, unsigned long myHash );
bool loadDictHashTable( );
//bool genTopPopFile ( char *infile );
bool genDistributedPopFile ( char *infile, unsigned long myHash );
//bool cleanDictFile ( );
bool makeClean( char *inBuf, long inBufSize,
char *outBuf, long outBufSize );//, bool isUTF16 );
//bool makePhonet( char *infile);
//bool makeDict();
//bool makeQueryFiles ( );
//bool makeWikiFiles ( );
bool loadWikipediaWords();
bool loadMispelledWords();
bool hasMispelling(char *phrase, long phraseLen);
long tryPhonet( char *phonetTmp, char *origPhonet,
char *origClean, long tryForScore,
Reco *recos, long numRecos, long *lowestScore );
long editDistance( char *a, char *b, long level, // starting level
long limit ); // maximum level
long weightedAverage(long soundslikeScore, long wordScore);
long limitEditDistance( char *a, char *b, long limit );
long limit1EditDistance( char *a, char *b );
long limit2EditDistance( char *a, char *b );
long checkRest( char *a, char *b, long w, char *amax, long min );
long check2( char *a, char *b, long w, char *amax, long min );
short editDistance( char *a0, char *b0 );
short reduceScore ( char *a, char *b );
//bool makeWordFiles ( long numWordsToDump , long numWordsPerPhrase ,
// char *coll );
//bool makePopFiles ( long numWordsToDump , long numWordsPerPhrase ,
// char *coll);
//bool makeScoreFiles ( long maxWordsPerFile );
// this map maps a char to a "dict char"
//unsigned char m_map [ 256 ];
// . when comparing letter pairs, we only allow them to consist of
// certain chars: 0-9, A-Z, apostrophe and space and \0 otherwise
// m_table gets too big. This implies a NUM_CHARS of
// . this compressed the value, too
// . \0, space, 0-9, A-Z, \' is the ordering
//unsigned char to_dict_char ( unsigned char c ) { return m_map[c]; };
// Temporary unicode workaround for latin-1 compatibility
//unsigned char uc_to_dict_char ( UChar c ) {
// if (c>255)c=0;
// return m_map[c];
// what language loaded
long m_lang;
// what charset does this language use
unsigned char m_charset;
// buffer to store the phonetic rules
char *m_rulesBuf;
long m_rulesBufSize;
char **m_rulesPtr;
long m_rulesPtrSize;
long m_numRules;
// points to the index of each rule that starts with a new character
long m_ruleStarts[MAX_CHARS];
// the chars that are in a phonet
bool m_ruleChars[MAX_CHARS];
// buffers to store the dictionaries
char *m_distributedBuf;
long m_distributedBufSize;
char **m_tuplePtr;
long m_tuplePtrSize;
long m_numTuples;
// total number of phonets
long m_numPhonets;
// narrow phrase
char *m_narrowBuf;
long m_narrowBufSize;
long m_numNarrowPtrs;
char **m_frntPtrs;
char **m_bckPtrs;
long *m_frntCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
long *m_bckCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
// m_phonetics stores the hash of the phonetic as the key.
// the value is a composite of index in m_tuplePtrs where the list
// starts as the high 32 bits of the value and the number of
// words having the same phonetic as the low 32 bits of the value
HashTableT <unsigned long long, unsigned long long > m_phonetics;
// hash table of the distributed pop words dictionary
// HashTableT <unsigned long, long> m_titlerecDict;
// hash table of the distributed pop words dictionary
HashTableT <unsigned long long, long>m_distributedPopPhrases;
// hash table of the top popular words in the dictionary
// HashTableT <unsigned long, char *> m_topPopPhrases;
// hash table of mispelled words
HashTableT <unsigned long, bool>m_misp;
// hash table of wikipedia words
HashTableT <unsigned long, bool>m_wiki;
// PARMS, which can be adjusted. Currently all languages have the
// same adjustments, so using the same parms.
long m_editDistanceWeightsDel1;
long m_editDistanceWeightsDel2;
long m_editDistanceWeightsSwap;
long m_editDistanceWeightsSub;
long m_editDistanceWeightsSimilar;
long m_editDistanceWeightsMin;
long m_editDistanceWeightsMax;
long m_soundslikeWeight;
long m_wordWeight;
long m_span;
bool m_followup;
bool m_collapseResult;
bool m_removeAccents;