open-source-search-engine/Language.cpp

5003 lines
132 KiB
C++

#include "Language.h"
#include "sort.h"
#include "Speller.h"
#include "Sections.h"
// word/phrase must be in at least this many docs to be included in our dict
#define MIN_DOCS 3
// ROUTINES NEEDED FOR GBSORT
// The dict is stored as a tuple of ( original word, phonetic, (lang, score)..)
int cmpPhonet (const void *v1, const void *v2) {
char *word1 = *(char **)v1;
// phrase
char *p1 = word1;
// phonetic
p1 += gbstrlen(p1) + 1;
char *word2 = *(char **)v2;
// phrase
char *p2 = word2;
// phonetic
p2 += gbstrlen(p2) + 1;
return strcmp(p1,p2);
}
int cmpScores (const void *v1, const void *v2) {
Reco r1 = *(Reco *) v1;
Reco r2 = *(Reco *) v2;
return ( r1.score > r2.score );
}
int cmpFrnt (const void *v1, const void *v2) {
// compare phrase
char *p1 = *(char **) v1;
char *p2 = *(char **) v2;
return strcmp ( p1,p2 );
}
int cmpBck (const void *v1, const void *v2) {
char *p1 = *(char **) v1;
char *p2 = *(char **) v2;
// string compare for reverse
// go to the end
p1 += gbstrlen(p1) - 1;
p2 += gbstrlen(p2) - 1;
while ( *p1 != '\0' && *p2 != '\0' ) {
if ( *p1 > *p2 )
return 1;
else if ( *p1 < *p2 )
return -1;
p1--;
p2--;
}
if ( *p1 == '\0' )
return -1;
if ( *p2 == '\0' )
return 1;
return 0;
}
static char s_keyMap[] = { 10, 24, 22, 12, 2, 13, 14, 15, 7, 16,
17, 18, 26, 25, 8, 9 , 0 , 3 , 11, 4,
6 , 23, 1 , 21, 5, 20 };
static char s_keyboard[] = {'q' ,'w','e','r','t','y','u','i','o' ,'p' ,
'a' ,'s','d','f','g','h','j','k','l' ,'\0',
'z','x','c','v','b','n','m','\0','\0','\0'};
//static void gotSummaryWrapper ( void *state );
//static void gotIndexListWrapper( void *state , RdbList *list );
//static void gotTermFreqsWrapper( void *state );
/*static void gotAffinityFreqs1Wrapper(void *state);
static void gotAffinityFreqs2Wrapper(void *state);*/
Language::Language(){
m_rulesBuf = NULL;
m_rulesBufSize = 0;
m_rulesPtr = NULL;
m_rulesPtrSize = 0;
m_distributedBuf = NULL;
m_distributedBufSize = 0;
m_tuplePtr = NULL;
m_tuplePtrSize = 0;
m_narrowBuf = NULL;
m_narrowBufSize = 0;
m_numNarrowPtrs = 0;
// Set to the default aspell parms
m_editDistanceWeightsDel1 = 95;
m_editDistanceWeightsDel2 = 95;
m_editDistanceWeightsSwap = 90;
m_editDistanceWeightsSub = 100;
m_editDistanceWeightsSimilar = 10;
m_editDistanceWeightsMin = 95;
m_editDistanceWeightsMax = 100;
m_soundslikeWeight = 15;
m_wordWeight = 85;
m_span = 50;
// . set m_map
// . this maps an ascii char to a char in dict space
// . used in loadNarrow
/*
for ( int32_t i = 0 ; i < 256 ; i++ ) {
unsigned char d = to_upper_ascii(i);
if ( is_alpha(d) ) {
// some like char 254 aren't really ascii!!
// so make them into Z's, a rare letter, which
// probably isn't in the same alphabet as 222 and 254
if ( d == 222 ) m_map[i] = 'Z' - 'A' + 12;
else if ( d == 254 ) m_map[i] = 'Z' - 'A' + 12;
else if ( d < 'A' ) m_map[i] = 38; // use apostrophes
else if ( d > 'Z' ) m_map[i] = 38; // use apostrophes
else m_map[i] = d - 'A' + 12;
continue;
}
if ( is_digit(d) ) m_map[i] = d - '0' + 2;
else if ( d == 0 ) m_map[i] = 0;
else if ( d == '\'' ) m_map[i] = 38;
else if ( d == '-' ) m_map[i] = 39;
else if ( d == '\n' ) m_map[i] = 0;
else m_map[i] = 1; // a space
}
*/
reset();
}
/*
bool Language::convertLatin1DictToUTF8( char *infile ){
// open the file for reading
FILE *fdr = fopen ( infile , "r" );
if ( ! fdr )
return log( "lang: Failed to open %s for reading: "
"%s.",infile, strerror(errno) );
char ff[1024];
// open for writing
sprintf ( ff , "%s.utf8", infile );
// delete it first
unlink ( ff );
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
}
char buf[1024];
char out[4*1024];
// this loop goes through all the words and only adds those
// words into the phonetic dict that have phonets.
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
int32_t outLen = latin1ToUtf8(out, 4*1024, buf, gbstrlen(buf));
// write out the trailing \n as well
out[outLen] = '\n';
outLen++;
int32_t wn = write ( fdw , out , outLen ) ;
if ( wn != outLen )
return log("lang: write: %s",
strerror(errno));
}
fclose(fdr);
close(fdw);
return true;
}
*/
Language::~Language(){
reset();
}
void Language::reset(){
if ( m_rulesBuf && m_rulesBufSize > 0 ){
mfree( m_rulesBuf, m_rulesBufSize, "LanguageBuf" );
m_rulesBuf = NULL;
m_rulesBufSize = 0;
}
if ( m_rulesPtr && m_rulesPtrSize > 0 ){
mfree( m_rulesPtr, m_rulesPtrSize, "LanguagePtrBuf" );
m_rulesPtr = NULL;
m_rulesPtrSize = 0;
}
if ( m_distributedBuf && m_distributedBufSize > 0 ){
mfree( m_distributedBuf, m_distributedBufSize,
"DistributedPtrBuf" );
m_distributedBuf = NULL;
m_distributedBufSize = 0;
}
if ( m_tuplePtr && m_tuplePtrSize >0 ){
mfree(m_tuplePtr, m_tuplePtrSize, "LanguageWordsPtr");
m_tuplePtr = NULL;
m_tuplePtrSize = 0;
}
if ( m_narrowBuf && m_narrowBufSize > 0 ){
mfree(m_narrowBuf, m_narrowBufSize, "LanguageNarrowBuf");
m_narrowBuf = NULL;
m_narrowBufSize = 0;
}
m_numRules = 0;
m_numTuples = 0;
m_followup = true;
m_collapseResult = false;
m_removeAccents = true;
}
bool Language::init( char *unifiedBuf, int32_t unifiedBufSize, int32_t lang,
int32_t hostsPerSplit, uint32_t myHash ){
reset();
if ( ! m_phonetics.set(256) ) return false;
if ( ! m_dict.set(256) ) return false;
if ( ! m_distributedPopPhrases.set(256) ) return false;
m_lang = lang;
m_charset = getLanguageCharset(m_lang);
// load the hashtable for getPhrasePopularity
//if ( !loadDict() )
// load the rules dictionary
if ( !loadRules( ) ||
!loadSpellerDict( unifiedBuf, unifiedBufSize, hostsPerSplit,
myHash ) ){
log ( LOG_INIT,"lang: Error initializing for "
"language %s", getLanguageAbbr(m_lang) );
return false;
}
//if ( g_conf.m_doNarrowSearch &&
// !loadNarrow( unifiedBuf, unifiedBufSize, hostsPerSplit, myHash) ){
// log ( LOG_INIT,"lang: Error initializing narrow search for "
// "language %s", getLanguageAbbr(m_lang) );
// // don't return since this isn't critical
// //return false
//}
return true;
}
///////////////////////////////////////////////////////
// DICTIONARY LOADING ROUTINES BELOW HERE
//
// These will load g_hostdb.m_dir/dict/ files from
///////////////////////////////////////////////////////
bool Language::loadRules ( ) {
char ff[1024];
File f;
sprintf ( ff , "%sdict/%s/%s_phonet.dat", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
f.set ( ff );
// open file
if ( ! f.open ( O_RDONLY ) ) {
log("lang: open: %s",mstrerror(g_errno));
return false;
}
// get file size
int32_t fileSize = f.getFileSize() ;
// store a \0 at the end
m_rulesBufSize = fileSize + 1;
// make buffer to hold all
m_rulesBuf = (char *) mmalloc( m_rulesBufSize, "LanguageBuf" );
if ( !m_rulesBuf ) {
g_errno = ENOMEM;
log("lang: mmalloc: %s",mstrerror(errno));
return false;
}
// read em all in
if ( ! f.read ( m_rulesBuf , fileSize , 0 ) ) {
log("lang: read: %s", mstrerror(g_errno));
return false;
}
m_rulesBuf[fileSize] = '\0';
// change \n to \0
for ( int32_t i = 0 ; i < m_rulesBufSize ; i++ ) {
if ( m_rulesBuf[i] != '\n' )
continue;
m_rulesBuf[i] = '\0';
}
f.close();
m_numRules = 0;
char *p = m_rulesBuf;
// This loop checks how many rules we have
while ( p < ( m_rulesBuf + m_rulesBufSize ) ){
// if it is a comment, skip
// if no line, skip
if ( *p == '#' || gbstrlen(p) == 0 || *p == ' ' ){
p += gbstrlen(p) + 1;
continue;
}
// we have a tuple
if ( strstr(p, "followup") == p ){
while ( *p != ' ' )
p++;
while ( *p == ' ' )
p++;
if ( *p != '1' )
m_followup = false;
}
else if ( strstr(p, "collapse_result") == p ){
while ( *p != ' ' )
p++;
while ( *p == ' ' )
p++;
if ( *p == '1' )
m_collapseResult = true;
}
else if ( strstr(p, "version") == p ){
while ( *p != ' ' )
p++;
while ( *p == ' ' )
p++;
if ( *p != '1' )
m_removeAccents = false;
}
// else the rules start or end here
else
m_numRules += 2;
p += gbstrlen(p) + 1;
}
// allocate memory for the ruleptrs
m_rulesPtrSize = m_numRules * sizeof ( char* ) * m_numRules;
m_rulesPtr = (char **) mmalloc(m_rulesPtrSize,"LanguagePtrBuf");
if ( !m_rulesPtr ){
g_errno = ENOMEM;
log("lang: mmalloc: %s",mstrerror(errno));
return false;
}
// init
for ( int32_t i = 0; i < MAX_CHARS; i++) {
m_ruleStarts[i] = -1;
m_ruleChars[i] = false;
}
// do the loop again and assign the pointers
p = m_rulesBuf;
int32_t numRules = 0;
while ( p < ( m_rulesBuf + m_rulesBufSize ) ){
char *start = p;
// if it is a comment, skip
// if no line, skip
if ( *p == '#' || gbstrlen(p) == 0 || *p == ' ' ){
p += gbstrlen(p) + 1;
continue;
}
// we have a tuple
while ( *p != ' ' )
p++;
while ( *p == ' ' ){
*p = '\0';
p++;
}
// if the rule converts a letter into a '_' (blank)
if ( *p == '_' )
*p = '\0';
if ( strstr(start, "followup") == start ){
if ( *p != '1' )
m_followup = false;
}
else if ( strstr(start, "collapse_result") == start ){
if ( *p == '1' )
m_collapseResult = true;
}
else if ( strstr(start, "version") == start ){
if ( *p != '1' )
m_removeAccents = false;
}
// else the rules start or end here
else{
m_rulesPtr[numRules++] = start;
m_rulesPtr[numRules++] = p;
// mark the chars that occur in the rule
// lets just mark the first char. It seems to suffice
if ( *p )
m_ruleChars[(int32_t)*p] = true;
}
p += gbstrlen(p) + 1;
}
// m_ruleStarts[i] points to the index of the m_rulesPtr where the
// rule of character i starts
for ( int32_t i = 0; i < numRules; i += 2) {
int32_t k = (UChar8) m_rulesPtr[i][0];
if ( m_ruleStarts[k] < 0 )
m_ruleStarts[k] = i;
}
// if ( m_lang == 2 || m_lang == 3 ) makeDict();
return true;
}
bool Language::loadSpellerDict( char *spellerBuf, int32_t spellerBufSize,
int32_t hostsPerSplit, uint32_t myHash ){
File distributedPopFile;
char ff[1024];
// load the distributed pop file
sprintf ( ff , "%sdict/%s/%s.query.phonet.%"INT32"", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), myHash);
distributedPopFile.set ( ff );
if ( ! distributedPopFile.open ( O_RDONLY ) ) {
log("lang: open: %s. Generating from common pop file",
mstrerror(g_errno));
sprintf ( ff , "%sdict/%s/%s.query.phonet", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
// If we don't have the distributed pop file, open the
// common pop file and generate the distributed one
if ( !genDistributedPopFile( ff, myHash ))
return false;
// try opening the file now
if ( ! distributedPopFile.open ( O_RDONLY ) ) {
log("lang: open: %s",mstrerror(g_errno));
return false;
}
}
// get file sizes
int32_t distributedPopFileSize = distributedPopFile.getFileSize();
// store a \0 at the end
m_distributedBufSize = distributedPopFileSize + 1;
// make buffer to hold all
m_distributedBuf = (char *) mmalloc(m_distributedBufSize,
"DistributedPtrBuf");
if ( !m_distributedBuf) {
log("lang: mmalloc: %s",mstrerror(errno));return false;
}
char *p = m_distributedBuf;
// read em all in
if ( ! distributedPopFile.read ( p , distributedPopFileSize , 0 ) ){
log("lang: read: %s", mstrerror(g_errno));
return false;
}
m_distributedBuf[distributedPopFileSize] = '\0';
distributedPopFile.close();
// count the tuples that belong to this language that come from
// the wordlist and query file (i.e. that are not negative )
p = spellerBuf;
while ( p < spellerBuf + spellerBufSize - 1){
// first is the phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
// skip phrase and move to phonet
p += gbstrlen(p) + 1 ;
char *phonet = p;
if ( p >= spellerBuf + spellerBufSize-1 ) break;
// skip phonet and move to (lang,score) tuples
p += gbstrlen(p) + 1;
if ( p >= spellerBuf + spellerBufSize-1 ) break;
// skip (lang, score) tuple
p += gbstrlen(p) + 1;
// check if phonet it present
if ( *phonet == '\0' )
continue;
uint64_t phonetKey = hash64Lower_utf8(phonet);
// check if this phonet belongs to this host
if ( phonetKey % hostsPerSplit != myHash )
continue;
uint64_t h = hash64d(phrase, gbstrlen(phrase));
// check if this phrase belongs to this language
// can do that by calling spellers getphrasepopularity
if ( g_speller.getPhrasePopularity( phrase, h, false,
m_lang ) <= 0 )
continue;
m_numTuples++;
}
// also change the \t to \0
p = m_distributedBuf;
while ( p < m_distributedBuf + m_distributedBufSize ){
m_numTuples++;
while ( *p != '\n' &&
p < m_distributedBuf + m_distributedBufSize - 1) {
if ( *p == '\t' )
*p = '\0';
p++;
}
*p = '\0';
p++;
}
// tuples have already been counted
m_tuplePtrSize = m_numTuples * sizeof(char *);
m_tuplePtr = (char **) mmalloc ( m_tuplePtrSize, "LanguageTuplePtr" );
if ( !m_tuplePtr ) {
log("lang: mmalloc: %s",mstrerror(errno));return false;}
int32_t numTuples = 0;
// now go through the unified dict again and assign the pointers
p = spellerBuf;
while ( p < spellerBuf + spellerBufSize - 1){
// first is the phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
// skip phrase and move to phonet
p += gbstrlen(p) + 1;
char *phonet = p;
if ( p >= spellerBuf + spellerBufSize - 1 ) break;
// skip phonet and move to (lang,score) tuples
p += gbstrlen(p) + 1;
if ( p >= spellerBuf + spellerBufSize - 1 ) break;
// skip (lang, score) tuple
p += gbstrlen(p) + 1;
if ( *phonet == '\0' )
continue;
uint64_t phonetKey = hash64Lower_utf8(phonet);
// check if this phonet belongs to this host
if ( phonetKey % hostsPerSplit != myHash )
continue;
uint64_t h = hash64d(phrase, gbstrlen(phrase));
// check if this phrase belongs to this language
// can do that by calling spellers getphrasepopularity
if ( g_speller.getPhrasePopularity( phrase, h, false,
m_lang ) <= 0 )
continue;
m_tuplePtr[numTuples] = phrase;
numTuples++;
}
// go through the distributed dict and assign the pointers
p = m_distributedBuf;
while ( p < m_distributedBuf + m_distributedBufSize ){
m_tuplePtr[numTuples++] = p;
// skip phrase
p += gbstrlen(p) + 1;
if ( p >= m_distributedBuf + m_distributedBufSize ) break;
// skip phonet
p += gbstrlen(p) + 1;
if ( p >= m_distributedBuf + m_distributedBufSize ) break;
// skip popularity
p += gbstrlen(p) + 1;
}
// sanity
for ( int32_t j = 0 ; j< numTuples ; j++ )
gbstrlen(m_tuplePtr[j]) ;
// sanity check
if ( numTuples != m_numTuples ){
char *xx = NULL; *xx = 0;
}
// kill last one seems problemtic with #define EFENCE in Mem.cpp
numTuples--;
m_numTuples--;
// sort the wordsPtrs accoding to their phonetics
gbsort( m_tuplePtr, m_numTuples, sizeof(char*), cmpPhonet );
char *tuple;
m_numPhonets = 0;
int32_t startIndex = 0;
int32_t index = 0;
while ( index < m_numTuples ) {
// The distributed dict is stored as a tuple of
// ( original phrase, phonetic, lang, score )
// first to come is the phrase
tuple = m_tuplePtr[index];
// move to the phonet
tuple += gbstrlen(tuple) + 1;
uint64_t phonetKey = hash64Lower_utf8 ( tuple );
if ( phonetKey % hostsPerSplit != myHash ){
index++;
continue;
}
int32_t numWordsInPhonet = 0;
startIndex = index;
while ( index < m_numTuples ){
// first to come is the phrase
tuple = m_tuplePtr[index];
char *phrase = m_tuplePtr[index];
// move to the phonet
tuple += gbstrlen(tuple) + 1;
uint64_t pKey = hash64Lower_utf8(tuple);
if ( pKey != phonetKey )
break;
// move to the popularity
tuple += gbstrlen(tuple) + 1;
// only add the distributed pop words if they come
// out of the distributed pop words dict
if (phrase > m_distributedBuf &&
phrase < m_distributedBuf + m_distributedBufSize){
// add the distributed pop words
uint64_t h = hash64d( phrase,
gbstrlen(phrase));
int32_t slot = m_distributedPopPhrases.
getSlot(h);
int32_t pop = atoi(tuple);
if ( slot == -1 )
m_distributedPopPhrases.addKey(h, pop);
}
numWordsInPhonet++;
index++;
}
int32_t slot = m_phonetics.getSlot ( phonetKey );
if ( slot != -1 ){
log(LOG_LOGIC, "speller: %"INT32" != -1, %16"XINT64", %s",
slot, phonetKey, tuple);
char *xx = NULL; *xx = 0;
}
// make the composite value
uint64_t value = startIndex;
// make it the higher 32 bits
value <<= 32;
value += numWordsInPhonet;
m_phonetics.addKey( phonetKey, value );
m_numPhonets++;
}
log(LOG_INIT,"lang: Read %"INT32" words and %"INT32" phonets into memory",
m_numTuples, m_numPhonets );
return true;
}
/*
bool Language::loadNarrow( char *spellerBuf, int32_t spellerBufSize,
int32_t hostsPerSplit, uint32_t myHash ){
// don't load for any other language except english
if ( m_lang != langEnglish )
return true;
// first find out how many phrases have more than 1 word
// count the tuples that belong to this language that come from
// the wordlist and query file (i.e. that are not negative )
char *p = spellerBuf;
while ( p < spellerBuf + spellerBufSize - 1){
// first is the phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
// skip phrase and move to phonet
p += gbstrlen(p) + 1;
char *phonet = p;
// skip phonet and move to (lang,score) tuples
p += gbstrlen(p) + 1;
// skip (lang, score) tuple
p += gbstrlen(p) + 1;
uint64_t h = hash64d(phrase, gbstrlen(phrase));
// check if this phrase belongs to this language
// can do that by calling spellers getphrasepopularity
if ( g_speller.
getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ){
continue;
}
// check if phonet it present
if ( *phonet == '\0' ){
continue;
}
uint64_t phonetKey = hash64Lower_utf8(phonet);
// check if this phonet belongs to this host
if ( phonetKey % hostsPerSplit != myHash ){
continue;
}
// make sure the phrase has 3 or more letters
if ( gbstrlen(phrase) < 3 )
continue;
// check if the phrase has more than 1 word
bool isPhrase = false;
char *q = phrase;
while ( *q != '\0' ){
if ( *q == ' ' )
isPhrase = true;
q++;
}
if ( !isPhrase )
continue;
m_numNarrowPtrs++;
}
p = m_distributedBuf;
while ( p < m_distributedBuf + m_distributedBufSize ){
// first is the phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
// skip phrase and move to phonet
p += gbstrlen(p) + 1;
// skip phonet
p += gbstrlen(p) + 1;
// skip popularity
p += gbstrlen(p) + 1;
// make sure the phrase has 3 or more letters
if ( gbstrlen(phrase) < 3 )
continue;
// check if the phrase has more than 1 word
bool isPhrase = false;
char *q = phrase;
while ( *q != '\0' ){
if ( *q == ' ' )
isPhrase = true;
q++;
}
if ( !isPhrase )
continue;
m_numNarrowPtrs++;
}
// allocate memory for that
// also allocate memory for the m_frntCharPtrs and m_bckCharPtrs
m_narrowBufSize = 2 * sizeof (char *) * m_numNarrowPtrs +
( NUM_CHARS * NUM_CHARS * NUM_CHARS * 4 * 2 );
m_narrowBuf = (char *) mmalloc( m_narrowBufSize, "LanguageNarrowBuf" );
if ( !m_narrowBuf ){
log("lang: Could not allocate %"INT32" bytes for narrow buf",
m_narrowBufSize);
g_errno = ENOMEM;
return false;
}
p = m_narrowBuf;
m_frntPtrs = (char **) p;
p += sizeof(char **) * m_numNarrowPtrs;
m_bckPtrs = (char **) p;
p += sizeof(char *) * m_numNarrowPtrs;
m_frntCharPtrs = (int32_t *) p;
p += NUM_CHARS * NUM_CHARS * NUM_CHARS * 4;
m_bckCharPtrs = (int32_t *)p;
p += NUM_CHARS * NUM_CHARS * NUM_CHARS * 4;
int32_t numNarrowPtrs = 0;
// go through the loop again and set the positions
p = spellerBuf;
while ( p < spellerBuf + spellerBufSize - 1){
// first is the phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
// skip phrase and move to phonet
p += gbstrlen(p) + 1;
char *phonet = p;
// skip phonet and move to (lang,score) tuples
p += gbstrlen(p) + 1;
// skip (lang, score) tuple
p += gbstrlen(p) + 1;
uint64_t h = hash64d(phrase, gbstrlen(phrase));
// check if this phrase belongs to this language
// can do that by calling spellers getphrasepopularity
if ( g_speller.
getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ){
continue;
}
// check if phonet it present
if ( *phonet == '\0' ){
continue;
}
uint64_t phonetKey = hash64Lower_utf8(phonet);
// check if this phonet belongs to this host
if ( phonetKey % hostsPerSplit != myHash ){
continue;
}
// make sure the phrase has 3 or more letters
if ( gbstrlen(phrase) < 3 )
continue;
// check if the phrase has more than 1 word
bool isPhrase = false;
char *q = phrase;
while ( *q != '\0' ){
if ( *q == ' ' )
isPhrase = true;
q++;
}
if ( !isPhrase )
continue;
m_frntPtrs[numNarrowPtrs] = phrase;
m_bckPtrs[numNarrowPtrs] = phrase;
numNarrowPtrs++;
}
p = m_distributedBuf;
while ( p < m_distributedBuf + m_distributedBufSize ){
// skip phrase
char *phrase = p;
// if line is a comment skip it
if ( *p == '#' ){
p += gbstrlen(p) + 1;
continue;
}
p += gbstrlen(p) + 1;
// skip phonet
p += gbstrlen(p) + 1;
// skip popularity
p += gbstrlen(p) + 1;
// make sure the phrase has 3 or more letters
if ( gbstrlen(phrase) < 3 )
continue;
// check if the phrase has more than 1 word
bool isPhrase = false;
char *q = phrase;
while ( *q != '\0' ){
if ( *q == ' ' )
isPhrase = true;
q++;
}
if ( !isPhrase )
continue;
m_frntPtrs[numNarrowPtrs] = phrase;
m_bckPtrs[numNarrowPtrs] = phrase;
numNarrowPtrs++;
}
// sanity check
if ( numNarrowPtrs != m_numNarrowPtrs ){
log(LOG_LOGIC, "speller: %"INT32" != %"INT32" numNarrowPtrs",
numNarrowPtrs, m_numNarrowPtrs);
char *xx=NULL; *xx=0;
}
// sort the front pointers and back pointers
gbsort ( m_frntPtrs, m_numNarrowPtrs, sizeof(char*), cmpFrnt );
gbsort ( m_bckPtrs, m_numNarrowPtrs, sizeof(char*), cmpBck );
// printing them out
//for ( int32_t i = 0; i < m_numNarrowPtrs; i++ )
// log ( "lang: frnt=%s\t\t bck=%s",
// m_frntPtrs[i] + gbstrlen(m_frntPtrs[i]) + 1,
// m_bckPtrs[i] + gbstrlen(m_bckPtrs[i]) + 1);
// now set the m_frntCharPtrs and m_bckCharPtrs
for ( int32_t i = 0; i < NUM_CHARS * NUM_CHARS * NUM_CHARS; i++ ){
m_frntCharPtrs[i] = -1;
m_bckCharPtrs[i] = -1;
}
for ( int32_t i = 0; i < m_numNarrowPtrs; i++ ){
// align to the phrase
char *frnt = m_frntPtrs[i];
char *bck = m_bckPtrs[i];
bck += gbstrlen(bck) - 1;
char f0 = to_dict_char(frnt[0]);
char f1 = to_dict_char(frnt[1]);
char f2 = to_dict_char(frnt[2]);
char b0 = to_dict_char(bck[0]);
char b1 = to_dict_char(bck[-1]);
char b2 = to_dict_char(bck[-2]);
int32_t fx = f0 * NUM_CHARS * NUM_CHARS + f1 * NUM_CHARS + f2;
int32_t bx = b0 * NUM_CHARS * NUM_CHARS + b1 * NUM_CHARS + b2;
if ( m_frntCharPtrs[fx] == -1 )
m_frntCharPtrs[fx]= i;
if ( m_bckCharPtrs[bx] == -1 )
m_bckCharPtrs[bx] = i;
}
return true;
}
*/
bool Language::loadDictHashTable( ){
char ff[MAX_FRAG_SIZE];
// first load the language dict
// open the input file
FILE *fdr;
sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) );
// then open
fdr = fopen ( ff, "r" );
if ( !fdr )
return log("lang: Could not open %s for reading: "
"%s.", ff, strerror(errno));
char buf[1024];
// this loop goes through all the words
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
int32_t pop = atoi(p);
// move to the phrase
while ( *p != '\t' )
p++;
p++;
char *phrase = p;
// move to the next tab before the phonetic
while ( *p != '\t' )
p++;
uint64_t key = hash64d( phrase, p - phrase);
int32_t slot = m_dict.getSlot(key);
int32_t value = 0;
if ( slot != -1 ){
value = m_dict.getValueFromSlot(slot);
if ( pop < value )
continue;
}
m_dict.addKey( key, pop );
}
fclose(fdr);
// now for the top pop words from the query log
sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) );
// then open
fdr = fopen ( ff, "r" );
if ( !fdr )
return log("lang: Could not open %s for reading: "
"%s.", ff, strerror(errno));
// this loop goes through all the words
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
int32_t pop = atoi(p);
// move to the phrase
while ( *p != '\t' )
p++;
p++;
char *phrase = p;
// move to the next tab before the phonetic
while ( *p != '\t' )
p++;
uint64_t key = hash64d( p, p - phrase);
int32_t slot = m_dict.getSlot(key);
int32_t value = 0;
if ( slot != -1 ){
value = m_dict.getValueFromSlot(slot);
if ( pop < value )
continue;
}
m_dict.addKey( key, pop );
}
fclose(fdr);
// now for the title rec dicts. If the phrase is only present in the
// titlerec dict then store it as a negative value
for ( int32_t i = 0; i < NUM_CHARS; i++ ){
// open the input file
FILE *fdr;
sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i);
// then open
fdr = fopen ( ff, "r" );
if ( !fdr )
return log("lang: Could not open %s for reading: "
"%s.", ff, strerror(errno));
// this loop goes through all the words and only adds those
// words into the phonetic dict that have phonets.
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
int32_t pop = ( atoi(p) * 32000 )/ 10000;
// move to the phrase
while ( *p != '\t' )
p++;
p++;
uint64_t key = hash64d( p, gbstrlen(p) );
// add only if it is not found in english dict and
// query dict
int32_t slot = m_dict.getSlot(key);
int32_t value = 0;
if ( slot != -1 ){
value = m_dict.getValueFromSlot(slot);
if ( pop < value )
continue;
}
// if phrase is only present in the title rec, store
// as a negative value
else
pop *= -1;
m_dict.addKey( key, pop );
}
fclose(fdr);
}
return true;
}
bool Language::loadWikipediaWords(){
// open the wikipedia file
char ff[1024];
sprintf ( ff , "%sdict/%s/%s.wiki", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
FILE *fdr = fopen ( ff, "r" );
if ( ! fdr ) {
return log("lang: Could not open for mispelled words"
"reading: %s.",strerror(errno));
}
m_wiki.set(1024);
char buf[1024];
// go through the words in dict/words
while ( fgets ( buf , 1024 , fdr ) ) {
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(buf) ;
// skip if empty
if ( wlen <= 0 ) continue;
buf[wlen-1]='\0';
uint32_t key = hash32d(buf, gbstrlen(buf));
int32_t slot = m_wiki.getSlot ( key );
if ( slot != -1 ){
continue;
char *xx=NULL; *xx=0;
}
m_wiki.addKey(key,1);
}
fclose(fdr);
return true;
}
bool Language::loadMispelledWords(){
char ff [1024];
// also open the commonly misspelled words file
sprintf ( ff , "%sdict/%s/%s.misp", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
FILE *fdr = fopen ( ff, "r" );
if ( ! fdr ) {
return log("lang: Could not open for mispelled words"
"reading: %s.",strerror(errno));
}
m_misp.set(1024);
char buf[1024];
// go through the words in dict/words
while ( fgets ( buf , 1024 , fdr ) ) {
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(buf) ;
// skip if empty
if ( wlen <= 0 ) continue;
buf[wlen-1]='\0';
uint32_t key = hash32d(buf, gbstrlen(buf));
int32_t slot = m_misp.getSlot ( key );
if ( slot != -1 ){
char *xx=NULL; *xx=0;
}
m_misp.addKey(key,1);
}
fclose(fdr);
return true;
}
///////////////////////////////////////////////////////
// LANGUAGE RECOMMENDATION ROUTINES BELOW HERE
//
///////////////////////////////////////////////////////
/*
int32_t Language::narrowPhrase ( char *request, char *phrases, int32_t *pops,
int32_t maxPhrases ){
// if we haven't been loaded, just return
if ( m_numNarrowPtrs == 0 )
return 0;
int32_t numPhrases = 0;
int32_t requestLen = gbstrlen(request);
// don't check for narrow phrase if the original phrase is more than
// MAX_PHRASE_LEN - 3 OR less than 3 chars.
// Why MAX_PHRASE_LEN - 3 ? Because then only can we find a narrow
// phrase
if ( requestLen > MAX_PHRASE_LEN - 3 || requestLen < 3 )
return numPhrases;
// get the start and end two chars and convert them to dict_char
char f0 = to_dict_char(request[0]);
char f1 = to_dict_char(request[1]);
char f2 = to_dict_char(request[2]);
char *bck = request + requestLen - 1;
char b0 = to_dict_char(bck[0]);
char b1 = to_dict_char(bck[-1]);
char b2 = to_dict_char(bck[-2]);
uint64_t start = gettimeofdayInMilliseconds();
int32_t minPop = 0;
char req[MAX_PHRASE_LEN];
// first get all the ones in the front
strcpy(req, request);
// add a space so that we match the exact phrase
req[requestLen] = ' ';
req[requestLen + 1] = '\0';
int32_t fx = f0 * NUM_CHARS * NUM_CHARS + f1 * NUM_CHARS + f2;
int32_t index = m_frntCharPtrs[fx];
if ( index == -1 )
goto skipFrnt;
while ( index < m_numNarrowPtrs ){
char *tuple = m_frntPtrs[index++];
char *phrase = tuple;
//check if we have gone over the phrase (if present) or not
int32_t cmp = strncasecmp (phrase, req, gbstrlen(req));
if ( cmp > 0 )
break;
if ( cmp < 0 )
continue;
// found it. get the popularity
int32_t pop = 0;
// if its from the distributed dict, get it directly
if ( tuple > m_distributedBuf &&
tuple < m_distributedBuf + m_distributedBufSize ){
// skip the phrase
tuple += gbstrlen(tuple) + 1;
// skip the phonet
tuple += gbstrlen(tuple) + 1;
pop = atoi(tuple);
}
// else get it by getphrasePopularity
else {
uint64_t h = hash64d(phrase, gbstrlen(phrase));
pop = g_speller.getPhrasePopularity(phrase, h, false,
m_lang);
}
int32_t indx = numPhrases;
// if not full
if ( numPhrases < maxPhrases )
numPhrases++;
// if full
else{
if ( minPop >= pop )
continue;
int32_t minIndx = 0;
minPop = pops[0];
for ( int32_t j = 1; j < maxPhrases; j++ ){
if ( minPop < pops[j] )
continue;
minPop = pops[j];
minIndx = j;
}
if ( minPop >= pop )
continue;
indx = minIndx;
minPop = pop;
}
// store the pop
pops[indx] = pop;
strcpy ( &phrases[MAX_FRAG_SIZE * indx],phrase );
log (LOG_DEBUG,"speller: Narrow phrase=%s, pop=%"INT32"",
&phrases[MAX_FRAG_SIZE * indx], pops[indx]);
}
skipFrnt:
// now get the back
req[0] = ' ';
strcpy(&req[1],request);
int32_t bx = b0 * NUM_CHARS * NUM_CHARS + b1 * NUM_CHARS + b2;
index = m_bckCharPtrs[bx];
if ( index == -1 )
return numPhrases;
while ( index < m_numNarrowPtrs ){
char *tuple = m_bckPtrs[index++];
char *phrase = tuple;
//check if we have gone over the phrase (if present) or not
// cannot use strcasecmp because we compare from the back
char *p1 = phrase + gbstrlen(phrase) - 1;
char *p2 = req + gbstrlen(req) - 1;
while ( p1 >= phrase && p2 >= req ) {
if ( *p1 != *p2 )
break;
p1--;
p2--;
}
if ( p2 >= req || p1 < phrase ){
if ( *p1 > *p2 )
break;
continue;
}
// found it
int32_t pop = 0;
// if its from the distributed dict, get it directly
if ( tuple > m_distributedBuf &&
tuple < m_distributedBuf + m_distributedBufSize ){
// skip the phrase
tuple += gbstrlen(tuple) + 1;
// skip the phonet
tuple += gbstrlen(tuple) + 1;
pop = atoi(tuple);
}
// else get it by getphrasePopularity
else {
uint64_t h = hash64d(phrase, gbstrlen(phrase));
pop = g_speller.getPhrasePopularity(phrase, h, false,
m_lang);
}
int32_t indx = numPhrases;
// if not full
if ( numPhrases < maxPhrases )
numPhrases++;
// if full
else{
if ( minPop >= pop )
continue;
int32_t minIndx = 0;
minPop = pops[0];
for ( int32_t j = 1; j < maxPhrases; j++ ){
if ( minPop < pops[j] )
continue;
minPop = pops[j];
minIndx = j;
}
if ( minPop >= pop )
continue;
indx = minIndx;
minPop = pop;
}
// store the pop
pops[indx] = pop;
strcpy ( &phrases[MAX_FRAG_SIZE * indx],phrase );
log (LOG_DEBUG,"speller: Narrow phrase=%s, pop=%"INT32"",
&phrases[MAX_FRAG_SIZE * indx], pops[indx]);
}
uint64_t took = gettimeofdayInMilliseconds() - start;
if ( took > 5)
log ( LOG_WARN,"lang: Finding narrow phrases took %"INT64" ms",
took );
return numPhrases;
}
*/
// . return the clean buffer that can be spellchecked
// . in utf8 always now
bool Language::makeClean( char *src, int32_t srcSize,
char *dst, int32_t dstSize ) {
//char *pin = inBuf;
//char *pout = outBuf;
char *srcEnd = src + srcSize;
char *dstEnd = dst + dstSize;
char cs;
//while ( pout - outBuf < outBufSize && *pin != '\0' ){
for ( ; src < srcEnd ; src += cs ) {
cs = getUtf8CharSize ( src );
//UChar32 c = 0;
//if ( isUTF16 )
// c = utf16Decode( (UChar *)pin, &(UChar *)pin );
//else
// c = utf8Decode ( pin, &pin );
// Since we're english cannot check anything but ASCII
//if ( c > 0x7f )
// return false;
//if (!ucIsAlnum(c) && !ucIsWhiteSpace(c) && c != (int32_t)'\'' &&
// c != (int32_t)' ' && c != (int32_t)'-' )
// return false;
// skip more advanced forms of punct
if ( ! is_alnum_utf8 ( src ) &&
! is_wspace_utf8 ( src ) &&
*src != '\'' &&
*src != ' ' &&
*src != '-' )
return false;
// return false to avoid overflow
if ( dst + 5 >= dstEnd ) return false;
if ( cs == 1 ) *dst++ = to_upper_a (*src);
else dst += to_upper_utf8 ( dst , src );
// write the char as upper case
//dst += getClean ( dst , src );
}
// null end it
*dst = '\0';
return true;
}
// returns the number of recommendations that were found
// First finds recommendations by the soundslike (phonetic) score
// Then tries to split the word and finds recommendations by the word score
// Stores the top MAX_RECOMMENDATIONS in the array, and then returns the
// highest popularity recommendation out of them
bool Language::getRecommendation( char *origWord, int32_t origWordLen,
char *recommendation, int32_t recommendationLen,
bool *found, int32_t *score, int32_t *popularity,
bool forceReco ){
// if rules and words are not loaded, return
if ( m_numRules == 0 || m_numTuples == 0 )
return true;
// don't check for recommendation if the original phrase is more than
// MAX_PHRASE_LEN - 1
if ( origWordLen > MAX_PHRASE_LEN - 1 )
return false;
char origPhonet[MAX_PHRASE_LEN];
char origClean[MAX_PHRASE_LEN];
char possiblePhonet[ MAX_PHRASE_LEN ];
Reco recos[MAX_RECOMMENDATIONS];
// also keep the lowest score that we've found.
int32_t lowestScore = LARGE_SCORE;
/*char recos[MAX_RECOMMENDATIONS][MAX_PHRASE_LEN];
int32_t recoScores[MAX_RECOMMENDATIONS];*/
int32_t numRecos = 0;
// null end recommendation in case we don't find anything.
*recommendation = '\0';
*found = false;
*score = LARGE_SCORE;
*popularity = 0;
// no recommendations for 1 letter words
if ( origWordLen < 2 )
return false;
// no recommendation if the word is found in the dictionary
if ( !forceReco ){
// if we are spell checking a query then we start with the
// phrases and then move on to individual words. This should
// eliminate bugs like saying "brittany spears" is correct
// because the phrase shall be checked before individual words
uint64_t h = hash64d( origWord, gbstrlen(origWord));
if ( g_speller.getPhrasePopularity( origWord,
h, false ) != 0 ){
*found = true;
return false;
}
// check if it is present in the distributed dictionary
if ( m_distributedPopPhrases.getSlot ( h ) != -1 ){
*found = true;
return false;
}
}
//int32_t minRecoScore = LARGE_SCORE;
// clean the word, i.e. convert word to uppercase and
// remove possible accents
if ( !makeClean ( origWord, origWordLen, origClean, MAX_PHRASE_LEN) )
return false;
// memset ( phonet, '\0', MAX_PHRASE_LEN );
// get the phonetic
getPhonetic ( origClean, gbstrlen(origClean), origPhonet,
MAX_PHRASE_LEN );
log ( LOG_DEBUG,"speller: original - %s %s %s",origWord,
origClean, origPhonet );
// this is the max score that we are trying to get
// this is the radius around the misspelled word that we are checking
int32_t tryForScore = 3 * ( m_wordWeight * m_editDistanceWeightsMax )/100;
// decrease score by 50pc if the length of the phonet is less than 5
// decrease score by 20pc if the length of the phonet is less than 7
if ( gbstrlen(origPhonet) < 5 ) tryForScore -= tryForScore / 2;
else if ( gbstrlen(origPhonet) < 7 ) tryForScore -= tryForScore / 5;
// first try the same phonetic as the original word
int32_t origLen = gbstrlen(origPhonet);
// first add the original
strcpy ( possiblePhonet, origPhonet );
// get recos from this phonet
numRecos = tryPhonet( possiblePhonet, origPhonet,
origClean, tryForScore,
recos, numRecos, &lowestScore );
// generate different phonets using addition, deletion, substitution
// and swapping.
// ADDITION
for ( int32_t i = 0; i < origLen + 1; i++ ){
for ( int32_t j = 0; j < MAX_CHARS; j++ ){
if ( !m_ruleChars[j] ) continue;
char *p = possiblePhonet;
// first put in all the chars the are before the char
// to be added
gbmemcpy ( p, origPhonet, i ); p += i;
// the index of m_ruleChars[] is the char to be added
*p++ = j;
gbmemcpy ( p, origPhonet + i, origLen - i );
p += origLen - i;
*p++ = '\0';
numRecos = tryPhonet( possiblePhonet, origPhonet,
origClean, tryForScore,
recos, numRecos, &lowestScore );
}
}
// DELETION
for ( int32_t i = 0; i < origLen; i++ ){
char *p = possiblePhonet;
// put the chars that come before the deleted char
gbmemcpy ( p, origPhonet, i ); p += i;
// put the chars that come after the deleted char
gbmemcpy ( p, origPhonet + i + 1, origLen - i - 1 );
p += origLen - i - 1;
*p++ = '\0';
numRecos = tryPhonet( possiblePhonet, origPhonet,
origClean, tryForScore,
recos, numRecos, &lowestScore );
}
// SUBSTITUTION
for ( int32_t i = 0; i < origLen; i++ ){
for ( int32_t j = 0; j < MAX_CHARS; j++ ){
if ( !m_ruleChars[j] ) continue;
char *p = possiblePhonet;
// cannot substitue if both chars are the same
if ( j == *( origPhonet + i ) ) continue;
// put the chars that come before the substituted char
gbmemcpy ( p, origPhonet, i ); p += i;
// substitute the char
*p++ = j;
// put the chars that come after the deleted char
gbmemcpy ( p, origPhonet + i + 1, origLen - i - 1);
p += origLen - i - 1;
*p++ = '\0';
numRecos = tryPhonet( possiblePhonet, origPhonet,
origClean, tryForScore,
recos, numRecos, &lowestScore );
}
}
// SWAPPING
for ( int32_t i = 0; i < origLen - 1; i++ ){
char *p = possiblePhonet;
// cannot swap if both chars are the same
if ( *( origPhonet + i ) == *( origPhonet + i + 1 ) ) continue;
// put the chars that come before the swapped char
gbmemcpy ( p, origPhonet, i ); p += i;
//swap the chars
*p++ = *( origPhonet + i + 1);
*p++ = *( origPhonet + i );
// put the chars that come after the deleted char
gbmemcpy ( p, origPhonet + i + 2, origLen - i - 2);
p += origLen - i - 2;
*p++ = '\0';
numRecos = tryPhonet( possiblePhonet, origPhonet,
origClean, tryForScore,
recos, numRecos, &lowestScore );
}
// check if splitting the word gives us any good recommendations
// this works like the try_split() function of aspell in suggest.cpp
// dont split the word if its less than 4 chars
if ( gbstrlen(origWord) < 4 )
goto skipSplit;
// copy it over to another string
char splitWord[MAX_PHRASE_LEN];
strcpy ( splitWord, origWord );
splitWord[ gbstrlen(splitWord) + 1 ] = '\0';
splitWord[ gbstrlen(splitWord) ] = splitWord[ gbstrlen(splitWord) - 1 ];
for ( int32_t i = gbstrlen( origWord ) - 2; i >= 2; --i) {
splitWord[i+1] = splitWord[i];
splitWord[i] = '\0';
uint64_t h = hash64d ( splitWord, gbstrlen(splitWord));
// check if the split words exist in the dictionary
int32_t pop = g_speller.getPhrasePopularity(splitWord,h,false);
if ( pop == 0 ){
// check the distributed dict also
int32_t slot = m_distributedPopPhrases.getSlot(h);
if ( slot != -1 )
pop = m_distributedPopPhrases.
getValueFromSlot(slot);
if ( pop == 0 )
continue;
}
h = hash64d ( splitWord + i + 1, gbstrlen(splitWord + i + 1));
pop = g_speller.getPhrasePopularity( splitWord + i + 1, h,
false );
if ( pop == 0 ){
// check the distributed dict also
int32_t slot = m_distributedPopPhrases.getSlot(h);
if ( slot != -1 )
pop = m_distributedPopPhrases.
getValueFromSlot(slot);
if ( pop == 0 )
continue;
}
// replace the '\0' in between the split with a ' '
splitWord[i] = ' ';
int32_t wordScore = m_editDistanceWeightsDel2 * 3 / 2;
char phonetReco[MAX_PHRASE_LEN];
// get phonetic
getPhonetic ( splitWord, gbstrlen(splitWord), phonetReco,
MAX_PHRASE_LEN );
int32_t soundslikeScore = editDistance ( origPhonet,
phonetReco );
// the final score taking into consideration the
// phonetic score as well as the word score
int32_t score = weightedAverage ( soundslikeScore, wordScore );
if ( score > tryForScore + m_span )
continue;
// also continue if the score is greater than 2*lowestScore,
// because then this reco doesn't have a chance
if ( score > lowestScore * 2 )
continue;
// change the lowest score if needed
if ( score < lowestScore )
lowestScore = score;
// try to add this to the recommendations
/*log ( LOG_WARN, "lang: reco=%s wordScore=%"INT32" "
"phonetScore=%"INT32" score=%"INT32"",
splitWord, wordScore, soundslikeScore, score );*/
if ( numRecos < MAX_RECOMMENDATIONS ){
strcpy ( recos[numRecos].reco, splitWord );
recos[numRecos].score = score;
numRecos++;
continue;
}
int32_t maxScore = 0;
int32_t maxIndex = 0;
// find the largest score
for ( int32_t k = 0; k < numRecos; k++ ){
if ( recos[k].score > maxScore ){
maxScore = recos[k].score;
maxIndex = k;
}
}
// boot out the largest score if it is more than this
// score
if ( score > maxScore )
continue;
strcpy ( recos[maxIndex].reco, splitWord );
recos[maxIndex].score = score;
}
skipSplit:
// if no recos return
if ( numRecos == 0 )
return false;
// sort the recos according to their scores
gbsort ( recos, numRecos, sizeof(Reco), cmpScores );
log ( LOG_DEBUG, "speller: --------Top Recos--------" );
// select the best recommendation among them by score
int32_t bestRecoIndex = 0;
int32_t bestRecoPop = -1;
for ( int32_t i = 0; i < numRecos; i++ ){
uint64_t h = hash64d ( recos[i].reco,
gbstrlen(recos[i].reco));
int32_t pop = g_speller.getPhrasePopularity(recos[i].reco, h,
false);
if ( pop == 0 ){
// check the distributed dict also
int32_t slot = m_distributedPopPhrases.getSlot(h);
if ( slot != -1 )
pop = m_distributedPopPhrases.
getValueFromSlot(slot);
}
if ( ( recos[i].score < ( recos[bestRecoIndex].score * 2 ) &&
pop > ( bestRecoPop * 4 ) ) ||
( recos[i].score == recos[bestRecoIndex].score &&
pop > bestRecoPop ) ){
bestRecoPop = pop;
bestRecoIndex = i;
}
log ( LOG_DEBUG,"speller: %"INT32") reco=%s score=%"INT32" pop=%"INT32"",
i, recos[i].reco, recos[i].score, pop );
}
log ( LOG_DEBUG, "speller: the best reco found is %s for word %s",
recos[bestRecoIndex].reco, origWord );
// put the best reco into the recommendation
strcpy ( recommendation, recos[bestRecoIndex].reco );
*score = recos[bestRecoIndex].score;
*popularity = bestRecoPop;
return true;
}
int32_t Language::tryPhonet( char *phonetTmp, char *origPhonet,
char *origClean, int32_t tryForScore,
Reco *recos, int32_t numRecos, int32_t *lowestScore ){
// go through all the phonetics and select those that have score <= 100
uint64_t key = hash64Lower_utf8(phonetTmp);
int32_t slot = m_phonetics.getSlot ( key );
if ( slot == -1 )
return numRecos;
// the value is a combination of the index and the number of
// words having the same phonet
uint64_t value = m_phonetics.getValueFromSlot(slot);
int32_t index = value >> 32;
int32_t numWordsInPhonet = value & 0xffffffff;
log ( LOG_DEBUG,"speller: next phonet is %s, index=%"INT32", numWords=%"INT32"",
phonetTmp, index, numWordsInPhonet );
//if ( strcmp(phonetTmp,"WST") == 0 )
//log(LOG_WARN,"BRTNSPS");
// check the score to see if this phonet is any good.
// phonet score is 100 for phonets that do not contain all
// the letters of the word phonet. e.g. word Phonet = "PLKN",
// phonet = "PLKS" phonet score is 95 for phonets that contain
// all letters, and 0 where the phonets are same.
int32_t phonetScore = limit1EditDistance( phonetTmp, origPhonet );
if ( phonetScore >= LARGE_SCORE )
return numRecos;
//log ( LOG_WARN,"lang: checking phonet %s, "
//"numWords=%"INT32"",phonetTmp, numWordsInPhonet);
// this phonet works, for all the words under this phonet,
// get their score.
for ( int32_t j = 0; j < numWordsInPhonet; j++ ){
// The dict is stored as a tuple of
// ( original phrase, phonetic, (lang, score)... )
char *wordReco = m_tuplePtr[j + index];
// make the clean Reco
char cleanReco[MAX_PHRASE_LEN];
// sanity check, this is in the dict, so we should be able to
// make the word into clean
if ( !makeClean( wordReco, gbstrlen(wordReco), cleanReco,
MAX_PHRASE_LEN ) ){
char *xx = NULL; *xx = 0;
}
// now the phonetic
char *phonetReco = wordReco + gbstrlen(wordReco) + 1;
// sanity check
if ( !cleanReco[0] || !phonetReco ){
char *xx = NULL; *xx = 0;
}
// we want the min Score, so this is init'ed to max
int32_t wordScore = LARGE_SCORE;
// init this to phonetScore
int32_t soundslikeScore = phonetScore;
//log (LOG_WARN,"lang: %s\t%s\t%s %"INT32" %"INT32"",
// wordReco, cleanReco, phonetReco,
// wordScore, soundslikeScore);
if ( wordScore >= LARGE_SCORE ){
int32_t slScore = soundslikeScore;
if ( slScore >= LARGE_SCORE )
slScore = 0;
int32_t level = ( 100 * tryForScore -
m_soundslikeWeight * slScore )/
(m_wordWeight *
m_editDistanceWeightsMin);
if ( level < 0 )
level = 0;
if ( level >= int32_t(slScore/
m_editDistanceWeightsMin))
wordScore = editDistance ( origClean,
cleanReco,
level,
level );
}
if ( wordScore >= LARGE_SCORE )
continue;
// this is needed for split words, that are taken
// care of after this loop
/*if ( soundslikeScore >= LARGE_SCORE ){
if ( weightedAverage( 0, wordScore ) >
tryForScore )
continue;
soundslikeScore = editDistance ( origPhonet,
phonetReco );
}*/
// the final score taking into consideration the
// phonetic score as well as the word score
int32_t score = weightedAverage ( soundslikeScore,
wordScore );
if ( score > tryForScore + m_span || score == 0)
continue;
// also continue if the score is greater than 2*lowestScore,
// because then this reco doesn't have a chance
if ( score > *lowestScore * 2 )
continue;
// change the lowest score if needed
if ( score < *lowestScore )
*lowestScore = score;
/*int32_t reduceScore=reduceScore(origClean,cleanReco);
if ( reduceScore > 0 )
log ( LOG_DEBUG,"lang: reducing score request=%s, "
"reco=%s, score=%"INT32", reduce=%"INT32"", origClean,
cleanReco, score, reduceScore );
score -= reduceScore;*/
//log ( LOG_WARN, "lang: reco=%s phonet=%s "
//"wordScore=%"INT32" phonetScore=%"INT32" score=%"INT32"",
//wordReco, phonetReco, wordScore,
//soundslikeScore, score );
/*if ( minRecoScore < score )
continue;
// this is our best recommendation yet
minRecoScore = score;
strcpy ( recommendation, wordReco );*/
if ( numRecos < MAX_RECOMMENDATIONS ){
strcpy ( recos[numRecos].reco, wordReco );
recos[numRecos].score = score;
numRecos++;
continue;
}
int32_t maxScore = 0;
int32_t maxIndex = 0;
// find the largest score
for ( int32_t k = 0; k < numRecos; k++ ){
if ( recos[k].score > maxScore ){
maxScore = recos[k].score;
maxIndex = k;
}
}
// boot out the largest score if it is more than this
// score
if ( score > maxScore )
continue;
strcpy ( recos[maxIndex].reco, wordReco );
recos[maxIndex].score = score;
}
return numRecos;
}
int32_t Language::editDistance( char *a, char *b, int32_t level, // starting level
int32_t limit ) { // maximum level
// sanity check
if ( level <= 0 || limit < level){
char *xx = NULL; *xx = 0;
}
int32_t score = LARGE_SCORE;
while (score >= LARGE_SCORE && level <= limit) {
if (level == 2)
score = limit2EditDistance( a, b );
else if (level < 5)
score = limitEditDistance( a, b, level );
else {
char *xx = NULL; *xx = 0;
//score = editDistance(a,b,w);
}
++level;
}
return score;
}
int32_t Language::weightedAverage(int32_t soundslikeScore, int32_t wordScore) {
return ( m_wordWeight * wordScore +
m_soundslikeWeight * soundslikeScore) / 100;
}
int32_t Language::limitEditDistance( char * a, char * b,
int32_t limit ) {
limit = limit * m_editDistanceWeightsMax;
static const int size = 10;
struct Edit {
char * a;
char * b;
int score;
};
Edit begin[size];
Edit * i = begin;
// const char * a0;
// const char * b0;
int32_t score = 0;
int32_t min = LARGE_SCORE;
while (true) {
while (*a == *b) {
if (*a == '\0') {
if (score < min) min = score;
goto FINISH;
}
++a;
++b;
}
if (*a == '\0') {
do {
score += m_editDistanceWeightsDel2;
if (score >= min) goto FINISH;
++b;
} while (*b != '\0');
min = score;
}
else if (*b == '\0') {
do {
score += m_editDistanceWeightsDel1;
if (score >= min)
goto FINISH;
++a;
} while (*a != '\0');
min = score;
}
// if floor(score/max)=limit/max-1 then this edit is only good
// if it makes the rest of the string match. So check if
// the rest of the string matches to avoid the overhead of
// pushing it on then off the stack
else if ( score + m_editDistanceWeightsMax <= limit ) {
if ( limit * m_editDistanceWeightsMin <=
m_editDistanceWeightsMax *
( m_editDistanceWeightsMin + score ) ) {
// delete a character from a
min = checkRest( a+1, b,
score +
m_editDistanceWeightsDel1,
NULL, min );
// delete a character from b
min = checkRest( a, b+1,
score +
m_editDistanceWeightsDel2,
NULL, min );
if (*a == *(b+1) && *b == *(a+1)) {
// swap two characters
min=checkRest(a+2, b+2,
score +
m_editDistanceWeightsSwap,
NULL, min );
}
// substitute one character for another which
// is the same thing as deleting a character
// from both a & b
else {
min=checkRest(a+1, b+1,
score +
m_editDistanceWeightsSub,
NULL, min );
}
}
else {
// delete a character from a
i->a = a + 1;
i->b = b;
i->score = score + m_editDistanceWeightsDel1;
++i;
// delete a character from b
i->a = a;
i->b = b + 1;
i->score = score + m_editDistanceWeightsDel2;
++i;
// If two characters can be swapped and make
// a match then the substitution is pointless.
// Also, there is no need to push this on
// the stack as it is going to be imminently
// removed.
if (*a == *(b+1) && *b == *(a+1)) {
// swap two characters
a = a + 2;
b = b + 2;
score += m_editDistanceWeightsSwap;
continue;
}
// substitute one character for another
// which is the same thing as deleting a
// character from both a & b
else {
a = a + 1;
b = b + 1;
score += m_editDistanceWeightsSub;
continue;
}
}
}
FINISH:
if (i == begin) return min;
--i;
a = i->a;
b = i->b;
score = i->score;
}
}
int32_t Language::limit1EditDistance( char *a, char *b ){
int32_t min = LARGE_SCORE;
char * amax = a;
while(*a == *b) {
if (*a == '\0')
return 0; //EditDist(0, a);
++a; ++b;
}
if (*a == '\0') {
++b;
if (*b == '\0')
return m_editDistanceWeightsDel2;
//EditDist(ws.del2, a);
return LARGE_SCORE;
// EditDist(LARGE_SCORE, a);
}
else if (*b == '\0') {
++a;
if (*a == '\0')
return m_editDistanceWeightsDel1;
//EditDist(ws.del1, a);
return LARGE_SCORE;
//EditDist(LARGE_SCORE, a);
}
else {
// delete a character from a
min = checkRest( a+1, b, m_editDistanceWeightsDel1,
amax, min );
// delete a character from b
min = checkRest( a, b+1, m_editDistanceWeightsDel2,
amax, min );
if (*a == *(b+1) && *b == *(a+1)) {
// swap two characters
min = checkRest( a+2, b+2, m_editDistanceWeightsSwap,
amax, min );
}
else {
// substitute one character for another which is the
// same thing as deleting a character from both a & b
min = checkRest( a+1, b+1, m_editDistanceWeightsSub,
amax, min );
}
}
return min;
//EditDist(min, amax);
}
int32_t Language::limit2EditDistance( char *a, char *b ) {
int min = LARGE_SCORE;
char * amax = a;
while(*a == *b) {
if (*a == '\0')
return 0;
//return EditDist(0, a);
++a; ++b;
}
if (*a == '\0') {
++b;
if (*b == '\0')
return m_editDistanceWeightsDel2;
//return EditDist(ws.del2,a);
++b;
if (*b == '\0')
return 2 * m_editDistanceWeightsDel2;
//return EditDist(2*ws.del2, a);
return LARGE_SCORE;//EditDist(LARGE_SCORE, a);
}
else if (*b == '\0') {
++a;
if (*a == '\0')
return m_editDistanceWeightsDel1;
//return EditDist(ws.del1, a);
++a;
if (*a == '\0')
return 2 * m_editDistanceWeightsDel1;
//return EditDist(2*ws.del1, a);
return LARGE_SCORE;
//return EditDist(LARGE_SCORE, a);
}
else {
// delete a character from a
min = check2( a+1, b, m_editDistanceWeightsDel1, amax, min );
// delete a character from b
min = check2( a, b+1, m_editDistanceWeightsDel2, amax, min );
if (*a == *(b+1) && *b == *(a+1)) {
// swap two characters
min = check2( a+2, b+2, m_editDistanceWeightsSwap,
amax, min );
}
else {
// substitute one character for another which is the
// same thing as deleting a character from both a & b
min = check2( a+1, b+1, m_editDistanceWeightsSub,
amax, min );
}
}
return min;
//return EditDist(min, amax);
}
int32_t Language::checkRest( char *a, char *b,
int32_t w, char *amax, int32_t min ){
char *a0 = a;
char *b0 = b;
while(*a0 == *b0) {
if (*a0 == '\0') {
if (w < min) min = w;
break;
}
++a0;
++b0;
}
if ( amax && amax < a0) amax = a0;
return min;
}
int32_t Language::check2( char *a, char *b, int32_t w, char *amax, int32_t min ){
char *aa = a;
char *bb = b;
while(*aa == *bb) {
if (*aa == '\0') {
if (amax < aa) amax = aa;
if (w < min) min = w;
break;
}
++aa;
++bb;
}
if (*aa == '\0') {
if (amax < aa) amax = aa;
if (*bb == '\0') {}
else if (*(bb+1) == '\0' &&
w + m_editDistanceWeightsDel2 < min)
min = w + m_editDistanceWeightsDel2;
}
else if (*bb == '\0') {
++aa;
if (amax < aa) amax = aa;
if (*aa == '\0' &&
w + m_editDistanceWeightsDel1 < min)
min = w + m_editDistanceWeightsDel1;
}
else {
min = checkRest( aa+1, bb,
w + m_editDistanceWeightsDel1, amax, min );
min = checkRest( aa, bb+1,
w + m_editDistanceWeightsDel2, amax, min );
if (*aa == *(bb+1) && *bb == *(aa+1))
min = checkRest( aa+2, bb+2,
w + m_editDistanceWeightsSwap,
amax, min);
else
min = checkRest( aa+1, bb+1,
w + m_editDistanceWeightsSub,
amax, min );
}
return min;
}
int16_t Language::editDistance( char *a0, char *b0 ){
int32_t aSize = gbstrlen(a0) + 1;
int32_t bSize = gbstrlen(b0) + 1;
// VARARRAY(int16_t, e_d, a_size * b_size);
int16_t e[aSize * bSize];
// ShortMatrix e(a_size,b_size,e_d);
e[0] = 0;// e(0, 0) = 0;
for ( int32_t j = 1; j != bSize; ++j )
e[0 + j * aSize] = e[(j-1) * aSize] +
m_editDistanceWeightsDel1;
const char * a = a0 - 1;
const char * b = b0 - 1;
int16_t te;
for (int32_t i = 1; i != aSize; ++i) {
e[i] = e[i-1] + m_editDistanceWeightsDel2;
for (int32_t j = 1; j != bSize; ++j) {
if (a[i] == b[j]) {
e[i + j * aSize] = e[(i-1) + (j-1) * aSize];
}
else {
e[i + j * aSize] = m_editDistanceWeightsSub +
e[(i-1) + (j-1) * aSize];
if (i != 1 && j != 1 &&
a[i] == b[j-1] && a[i-1] == b[j]) {
te = m_editDistanceWeightsSwap +
e[(i-2) + (j-2) * aSize];
if (te < e[i + j * aSize])
e[i + j * aSize] = te;
}
te = m_editDistanceWeightsDel1 +
e[i-1 + j * aSize];
if (te < e[i + j * aSize])
e[i + j * aSize] = te;
te = m_editDistanceWeightsDel2 +
e[i + (j-1) * aSize];
if (te < e[i + j * aSize])
e[i + j * aSize] = te;
}
}
}
return e[(aSize - 1) + (bSize - 1) * aSize];
}
// reduces score for substitutions that are close on the key board
// eg. we want "hakt" --> "halt", but it used to give "hakt"->"hat"
// string 'a' is the mispelling, string 'b' is the recommendation
int16_t Language::reduceScore ( char *a, char *b ){
// reduce score only for substitutions and for 1 edit hop away
// so essentially both strings should be of the same length
if ( gbstrlen(a) != gbstrlen(b) )
return 0;
int16_t reduceScore = 0;
while ( *a && *b ){
if ( *a == *b ){
a++;
b++;
continue;
}
char c = to_lower_a(*a);
char bplace = s_keyMap[to_lower_a(*b) - 'a'];
// check for all chars around it. For eg. for the letter
// 'j'(16); check 'u'(6),'i'(7),'h'(15),'k'(17),'n'(25),'m'(26)
if ( bplace - 10 >= 0 ) {
if ( ( s_keyboard[bplace - 10] == c ) ||
( s_keyboard[bplace - 9 ] == c ) )
reduceScore += 45;
}
if ( bplace < 10 ) {
if ( s_keyboard[bplace + 1] == c )
reduceScore += 45;
}
if ( bplace % 10 > 0 ) {
if ( s_keyboard[bplace - 1] == c )
reduceScore += 45;
}
if ( bplace - 10 < 28 ) {
if ( ( s_keyboard[bplace + 10] == c ) ||
( s_keyboard[bplace + 9 ] == c ) )
reduceScore += 45;
}
a++;
b++;
}
if ( reduceScore == 45 )
return 45;
return 0;
}
bool Language::getPhonetic( char *origWord, int32_t origWordLen,
char *target, int32_t targetLen ){
*target = '\0';
char word[MAX_PHRASE_LEN];
if ( !makeClean(origWord, origWordLen, word, targetLen ) )
return false;
int32_t wordLen = gbstrlen(word);
int32_t i = 0;
int32_t j = 0;
int32_t k = 0; // number of letters found
int32_t n = 0; // index of m_rulesPtr where the rules for the char starts
int32_t p = 0; // priority of the rule
int32_t z = 0;
int32_t k0 = -333;
int32_t n0 = -333;
int32_t p0 = -333;
int32_t z0 = 0;
char c,c0;
const char *s;
while ( word[i] ){
c = word[i];
//log ( LOG_WARN,"lang: Checking Position %"INT32", word=%s "
// "\ttarget=%s", j, word, target );
z0 = 0;
n = m_ruleStarts[(UChar8) c];
// while the rule exists
if ( n >= 0 ){
// check all rules that start with the same letter
while ( m_rulesPtr[n] && m_rulesPtr[n][0] == (UChar8) c ){
//log( LOG_WARN, "lang: Checking rule "
// "No.%"INT32", \"%s\"\t--> \"%\"s", n,
// m_rulesPtr[n], m_rulesPtr[n+1]);
/** check whole string **/
k = 1; /** number of found letters **/
p = 5; /** default priority **/
s = m_rulesPtr[n];
s++; /** important for (see below) "*(s-1)" **/
// while we are not at the end of the rule and
// the next character of the word is s and
// s is not a digit (priority) and
// s is not (-<^$, we are on the right track
// so keep on checking the next char's.
while (*s != '\0' && word[i+k] == *s &&
!isdigit (*s) &&
strchr ("(-<^$", *s) == NULL) {
k++;
s++;
}
// letters in brackets means only one of these
// chars must fit (OR)
// eg. rule OH(AEIOUY) means A OR E OR I....
if (*s == '(') {
/** check letters in "(..)" **/
// isalpha makes sure that we check
// only letters, and letters are only
// inside the brackets
if ( isalpha(word[i+k] ) &&
strchr(s+1, word[i+k]) != NULL ) {
k++;
while (*s != ')')
s++;
s++;
}
}
p0 = (int) *s;
k0 = k;
// The number of dashes determines how many
// characters from the end will not be replaced
while (*s == '-' && k > 1) {
k--;
s++;
}
// if a `<' is appended to the search string,
// the search for replacement rules will
// continue with the replacement string
// and not with the next character of the word.
if (*s == '<')
s++;
// the priority is the digit
if (isdigit (*s)) {
p = *s - '0';
s++;
}
// The control character `^' says that the
// search string only matches at the beginning
// of words
if (*s == '^' && *(s+1) == '^')
s++;
/* FOR FOLLOWUP RULES
if not at the end of the rule OR
( not on rule that applies only to beginning
of word AND
( i is 0 OR word[i-1] is not alphabet ) AND
( not on rule that applies only to end of
word AND i > 0 AND word[i-1] is not alphabet
AND word[i+k0] is not alphabet ) */
if (*s == '\0' ||
( *s == '^' &&
( i == 0 || !isalpha(word[i-1])) &&
(*(s+1) != '$' ||
(!isalpha(word[i+k0]) ))) ||
(*s == '$' && i > 0 &&
isalpha(word[i-1]) &&
(!isalpha(word[i+k0]) ))) {
/** search for followup rules, if: **/
/** parms.followup and k > 1 and NO '-' in searchstring **/
c0 = word[i+k-1];
n0 = m_ruleStarts[(UChar8)c0];
// followup gives better results.
if ( //parms.followup &&
k > 1 && n0 >= 0 &&
p0 != (int) '-' &&
word[i+k] != '\0' ) {
/** test follow-up rule for "word[i+k]" **/
while (m_rulesPtr[n0][0]==c0) {
/*log (LOG_WARN,
"lang: "
"follow-up rule "
"No.%"INT32"....%s\t -->
%s",n0,
m_rulesPtr[n0],
m_rulesPtr[n0+1] );*/
/** check whole string **/
k0 = k;
p0 = 5;
s = m_rulesPtr[n0];
s++;
while (*s != '\0' &&
word[i+k0] == *s &&
!isdigit(*s) &&
strchr("(-<^$",*s) == NULL) {
k0++;
s++;
}
if (*s == '(') {
/** check letters **/
if ( isalpha(word[i+k0]) &&
strchr (s+1, word[i+k0] ) != NULL) {
k0++;
while (*s != ')' && *s != '\0')
s++;
if (*s == ')')
s++;
}
}
while (*s == '-') {
/** "k0" gets NOT reduced **/
/** because "if (k0 == k)" **/
s++;
}
if (*s == '<')
s++;
if (isdigit (*s)) {
p0 = *s - '0';
s++;
}
if (*s == '\0' ||
/** *s == '^' cuts **/
(*s == '$' && !isalpha(word[i+k0]))) {
if (k0 == k) {
/** this is just a piece of the string **/
//log(LOG_WARN,"lang: discarded (too int16_t)");
n0 += 2;
continue;
}
if (p0 < p) {
/** priority too low **/
//log(LOG_WARN,"lang: discarded (priority)");
n0 += 2;
continue;
}
/** rule fits; stop search **/
break;
}
// log(LOG_WARN,"lang: discarded");
n0 += 2;
} /** End of "while (parms.rules[n0][0] == c0)" **/
if (p0 >= p && m_rulesPtr[n0][0] == c0) {
/*log(LOG_WARN,"lang: Rule No.%"INT32", %s",n, m_rulesPtr[n]);
log(LOG_WARN,"lang: not used because of follow-up Rule No.%"INT32", %s",
n0,m_rulesPtr[n0]);*/
n += 2;
continue;
}
} /** end of follow-up stuff **/
/** replace string **/
/*log(LOG_WARN,"lang: Using rule "
"No.%"INT32", %s\t --> %s", n,
m_rulesPtr[n],m_rulesPtr[n+1]);*/
s = m_rulesPtr[n+1];
p0 = ( m_rulesPtr[n][0] != '\0' &&
strchr ( m_rulesPtr[n]+1,'<') != NULL) ? 1:0;
if (p0 == 1 && z == 0) {
/** rule with '<' is used **/
if (j > 0 && *s != '\0' &&
(target[j-1] == c ||
target[j-1] == *s)) {
j--;
}
z0 = 1;
z = 1;
k0 = 0;
while (*s != '\0' && word[i+k0] != '\0') {
word[i+k0] = *s;
k0++;
s++;
}
if (k > k0){
//strmove (&word[0]+i+k0, &word[0]+i+k);
char *to = &word[0]+i+k0;
char *from = &word[0]+i+k;
while (( *to++ = *from++ ) != 0 )
;
}
/** new "actual letter" **/
c = word[i];
}
else { /** no '<' rule used **/
i += k - 1;
z = 0;
while (*s != '\0'
&& *(s+1) != '\0' && j < wordLen) {
if (j == 0 || target[j-1] != *s) {
target[j] = *s;
j++;
}
s++;
}
/** new "actual letter" **/
c = *s;
if (m_rulesPtr[n][0] != '\0'
&& strstr (m_rulesPtr[n]+1, "^^") != NULL) {
if (c != '\0') {
target[j] = c;
j++;
}
//strmove (&word[0], &word[0]+i+1);
char *to = &word[0];
char *from = &word[0]+i+1;
while (( *to++ = *from++ ) != 0 )
;
i = 0;
z0 = 1;
}
}
break;
} /** end of follow-up stuff **/
n += 2;
} /** end of while (parms.rules[n][0] == c) **/
} /** end of if (n >= 0) **/
if (z0 == 0) {
// collapse_result is false for english
if (k && p0 != -333 && !p0 &&
//(assert(p0!=-333),!p0) &&
j < wordLen && c != '\0' ) { //&&
//(!parms.collapse_result ||
// j == 0 || target[j-1] != c))
/** condense only double letters **/
target[j] = c;
///printf("\n setting \n");
j++;
}
/*else if (p0 || !k)
log( LOG_WARN,"lang: no rule found; "
"character \"%c\" skipped",word[i] );*/
// goto the next character of the word
i++;
z = 0;
k=0;
}
} /** end of while ((c = word[i]) != '\0') **/
target[j] = '\0';
return true;
}
bool Language::hasMispelling(char *phrase, int32_t phraseLen){
char *p = phrase;
char *pend = p;
while ( pend < phrase + phraseLen ){
while ( *pend != ' ' && pend < phrase + phraseLen )
pend++;
char word[1024];
gbmemcpy(word, p, pend - p);
word[pend - p] = '\0';
uint32_t key = hash32d(p, pend - p);
int32_t slot = m_misp.getSlot(key);
if ( slot != -1 ){
log(LOG_WARN,"lang: found mispelling in %s", word);
return true;
}
pend++;
p = pend;
}
return false;
}
///////////////////////////////////////////////////////
// DICTIONARY GENERATION ROUTINES BELOW HERE
//
///////////////////////////////////////////////////////
/*
// . return false and set g_errno on error, true on success
bool Language::generateDicts ( int32_t numWordsToDump , char *coll ) {
log(LOG_INIT,
"lang: Reading first %"INT32" words from titledb records in "
"collection '%s'.",
numWordsToDump,coll);
// ensure we got a dict dir in our working dir
char dd[1024];
if ( gbstrlen ( g_hostdb.m_dir ) > 1000 ) {
g_errno = EBADENGINEER;
log("lang: Working directory %s is too long.",
g_hostdb.m_dir);
return false;
}
sprintf ( dd , "mkdir %sdict.new/" , g_hostdb.m_dir );
log(LOG_INIT,"lang: %s",dd);
if ( gbsystem ( dd ) == -1 ) return false;
sprintf ( dd , "mkdir %stmp/" , g_hostdb.m_dir );
log(LOG_INIT,"lang: %s",dd);
if ( gbsystem ( dd ) == -1 ) return false;
// . loop through all titleRecs
// . put all words/phrases that begin with letter X in file
// words.Y, where Y is the numeric value of to_dict_char(X)
// . don't dump out more than "100,000" words/phrases
// . only dump out one title rec per IP
// . do not dump out a word/phrase more than once for the same titleRec
// . stores files in /tmp/ dir
if (!ucInit(g_hostdb.m_dir))
return log("Unicode initialization failed!");
g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
g_titledb.init ();
g_collectiondb.init(true);
g_titledb.addColl ( coll );
// load the mispellings file first
//if ( !loadMispelledWords() )
// log (LOG_WARN,"lang: mispelled file could not be loaded");
//log(LOG_DEBUG, "lang: making query files");
//if( !makeQueryFiles ( ) )
// return log("lang: had error: %s.",
// mstrerror(g_errno));
log(LOG_DEBUG, "lang: making word files");
if( ! makeWordFiles ( numWordsToDump , MAX_WORDS_PER_PHRASE , coll ) )
return log("lang: had error: %s.",
mstrerror(g_errno));
log(LOG_DEBUG, "lang: making pop files");
if ( ! makePopFiles ( numWordsToDump , MAX_WORDS_PER_PHRASE , coll ) )
return log("lang: had error: %s.",
mstrerror(g_errno));
// add words from /usr/dict/words to the word files
//if ( ! addDictWords ( ) ) return false;
// sort each file
for ( int32_t i = 0 ; i < NUM_CHARS ; i++ ) {
char tmp[1024];
// . sort should treat all lower chars as upper
// . sort in reverse order so longer fragments are on top
// of their int16_ter sub fragments so if they have the
// same score in the end, we'll keep the longer fragment
sprintf(tmp,"sort -f -r %stmp/%s/%s.words.%"INT32" > "
"%stmp/%s/%s.words.%"INT32".sorted",
g_hostdb.m_dir, getLanguageAbbr(m_lang),
getLanguageAbbr(m_lang), i, g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i);
log(LOG_INIT,"lang: %s",tmp);
gbsystem ( tmp );
}
// . now convert each sorted file into a unique list of word/phrases
// with scores
// . score is number of times that word/phrase was found in the file
// . truncate each file to the top "1000000" words/phrases
if ( ! makeScoreFiles ( 180000 ))//numWordsToDump, max # words per file
return log(
"lang: had error: %s.",mstrerror(g_errno));
loadRules();
// success
return true;
}
// . TODO: remove bad words
// . loop through all titleRecs
// . put all words/phrases that begin with letter X in file
// words.Y, where Y = to_dict_char(X) [that compress the char value]
// . don't dump out more than "100,000" words/phrases
// . only dump out one title rec per IP
// . do not dump out a word/phrase more than once for the same titleRec
// . stores files in /tmp/ dir
// . return false and set g_errno on error, true on success
bool Language::makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
char *coll ) {
int32_t numDumped = 0;
// message
log(LOG_INIT,"lang: Dumping first %"INT32" words/phrases.",
numWordsToDump );
// . only allow 1 vote per ip domain
// . assume each titlerec has about 50 words in it
uint32_t maxNumIps = numWordsToDump / 50 ;
if ( maxNumIps < 100000 ) maxNumIps = 100000;
int32_t iptableSize = maxNumIps * 4;
log(LOG_INIT,"lang: Allocating %"INT32" bytes.", iptableSize );
int32_t *iptable = (int32_t *) mmalloc ( iptableSize , "Language" );
if ( ! iptable ) {
return log(
"lang: Could not allocate %"INT32" bytes: %s",
iptableSize,mstrerror(g_errno));
}
memset ( iptable , 0 , iptableSize );
// get the default siteRec
//SiteRec sr;
//Url dummy;
//dummy.set ( "www.jinx.com" , gbstrlen("www.jinx.com") );
//sr.set ( &dummy , coll , gbstrlen(coll) , 7 ); // filenum
// read in 12 byte key, 4 byte size then data of that size
uint32_t ip;
int32_t totalVoters = 0;
uint32_t h;
// buffer used for storing de-tagged doc content
// JAB: warning abatement
// int32_t xbufSize ;
// declare up here so we can jump to done: label
int32_t nw;
//XmlDoc doc;
Words w;
Xml xml;
Url *u;
TitleRec tr;
// JAB: warning abatement
//char xbuf [ 1024*512 ] ; //1024 ];
//int32_t jx = numWordsPerPhrase * 2;
// the word vote table to ensure one vote per word per doc
int32_t vnumEntries ;
int32_t vtableSize = 0 ;
int32_t *vtable = NULL;
// display titlerec # we are scanning
int32_t count = 0;
// open all files for appending
int fds [ NUM_CHARS ];
for ( int32_t i = 0 ; i < NUM_CHARS ; i++ ) {
char ff[1024];
sprintf ( ff , "%stmp/%s/%s.words.%"INT32"", g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
// delete it first
unlink ( ff );
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
}
// message
//log(LOG_INIT,"lang: Scanning title recs for words and phrases in "
// "%s",colldir);
//
// THE TITLE SCAN LOOP
//
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
//g_titledb.init ();
//g_collectiondb.init(true);
//g_titledb.addColl ( coll );
key_t startKey ;
key_t endKey ;
startKey.setMin();
endKey.setMax();
startKey = g_titledb.makeFirstTitleRecKey ( 0 ); // docid );
// turn off threads
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t k ;
char *rec ;
int32_t recSize ;
int32_t sameip = 0;
int32_t y;
char quality;
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
//"main" , // coll ,
coll ,
&list ,
startKey ,
endKey ,
minRecSizes ,
false , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&msg5b )){
log(LOG_LOGIC,"lang: getList did not block.");
return false;
}
// all done if empty
log(LOG_INIT, "lang: got list: %"INT32" recs", list.getNumRecs());
if ( list.isEmpty() ) goto done;
k = list.getCurrentKey();
rec = list.getCurrentRec();
recSize = list.getCurrentRecSize();
startKey = *(key_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey < *(key_t *)list.getLastKey() ) goto done;
//
// END SCAN LOOP
//
// parse out and decompress the TitleRec
tr.set ( rec , recSize , false ) ; // owndata?
// if quality is low, skip this doc
quality = tr.getDocQuality();
if ( quality < 60 )
goto loop;
// only do your language
if ( tr.m_language != m_lang )
goto loop;
// extract the url
u = tr.getUrl();
// get ip
ip = u->getIp();
// look up in ip table
h = ip % maxNumIps;
y = 0;
ipchain:
if ( iptable[h] ) {
// skip if already voted
if ( iptable[h] == (int32_t)ip ) { sameip++; goto loop; }
// chain to next bucket
if ( ++h >= maxNumIps ) h = 0;
if ( ++y > (int32_t)maxNumIps ) {
log(LOG_LOGIC,"spell: IP table is too small. "
"Exiting.");
char *xx = NULL; *xx = 0;
}
goto ipchain;
}
// store in bucket so no doc from this ip votes again
iptable[h] = ip;
// count the voters
totalVoters++;
// parse all the tags out
//doc.set ( &tr , &sr );
// store in this xbuf w/o tags
xml.set ( tr.getCharset(),tr.getContent() , tr.getContentLen() ,
false , 0, false ,
tr.getVersion() );
//xml = doc.getXml();
// xbufSize = xml.getText ( xbuf ,
// 1024*512 ,
// 0 ,
// 999999 ,
// false ,
// true ,
// true );
// convert non-tag content into words
w.set(&xml, true, true);
// hash each phrase
nw = w.getNumWords();
// TODO: make the above a getWords(&w) routine!!
// so it can take from titleRecs or query logs
// . don't hash a word from this doc more than once
// . wvtable = word vote table
vnumEntries = (nw * numWordsPerPhrase * 130) / 100;
vtableSize = vnumEntries * 4;
//log("mallocing2b %"INT32" bytes", vtableSize );
if ( (count % 100) == 0 )
log(LOG_INIT,"lang: Scanning document %"INT32" "
"(%"INT32" dup ips, %"INT32" words dumped).",
count,sameip,numDumped);
count++;
vtable = (int32_t *) mmalloc ( vtableSize , "Language" );
if ( ! vtable ) {
mfree ( iptable , iptableSize , "Language" );
return log("lang: Failed to allocate %"INT32" "
"bytes: %s.",iptableSize,mstrerror(g_errno));
}
memset ( vtable , 0 , vtableSize );
// every other word is punctuation, so step by 2
for ( int32_t i = 0 ; i < nw ; i ++ ) {
// skip punct. wordId is 0.
if ( w.isPunct(i) ) continue;
// is the ith word a stop word?
// tmp buffer to hold word/phrase
char tmp[1024];
char *tmpp = tmp;
char *tmpend = tmp + 1024 - 3;
char *ww = w.getWord(i);
int32_t wwlen = w.getWordLen(i);
if ( wwlen < 2 )
continue;
bool isStop = ::isStopWord ( ww, wwlen, w.getWordId (i));
// BUT ok if Capitalized or number
if ( isStop ) {
if ( is_digit (ww[0]) ) isStop = false;
if ( is_cap (ww,wwlen) ) isStop = false;
// e-mail, c file, c. s. lewis
if ( wwlen == 1 && ww[0] != 'a' ) isStop = false;
}
// loop over # of words per phrase
for ( int32_t k = 1 ; k < numWordsPerPhrase ; k++ ) {
tmpp = tmp;
// stop words cannot start dictionary phrases
if ( k > 1 && isStop ) break;
int32_t lastj = -1;
// do not end on stop word either
for ( int32_t j = i ; j < i + k * 2 ; j++ ) {
// skip if overflow
if ( j >= nw ) continue;
// skip punct
if ( w.isPunct(j) ) continue;
// point to word
char *ww = w.getWord(j);
int32_t wwlen = w.getWordLen(j);
// if no room to store word, skip it
if ( tmpp + wwlen >= tmpend ) {
tmpp = tmp; break; }
// write word into buf
// convert to lower case so our sort works
// they way it should
char tx[1024];
// n is how many bytes we wrote into "tx"
int32_t n = to_lower_utf8(tmpp,tmpend,ww,wwlen);
// advance it
tmpp += n;
// no longer convert to utf8, cuz title rec
// is now already in utf8 by default!!
//tmpp += latin1ToUtf8( tmpp,
// tmpend - tmpp,
// tx, wwlen );
// remember last word # we added
lastj = j;
// followed by space, apostrophe or hyphen
if ( ww[wwlen] == '-' ) *tmpp = '-';
else if ( ww[wwlen] == '\'' ) *tmpp = '\'';
else *tmpp = ' ';
tmpp++;
}
// bail if nothing to add
if ( tmpp <= tmp )
continue;
// don't add dict phrase if last word is a stop word
if ( k > 1 && lastj >= 0 ) {
char *ww = w.getWord ( lastj );
int32_t wwlen = w.getWordLen ( lastj );
int64_t wid = w.getWordId ( lastj );
bool isStop = ::isStopWord(ww,wwlen,wid);
// BUT ok if Capitalized or number
if ( isStop ) {
if (is_digit (ww[0]) ) isStop=false;
if (is_cap (ww,wwlen)) isStop=false;
}
if ( isStop ) continue;
}
// point to last space
tmpp--;
// overwrite it, terminate with a \n
*tmpp = '\n';
// how long is it? does not include terminating \n
int32_t tmplen = tmpp - tmp;
// skip if nothing
if ( tmplen <= 0 )
continue;
// skip word if it has binary chars in it
if ( has_binary ( tmp , tmplen ) )
continue;
// debug
//if ( strncasecmp ( tmp , "a zero" , 6 ) == 0 )
// log("shit");
// get hash of word/phrase
// we need to preserve distinguish between proper
// and improper accent marks, so don't do just ascii
// by using wh = w.getWordId(j)
uint64_t hh = hash64Lower_utf8 (tmp,tmplen );
// don't allow more than one vote per doc for a word
int32_t ii = hh % vnumEntries;
vchain:
if ( vtable[ii] && vtable[ii] != (int32_t)hh ) {
if ( ++ii >= vnumEntries ) ii = 0 ;
goto vchain;
}
if ( vtable[ii] ) continue;
// store it
vtable[ii] = (int32_t)hh;
// a new word for this doc
// append the word out to file
int32_t fn = to_dict_char(tmp[0]);
// write the hash before the word
//char tt[32];
//sprintf ( tt , "%016"XINT64" ", hh );
//if ( write ( fds[fn], tt , 17 ) != 17 )
// return log("spell: makeWordFiles: write: %s",
// strerror(errno));
char tmpx[2080];
tmpp++;
*tmpp = '\0';
sprintf(tmpx,"%s", tmp);
int32_t tmpxlen = gbstrlen(tmpx);
// write out the trailing \n as well
int32_t wn = write ( fds[fn] , tmpx , tmpxlen ) ;
if ( wn != tmpxlen )
return log("spell: makeWordFiles: write: %s",
strerror(errno));
numDumped++;
if ( numDumped >= numWordsToDump ) goto done;
}
}
// breakout:
// don't need the word voting table anymore
if ( vtable ) mfree ( vtable , vtableSize , "Language");
vtable = NULL;
// get more titlerecs so we can hash more words/phrases
goto loop;
done:
// don't need the word voting table anymore
if ( vtable ) mfree ( vtable , vtableSize , "Language");
vtable = NULL;
// close all files
for ( int32_t i = 0 ; i < NUM_CHARS ; i++ )
close ( fds[i] );
return true;
}
#define NUM_UNIFILES MAX_LANGUAGES
bool Language::makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
char *coll) {
int32_t numDumped = 0;
int32_t docCount = 0;
// message
log(LOG_INIT,"lang: Dumping first %"INT32" words/phrases.",
numWordsToDump );
// . only allow 1 vote per ip domain
// . assume each titlerec has about 50 words in it
uint32_t maxNumIps = numWordsToDump / 50 ;
if ( maxNumIps < 100000 ) maxNumIps = 100000;
int32_t iptableSize = maxNumIps * 4;
log(LOG_INIT,"lang: Allocating %"INT32" bytes.", iptableSize );
int32_t *iptable = (int32_t *) mmalloc ( iptableSize , "Language" );
if ( ! iptable ) {
return log(
"lang: Could not allocate %"INT32" bytes: %s",
iptableSize,mstrerror(g_errno));
}
memset ( iptable , 0 , iptableSize );
// get the default siteRec
//SiteRec sr;
//Url dummy;
//dummy.set ( "www.jinx.com" , gbstrlen("www.jinx.com") );
//sr.set ( &dummy , coll , gbstrlen(coll) , 7 ); // filenum
// read in 12 byte key, 4 byte size then data of that size
uint32_t ip;
int32_t totalVoters = 0;
uint32_t h;
// buffer used for storing de-tagged doc content
int32_t xbufSize ;
// declare up here so we can jump to done: label
int32_t nw;
//XmlDoc doc;
Words w;
Xml xml;
//Scores s;
Url *u;
TitleRec tr;
char xbuf [ 1024*512 ] ; //1024 ];
//int32_t jx = numWordsPerPhrase * 2;
// the word vote table to ensure one vote per word per doc
int32_t vnumEntries ;
int32_t vtableSize = 0 ;
int32_t *vtable = NULL;
// display titlerec # we are scanning
int32_t count = 0;
// open all files for appending
int fds [ NUM_UNIFILES ];
for ( int32_t i = 0 ; i < NUM_UNIFILES ; i++ ) {
char ff[1024];
sprintf ( ff , "%stmp/%s/%s.popwords.%"INT32"", g_hostdb.m_dir ,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
// delete it first
unlink ( ff );
// then open a new one for appending
fds[i] = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fds[i] < 0 )
return log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
}
// message
//log(LOG_INIT,"lang: Scanning title recs for words and phrases in "
// "%s",colldir);
//
// THE TITLE SCAN LOOP
//
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
//g_titledb.init ();
//g_collectiondb.init(true);
//g_titledb.addColl ( coll );
key_t startKey ;
key_t endKey ;
startKey.setMin();
endKey.setMax();
startKey = g_titledb.makeFirstTitleRecKey ( 0 ); // docid );
// turn off threads
g_threads.disableThreads();
// get a meg at a time
int32_t minRecSizes = 1024*1024;
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t k ;
char *rec ;
int32_t recSize ;
int32_t sameip = 0;
int32_t y;
char quality;
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
Sections ss;
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_TITLEDB ,
//"main" , // coll ,
coll ,
&list ,
startKey ,
endKey ,
minRecSizes ,
false , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&msg5b )){
log(LOG_LOGIC,"lang: getList did not block.");
return false;
}
// all done if empty
log(LOG_INIT, "lang: got list: %"INT32" recs", list.getNumRecs());
if ( list.isEmpty() ) goto done;
list.resetListPtr();
docloop:
k = list.getCurrentKey();
rec = list.getCurrentRec();
recSize = list.getCurrentRecSize();
//
// END SCAN LOOP
//
docCount++;
// parse out and decompress the TitleRec
tr.set ( rec , recSize , false ) ; // owndata?
// if quality is low, skip this doc
quality = tr.getDocQuality();
if ( quality < 60 )
goto docdone;
if ( tr.m_language != m_lang )
goto docdone;
// extract the url
u = tr.getUrl();
// get ip
ip = u->getIp();
// look up in ip table
h = ip % maxNumIps;
y = 0;
ipchain:
if ( iptable[h] ) {
// skip if already voted
if ( iptable[h] == (int32_t)ip ) { sameip++; goto docdone; }
// chain to next bucket
if ( ++h >= maxNumIps ) h = 0;
if ( ++y > (int32_t)maxNumIps ) {
log(LOG_LOGIC,"spell: IP table is too small. "
"Exiting.");
char *xx = NULL; *xx = 0;
}
goto ipchain;
}
// store in bucket so no doc from this ip votes again
iptable[h] = ip;
// count the voters
totalVoters++;
// parse all the tags out
//doc.set ( &tr , &sr );
// store in this xbuf w/o tags
xml.set ( tr.getCharset(),tr.getContent() , tr.getContentLen() ,
false , 0, false ,
tr.getVersion() );
//xml = doc.getXml();
xbufSize = xml.getText ( xbuf ,
1024*512 ,
0 ,
999999 ,
false ,
true ,
true );
// convert non-tag content into words
//w.set ( true, (char*)xbuf , xbufSize );
w.set ( &xml, true, true);
//s.set ( &w, &xml , TITLEREC_CURRENT_VERSION );
//s.set ( &w, TITLEREC_CURRENT_VERSION , false );
ss.set ( &w,NULL,0,NULL,0,NULL,NULL,&tr,NULL,0);
// hash each phrase
nw = w.getNumWords();
// TODO: make the above a getWords(&w) routine!!
// so it can take from titleRecs or query logs
// . don't hash a word from this doc more than once
// . wvtable = word vote table
vnumEntries = (nw * numWordsPerPhrase * 130) / 100;
vtableSize = vnumEntries * 4;
//log("mallocing2b %"INT32" bytes", vtableSize );
if ( (count % 100) == 0 )
log(LOG_INIT,"lang: Scanning document %"INT32" "
"(%"INT32" dup ips, %"INT32" words dumped).",
count,sameip,numDumped);
count++;
vtable = (int32_t *) mmalloc ( vtableSize , "Language" );
if ( ! vtable ) {
mfree ( iptable , iptableSize , "Language" );
return log("lang: Failed to allocate %"INT32" "
"bytes: %s.",iptableSize,mstrerror(g_errno));
}
memset ( vtable , 0 , vtableSize );
// every other word is punctuation, so step by 2
//log("Adding %d words", nw);
for ( int32_t i = 0 ; i < nw ; i ++ ) {
// skip punct
//if ( w.isPunct(i) ) continue;
//if ( !s.getScore(i) ) continue;
if ( ss.m_sectionPtrs[i]->m_flags & badFlags ) continue;
// is the ith word a stop word?
// tmp buffer to hold word/phrase
char tmp[2048];
char *tmpp = tmp;
char *tmpend = tmp + 2048 - 3;
char *ww = w.getWord(i);
int32_t wwlen = w.getWordLen(i);
bool isStop = ::isStopWord ( ww, wwlen, w.getWordId (i));
// BUT ok if Capitalized or number
if ( isStop ) {
if ( w.isNum(i) ) isStop = false;
if ( w.isUpper(i)) isStop = false;
// e-mail, c file, c. s. lewis
if ( wwlen == 1 && ww[0] != 'a' )
isStop = false;
}
// loop over # of words per phrase
for ( int32_t k = 1 ; k < numWordsPerPhrase ; k++ ) {
tmpp = tmp;
// stop words cannot start dictionary phrases
if ( k > 1 && isStop ) break;
int32_t lastj = -1;
// do not end on stop word either
for ( int32_t j = i ; j < i + k * 2 ; j++ ) {
// skip if overflow
if ( j >= nw ) continue;
// skip punct
//if ( w.isPunct(i+j) ) continue;
//if ( !s.getScore(i+j) ) continue;
if ( ss.m_sectionPtrs[j]->m_flags &badFlags )
continue;
// point to word
char *ww = w.getWord(j);
int32_t wwlen = w.getWordLen(j);
// if no room to store word, skip it
if ( tmpp + wwlen >= tmpend ) {
tmpp = tmp; break; }
// write word into buf
// convert to lower case so our sort works
// they way it should
// n is how many bytes we wrote into "tx"
int32_t n = to_lower_utf8(tmpp,tmpend,ww,wwlen);
// advance it
tmpp += n;
// remember last word # we added
lastj = j;
// followed by space, apostrophe or hyphen
if ( ww[wwlen] == '-' ) *tmpp = '-';
else if ( ww[wwlen] == '\'' ) *tmpp = '\'';
else *tmpp = ' ';
tmpp++;
}
// bail if nothing to add
if ( tmpp <= tmp ) continue;
// don't add dict phrase if last word is a stop word
if ( k > 1 && lastj >= 0 ) {
char *ww = w.getWord ( lastj );
int32_t wwlen = w.getWordLen ( lastj );
int64_t wid = w.getWordId ( lastj );
isStop =::isStopWord(ww,wwlen,wid);
// BUT ok if Capitalized or number
if ( isStop ) {
if ( w.isNum(lastj) ) isStop=false;
if ( w.isUpper( lastj ) ) isStop=false;
}
if ( isStop ) continue;
}
// point to last space
//tmpp--;
// overwrite it, terminate with a \n
*tmpp = '\n';
// how long is it? does not include terminating \n
int32_t tmplen = tmpp - tmp;
// skip if nothing
if ( tmplen <= 0 ) continue;
// skip word if it has binary chars in it
if ( has_binary ( tmp , tmplen ) ) continue;
// debug
//if ( strncasecmp ( tmp , "a zero" , 6 ) == 0 )
// log("shit");
// get hash of word/phrase
// we need to preserve distinguish between proper
// and improper accent marks, so don't do just ascii
// by using wh = w.getWordId(i+j)
uint64_t hh = hash64Lower_utf8 (tmp,tmplen );
// don't allow more than one vote per doc for a word
int32_t ii = hh % vnumEntries;
vchain:
if ( vtable[ii] && vtable[ii] != (int32_t)hh ) {
if ( ++ii >= vnumEntries ) ii = 0 ;
goto vchain;
}
if ( vtable[ii] ) continue;
// store it
vtable[ii] = (int32_t)hh;
// a new word for this doc
// append the word out to file
//int32_t fn = to_dict_char(tmp[0]);
int32_t fn = tr.getLanguage();
// write the hash before the word
//char tt[32];
//sprintf ( tt , "%016"XINT64" ", hh );
//if ( write ( fds[fn], tt , 17 ) != 17 )
// return log("spell: makeWordFiles: write: %s",
// strerror(errno));
// write out the trailing \n as well
int32_t wn = write ( fds[fn] , tmp , tmplen + 1) ;
if ( wn != tmplen + 1 )
return log("spell: makePopFiles: "
"write: %s",
strerror(errno));
numDumped++;
if ( numDumped >= numWordsToDump )
goto done;
}
}
//log(LOG_INIT, "lang: got %"INT32" docs, %"INT32" words",
//docCount, numDumped);
// breakout:
// don't need the word voting table anymore
if ( vtable ) mfree ( vtable , vtableSize , "Language");
vtable = NULL;
docdone:
// get more titlerecs so we can hash more words/phrases
list.skipCurrentRecord();
if (!list.isExhausted())
goto docloop;
startKey = *(key_t *)list.getLastKey();
startKey += (uint32_t) 1;
// watch out for wrap around
if ( startKey < *(key_t *)list.getLastKey() ) goto done;
goto loop;
done:
// don't need the word voting table anymore
log(LOG_INIT, "lang: got %"INT32" docs total", docCount);
if ( vtable ) mfree ( vtable , vtableSize , "Language");
vtable = NULL;
// close all files
for ( int32_t i = 0 ; i < NUM_UNIFILES ; i++ ) close ( fds[i] );
return true;
}
// . now convert each sorted file into a unique list of word/phrases
// with scores
// . score is number of times that word/phrase was found in the file
// . truncate each file to the top "maxWordsPerFile" words/phrases
bool Language::makeScoreFiles ( int32_t maxWordsPerFile ) {
// convert each file
for ( int32_t i = 0 ; i < NUM_CHARS ; i++ ) {
// open the file for reading
char ff[1024];
sprintf ( ff , "%stmp/%s/%s.words.%"INT32".sorted", g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
FILE *fdr = fopen ( ff , "r" );
if ( ! fdr )
return log(
"lang: Failed to open %s for reading: "
"%s.",ff, strerror(errno));
// and one for writing out score/word pairs
sprintf ( ff, "%stmp/%s/%s.words.%"INT32".prescored",g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
FILE *fdw = fopen ( ff , "w" );
if ( ! fdw )
return log(
"lang: Failed to open %s for writing: "
"%s.",ff, strerror(errno));
log(LOG_INIT,"lang: Making %s.", ff );
// ongoing score count
int32_t score = 0;
int32_t oldscore = 0;
// store last word/phrase in here
char lastw [ 1029];
lastw[0] = '\0';
// and its hash in here
uint64_t lasthh = 0;
char pbuf[1024];
//int32_t bonus = 0;
//bool gotit = false; // do we start w/ '*'? means in dict.
// read in each line
while ( fgets ( pbuf , 1024 , fdr ) ) {
char *p = pbuf;
// skip '*'
//if ( *p == '*' ) { gotit = true ; p++; }
//else gotit = false;
// skip lines beginning with "the " TOO COMMON
if ( (p[0] == 't' || p[0] == 'T') &&
strncasecmp ( p , "the ", 4 ) == 0 )
continue;
// also, "and "
if ( (p[0] == 'a' || p[0] == 'A') &&
strncasecmp ( p , "and ", 4 ) == 0 )
continue;
// and, "a "
if ( (p[0] == 'a' || p[0] == 'A') && p[1] == ' ')
continue;
// don't include terminating \n in the length
int32_t plen = gbstrlen(p) - 1;
if ( plen <= 0 ) continue;
// skip if too big and might have been truncated
if ( plen >= 1000 ) continue;
// NULL terminate it to take off ending * and/or \n
p [plen] = '\0';
// get the hash of this word/phrase
uint64_t hh = hash64Lower_utf8 ( p , plen );
//sscanf ( buf , "%"XINT64"" , &hh );
// was it same as last? if so, tally and continue
if ( hh == lasthh ) {
score++;
//if ( gotit ) bonus = IN_DICT_BONUS;
continue;
}
// add bonus to score to get final score
//score += bonus;
// . otherwise, we're starting a new word
// . print out the word before us
if ( score >= MIN_DOCS ) {
//if ( gotit ) // bonus )
// fprintf(fdw,"%05"INT32" *%s\n",score,lastw);
//else
fprintf(fdw,"%05"INT32" %s\n" ,score,lastw);
}
// we are now the new word
lasthh = hh;
strncpy ( lastw , p , 1010 );
//if ( gotit ) bonus = IN_DICT_BONUS;
//else bonus = 0;
// give us score 1
score = 1;
}
// write out the last
// skip if too big and might have been truncated
//score += bonus;
if ( score >= MIN_DOCS && gbstrlen(lastw) < 1000) {
//if (gotit) fprintf (fdw,"%05"INT32" *%s\n",score,lastw );
// else fprintf (fdw,"%05"INT32" %s\n" ,score,lastw );
fprintf (fdw,"%05"INT32" %s\n" ,score,lastw );
}
fclose ( fdr );
fclose ( fdw );
//
// now remove small phrases in there just because the
// big phrase containing them is the popular one
//
// open the file for reading
sprintf ( ff, "%stmp/%s/%s.words.%"INT32".prescored",g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
fdr = fopen ( ff , "r" );
if ( ! fdr )
return log(
"lang: Failed to open %s for reading: "
"%s.",ff, strerror(errno));
// and one for writing out score/word pairs
sprintf ( ff , "%stmp/%s/%s.words.%"INT32".scored", g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i );
fdw = fopen ( ff , "w" );
if ( ! fdw )
return log(
"lang: Failed to open %s for writing: "
"%s.",ff, strerror(errno));
lastw[0] = '\0';
// read in each line
while ( fgets ( pbuf , 1024 , fdr ) ) {
char *p = pbuf;
// don't include terminating \n in the length
int32_t plen = gbstrlen(p) - 1;
// NULL terminate it to take off ending * and/or \n
p [plen] = '\0';
// get score
int32_t score = atoi(p);
// advance p over score and separating space
while ( isdigit(*p) ) p++;
p++;
// skip '*'
//if ( *p == '*' ) { gotit = true ; p++; }
//else gotit = false;
// debug point
//if ( strcmp ( p , "a wide variety of topics" )==0)
// log("got it");
// does the new chunk match the last one?
int32_t n;
for ( n = 0 ; p[n] &&
to_lower_a(p[n]) ==
to_lower_a(lastw[n]); n++ );
// cancel match if doesn't fail on a word boundary
if ( p[n] ) n = 0;
if ( is_alnum(lastw[n]) ) n = 0;
// if match subtract score so we don't leech our
// points from him
if ( n > 0 ) score -= oldscore;
// if our score is now too low, don't add ourselves
if ( score < MIN_DOCS ) continue;
// . save it to disk
// . this puts the asterisk back at the end of the
// word for easier reading
//if ( gotit) fprintf(fdw,"%05"INT32" %s*\n",score,p);
//else fprintf(fdw,"%05"INT32" %s\n" ,score,p);
fprintf(fdw,"%05"INT32"\t%s\n" ,score,p);
// store as last
oldscore = score;
strncpy ( lastw , p , 1010 );
}
fclose ( fdr );
fclose ( fdw );
// sort the score file and output to dict.%"INT32"
char bb[1024];
sprintf( bb,
"sort -f -r %stmp/%s/%s.words.%"INT32".scored | "
"head -%"INT32" > %sdict.new/%s/%s.dict.%"INT32"",
g_hostdb.m_dir, getLanguageAbbr(m_lang),
getLanguageAbbr(m_lang), i, maxWordsPerFile,
g_hostdb.m_dir, getLanguageAbbr(m_lang),
getLanguageAbbr(m_lang), i );
log(LOG_INIT,"lang: %s",bb);
gbsystem ( bb );
// make the phonets for it too
//sprintf(bb,"%sdict.new/dict.%"INT32"",g_hostdb.m_dir,i);
//makePhonet ( bb );
}
return true;
}
// Get the queries from the http query requests and use them as phrases
bool Language::makeQueryFiles ( ) {
char buf [1024*10];
for ( int32_t i = 1; i < 2; i++ ){
//fdr = fopen ( "dict/queries.mamma","r" );
char fx[1024];
sprintf( fx,"%sdict/queries.mamma%"INT32"",g_hostdb.m_dir, i );
FILE *fdr = fopen ( fx,"r" );
if ( ! fdr ) {
return log("lang: Could not open query file for "
"reading: %s.",strerror(errno));
}
// open for writing
char ff[1024];
sprintf ( ff , "%stmp/dict.queries.%"INT32"", g_hostdb.m_dir, i );
// delete it first
unlink ( ff );
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
}
Url u;
Query q;
while ( fgets ( buf , 1024 * 10, fdr ) ) {
buf[1024 * 10 - 1] = '\0';
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(buf) ;
// skip if empty
if ( wlen <= 0 ) continue;
buf[wlen-1]='\0';
u.set(buf,gbstrlen(buf));
HttpRequest r1,r2;
bool status = r1.set ( &u ) ;
if ( !status )
continue;
r2.set( r1.getRequest(), r1.getRequestLen(), NULL );
char frag[1024];
int32_t flen;
char *query = r2.getString( "uip",&flen );
gbmemcpy ( frag, query, flen );
frag[flen++] = '\t';
int32_t queryLen;
query = r2.getString( "q",&queryLen );
q.set(query, queryLen, NULL, 0, true);
// don't use truncated queries
if ( q.m_truncated )
continue;
if ( q.m_isBoolean )
continue;
int32_t nqw = q.m_numWords;
for ( int32_t i = 0 ; i < nqw ; i++ ) {
int32_t fragLen = flen;
// get a word in the Query to start a fragment
// with
QueryWord *qw = &q.m_qwords[i];
// can he start the phrase?
bool canStart = true;
if (!qw->isAlphaWord())
canStart = false;
// MDW: wtf is this?
//UCScript script = qw->wordScript();
//if ((script != ucScriptCommon) &&
// (script != ucScriptLatin))
// canStart = false;
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_CONNECTED &&
qw->m_ignoreWord != IGNORE_QUOTED )
canStart = false;
// if he can't start our fragment,
// just copy over to "dst"
if ( ! canStart ) {
continue;
}
bool inQuotes = qw->m_inQuotes;
char fieldCode = qw->m_fieldCode;
// . get longest continual fragment that
// . starts with word #i. get the following
// words that can be in a fragment
// that starts with word #i start of the frag
char *p = qw->m_word;
int32_t plen = 0;
int32_t lastLen = 0;
for ( ; i < nqw ; i++ ) {
// . skip if we should
// . keep punct, however
QueryWord *qw = &q.m_qwords[i];
if ( qw->m_opcode )
break;
if ( qw->m_inQuotes != inQuotes )
break;
if ( qw->m_fieldCode != fieldCode )
break;
// are we punct?
lastLen = 0;
if ( is_alnum_utf8 ( qw->m_word ) )
lastLen=plen;
// inc the ptr
plen += qw->m_wordLen;
}
// revisit this i in big loop since we did not
// include it
i--;
// if last thing we added was punct, roll back
// over it
if ( lastLen ) { plen = lastLen; i--; }
bool lastPunct = false;
char *pend = p + plen;
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
//skip anything but latin-1
//if (c > 255) continue;
if ( getUtf8CharSize(p) != 1) continue;
// only works on a single character
if ( ! to_dict_char ( *p ) )
continue;
// skip back to back punct/spaces
if ( ! is_alnum_utf8(p) && lastPunct )
continue;
if ( ! is_alnum_utf8(p) )
lastPunct = true;
else
lastPunct=false;
// check for a breech
if ( fragLen+4>=1023) {
break;
g_errno = EBUFTOOSMALL;
return false; }
// language phrases are looking
// for latin-1
char cs = getUtf8CharSize(p);
if ( cs == 1 ) {
frag[fragLen++] = *p;
continue;
}
// otherwise, more than 1 byte char
gbmemcpy(frag+fragLen,p,cs);
fragLen += cs;
}
// if any part of the phrase has a mispelling,
// discard the query
if ( hasMispelling( &frag[flen],
fragLen - flen) ){
break;
}
frag[fragLen++] = '\n';
frag[fragLen] = '\0';
// write out the trailing \n as well
int32_t wn = write ( fdw, frag, fragLen ) ;
if ( wn != fragLen )
return log("spell: makeWordFiles: "
"write: %s",
strerror(errno));
// break here so that we only print one phrase
// per query
break;
}
}
fclose (fdr);
close (fdw);
// each ip can only vote once for a particular query.
// Each ip vote counts as one popular vote
//char cmd[2048];
// sort, the uniquify so that each ip can have only 1 occurance
// of each phrase. Then awk to get just the phrase.
// Then sort again and uniquify with count and remove single
// occurance phrases. Then sort on the count to get the most
// common phrases on top.
//sprintf( cmd, "sort -f %s | uniq -i | "
//"awk -F \'\\t\' \'{print $2}\' "
//"| sort -f | uniq -i -c -d | sort -g -r -k 1,1 "
//"> %s.uniq.sorted", ff, ff );
//log ( LOG_INIT,"lang: %s", cmd );
//gbsystem(cmd);
}
return true;
}
// Make a list of the wikipedia titles of docs found by the query
// "site:xx.wikipedia.org", where xx is the abbr of the language.
// Store in xx.wiki
bool Language::makeWikiFiles( ) {
// open for writing
char ff[1024];
sprintf ( ff , "%sdict/%s/%s.wiki", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) );
// delete it first
unlink ( ff );
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
return true;
}
// make a state
StateWik *st ;
try { st = new (StateWik); }
catch ( ... ) {
g_errno = ENOMEM;
log("Lang: new(%i): %s", sizeof(StateWik),
mstrerror(g_errno));
return false;
}
mnew ( st , sizeof(StateWik) , "LanguageWik" );
st->m_fdw = fdw;
char query [MAX_QUERY_LEN];
sprintf(query,"site:%s.wikipedia.org",getLanguageAbbr(m_lang));
st->m_coll = g_conf.m_defaultColl;
st->m_collLen = gbstrlen(st->m_coll);
// . a boolFlag of 0 means query is not boolean
st->m_q.set ( query, gbstrlen(query), st->m_coll, st->m_collLen,
0 ); // boolFlag
st->m_termId = st->m_q.getTermId(0);
st->m_startKey = g_indexdb.makeStartKey ( st->m_termId );
st->m_endKey = g_indexdb.makeEndKey ( st->m_termId );
st->m_minRecSize = 500 * 1024;
if ( !st->getIndexList( ) )
return false;
return st->getSummary();
}
bool StateWik::getIndexList( ) {
// get the rdb ptr to titledb's rdb
//Rdb *rdb = g_indexdb.getRdb();
// -1 means read from all files in Indexdb
// get the title rec at or after this docId
if ( ! m_msg0.getList ( -1 ,
0 ,
0 ,
0 , // max cache age
false , // add to cache?
RDB_INDEXDB , // rdbId of 2 = indexdb
m_coll ,
&m_list ,
m_startKey ,
m_endKey ,
m_minRecSize, // recSizes
//st->m_useTree , // include tree?
//st->m_useCache , // include cache?
//false , // add to cache?
//0 , // startFileNum
//numFiles , // numFiles
this , // state
gotIndexListWrapper ,
0 ) ) // niceness
return false;
return getSummary( );
}
void gotIndexListWrapper( void *state , RdbList *list ){
StateWik *st = (StateWik *) state;
list->resetListPtr();
st->getSummary();
return;
}
bool StateWik::getSummary( ){
m_numMsg20sOutstanding = 0;
m_numMsg20sReceived = 0;
int32_t numLaunched = 0;
// launch MAX_FRAG_SIZE msg20's at a time, wait for all of them
while ( numLaunched < MAX_FRAG_SIZE && !m_list.isExhausted() ){
int64_t docId = m_list.getCurrentDocId () ;
// set the summary request then get it!
Msg20Request req;
Query *q = &m_q;
//int32_t nt = q->m_numTerms;
req.ptr_qbuf = q->getQuery();
req.size_qbuf = q->getQueryLen()+1;
req.ptr_coll = m_coll;
req.size_coll = m_collLen+1;
req.m_docId = docId;
req.m_numSummaryLines = 3;
req.m_maxCacheAge = g_conf.m_indexdbMaxIndexListAge;
req.m_wcache = true; // addToCache
req.m_state = this;
req.m_callback = gotSummaryWrapper;
req.m_niceness = 0;
req.m_expected = true;
req.m_boolFlag = q->m_isBoolean; // 2 means auto?
req.m_allowPunctInPhrase = true;
req.m_showBanned = false;
if ( ! m_msg20s[numLaunched].getSummary ( &req ) )
m_numMsg20sOutstanding++;
#ifdef _OLDMSG20_
if ( !m_msg20s[numLaunched].
getSummary(&m_q,
NULL,
NULL,
docId,
-1, //clusterLevel
3,//numLinesInSummary,
g_conf.m_indexdbMaxIndexListAge,
1 , //addToCache
m_coll ,
m_collLen ,
this ,
gotSummaryWrapper ,
0 ,// niceness
//m_sequentialTitledbLookup,
false ,// titledb restrict?
NULL,//m_si->m_displayMetas ,
0,//m_si->m_displayMetasLen ,
0,//bigSampleRadius ,
0,//bigSampleMaxLen ,
true,//m_si->m_isMasterAdmin ,
true , //requireallterms
false , //count links
0,
NULL, //url
false, //just get link info
false,//considerTitlesFromBody
true,// usenewsummaries
0,
NULL, //link info
NULL, //hostdb
true,//expect 2b there?
NULL,
0,
0,
true,//getvectorrec
false,//deduping
true,// allowPunctinPhrase
false,//showbanned
false,//excludeLinkText,
false,//hackFixWords,
false,//hackFixPhrases,
0,//includeCachedCopy
false))// justgetlinkquality
m_numMsg20sOutstanding++;
#endif
m_list.skipCurrentRecord();
numLaunched++;
}
m_numMsg20sLaunched = numLaunched;
if ( m_numMsg20sOutstanding > 0 )
return false;
gotSummaryWrapper( this );
return false;
}
void gotSummaryWrapper ( void *state ){
StateWik *st = (StateWik *) state;
st->m_numMsg20sReceived++;
if ( !st->m_list.isExhausted() &&
st->m_numMsg20sLaunched < MAX_FRAG_SIZE )
return;
if ( st->m_numMsg20sReceived < st->m_numMsg20sOutstanding )
return;
if ( !st->gotSummary( ) )
return;
return;
}
bool StateWik::gotSummary ( ){
for ( int32_t i = 0; i < m_numMsg20sLaunched; i++ ){
if ( m_msg20s[i].m_errno )
continue;
char frag[MAX_FRAG_SIZE];
int32_t flen = 0;
strcpy(frag, m_msg20s[i].getTitle());
flen = gbstrlen(frag);
//log ( LOG_WARN,"lang: Got url %s with title %s",
// m_msg20s[i].getUrl(),
// m_msg20s[i].getTitle() );
// check for two or more consecutive puncts
bool lastPunct = false;
bool skip = false;
char *p = frag;
char *pend = frag + flen;
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
if ( lastPunct && !is_alnum_utf8(p) ){
skip = true;
break;
}
if ( !is_alnum_utf8 ( p ) )
lastPunct = true;
}
if ( skip )
continue;
// check if all the letters are not alphabets
int32_t numAlphas = 0;
// anoterh loop
p = frag;
for ( ; p < pend ; p += getUtf8CharSize(p) ) {
if ( !is_alpha_utf8 ( p ) )
numAlphas++;
}
if ( numAlphas >= flen )
continue;
frag[flen++] = '\n';
frag[flen] = '\0';
//log ( LOG_WARN,"lang: Got url %s with title %s",
// m_msg20s[i].getUrl(),frag );
// write out the trailing \n as well
int32_t wn = write ( m_fdw, frag, flen ) ;
if ( wn != flen )
continue;
}
// see if u can launch more
if ( !m_list.isExhausted() )
return getSummary();
// see if the termlist is over
if ( m_list.getListSize() >= m_minRecSize ){
// see if u can get some more of the list.
m_startKey = *(key_t *)m_list.getLastKey();
m_startKey += (uint32_t) 1;
// watch out for wrap around
if ( m_startKey >= *(key_t *)m_list.getLastKey() )
return getIndexList();
}
// close the file
close(m_fdw);
return true;
}
// Generates the phonetics of the words of the dictionary.
// Finds the term frequency and then put it as the popularity after adjusting
bool Language::makeDict(){
StateDict *st ;
try { st = new (StateDict); }
catch ( ... ) {
g_errno = ENOMEM;
log("Lang: new(%i): %s", sizeof(StateDict),
mstrerror(g_errno));
return true;
}
mnew ( st , sizeof(StateDict) , "StateDict" );
m_stateDict = st;
char ff[1024];
sprintf(ff,"%sdict/%s/%s.wl", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
File f;
f.set (ff);
// open file
if ( ! f.open ( O_RDONLY ) ) {
log("lang: open: %s",mstrerror(g_errno));
return true;
}
// TODO : CHANGE THIS TO USE fgets
// get file size
int32_t fileSize = f.getFileSize() ;
// store a \0 at the end
st->m_dictBufSize = fileSize + 1;
// make buffer to hold all
st->m_dictBuf = (char *) mmalloc ( st->m_dictBufSize ,
"LanguageWordsBuf" );
if ( ! st->m_dictBuf) {
log("lang: mmalloc: %s",mstrerror(errno));return false;
}
// read em all in
if ( ! f.read ( st->m_dictBuf , fileSize , 0 ) ) {
log("lang: read: %s", mstrerror(g_errno));
return true;
}
// change \n to \0
st->m_numTuples = 0;
for ( int32_t i = 0 ; i < st->m_dictBufSize ; i++ ) {
if ( st->m_dictBuf[i] != '\n' ) continue;
st->m_dictBuf[i] = '\0';
st->m_numTuples++;
}
f.close();
// log a msg
log(LOG_INIT,"lang: read %"INT32" words into memory", st->m_numTuples );
// alloc space to make them into termids
st->m_bufSize = st->m_numTuples * ( sizeof (char*) +
2 * sizeof (int64_t) );
st->m_buf = (char *) mmalloc ( st->m_bufSize, "LanguagePtrs" );
if ( !st->m_buf ) {
log ( LOG_WARN,"lang: could not alloc %"INT32" bytes",
st->m_bufSize );
g_errno = ENOMEM;
return true;
}
char *p = st->m_buf;
st->m_wordsPtr = (char **) p;
p += st->m_numTuples * sizeof(char *);
st->m_termIds = (int64_t *)p;
p += st->m_numTuples * sizeof(int64_t);
st->m_termFreqs = (int64_t *)p;
p += st->m_numTuples * sizeof(int64_t);
char *coll = g_conf.m_defaultColl;
int32_t collLen = gbstrlen(coll);
p = st->m_dictBuf;
for ( int32_t i = 0; i < st->m_numTuples; i++ ){
st->m_wordsPtr[i] = p;
p += gbstrlen(p) + 1;
int32_t wordLen = gbstrlen(st->m_wordsPtr[i]);
// . set query class
// . a boolFlag of 0 means query is not boolean
Query q;
q.set ( st->m_wordsPtr[i], wordLen , coll , collLen , 0 );
st->m_termIds[i] = q.getTermId(0);
st->m_termFreqs[i] = 0;
}
if ( !st->m_msg37.getTermFreqs ( coll ,
0 , // maxAge
st->m_termIds ,
st->m_numTuples ,
st->m_termFreqs ,
this ,
gotTermFreqsWrapper,
0 , // niceness
false ))// exact count?
return false;
gotTermFreqsWrapper(this);
return true;
}
void gotTermFreqsWrapper(void *state){
Language *lang = (Language *) state;
lang->gotTermFreqs(lang->m_stateDict);
}
bool Language::gotTermFreqs( StateDict *st ){
int fd;
char ff[1024];
sprintf ( ff , "%sdict/%s/%s.wl.phonet",g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
// delete it first
unlink ( ff );
// then open a new one for appending
fd = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fd < 0 ){
log("lang: Could not open %s for writing: "
"%s.",ff, strerror(errno));
st->m_numTuples = 0;
}
int64_t max = 0LL;
for ( int32_t i = 0; i < st->m_numTuples; i++ ){
if ( st->m_termFreqs[i] > max )
max = st->m_termFreqs[i];
}
char cleanWord[MAX_PHRASE_LEN];
char phonetic[MAX_PHRASE_LEN];
int32_t wordLen = 0;
char tmp[1024];
for ( int32_t i = 0; i < st->m_numTuples; i++ ){
wordLen = gbstrlen(st->m_wordsPtr[i]);
// clean the word, i.e. convert word to uppercase and
// remove possible accents
makeClean( st->m_wordsPtr[i], wordLen,
cleanWord, MAX_PHRASE_LEN );
getPhonetic ( cleanWord, gbstrlen(cleanWord),
phonetic, MAX_PHRASE_LEN );
int64_t freq = ( st->m_termFreqs[i] * 32000 ) / max ;
sprintf(tmp,"%"INT64"\t%s\t%s\n", freq,
st->m_wordsPtr[i], phonetic);
uint32_t wn = write ( fd , tmp , gbstrlen(tmp) ) ;
if ( wn != gbstrlen(tmp) ){
log("lang: makeWordFiles: write: %s",
strerror(errno));
break;
}
}
close(fd);
mfree ( st->m_dictBuf, st->m_dictBufSize,"LanguageDictBuf" );
mfree ( st->m_buf, st->m_bufSize,"LanguageBuf");
mdelete(st,sizeof(StateDict),"StateDict");
delete(st);
return true;
}
#if 0
bool Language::makeAffinities(){
// make a state
StateAff *st ;
try { st = new (StateAff); }
catch ( ... ) {
g_errno = ENOMEM;
log("Lang: new(%i): %s", sizeof(StateAff),
mstrerror(g_errno));
return false;
}
mnew ( st , sizeof(StateAff) , "LanguageAffinity" );
st->m_fileNum = 12;
// blocked
if ( !openAffinityFile(st) )
return false;
return st->doneAffinities(st);
}
bool StateAff::openAffinityFile( ){
if ( m_fileNum >= NUM_CHARS )
return true;
// open for reading
char ff[1024];
sprintf ( ff , "%sdict/dict.%"INT32"", g_hostdb.m_dir, m_fileNum );
m_fdr = fopen ( ff, "r" );
if ( !m_fdr ) {
log("lang: test: Could not open %s for "
"reading: %s.", ff,strerror(errno));
return true;
}
// open for writing
sprintf ( ff , "%sdict.new/dict.%"INT32".aff", g_hostdb.m_dir,
m_fileNum );
// delete it first
unlink ( ff );
// then open a new one for appending
m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( m_fdw < 0 ){
log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
return true;
}
if ( !launchAffinity(st) ){
return false;
}
m_fileNum++;
return openAffinityFile(st);
}
bool Language::launchAffinity(StateAff *st){
//char dst[1026];
// go through the words in dict/words
while ( fgets ( m_buf , MAX_FRAG_SIZE , m_fdr ) ){
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(m_buf) ;
// skip if empty
if ( wlen <= 0 )
return launchAffinity(st);
m_buf[wlen-1]='\0';
// skip to the phrase. titlerec dict have space as a seperator
char *p = m_buf;
while ( *p != ' ' )
p++;
p++;
char *coll = g_conf.m_defaultColl;
int32_t collLen = gbstrlen(coll);
// . set query class
// . a boolFlag of 0 means query is not boolean
int32_t numTerms = 0;
Query *q = &m_q;
if ( q->set ( p, gbstrlen(p), coll, collLen, 0 ) )
numTerms = q->getNumTerms();
// no use doing affinities on 1 word phrases
if ( numTerms <= 1 ){
char dst[1096];
sprintf( dst, "00000\t%s\n", m_buf );
log("%s",dst);
uint32_t wn = write(m_fdw, dst, gbstrlen(dst));
if ( wn != gbstrlen(dst) )
log("lang: genTopPopFile: write: %s",
strerror(errno));
continue;
}
m_msg3a.reset();
if ( !m_msg3a.
getDocIds( q ,
coll ,
collLen ,
100.0 ,
g_conf.m_indexdbMaxIndexListAge,
true ,
0 ,//stage0
30,
0 ,
this,
gotAffinityFreqs1Wrapper ) )
return false;
return gotAffinityFreqs1(st);
}
fclose(m_fdr);
close(m_fdw);
return true;
}
void gotAffinityFreqs1Wrapper(void *state){
StateAff *st = (StateAff *) state;
st->gotAffinityFreqs1(st);
return;
}
bool StateAff::gotAffinityFreqs1( ){
m_denominator = m_msg3a.getNumTotalHits();
// now get the phrase hits
char *p = m_buf;
while ( *p != ' ' )
p++;
// change the space to a quote
*p = '\"';
//go to the end
while ( *p != '\0' )
p++;
//change that to quote
*p = '\"';
p++;
// null end
*p = '\0';
p = m_buf;
while ( *p != '\"')
p++;
char *coll = g_conf.m_defaultColl;
int32_t collLen = gbstrlen(coll);
// . set query class
// . a boolFlag of 0 means query is not boolean
Query *q = &m_q;
q->set ( p, gbstrlen(p), coll, collLen, 0 );
m_msg3a.reset();
if ( !m_msg3a.
getDocIds( q ,
coll ,
collLen ,
100.0 ,
g_conf.m_indexdbMaxIndexListAge,
true ,
0 ,//stage0
30,
0 ,
this ,
gotAffinityFreqs2Wrapper ) )
return false;
return gotAffinityFreqs2(st);
}
void gotAffinityFreqs2Wrapper(void *state){
StateAff *st = (StateAff *) state;
st->gotAffinityFreqs2(st);
return;
}
bool StateAff::gotAffinityFreqs2(StateAff *st){
m_numerator = m_msg3a.getNumTotalHits();
double affinity = 0;
if ( m_denominator > 0 )
affinity = (double)m_numerator / (double)m_denominator;
affinity *= 10000;
char dst[1096];
sprintf( dst, "%05.0f\t%s\n", affinity, m_buf );
log("num=%"INT64", denom=%"INT64", %s",m_numerator,m_denominator,dst);
uint32_t wn = write ( m_fdw , dst , gbstrlen(dst) ) ;
if ( wn != gbstrlen(dst) )
log("lang: genTopPopFile: write: %s",strerror(errno));
//blocked
if ( !launchAffinity(st) )
return false;
// didn't block means the file ended
m_fileNum++;
if ( !openAffinityFile(st) )
return false;
return doneAffinities(st);
}
bool StateAff::doneAffinities(StateAff *st){
mdelete(st,sizeof(StateAff), "StateAff");
delete(st);
return true;
}
#endif
///////////////////////////////////////////////////////
// DICTIONARY MANIPULATION ROUTINES BELOW HERE
//
///////////////////////////////////////////////////////
// Clean query dict file of mispelleings
// NOTE: This function shall only compare each word to see if the phrase
// is present in the most commonly mispelled words list, that is present
// in the file mispelled_words. For spellchecking, use spellcheckDict()
// NOTE: Whenever you use these functions, please check the infile, outfile
// and the text format is correct
bool Language::cleanDictFile ( ) {
char buf [1024*10];
char fx[1024];
sprintf( fx,"%sdict/%s/%s.query.phonet",g_hostdb.m_dir,
getLanguageAbbr(m_lang),getLanguageAbbr(m_lang) );
FILE *fdr = fopen ( fx,"r" );
if ( ! fdr ) {
return log("lang: Could not open query file for "
"reading: %s.",strerror(errno));
}
// open for writing
char ff[1024];
sprintf ( ff , "%stmp/query.phonet.clean", g_hostdb.m_dir );
// delete it first
unlink ( ff );
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
}
while ( fgets ( buf , 1024 * 10, fdr ) ) {
buf[1024 * 10 - 1] = '\0';
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(buf) ;
// skip if empty
if ( wlen <= 0 ) continue;
//buf[wlen-1]='\0';
char *p = buf;
while ( *p != '\t' )
p++;
p++;
char *str = p;
while ( *p != '\t' )
p++;
if ( hasMispelling(str, p - str) )
continue;
// write out the trailing \n as well
int32_t wn = write ( fdw, buf, wlen ) ;
if ( wn != wlen )
return log("spell: makeWordFiles: "
"write: %s",
strerror(errno));
// break here so that we only print one phrase
// per query
}
return true;
}
// opens each file and creates the (score, word, phonet) tuple and stores
// in phonet file. Normalizes scores to a high score of 32000. Also removes
// tuples for which there are no phonets and tuples that are adult.
// The incoming file is supposed to be a tuple of (score, word)
bool Language::makePhonet( char *infile){
loadRules();
// create the output file
int fdw;
char outfile[1024];
sprintf ( outfile , "%s.phonet", infile);
// delete it first
unlink ( outfile );
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
char buf[1024];
int32_t max = 0;
// open the input file
FILE *fdr;
// then open
fdr = fopen ( infile, "r" );
if ( !fdr )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
// this loop goes through all the tuples and finds max score
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
while ( *p == ' ' )
p++;
// first is the popularity score
if ( atoi (p) > max )
max = atoi(p);
}
// close
fclose(fdr);
// then open
fdr = fopen ( infile, "r" );
if ( !fdr )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
char *scorePtr;
char *wordPtr;
char cleanWord[MAX_PHRASE_LEN];
char phonetic[MAX_PHRASE_LEN];
int32_t wordLen = 0;
char tmp[1024];
// this loop goes through all the tuples and only adds those
// tuples into the phonetic dict that have phonets. Normalizes scores.
while ( fgets ( buf , 1024 , fdr ) ) {
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
while ( *p == ' ' )
p++;
// first is the popularity score
scorePtr = p;
int64_t score = (int64_t ) atoi(scorePtr);
// normalize score
score = ( score * 32000 )/ max;
// skip it
while ( *p != '\t' )
p++;
// null end it
*p = '\0';
p++;
wordPtr = p;
wordLen = gbstrlen( wordPtr );
// make the all letters in lower case
to_lower1(p);
// clean the word, i.e. convert word to uppercase and
// remove possible accents
if (!makeClean(wordPtr, wordLen, cleanWord, MAX_PHRASE_LEN)){
log ( "removed unclean phrase %s", p );
continue;
}
if ( !getPhonetic ( cleanWord, gbstrlen(cleanWord), phonetic,
MAX_PHRASE_LEN ) ){
log ( "could not get phonetic of phrase %s", p );
continue;
}
if ( gbstrlen(phonetic) == 0 ){
log ( "got 0 len phonetic of phrase %s", p );
continue;
}
sprintf(tmp,"%"INT64"\t%s\t%s\n",score, wordPtr, phonetic);
uint32_t wn = write ( fdw , tmp , gbstrlen(tmp) ) ;
if ( wn != gbstrlen(tmp) )
return log("lang: makePopPhonet: write: "
"%s",strerror(errno));
}
close(fdw);
fclose(fdr);
// all done
return true;
}
bool Language::genTopPopFile ( char *infile ){
// open the input file
FILE *fdr;
// then open
fdr = fopen ( infile, "r" );
if ( !fdr )
return log("lang: Could not open %s for reading: "
"%s.", infile, strerror(errno));
// create the output file
int fdw;
char outfile[1024];
sprintf ( outfile , "%s.top", infile );
// delete it first
unlink ( outfile );
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
char buf[1024];
int32_t count = 0;
// this loop goes through all the words and only adds those
// tuples into the distributed file that belong to this host.
while ( fgets ( buf , 1024 , fdr ) ) {
// put the first TOP_POP_PHRASES words
if ( count++ >= TOP_POP_PHRASES )
break;
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
uint32_t wn = write ( fdw , buf , gbstrlen(buf) ) ;
if ( wn != gbstrlen(buf) )
return log("lang: genTopPopFile: write: "
"%s",strerror(errno));
}
close(fdw);
fclose(fdr);
return true;
}
*/
// the distributed pop file is stored as a tuple of (phrase, phonet, lang, pop)
// to comply with the unified dict
bool Language::genDistributedPopFile ( char *infile, uint32_t myHash ){
// open the input file
FILE *fdr;
// then open
fdr = fopen ( infile, "r" );
if ( !fdr )
return log("lang: Could not open %s for writing: "
"%s.", infile, strerror(errno));
// create the output file
int fdw;
char outfile[1024];
sprintf ( outfile , "%s.%"INT32"", infile, myHash );
// delete it first
unlink ( outfile );
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
char buf[1024];
int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
hostsPerSplit /= g_hostdb.m_numHostsPerShard;
int32_t count = 0;
// this loop goes through all the words and only adds those
// tuples into the distributed file that belong to this host.
while ( fgets ( buf , 1024 , fdr ) ) {
// skip the first TOP_POP_PHRASES words because they shall be
// put in the top pop file
if ( count++ < TOP_POP_PHRASES )
continue;
int32_t wlen = gbstrlen(buf);
if ( wlen <= 0 || wlen > MAX_PHRASE_LEN )
continue;
// remove the newline \n
buf [wlen - 1] = '\0';
char *p = buf;
char *pend = p + wlen - 1;
// first is the popularity score
char *score = p;
while ( *p != '\t' && p < pend )
p++;
// null end the score
*p = '\0';
p++;
// next is the phrase
char *phrase = p;
while ( *p != '\t' && p < pend )
p++;
p++;
// check if we're at the phonet
if ( p >= pend )
continue;
char *phonet = p;
uint64_t phonetKey = hash64Lower_utf8(phonet);
if ( phonetKey % hostsPerSplit != myHash )
continue;
char tmp[1024];
sprintf(tmp,"%s\t%s\n", phrase, score);
// put the \n in place of \0
//buf [wlen-1] = '\n';
uint32_t wn = write ( fdw , tmp , gbstrlen(tmp) ) ;
if ( (int32_t)wn != gbstrlen(tmp) )
return log("lang: genDistributedPop: write: "
"%s",strerror(errno));
}
close(fdw);
fclose(fdr);
return true;
}
// heuristic code to spellcheck the dictionary
// spellcheck each word in the pop words dictionary with forceReco on so that
// we get a recommendation. Output words that have a recommendation that has
// 4 times the popularity of the word
int32_t Language::spellcheckDict(){
if ( !loadWikipediaWords() )
return 0;
char ff[1024];
sprintf ( ff , "%sdict/%s/%s.query.phonet", g_hostdb.m_dir,
getLanguageAbbr(m_lang), getLanguageAbbr(m_lang));
FILE *fd = fopen ( ff, "r" );
if ( ! fd ) {
log("lang: test: Could not open %s for "
"reading: %s.", "query.phonet",strerror(errno));
return 0;
}
// create the output file
int fdw;
char outfile[1024];
sprintf ( outfile , "%s.spellcheck", ff );
// delete it first
unlink ( outfile );
// then open a new one for appending
fdw = open ( outfile ,
O_CREAT | O_RDWR | O_APPEND ,
getFileCreationFlags() );
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 )
return log("lang: Could not open %s for writing: "
"%s.", outfile, strerror(errno));
HashTableT <int32_t,int32_t> kickedOutPhrases;
kickedOutPhrases.set(256);
int32_t notFound = 0;
char buf[1026];
//char dst[1026];
// go through the words in dict/words
while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
// length of word(s), including the terminating \n
int32_t wlen = gbstrlen(buf) ;
// skip if empty
if ( wlen <= 0 ) continue;
buf[wlen-1]='\0';
for ( int32_t j = 0; j < wlen; j++ )
if ( buf[j] == '\t')
buf[j] = '\0';
char *tuple = buf;
//skip score and go to phrase
tuple += gbstrlen(tuple) + 1;
char *word = tuple;
// . make the all letters in lower case
// . TODO: fix for utf8 words?
to_lower1_a(word);
// check for adult words
/*if ( isAdult (word) ){
log(LOG_WARN,"lang: kicking out adult phrase=%s",
word);
continue;
}*/
uint64_t h = hash64d ( word, gbstrlen(word));
bool isInWiki = false;
// if the phrase is in wikipedia, its safe
int32_t slot = m_wiki.getSlot(h);
if ( slot != -1 )
isInWiki = true;
int32_t wordPop = g_speller.getPhrasePopularity( word, h, false );
if ( wordPop == 0 ) {
slot = m_distributedPopPhrases.getSlot(h);
if ( slot != -1 ){
wordPop = m_distributedPopPhrases.
getValueFromSlot(slot);
}
}
bool isPhrase = false;
while ( *tuple != '\0' ){
if ( *tuple == ' ' )
isPhrase = true;
tuple++;
}
// point back to the phrase
tuple = word;
char recommendation[MAX_PHRASE_LEN];
bool found;
int32_t score;
int32_t pop;
/*
if ( !isPhrase && !isInWiki ){
// just the the best narrow phrase we can find
int32_t numNarrow = 0;
char narrow[MAX_PHRASE_LEN];
int32_t narrowPop;
numNarrow = narrowPhrase ( word, narrow,
&narrowPop, 1 );
if ( numNarrow == 0 ){
log (LOG_WARN,"lang: no Narrow Searches "
"for %s",word);
continue;
}
word = narrow;
wordPop = narrowPop;
}
*/
bool reco = getRecommendation( word, gbstrlen(word),
recommendation, MAX_PHRASE_LEN,
&found, &score, &pop,
true );// forceReco
// if a kicked out phrase is the recommendation, then DON'T
// kick out this one too, because it probably means that the
// kicked out phrase was good. BUT should we put the kicked
// out phrase back ??
if ( reco && !isInWiki ){
int32_t h1 = hash32d ( recommendation,
gbstrlen(recommendation) );
slot = m_wiki.getSlot(h1);
// if the recommendation is in wiki, then double the
// pop of the recommendation
if ( slot != -1 && !isInWiki ){
log (LOG_WARN,"lang: recommendation=%s "
"is in the wiki. kicks out phrase %s",
recommendation, buf+gbstrlen(buf)+1);
pop *= 2;
}
slot = kickedOutPhrases.getSlot(h1);
if ( slot != -1 ){
log (LOG_WARN,"lang: recommendation has "
"already been kicked out, word=%s, "
"reco=%s",buf+gbstrlen(buf)+1,
recommendation );
reco = false;
}
}
// if it is found in wikipedia OR
// if no reco is found (even though it is a phrase) OR
// if phrase popularity is 4x the recommendation popularity
// if score is less than 99.
if ( isInWiki || !reco || wordPop * 4 > pop || score > 99 ){
char tmp[MAX_FRAG_SIZE];
sprintf(tmp,"%s\t%s\t%s\n",buf, tuple,
tuple + gbstrlen(tuple) + 1);
uint32_t wn = write ( fdw , tmp , gbstrlen(tmp) );
if ( (int32_t)wn != gbstrlen(tmp) )
return log("spell: spellCheckDict: write: "
"%s",strerror(errno));
continue;
}
kickedOutPhrases.addKey(h,1);
log ( LOG_WARN,"lang: not found=%s, reco=%s, "
"score=%"INT32", wordPop=%"INT32", recoPop=%"INT32"",
buf + gbstrlen(buf) + 1, recommendation, score,
wordPop, pop );
notFound++;
}
close (fdw);
fclose(fd);
return notFound;
}