#include "Language.h" #include "sort.h" #include "Speller.h" #include "Sections.h" // word/phrase must be in at least this many docs to be included in our dict #define MIN_DOCS 3 // ROUTINES NEEDED FOR GBSORT // The dict is stored as a tuple of ( original word, phonetic, (lang, score)..) int cmpPhonet (const void *v1, const void *v2) { char *word1 = *(char **)v1; // phrase char *p1 = word1; // phonetic p1 += gbstrlen(p1) + 1; char *word2 = *(char **)v2; // phrase char *p2 = word2; // phonetic p2 += gbstrlen(p2) + 1; return strcmp(p1,p2); } int cmpScores (const void *v1, const void *v2) { Reco r1 = *(Reco *) v1; Reco r2 = *(Reco *) v2; return ( r1.score > r2.score ); } int cmpFrnt (const void *v1, const void *v2) { // compare phrase char *p1 = *(char **) v1; char *p2 = *(char **) v2; return strcmp ( p1,p2 ); } int cmpBck (const void *v1, const void *v2) { char *p1 = *(char **) v1; char *p2 = *(char **) v2; // string compare for reverse // go to the end p1 += gbstrlen(p1) - 1; p2 += gbstrlen(p2) - 1; while ( *p1 != '\0' && *p2 != '\0' ) { if ( *p1 > *p2 ) return 1; else if ( *p1 < *p2 ) return -1; p1--; p2--; } if ( *p1 == '\0' ) return -1; if ( *p2 == '\0' ) return 1; return 0; } static char s_keyMap[] = { 10, 24, 22, 12, 2, 13, 14, 15, 7, 16, 17, 18, 26, 25, 8, 9 , 0 , 3 , 11, 4, 6 , 23, 1 , 21, 5, 20 }; static char s_keyboard[] = {'q' ,'w','e','r','t','y','u','i','o' ,'p' , 'a' ,'s','d','f','g','h','j','k','l' ,'\0', 'z','x','c','v','b','n','m','\0','\0','\0'}; //static void gotSummaryWrapper ( void *state ); //static void gotIndexListWrapper( void *state , RdbList *list ); //static void gotTermFreqsWrapper( void *state ); /*static void gotAffinityFreqs1Wrapper(void *state); static void gotAffinityFreqs2Wrapper(void *state);*/ Language::Language(){ m_rulesBuf = NULL; m_rulesBufSize = 0; m_rulesPtr = NULL; m_rulesPtrSize = 0; m_distributedBuf = NULL; m_distributedBufSize = 0; m_tuplePtr = NULL; m_tuplePtrSize = 0; m_narrowBuf = NULL; m_narrowBufSize = 0; m_numNarrowPtrs = 0; // Set to the default aspell parms m_editDistanceWeightsDel1 = 95; m_editDistanceWeightsDel2 = 95; m_editDistanceWeightsSwap = 90; m_editDistanceWeightsSub = 100; m_editDistanceWeightsSimilar = 10; m_editDistanceWeightsMin = 95; m_editDistanceWeightsMax = 100; m_soundslikeWeight = 15; m_wordWeight = 85; m_span = 50; // . set m_map // . this maps an ascii char to a char in dict space // . used in loadNarrow /* for ( long i = 0 ; i < 256 ; i++ ) { unsigned char d = to_upper_ascii(i); if ( is_alpha(d) ) { // some like char 254 aren't really ascii!! // so make them into Z's, a rare letter, which // probably isn't in the same alphabet as 222 and 254 if ( d == 222 ) m_map[i] = 'Z' - 'A' + 12; else if ( d == 254 ) m_map[i] = 'Z' - 'A' + 12; else if ( d < 'A' ) m_map[i] = 38; // use apostrophes else if ( d > 'Z' ) m_map[i] = 38; // use apostrophes else m_map[i] = d - 'A' + 12; continue; } if ( is_digit(d) ) m_map[i] = d - '0' + 2; else if ( d == 0 ) m_map[i] = 0; else if ( d == '\'' ) m_map[i] = 38; else if ( d == '-' ) m_map[i] = 39; else if ( d == '\n' ) m_map[i] = 0; else m_map[i] = 1; // a space } */ reset(); } /* bool Language::convertLatin1DictToUTF8( char *infile ){ // open the file for reading FILE *fdr = fopen ( infile , "r" ); if ( ! fdr ) return log( "lang: Failed to open %s for reading: " "%s.",infile, strerror(errno) ); char ff[1024]; // open for writing sprintf ( ff , "%s.utf8", infile ); // delete it first unlink ( ff ); // then open a new one for appending int fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ){ return log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); } char buf[1024]; char out[4*1024]; // this loop goes through all the words and only adds those // words into the phonetic dict that have phonets. while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; long outLen = latin1ToUtf8(out, 4*1024, buf, gbstrlen(buf)); // write out the trailing \n as well out[outLen] = '\n'; outLen++; long wn = write ( fdw , out , outLen ) ; if ( wn != outLen ) return log("lang: write: %s", strerror(errno)); } fclose(fdr); close(fdw); return true; } */ Language::~Language(){ reset(); } void Language::reset(){ if ( m_rulesBuf && m_rulesBufSize > 0 ){ mfree( m_rulesBuf, m_rulesBufSize, "LanguageBuf" ); m_rulesBuf = NULL; m_rulesBufSize = 0; } if ( m_rulesPtr && m_rulesPtrSize > 0 ){ mfree( m_rulesPtr, m_rulesPtrSize, "LanguagePtrBuf" ); m_rulesPtr = NULL; m_rulesPtrSize = 0; } if ( m_distributedBuf && m_distributedBufSize > 0 ){ mfree( m_distributedBuf, m_distributedBufSize, "DistributedPtrBuf" ); m_distributedBuf = NULL; m_distributedBufSize = 0; } if ( m_tuplePtr && m_tuplePtrSize >0 ){ mfree(m_tuplePtr, m_tuplePtrSize, "LanguageWordsPtr"); m_tuplePtr = NULL; m_tuplePtrSize = 0; } if ( m_narrowBuf && m_narrowBufSize > 0 ){ mfree(m_narrowBuf, m_narrowBufSize, "LanguageNarrowBuf"); m_narrowBuf = NULL; m_narrowBufSize = 0; } m_numRules = 0; m_numTuples = 0; m_followup = true; m_collapseResult = false; m_removeAccents = true; } bool Language::init( char *unifiedBuf, long unifiedBufSize, long lang, long hostsPerSplit, unsigned long myHash ){ reset(); if ( ! m_phonetics.set(256) ) return false; if ( ! m_dict.set(256) ) return false; if ( ! m_distributedPopPhrases.set(256) ) return false; m_lang = lang; m_charset = getLanguageCharset(m_lang); // load the hashtable for getPhrasePopularity //if ( !loadDict() ) // load the rules dictionary if ( !loadRules( ) || !loadSpellerDict( unifiedBuf, unifiedBufSize, hostsPerSplit, myHash ) ){ log ( LOG_INIT,"lang: Error initializing for " "language %s", getLanguageAbbr(m_lang) ); return false; } //if ( g_conf.m_doNarrowSearch && // !loadNarrow( unifiedBuf, unifiedBufSize, hostsPerSplit, myHash) ){ // log ( LOG_INIT,"lang: Error initializing narrow search for " // "language %s", getLanguageAbbr(m_lang) ); // // don't return since this isn't critical // //return false //} return true; } /////////////////////////////////////////////////////// // DICTIONARY LOADING ROUTINES BELOW HERE // // These will load g_hostdb.m_dir/dict/ files from /////////////////////////////////////////////////////// bool Language::loadRules ( ) { char ff[1024]; File f; sprintf ( ff , "%sdict/%s/%s_phonet.dat", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); f.set ( ff ); // open file if ( ! f.open ( O_RDONLY ) ) { log("lang: open: %s",mstrerror(g_errno)); return false; } // get file size long fileSize = f.getFileSize() ; // store a \0 at the end m_rulesBufSize = fileSize + 1; // make buffer to hold all m_rulesBuf = (char *) mmalloc( m_rulesBufSize, "LanguageBuf" ); if ( !m_rulesBuf ) { g_errno = ENOMEM; log("lang: mmalloc: %s",mstrerror(errno)); return false; } // read em all in if ( ! f.read ( m_rulesBuf , fileSize , 0 ) ) { log("lang: read: %s", mstrerror(g_errno)); return false; } m_rulesBuf[fileSize] = '\0'; // change \n to \0 for ( long i = 0 ; i < m_rulesBufSize ; i++ ) { if ( m_rulesBuf[i] != '\n' ) continue; m_rulesBuf[i] = '\0'; } f.close(); m_numRules = 0; char *p = m_rulesBuf; // This loop checks how many rules we have while ( p < ( m_rulesBuf + m_rulesBufSize ) ){ // if it is a comment, skip // if no line, skip if ( *p == '#' || gbstrlen(p) == 0 || *p == ' ' ){ p += gbstrlen(p) + 1; continue; } // we have a tuple if ( strstr(p, "followup") == p ){ while ( *p != ' ' ) p++; while ( *p == ' ' ) p++; if ( *p != '1' ) m_followup = false; } else if ( strstr(p, "collapse_result") == p ){ while ( *p != ' ' ) p++; while ( *p == ' ' ) p++; if ( *p == '1' ) m_collapseResult = true; } else if ( strstr(p, "version") == p ){ while ( *p != ' ' ) p++; while ( *p == ' ' ) p++; if ( *p != '1' ) m_removeAccents = false; } // else the rules start or end here else m_numRules += 2; p += gbstrlen(p) + 1; } // allocate memory for the ruleptrs m_rulesPtrSize = m_numRules * sizeof ( char* ) * m_numRules; m_rulesPtr = (char **) mmalloc(m_rulesPtrSize,"LanguagePtrBuf"); if ( !m_rulesPtr ){ g_errno = ENOMEM; log("lang: mmalloc: %s",mstrerror(errno)); return false; } // init for ( long i = 0; i < MAX_CHARS; i++) { m_ruleStarts[i] = -1; m_ruleChars[i] = false; } // do the loop again and assign the pointers p = m_rulesBuf; long numRules = 0; while ( p < ( m_rulesBuf + m_rulesBufSize ) ){ char *start = p; // if it is a comment, skip // if no line, skip if ( *p == '#' || gbstrlen(p) == 0 || *p == ' ' ){ p += gbstrlen(p) + 1; continue; } // we have a tuple while ( *p != ' ' ) p++; while ( *p == ' ' ){ *p = '\0'; p++; } // if the rule converts a letter into a '_' (blank) if ( *p == '_' ) *p = '\0'; if ( strstr(start, "followup") == start ){ if ( *p != '1' ) m_followup = false; } else if ( strstr(start, "collapse_result") == start ){ if ( *p == '1' ) m_collapseResult = true; } else if ( strstr(start, "version") == start ){ if ( *p != '1' ) m_removeAccents = false; } // else the rules start or end here else{ m_rulesPtr[numRules++] = start; m_rulesPtr[numRules++] = p; // mark the chars that occur in the rule // lets just mark the first char. It seems to suffice if ( *p ) m_ruleChars[(long)*p] = true; } p += gbstrlen(p) + 1; } // m_ruleStarts[i] points to the index of the m_rulesPtr where the // rule of character i starts for ( long i = 0; i < numRules; i += 2) { long k = (UChar8) m_rulesPtr[i][0]; if ( m_ruleStarts[k] < 0 ) m_ruleStarts[k] = i; } // if ( m_lang == 2 || m_lang == 3 ) makeDict(); return true; } bool Language::loadSpellerDict( char *spellerBuf, long spellerBufSize, long hostsPerSplit, unsigned long myHash ){ File distributedPopFile; char ff[1024]; // load the distributed pop file sprintf ( ff , "%sdict/%s/%s.query.phonet.%li", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), myHash); distributedPopFile.set ( ff ); if ( ! distributedPopFile.open ( O_RDONLY ) ) { log("lang: open: %s. Generating from common pop file", mstrerror(g_errno)); sprintf ( ff , "%sdict/%s/%s.query.phonet", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); // If we don't have the distributed pop file, open the // common pop file and generate the distributed one if ( !genDistributedPopFile( ff, myHash )) return false; // try opening the file now if ( ! distributedPopFile.open ( O_RDONLY ) ) { log("lang: open: %s",mstrerror(g_errno)); return false; } } // get file sizes long distributedPopFileSize = distributedPopFile.getFileSize(); // store a \0 at the end m_distributedBufSize = distributedPopFileSize + 1; // make buffer to hold all m_distributedBuf = (char *) mmalloc(m_distributedBufSize, "DistributedPtrBuf"); if ( !m_distributedBuf) { log("lang: mmalloc: %s",mstrerror(errno));return false; } char *p = m_distributedBuf; // read em all in if ( ! distributedPopFile.read ( p , distributedPopFileSize , 0 ) ){ log("lang: read: %s", mstrerror(g_errno)); return false; } m_distributedBuf[distributedPopFileSize] = '\0'; distributedPopFile.close(); // count the tuples that belong to this language that come from // the wordlist and query file (i.e. that are not negative ) p = spellerBuf; while ( p < spellerBuf + spellerBufSize - 1){ // first is the phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase and move to phonet p += gbstrlen(p) + 1 ; char *phonet = p; if ( p >= spellerBuf + spellerBufSize-1 ) break; // skip phonet and move to (lang,score) tuples p += gbstrlen(p) + 1; if ( p >= spellerBuf + spellerBufSize-1 ) break; // skip (lang, score) tuple p += gbstrlen(p) + 1; // check if phonet it present if ( *phonet == '\0' ) continue; unsigned long long phonetKey = hash64Lower_utf8(phonet); // check if this phonet belongs to this host if ( phonetKey % hostsPerSplit != myHash ) continue; unsigned long long h = hash64d(phrase, gbstrlen(phrase)); // check if this phrase belongs to this language // can do that by calling spellers getphrasepopularity if ( g_speller.getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ) continue; m_numTuples++; } // also change the \t to \0 p = m_distributedBuf; while ( p < m_distributedBuf + m_distributedBufSize ){ m_numTuples++; while ( *p != '\n' && p < m_distributedBuf + m_distributedBufSize - 1) { if ( *p == '\t' ) *p = '\0'; p++; } *p = '\0'; p++; } // tuples have already been counted m_tuplePtrSize = m_numTuples * sizeof(char *); m_tuplePtr = (char **) mmalloc ( m_tuplePtrSize, "LanguageTuplePtr" ); if ( !m_tuplePtr ) { log("lang: mmalloc: %s",mstrerror(errno));return false;} long numTuples = 0; // now go through the unified dict again and assign the pointers p = spellerBuf; while ( p < spellerBuf + spellerBufSize - 1){ // first is the phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase and move to phonet p += gbstrlen(p) + 1; char *phonet = p; if ( p >= spellerBuf + spellerBufSize - 1 ) break; // skip phonet and move to (lang,score) tuples p += gbstrlen(p) + 1; if ( p >= spellerBuf + spellerBufSize - 1 ) break; // skip (lang, score) tuple p += gbstrlen(p) + 1; if ( *phonet == '\0' ) continue; unsigned long long phonetKey = hash64Lower_utf8(phonet); // check if this phonet belongs to this host if ( phonetKey % hostsPerSplit != myHash ) continue; unsigned long long h = hash64d(phrase, gbstrlen(phrase)); // check if this phrase belongs to this language // can do that by calling spellers getphrasepopularity if ( g_speller.getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ) continue; m_tuplePtr[numTuples] = phrase; numTuples++; } // go through the distributed dict and assign the pointers p = m_distributedBuf; while ( p < m_distributedBuf + m_distributedBufSize ){ m_tuplePtr[numTuples++] = p; // skip phrase p += gbstrlen(p) + 1; if ( p >= m_distributedBuf + m_distributedBufSize ) break; // skip phonet p += gbstrlen(p) + 1; if ( p >= m_distributedBuf + m_distributedBufSize ) break; // skip popularity p += gbstrlen(p) + 1; } // sanity for ( long j = 0 ; j< numTuples ; j++ ) gbstrlen(m_tuplePtr[j]) ; // sanity check if ( numTuples != m_numTuples ){ char *xx = NULL; *xx = 0; } // kill last one seems problemtic with #define EFENCE in Mem.cpp numTuples--; m_numTuples--; // sort the wordsPtrs accoding to their phonetics gbsort( m_tuplePtr, m_numTuples, sizeof(char*), cmpPhonet ); char *tuple; m_numPhonets = 0; long startIndex = 0; long index = 0; while ( index < m_numTuples ) { // The distributed dict is stored as a tuple of // ( original phrase, phonetic, lang, score ) // first to come is the phrase tuple = m_tuplePtr[index]; // move to the phonet tuple += gbstrlen(tuple) + 1; unsigned long long phonetKey = hash64Lower_utf8 ( tuple ); if ( phonetKey % hostsPerSplit != myHash ){ index++; continue; } long numWordsInPhonet = 0; startIndex = index; while ( index < m_numTuples ){ // first to come is the phrase tuple = m_tuplePtr[index]; char *phrase = m_tuplePtr[index]; // move to the phonet tuple += gbstrlen(tuple) + 1; unsigned long long pKey = hash64Lower_utf8(tuple); if ( pKey != phonetKey ) break; // move to the popularity tuple += gbstrlen(tuple) + 1; // only add the distributed pop words if they come // out of the distributed pop words dict if (phrase > m_distributedBuf && phrase < m_distributedBuf + m_distributedBufSize){ // add the distributed pop words unsigned long long h = hash64d( phrase, gbstrlen(phrase)); long slot = m_distributedPopPhrases. getSlot(h); long pop = atoi(tuple); if ( slot == -1 ) m_distributedPopPhrases.addKey(h, pop); } numWordsInPhonet++; index++; } long slot = m_phonetics.getSlot ( phonetKey ); if ( slot != -1 ){ log(LOG_LOGIC, "speller: %ld != -1, %16llx, %s", slot, phonetKey, tuple); char *xx = NULL; *xx = 0; } // make the composite value unsigned long long value = startIndex; // make it the higher 32 bits value <<= 32; value += numWordsInPhonet; m_phonetics.addKey( phonetKey, value ); m_numPhonets++; } log(LOG_INIT,"lang: Read %li words and %li phonets into memory", m_numTuples, m_numPhonets ); return true; } /* bool Language::loadNarrow( char *spellerBuf, long spellerBufSize, long hostsPerSplit, unsigned long myHash ){ // don't load for any other language except english if ( m_lang != langEnglish ) return true; // first find out how many phrases have more than 1 word // count the tuples that belong to this language that come from // the wordlist and query file (i.e. that are not negative ) char *p = spellerBuf; while ( p < spellerBuf + spellerBufSize - 1){ // first is the phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase and move to phonet p += gbstrlen(p) + 1; char *phonet = p; // skip phonet and move to (lang,score) tuples p += gbstrlen(p) + 1; // skip (lang, score) tuple p += gbstrlen(p) + 1; unsigned long long h = hash64d(phrase, gbstrlen(phrase)); // check if this phrase belongs to this language // can do that by calling spellers getphrasepopularity if ( g_speller. getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ){ continue; } // check if phonet it present if ( *phonet == '\0' ){ continue; } unsigned long long phonetKey = hash64Lower_utf8(phonet); // check if this phonet belongs to this host if ( phonetKey % hostsPerSplit != myHash ){ continue; } // make sure the phrase has 3 or more letters if ( gbstrlen(phrase) < 3 ) continue; // check if the phrase has more than 1 word bool isPhrase = false; char *q = phrase; while ( *q != '\0' ){ if ( *q == ' ' ) isPhrase = true; q++; } if ( !isPhrase ) continue; m_numNarrowPtrs++; } p = m_distributedBuf; while ( p < m_distributedBuf + m_distributedBufSize ){ // first is the phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase and move to phonet p += gbstrlen(p) + 1; // skip phonet p += gbstrlen(p) + 1; // skip popularity p += gbstrlen(p) + 1; // make sure the phrase has 3 or more letters if ( gbstrlen(phrase) < 3 ) continue; // check if the phrase has more than 1 word bool isPhrase = false; char *q = phrase; while ( *q != '\0' ){ if ( *q == ' ' ) isPhrase = true; q++; } if ( !isPhrase ) continue; m_numNarrowPtrs++; } // allocate memory for that // also allocate memory for the m_frntCharPtrs and m_bckCharPtrs m_narrowBufSize = 2 * sizeof (char *) * m_numNarrowPtrs + ( NUM_CHARS * NUM_CHARS * NUM_CHARS * 4 * 2 ); m_narrowBuf = (char *) mmalloc( m_narrowBufSize, "LanguageNarrowBuf" ); if ( !m_narrowBuf ){ log("lang: Could not allocate %li bytes for narrow buf", m_narrowBufSize); g_errno = ENOMEM; return false; } p = m_narrowBuf; m_frntPtrs = (char **) p; p += sizeof(char **) * m_numNarrowPtrs; m_bckPtrs = (char **) p; p += sizeof(char *) * m_numNarrowPtrs; m_frntCharPtrs = (long *) p; p += NUM_CHARS * NUM_CHARS * NUM_CHARS * 4; m_bckCharPtrs = (long *)p; p += NUM_CHARS * NUM_CHARS * NUM_CHARS * 4; long numNarrowPtrs = 0; // go through the loop again and set the positions p = spellerBuf; while ( p < spellerBuf + spellerBufSize - 1){ // first is the phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase and move to phonet p += gbstrlen(p) + 1; char *phonet = p; // skip phonet and move to (lang,score) tuples p += gbstrlen(p) + 1; // skip (lang, score) tuple p += gbstrlen(p) + 1; unsigned long long h = hash64d(phrase, gbstrlen(phrase)); // check if this phrase belongs to this language // can do that by calling spellers getphrasepopularity if ( g_speller. getPhrasePopularity( phrase, h, false, m_lang ) <= 0 ){ continue; } // check if phonet it present if ( *phonet == '\0' ){ continue; } unsigned long long phonetKey = hash64Lower_utf8(phonet); // check if this phonet belongs to this host if ( phonetKey % hostsPerSplit != myHash ){ continue; } // make sure the phrase has 3 or more letters if ( gbstrlen(phrase) < 3 ) continue; // check if the phrase has more than 1 word bool isPhrase = false; char *q = phrase; while ( *q != '\0' ){ if ( *q == ' ' ) isPhrase = true; q++; } if ( !isPhrase ) continue; m_frntPtrs[numNarrowPtrs] = phrase; m_bckPtrs[numNarrowPtrs] = phrase; numNarrowPtrs++; } p = m_distributedBuf; while ( p < m_distributedBuf + m_distributedBufSize ){ // skip phrase char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } p += gbstrlen(p) + 1; // skip phonet p += gbstrlen(p) + 1; // skip popularity p += gbstrlen(p) + 1; // make sure the phrase has 3 or more letters if ( gbstrlen(phrase) < 3 ) continue; // check if the phrase has more than 1 word bool isPhrase = false; char *q = phrase; while ( *q != '\0' ){ if ( *q == ' ' ) isPhrase = true; q++; } if ( !isPhrase ) continue; m_frntPtrs[numNarrowPtrs] = phrase; m_bckPtrs[numNarrowPtrs] = phrase; numNarrowPtrs++; } // sanity check if ( numNarrowPtrs != m_numNarrowPtrs ){ log(LOG_LOGIC, "speller: %ld != %ld numNarrowPtrs", numNarrowPtrs, m_numNarrowPtrs); char *xx=NULL; *xx=0; } // sort the front pointers and back pointers gbsort ( m_frntPtrs, m_numNarrowPtrs, sizeof(char*), cmpFrnt ); gbsort ( m_bckPtrs, m_numNarrowPtrs, sizeof(char*), cmpBck ); // printing them out //for ( long i = 0; i < m_numNarrowPtrs; i++ ) // log ( "lang: frnt=%s\t\t bck=%s", // m_frntPtrs[i] + gbstrlen(m_frntPtrs[i]) + 1, // m_bckPtrs[i] + gbstrlen(m_bckPtrs[i]) + 1); // now set the m_frntCharPtrs and m_bckCharPtrs for ( long i = 0; i < NUM_CHARS * NUM_CHARS * NUM_CHARS; i++ ){ m_frntCharPtrs[i] = -1; m_bckCharPtrs[i] = -1; } for ( long i = 0; i < m_numNarrowPtrs; i++ ){ // align to the phrase char *frnt = m_frntPtrs[i]; char *bck = m_bckPtrs[i]; bck += gbstrlen(bck) - 1; char f0 = to_dict_char(frnt[0]); char f1 = to_dict_char(frnt[1]); char f2 = to_dict_char(frnt[2]); char b0 = to_dict_char(bck[0]); char b1 = to_dict_char(bck[-1]); char b2 = to_dict_char(bck[-2]); long fx = f0 * NUM_CHARS * NUM_CHARS + f1 * NUM_CHARS + f2; long bx = b0 * NUM_CHARS * NUM_CHARS + b1 * NUM_CHARS + b2; if ( m_frntCharPtrs[fx] == -1 ) m_frntCharPtrs[fx]= i; if ( m_bckCharPtrs[bx] == -1 ) m_bckCharPtrs[bx] = i; } return true; } */ bool Language::loadDictHashTable( ){ char ff[MAX_FRAG_SIZE]; // first load the language dict // open the input file FILE *fdr; sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) ); // then open fdr = fopen ( ff, "r" ); if ( !fdr ) return log("lang: Could not open %s for reading: " "%s.", ff, strerror(errno)); char buf[1024]; // this loop goes through all the words while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; long pop = atoi(p); // move to the phrase while ( *p != '\t' ) p++; p++; char *phrase = p; // move to the next tab before the phonetic while ( *p != '\t' ) p++; unsigned long long key = hash64d( phrase, p - phrase); long slot = m_dict.getSlot(key); long value = 0; if ( slot != -1 ){ value = m_dict.getValueFromSlot(slot); if ( pop < value ) continue; } m_dict.addKey( key, pop ); } fclose(fdr); // now for the top pop words from the query log sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) ); // then open fdr = fopen ( ff, "r" ); if ( !fdr ) return log("lang: Could not open %s for reading: " "%s.", ff, strerror(errno)); // this loop goes through all the words while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; long pop = atoi(p); // move to the phrase while ( *p != '\t' ) p++; p++; char *phrase = p; // move to the next tab before the phonetic while ( *p != '\t' ) p++; unsigned long long key = hash64d( p, p - phrase); long slot = m_dict.getSlot(key); long value = 0; if ( slot != -1 ){ value = m_dict.getValueFromSlot(slot); if ( pop < value ) continue; } m_dict.addKey( key, pop ); } fclose(fdr); // now for the title rec dicts. If the phrase is only present in the // titlerec dict then store it as a negative value for ( long i = 0; i < NUM_CHARS; i++ ){ // open the input file FILE *fdr; sprintf ( ff , "%sdict/%s/%s.dict.%li", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i); // then open fdr = fopen ( ff, "r" ); if ( !fdr ) return log("lang: Could not open %s for reading: " "%s.", ff, strerror(errno)); // this loop goes through all the words and only adds those // words into the phonetic dict that have phonets. while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; long pop = ( atoi(p) * 32000 )/ 10000; // move to the phrase while ( *p != '\t' ) p++; p++; unsigned long long key = hash64d( p, gbstrlen(p) ); // add only if it is not found in english dict and // query dict long slot = m_dict.getSlot(key); long value = 0; if ( slot != -1 ){ value = m_dict.getValueFromSlot(slot); if ( pop < value ) continue; } // if phrase is only present in the title rec, store // as a negative value else pop *= -1; m_dict.addKey( key, pop ); } fclose(fdr); } return true; } bool Language::loadWikipediaWords(){ // open the wikipedia file char ff[1024]; sprintf ( ff , "%sdict/%s/%s.wiki", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); FILE *fdr = fopen ( ff, "r" ); if ( ! fdr ) { return log("lang: Could not open for mispelled words" "reading: %s.",strerror(errno)); } m_wiki.set(1024); char buf[1024]; // go through the words in dict/words while ( fgets ( buf , 1024 , fdr ) ) { // length of word(s), including the terminating \n long wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; unsigned long key = hash32d(buf, gbstrlen(buf)); long slot = m_wiki.getSlot ( key ); if ( slot != -1 ){ continue; char *xx=NULL; *xx=0; } m_wiki.addKey(key,1); } fclose(fdr); return true; } bool Language::loadMispelledWords(){ char ff [1024]; // also open the commonly misspelled words file sprintf ( ff , "%sdict/%s/%s.misp", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); FILE *fdr = fopen ( ff, "r" ); if ( ! fdr ) { return log("lang: Could not open for mispelled words" "reading: %s.",strerror(errno)); } m_misp.set(1024); char buf[1024]; // go through the words in dict/words while ( fgets ( buf , 1024 , fdr ) ) { // length of word(s), including the terminating \n long wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; unsigned long key = hash32d(buf, gbstrlen(buf)); long slot = m_misp.getSlot ( key ); if ( slot != -1 ){ char *xx=NULL; *xx=0; } m_misp.addKey(key,1); } fclose(fdr); return true; } /////////////////////////////////////////////////////// // LANGUAGE RECOMMENDATION ROUTINES BELOW HERE // /////////////////////////////////////////////////////// /* long Language::narrowPhrase ( char *request, char *phrases, long *pops, long maxPhrases ){ // if we haven't been loaded, just return if ( m_numNarrowPtrs == 0 ) return 0; long numPhrases = 0; long requestLen = gbstrlen(request); // don't check for narrow phrase if the original phrase is more than // MAX_PHRASE_LEN - 3 OR less than 3 chars. // Why MAX_PHRASE_LEN - 3 ? Because then only can we find a narrow // phrase if ( requestLen > MAX_PHRASE_LEN - 3 || requestLen < 3 ) return numPhrases; // get the start and end two chars and convert them to dict_char char f0 = to_dict_char(request[0]); char f1 = to_dict_char(request[1]); char f2 = to_dict_char(request[2]); char *bck = request + requestLen - 1; char b0 = to_dict_char(bck[0]); char b1 = to_dict_char(bck[-1]); char b2 = to_dict_char(bck[-2]); unsigned long long start = gettimeofdayInMilliseconds(); long minPop = 0; char req[MAX_PHRASE_LEN]; // first get all the ones in the front strcpy(req, request); // add a space so that we match the exact phrase req[requestLen] = ' '; req[requestLen + 1] = '\0'; long fx = f0 * NUM_CHARS * NUM_CHARS + f1 * NUM_CHARS + f2; long index = m_frntCharPtrs[fx]; if ( index == -1 ) goto skipFrnt; while ( index < m_numNarrowPtrs ){ char *tuple = m_frntPtrs[index++]; char *phrase = tuple; //check if we have gone over the phrase (if present) or not long cmp = strncasecmp (phrase, req, gbstrlen(req)); if ( cmp > 0 ) break; if ( cmp < 0 ) continue; // found it. get the popularity long pop = 0; // if its from the distributed dict, get it directly if ( tuple > m_distributedBuf && tuple < m_distributedBuf + m_distributedBufSize ){ // skip the phrase tuple += gbstrlen(tuple) + 1; // skip the phonet tuple += gbstrlen(tuple) + 1; pop = atoi(tuple); } // else get it by getphrasePopularity else { unsigned long long h = hash64d(phrase, gbstrlen(phrase)); pop = g_speller.getPhrasePopularity(phrase, h, false, m_lang); } long indx = numPhrases; // if not full if ( numPhrases < maxPhrases ) numPhrases++; // if full else{ if ( minPop >= pop ) continue; long minIndx = 0; minPop = pops[0]; for ( long j = 1; j < maxPhrases; j++ ){ if ( minPop < pops[j] ) continue; minPop = pops[j]; minIndx = j; } if ( minPop >= pop ) continue; indx = minIndx; minPop = pop; } // store the pop pops[indx] = pop; strcpy ( &phrases[MAX_FRAG_SIZE * indx],phrase ); log (LOG_DEBUG,"speller: Narrow phrase=%s, pop=%li", &phrases[MAX_FRAG_SIZE * indx], pops[indx]); } skipFrnt: // now get the back req[0] = ' '; strcpy(&req[1],request); long bx = b0 * NUM_CHARS * NUM_CHARS + b1 * NUM_CHARS + b2; index = m_bckCharPtrs[bx]; if ( index == -1 ) return numPhrases; while ( index < m_numNarrowPtrs ){ char *tuple = m_bckPtrs[index++]; char *phrase = tuple; //check if we have gone over the phrase (if present) or not // cannot use strcasecmp because we compare from the back char *p1 = phrase + gbstrlen(phrase) - 1; char *p2 = req + gbstrlen(req) - 1; while ( p1 >= phrase && p2 >= req ) { if ( *p1 != *p2 ) break; p1--; p2--; } if ( p2 >= req || p1 < phrase ){ if ( *p1 > *p2 ) break; continue; } // found it long pop = 0; // if its from the distributed dict, get it directly if ( tuple > m_distributedBuf && tuple < m_distributedBuf + m_distributedBufSize ){ // skip the phrase tuple += gbstrlen(tuple) + 1; // skip the phonet tuple += gbstrlen(tuple) + 1; pop = atoi(tuple); } // else get it by getphrasePopularity else { unsigned long long h = hash64d(phrase, gbstrlen(phrase)); pop = g_speller.getPhrasePopularity(phrase, h, false, m_lang); } long indx = numPhrases; // if not full if ( numPhrases < maxPhrases ) numPhrases++; // if full else{ if ( minPop >= pop ) continue; long minIndx = 0; minPop = pops[0]; for ( long j = 1; j < maxPhrases; j++ ){ if ( minPop < pops[j] ) continue; minPop = pops[j]; minIndx = j; } if ( minPop >= pop ) continue; indx = minIndx; minPop = pop; } // store the pop pops[indx] = pop; strcpy ( &phrases[MAX_FRAG_SIZE * indx],phrase ); log (LOG_DEBUG,"speller: Narrow phrase=%s, pop=%li", &phrases[MAX_FRAG_SIZE * indx], pops[indx]); } unsigned long long took = gettimeofdayInMilliseconds() - start; if ( took > 5) log ( LOG_WARN,"lang: Finding narrow phrases took %lli ms", took ); return numPhrases; } */ // . return the clean buffer that can be spellchecked // . in utf8 always now bool Language::makeClean( char *src, long srcSize, char *dst, long dstSize ) { //char *pin = inBuf; //char *pout = outBuf; char *srcEnd = src + srcSize; char *dstEnd = dst + dstSize; char cs; //while ( pout - outBuf < outBufSize && *pin != '\0' ){ for ( ; src < srcEnd ; src += cs ) { cs = getUtf8CharSize ( src ); //UChar32 c = 0; //if ( isUTF16 ) // c = utf16Decode( (UChar *)pin, &(UChar *)pin ); //else // c = utf8Decode ( pin, &pin ); // Since we're english cannot check anything but ASCII //if ( c > 0x7f ) // return false; //if (!ucIsAlnum(c) && !ucIsWhiteSpace(c) && c != (long)'\'' && // c != (long)' ' && c != (long)'-' ) // return false; // skip more advanced forms of punct if ( ! is_alnum_utf8 ( src ) && ! is_wspace_utf8 ( src ) && *src != '\'' && *src != ' ' && *src != '-' ) return false; // return false to avoid overflow if ( dst + 5 >= dstEnd ) return false; if ( cs == 1 ) *dst++ = to_upper_a (*src); else dst += to_upper_utf8 ( dst , src ); // write the char as upper case //dst += getClean ( dst , src ); } // null end it *dst = '\0'; return true; } // returns the number of recommendations that were found // First finds recommendations by the soundslike (phonetic) score // Then tries to split the word and finds recommendations by the word score // Stores the top MAX_RECOMMENDATIONS in the array, and then returns the // highest popularity recommendation out of them bool Language::getRecommendation( char *origWord, long origWordLen, char *recommendation, long recommendationLen, bool *found, long *score, long *popularity, bool forceReco ){ // if rules and words are not loaded, return if ( m_numRules == 0 || m_numTuples == 0 ) return true; // don't check for recommendation if the original phrase is more than // MAX_PHRASE_LEN - 1 if ( origWordLen > MAX_PHRASE_LEN - 1 ) return false; char origPhonet[MAX_PHRASE_LEN]; char origClean[MAX_PHRASE_LEN]; char possiblePhonet[ MAX_PHRASE_LEN ]; Reco recos[MAX_RECOMMENDATIONS]; // also keep the lowest score that we've found. long lowestScore = LARGE_SCORE; /*char recos[MAX_RECOMMENDATIONS][MAX_PHRASE_LEN]; long recoScores[MAX_RECOMMENDATIONS];*/ long numRecos = 0; // null end recommendation in case we don't find anything. *recommendation = '\0'; *found = false; *score = LARGE_SCORE; *popularity = 0; // no recommendations for 1 letter words if ( origWordLen < 2 ) return false; // no recommendation if the word is found in the dictionary if ( !forceReco ){ // if we are spell checking a query then we start with the // phrases and then move on to individual words. This should // eliminate bugs like saying "brittany spears" is correct // because the phrase shall be checked before individual words unsigned long long h = hash64d( origWord, gbstrlen(origWord)); if ( g_speller.getPhrasePopularity( origWord, h, false ) != 0 ){ *found = true; return false; } // check if it is present in the distributed dictionary if ( m_distributedPopPhrases.getSlot ( h ) != -1 ){ *found = true; return false; } } //long minRecoScore = LARGE_SCORE; // clean the word, i.e. convert word to uppercase and // remove possible accents if ( !makeClean ( origWord, origWordLen, origClean, MAX_PHRASE_LEN) ) return false; // memset ( phonet, '\0', MAX_PHRASE_LEN ); // get the phonetic getPhonetic ( origClean, gbstrlen(origClean), origPhonet, MAX_PHRASE_LEN ); log ( LOG_DEBUG,"speller: original - %s %s %s",origWord, origClean, origPhonet ); // this is the max score that we are trying to get // this is the radius around the misspelled word that we are checking long tryForScore = 3 * ( m_wordWeight * m_editDistanceWeightsMax )/100; // decrease score by 50pc if the length of the phonet is less than 5 // decrease score by 20pc if the length of the phonet is less than 7 if ( gbstrlen(origPhonet) < 5 ) tryForScore -= tryForScore / 2; else if ( gbstrlen(origPhonet) < 7 ) tryForScore -= tryForScore / 5; // first try the same phonetic as the original word long origLen = gbstrlen(origPhonet); // first add the original strcpy ( possiblePhonet, origPhonet ); // get recos from this phonet numRecos = tryPhonet( possiblePhonet, origPhonet, origClean, tryForScore, recos, numRecos, &lowestScore ); // generate different phonets using addition, deletion, substitution // and swapping. // ADDITION for ( long i = 0; i < origLen + 1; i++ ){ for ( long j = 0; j < MAX_CHARS; j++ ){ if ( !m_ruleChars[j] ) continue; char *p = possiblePhonet; // first put in all the chars the are before the char // to be added memcpy ( p, origPhonet, i ); p += i; // the index of m_ruleChars[] is the char to be added *p++ = j; memcpy ( p, origPhonet + i, origLen - i ); p += origLen - i; *p++ = '\0'; numRecos = tryPhonet( possiblePhonet, origPhonet, origClean, tryForScore, recos, numRecos, &lowestScore ); } } // DELETION for ( long i = 0; i < origLen; i++ ){ char *p = possiblePhonet; // put the chars that come before the deleted char memcpy ( p, origPhonet, i ); p += i; // put the chars that come after the deleted char memcpy ( p, origPhonet + i + 1, origLen - i - 1 ); p += origLen - i - 1; *p++ = '\0'; numRecos = tryPhonet( possiblePhonet, origPhonet, origClean, tryForScore, recos, numRecos, &lowestScore ); } // SUBSTITUTION for ( long i = 0; i < origLen; i++ ){ for ( long j = 0; j < MAX_CHARS; j++ ){ if ( !m_ruleChars[j] ) continue; char *p = possiblePhonet; // cannot substitue if both chars are the same if ( j == *( origPhonet + i ) ) continue; // put the chars that come before the substituted char memcpy ( p, origPhonet, i ); p += i; // substitute the char *p++ = j; // put the chars that come after the deleted char memcpy ( p, origPhonet + i + 1, origLen - i - 1); p += origLen - i - 1; *p++ = '\0'; numRecos = tryPhonet( possiblePhonet, origPhonet, origClean, tryForScore, recos, numRecos, &lowestScore ); } } // SWAPPING for ( long i = 0; i < origLen - 1; i++ ){ char *p = possiblePhonet; // cannot swap if both chars are the same if ( *( origPhonet + i ) == *( origPhonet + i + 1 ) ) continue; // put the chars that come before the swapped char memcpy ( p, origPhonet, i ); p += i; //swap the chars *p++ = *( origPhonet + i + 1); *p++ = *( origPhonet + i ); // put the chars that come after the deleted char memcpy ( p, origPhonet + i + 2, origLen - i - 2); p += origLen - i - 2; *p++ = '\0'; numRecos = tryPhonet( possiblePhonet, origPhonet, origClean, tryForScore, recos, numRecos, &lowestScore ); } // check if splitting the word gives us any good recommendations // this works like the try_split() function of aspell in suggest.cpp // dont split the word if its less than 4 chars if ( gbstrlen(origWord) < 4 ) goto skipSplit; // copy it over to another string char splitWord[MAX_PHRASE_LEN]; strcpy ( splitWord, origWord ); splitWord[ gbstrlen(splitWord) + 1 ] = '\0'; splitWord[ gbstrlen(splitWord) ] = splitWord[ gbstrlen(splitWord) - 1 ]; for ( long i = gbstrlen( origWord ) - 2; i >= 2; --i) { splitWord[i+1] = splitWord[i]; splitWord[i] = '\0'; unsigned long long h = hash64d ( splitWord, gbstrlen(splitWord)); // check if the split words exist in the dictionary long pop = g_speller.getPhrasePopularity(splitWord,h,false); if ( pop == 0 ){ // check the distributed dict also long slot = m_distributedPopPhrases.getSlot(h); if ( slot != -1 ) pop = m_distributedPopPhrases. getValueFromSlot(slot); if ( pop == 0 ) continue; } h = hash64d ( splitWord + i + 1, gbstrlen(splitWord + i + 1)); pop = g_speller.getPhrasePopularity( splitWord + i + 1, h, false ); if ( pop == 0 ){ // check the distributed dict also long slot = m_distributedPopPhrases.getSlot(h); if ( slot != -1 ) pop = m_distributedPopPhrases. getValueFromSlot(slot); if ( pop == 0 ) continue; } // replace the '\0' in between the split with a ' ' splitWord[i] = ' '; long wordScore = m_editDistanceWeightsDel2 * 3 / 2; char phonetReco[MAX_PHRASE_LEN]; // get phonetic getPhonetic ( splitWord, gbstrlen(splitWord), phonetReco, MAX_PHRASE_LEN ); long soundslikeScore = editDistance ( origPhonet, phonetReco ); // the final score taking into consideration the // phonetic score as well as the word score long score = weightedAverage ( soundslikeScore, wordScore ); if ( score > tryForScore + m_span ) continue; // also continue if the score is greater than 2*lowestScore, // because then this reco doesn't have a chance if ( score > lowestScore * 2 ) continue; // change the lowest score if needed if ( score < lowestScore ) lowestScore = score; // try to add this to the recommendations /*log ( LOG_WARN, "lang: reco=%s wordScore=%li " "phonetScore=%li score=%li", splitWord, wordScore, soundslikeScore, score );*/ if ( numRecos < MAX_RECOMMENDATIONS ){ strcpy ( recos[numRecos].reco, splitWord ); recos[numRecos].score = score; numRecos++; continue; } long maxScore = 0; long maxIndex = 0; // find the largest score for ( long k = 0; k < numRecos; k++ ){ if ( recos[k].score > maxScore ){ maxScore = recos[k].score; maxIndex = k; } } // boot out the largest score if it is more than this // score if ( score > maxScore ) continue; strcpy ( recos[maxIndex].reco, splitWord ); recos[maxIndex].score = score; } skipSplit: // if no recos return if ( numRecos == 0 ) return false; // sort the recos according to their scores gbsort ( recos, numRecos, sizeof(Reco), cmpScores ); log ( LOG_DEBUG, "speller: --------Top Recos--------" ); // select the best recommendation among them by score long bestRecoIndex = 0; long bestRecoPop = -1; for ( long i = 0; i < numRecos; i++ ){ unsigned long long h = hash64d ( recos[i].reco, gbstrlen(recos[i].reco)); long pop = g_speller.getPhrasePopularity(recos[i].reco, h, false); if ( pop == 0 ){ // check the distributed dict also long slot = m_distributedPopPhrases.getSlot(h); if ( slot != -1 ) pop = m_distributedPopPhrases. getValueFromSlot(slot); } if ( ( recos[i].score < ( recos[bestRecoIndex].score * 2 ) && pop > ( bestRecoPop * 4 ) ) || ( recos[i].score == recos[bestRecoIndex].score && pop > bestRecoPop ) ){ bestRecoPop = pop; bestRecoIndex = i; } log ( LOG_DEBUG,"speller: %li) reco=%s score=%li pop=%li", i, recos[i].reco, recos[i].score, pop ); } log ( LOG_DEBUG, "speller: the best reco found is %s for word %s", recos[bestRecoIndex].reco, origWord ); // put the best reco into the recommendation strcpy ( recommendation, recos[bestRecoIndex].reco ); *score = recos[bestRecoIndex].score; *popularity = bestRecoPop; return true; } long Language::tryPhonet( char *phonetTmp, char *origPhonet, char *origClean, long tryForScore, Reco *recos, long numRecos, long *lowestScore ){ // go through all the phonetics and select those that have score <= 100 unsigned long long key = hash64Lower_utf8(phonetTmp); long slot = m_phonetics.getSlot ( key ); if ( slot == -1 ) return numRecos; // the value is a combination of the index and the number of // words having the same phonet unsigned long long value = m_phonetics.getValueFromSlot(slot); long index = value >> 32; long numWordsInPhonet = value & 0xffffffff; log ( LOG_DEBUG,"speller: next phonet is %s, index=%li, numWords=%li", phonetTmp, index, numWordsInPhonet ); //if ( strcmp(phonetTmp,"WST") == 0 ) //log(LOG_WARN,"BRTNSPS"); // check the score to see if this phonet is any good. // phonet score is 100 for phonets that do not contain all // the letters of the word phonet. e.g. word Phonet = "PLKN", // phonet = "PLKS" phonet score is 95 for phonets that contain // all letters, and 0 where the phonets are same. long phonetScore = limit1EditDistance( phonetTmp, origPhonet ); if ( phonetScore >= LARGE_SCORE ) return numRecos; //log ( LOG_WARN,"lang: checking phonet %s, " //"numWords=%li",phonetTmp, numWordsInPhonet); // this phonet works, for all the words under this phonet, // get their score. for ( long j = 0; j < numWordsInPhonet; j++ ){ // The dict is stored as a tuple of // ( original phrase, phonetic, (lang, score)... ) char *wordReco = m_tuplePtr[j + index]; // make the clean Reco char cleanReco[MAX_PHRASE_LEN]; // sanity check, this is in the dict, so we should be able to // make the word into clean if ( !makeClean( wordReco, gbstrlen(wordReco), cleanReco, MAX_PHRASE_LEN ) ){ char *xx = NULL; *xx = 0; } // now the phonetic char *phonetReco = wordReco + gbstrlen(wordReco) + 1; // sanity check if ( !cleanReco[0] || !phonetReco ){ char *xx = NULL; *xx = 0; } // we want the min Score, so this is init'ed to max long wordScore = LARGE_SCORE; // init this to phonetScore long soundslikeScore = phonetScore; //log (LOG_WARN,"lang: %s\t%s\t%s %li %li", // wordReco, cleanReco, phonetReco, // wordScore, soundslikeScore); if ( wordScore >= LARGE_SCORE ){ long slScore = soundslikeScore; if ( slScore >= LARGE_SCORE ) slScore = 0; long level = ( 100 * tryForScore - m_soundslikeWeight * slScore )/ (m_wordWeight * m_editDistanceWeightsMin); if ( level < 0 ) level = 0; if ( level >= long(slScore/ m_editDistanceWeightsMin)) wordScore = editDistance ( origClean, cleanReco, level, level ); } if ( wordScore >= LARGE_SCORE ) continue; // this is needed for split words, that are taken // care of after this loop /*if ( soundslikeScore >= LARGE_SCORE ){ if ( weightedAverage( 0, wordScore ) > tryForScore ) continue; soundslikeScore = editDistance ( origPhonet, phonetReco ); }*/ // the final score taking into consideration the // phonetic score as well as the word score long score = weightedAverage ( soundslikeScore, wordScore ); if ( score > tryForScore + m_span || score == 0) continue; // also continue if the score is greater than 2*lowestScore, // because then this reco doesn't have a chance if ( score > *lowestScore * 2 ) continue; // change the lowest score if needed if ( score < *lowestScore ) *lowestScore = score; /*long reduceScore=reduceScore(origClean,cleanReco); if ( reduceScore > 0 ) log ( LOG_DEBUG,"lang: reducing score request=%s, " "reco=%s, score=%li, reduce=%li", origClean, cleanReco, score, reduceScore ); score -= reduceScore;*/ //log ( LOG_WARN, "lang: reco=%s phonet=%s " //"wordScore=%li phonetScore=%li score=%li", //wordReco, phonetReco, wordScore, //soundslikeScore, score ); /*if ( minRecoScore < score ) continue; // this is our best recommendation yet minRecoScore = score; strcpy ( recommendation, wordReco );*/ if ( numRecos < MAX_RECOMMENDATIONS ){ strcpy ( recos[numRecos].reco, wordReco ); recos[numRecos].score = score; numRecos++; continue; } long maxScore = 0; long maxIndex = 0; // find the largest score for ( long k = 0; k < numRecos; k++ ){ if ( recos[k].score > maxScore ){ maxScore = recos[k].score; maxIndex = k; } } // boot out the largest score if it is more than this // score if ( score > maxScore ) continue; strcpy ( recos[maxIndex].reco, wordReco ); recos[maxIndex].score = score; } return numRecos; } long Language::editDistance( char *a, char *b, long level, // starting level long limit ) { // maximum level // sanity check if ( level <= 0 || limit < level){ char *xx = NULL; *xx = 0; } long score = LARGE_SCORE; while (score >= LARGE_SCORE && level <= limit) { if (level == 2) score = limit2EditDistance( a, b ); else if (level < 5) score = limitEditDistance( a, b, level ); else { char *xx = NULL; *xx = 0; //score = editDistance(a,b,w); } ++level; } return score; } long Language::weightedAverage(long soundslikeScore, long wordScore) { return ( m_wordWeight * wordScore + m_soundslikeWeight * soundslikeScore) / 100; } long Language::limitEditDistance( char * a, char * b, long limit ) { limit = limit * m_editDistanceWeightsMax; static const int size = 10; struct Edit { char * a; char * b; int score; }; Edit begin[size]; Edit * i = begin; // const char * a0; // const char * b0; long score = 0; long min = LARGE_SCORE; while (true) { while (*a == *b) { if (*a == '\0') { if (score < min) min = score; goto FINISH; } ++a; ++b; } if (*a == '\0') { do { score += m_editDistanceWeightsDel2; if (score >= min) goto FINISH; ++b; } while (*b != '\0'); min = score; } else if (*b == '\0') { do { score += m_editDistanceWeightsDel1; if (score >= min) goto FINISH; ++a; } while (*a != '\0'); min = score; } // if floor(score/max)=limit/max-1 then this edit is only good // if it makes the rest of the string match. So check if // the rest of the string matches to avoid the overhead of // pushing it on then off the stack else if ( score + m_editDistanceWeightsMax <= limit ) { if ( limit * m_editDistanceWeightsMin <= m_editDistanceWeightsMax * ( m_editDistanceWeightsMin + score ) ) { // delete a character from a min = checkRest( a+1, b, score + m_editDistanceWeightsDel1, NULL, min ); // delete a character from b min = checkRest( a, b+1, score + m_editDistanceWeightsDel2, NULL, min ); if (*a == *(b+1) && *b == *(a+1)) { // swap two characters min=checkRest(a+2, b+2, score + m_editDistanceWeightsSwap, NULL, min ); } // substitute one character for another which // is the same thing as deleting a character // from both a & b else { min=checkRest(a+1, b+1, score + m_editDistanceWeightsSub, NULL, min ); } } else { // delete a character from a i->a = a + 1; i->b = b; i->score = score + m_editDistanceWeightsDel1; ++i; // delete a character from b i->a = a; i->b = b + 1; i->score = score + m_editDistanceWeightsDel2; ++i; // If two characters can be swapped and make // a match then the substitution is pointless. // Also, there is no need to push this on // the stack as it is going to be imminently // removed. if (*a == *(b+1) && *b == *(a+1)) { // swap two characters a = a + 2; b = b + 2; score += m_editDistanceWeightsSwap; continue; } // substitute one character for another // which is the same thing as deleting a // character from both a & b else { a = a + 1; b = b + 1; score += m_editDistanceWeightsSub; continue; } } } FINISH: if (i == begin) return min; --i; a = i->a; b = i->b; score = i->score; } } long Language::limit1EditDistance( char *a, char *b ){ long min = LARGE_SCORE; char * amax = a; while(*a == *b) { if (*a == '\0') return 0; //EditDist(0, a); ++a; ++b; } if (*a == '\0') { ++b; if (*b == '\0') return m_editDistanceWeightsDel2; //EditDist(ws.del2, a); return LARGE_SCORE; // EditDist(LARGE_SCORE, a); } else if (*b == '\0') { ++a; if (*a == '\0') return m_editDistanceWeightsDel1; //EditDist(ws.del1, a); return LARGE_SCORE; //EditDist(LARGE_SCORE, a); } else { // delete a character from a min = checkRest( a+1, b, m_editDistanceWeightsDel1, amax, min ); // delete a character from b min = checkRest( a, b+1, m_editDistanceWeightsDel2, amax, min ); if (*a == *(b+1) && *b == *(a+1)) { // swap two characters min = checkRest( a+2, b+2, m_editDistanceWeightsSwap, amax, min ); } else { // substitute one character for another which is the // same thing as deleting a character from both a & b min = checkRest( a+1, b+1, m_editDistanceWeightsSub, amax, min ); } } return min; //EditDist(min, amax); } long Language::limit2EditDistance( char *a, char *b ) { int min = LARGE_SCORE; char * amax = a; while(*a == *b) { if (*a == '\0') return 0; //return EditDist(0, a); ++a; ++b; } if (*a == '\0') { ++b; if (*b == '\0') return m_editDistanceWeightsDel2; //return EditDist(ws.del2,a); ++b; if (*b == '\0') return 2 * m_editDistanceWeightsDel2; //return EditDist(2*ws.del2, a); return LARGE_SCORE;//EditDist(LARGE_SCORE, a); } else if (*b == '\0') { ++a; if (*a == '\0') return m_editDistanceWeightsDel1; //return EditDist(ws.del1, a); ++a; if (*a == '\0') return 2 * m_editDistanceWeightsDel1; //return EditDist(2*ws.del1, a); return LARGE_SCORE; //return EditDist(LARGE_SCORE, a); } else { // delete a character from a min = check2( a+1, b, m_editDistanceWeightsDel1, amax, min ); // delete a character from b min = check2( a, b+1, m_editDistanceWeightsDel2, amax, min ); if (*a == *(b+1) && *b == *(a+1)) { // swap two characters min = check2( a+2, b+2, m_editDistanceWeightsSwap, amax, min ); } else { // substitute one character for another which is the // same thing as deleting a character from both a & b min = check2( a+1, b+1, m_editDistanceWeightsSub, amax, min ); } } return min; //return EditDist(min, amax); } long Language::checkRest( char *a, char *b, long w, char *amax, long min ){ char *a0 = a; char *b0 = b; while(*a0 == *b0) { if (*a0 == '\0') { if (w < min) min = w; break; } ++a0; ++b0; } if ( amax && amax < a0) amax = a0; return min; } long Language::check2( char *a, char *b, long w, char *amax, long min ){ char *aa = a; char *bb = b; while(*aa == *bb) { if (*aa == '\0') { if (amax < aa) amax = aa; if (w < min) min = w; break; } ++aa; ++bb; } if (*aa == '\0') { if (amax < aa) amax = aa; if (*bb == '\0') {} else if (*(bb+1) == '\0' && w + m_editDistanceWeightsDel2 < min) min = w + m_editDistanceWeightsDel2; } else if (*bb == '\0') { ++aa; if (amax < aa) amax = aa; if (*aa == '\0' && w + m_editDistanceWeightsDel1 < min) min = w + m_editDistanceWeightsDel1; } else { min = checkRest( aa+1, bb, w + m_editDistanceWeightsDel1, amax, min ); min = checkRest( aa, bb+1, w + m_editDistanceWeightsDel2, amax, min ); if (*aa == *(bb+1) && *bb == *(aa+1)) min = checkRest( aa+2, bb+2, w + m_editDistanceWeightsSwap, amax, min); else min = checkRest( aa+1, bb+1, w + m_editDistanceWeightsSub, amax, min ); } return min; } short Language::editDistance( char *a0, char *b0 ){ long aSize = gbstrlen(a0) + 1; long bSize = gbstrlen(b0) + 1; // VARARRAY(short, e_d, a_size * b_size); short e[aSize * bSize]; // ShortMatrix e(a_size,b_size,e_d); e[0] = 0;// e(0, 0) = 0; for ( long j = 1; j != bSize; ++j ) e[0 + j * aSize] = e[(j-1) * aSize] + m_editDistanceWeightsDel1; const char * a = a0 - 1; const char * b = b0 - 1; short te; for (long i = 1; i != aSize; ++i) { e[i] = e[i-1] + m_editDistanceWeightsDel2; for (long j = 1; j != bSize; ++j) { if (a[i] == b[j]) { e[i + j * aSize] = e[(i-1) + (j-1) * aSize]; } else { e[i + j * aSize] = m_editDistanceWeightsSub + e[(i-1) + (j-1) * aSize]; if (i != 1 && j != 1 && a[i] == b[j-1] && a[i-1] == b[j]) { te = m_editDistanceWeightsSwap + e[(i-2) + (j-2) * aSize]; if (te < e[i + j * aSize]) e[i + j * aSize] = te; } te = m_editDistanceWeightsDel1 + e[i-1 + j * aSize]; if (te < e[i + j * aSize]) e[i + j * aSize] = te; te = m_editDistanceWeightsDel2 + e[i + (j-1) * aSize]; if (te < e[i + j * aSize]) e[i + j * aSize] = te; } } } return e[(aSize - 1) + (bSize - 1) * aSize]; } // reduces score for substitutions that are close on the key board // eg. we want "hakt" --> "halt", but it used to give "hakt"->"hat" // string 'a' is the mispelling, string 'b' is the recommendation short Language::reduceScore ( char *a, char *b ){ // reduce score only for substitutions and for 1 edit hop away // so essentially both strings should be of the same length if ( gbstrlen(a) != gbstrlen(b) ) return 0; short reduceScore = 0; while ( *a && *b ){ if ( *a == *b ){ a++; b++; continue; } char c = to_lower_a(*a); char bplace = s_keyMap[to_lower_a(*b) - 'a']; // check for all chars around it. For eg. for the letter // 'j'(16); check 'u'(6),'i'(7),'h'(15),'k'(17),'n'(25),'m'(26) if ( bplace - 10 >= 0 ) { if ( ( s_keyboard[bplace - 10] == c ) || ( s_keyboard[bplace - 9 ] == c ) ) reduceScore += 45; } if ( bplace < 10 ) { if ( s_keyboard[bplace + 1] == c ) reduceScore += 45; } if ( bplace % 10 > 0 ) { if ( s_keyboard[bplace - 1] == c ) reduceScore += 45; } if ( bplace - 10 < 28 ) { if ( ( s_keyboard[bplace + 10] == c ) || ( s_keyboard[bplace + 9 ] == c ) ) reduceScore += 45; } a++; b++; } if ( reduceScore == 45 ) return 45; return 0; } bool Language::getPhonetic( char *origWord, long origWordLen, char *target, long targetLen ){ *target = '\0'; char word[MAX_PHRASE_LEN]; if ( !makeClean(origWord, origWordLen, word, targetLen ) ) return false; long wordLen = gbstrlen(word); long i = 0; long j = 0; long k = 0; // number of letters found long n = 0; // index of m_rulesPtr where the rules for the char starts long p = 0; // priority of the rule long z = 0; long k0 = -333; long n0 = -333; long p0 = -333; long z0 = 0; char c,c0; const char *s; while ( word[i] ){ c = word[i]; //log ( LOG_WARN,"lang: Checking Position %li, word=%s " // "\ttarget=%s", j, word, target ); z0 = 0; n = m_ruleStarts[(UChar8) c]; // while the rule exists if ( n >= 0 ){ // check all rules that start with the same letter while ( m_rulesPtr[n] && m_rulesPtr[n][0] == (UChar8) c ){ //log( LOG_WARN, "lang: Checking rule " // "No.%li, \"%s\"\t--> \"%\"s", n, // m_rulesPtr[n], m_rulesPtr[n+1]); /** check whole string **/ k = 1; /** number of found letters **/ p = 5; /** default priority **/ s = m_rulesPtr[n]; s++; /** important for (see below) "*(s-1)" **/ // while we are not at the end of the rule and // the next character of the word is s and // s is not a digit (priority) and // s is not (-<^$, we are on the right track // so keep on checking the next char's. while (*s != '\0' && word[i+k] == *s && !isdigit (*s) && strchr ("(-<^$", *s) == NULL) { k++; s++; } // letters in brackets means only one of these // chars must fit (OR) // eg. rule OH(AEIOUY) means A OR E OR I.... if (*s == '(') { /** check letters in "(..)" **/ // isalpha makes sure that we check // only letters, and letters are only // inside the brackets if ( isalpha(word[i+k] ) && strchr(s+1, word[i+k]) != NULL ) { k++; while (*s != ')') s++; s++; } } p0 = (int) *s; k0 = k; // The number of dashes determines how many // characters from the end will not be replaced while (*s == '-' && k > 1) { k--; s++; } // if a `<' is appended to the search string, // the search for replacement rules will // continue with the replacement string // and not with the next character of the word. if (*s == '<') s++; // the priority is the digit if (isdigit (*s)) { p = *s - '0'; s++; } // The control character `^' says that the // search string only matches at the beginning // of words if (*s == '^' && *(s+1) == '^') s++; /* FOR FOLLOWUP RULES if not at the end of the rule OR ( not on rule that applies only to beginning of word AND ( i is 0 OR word[i-1] is not alphabet ) AND ( not on rule that applies only to end of word AND i > 0 AND word[i-1] is not alphabet AND word[i+k0] is not alphabet ) */ if (*s == '\0' || ( *s == '^' && ( i == 0 || !isalpha(word[i-1])) && (*(s+1) != '$' || (!isalpha(word[i+k0]) ))) || (*s == '$' && i > 0 && isalpha(word[i-1]) && (!isalpha(word[i+k0]) ))) { /** search for followup rules, if: **/ /** parms.followup and k > 1 and NO '-' in searchstring **/ c0 = word[i+k-1]; n0 = m_ruleStarts[(UChar8)c0]; // followup gives better results. if ( //parms.followup && k > 1 && n0 >= 0 && p0 != (int) '-' && word[i+k] != '\0' ) { /** test follow-up rule for "word[i+k]" **/ while (m_rulesPtr[n0][0]==c0) { /*log (LOG_WARN, "lang: " "follow-up rule " "No.%li....%s\t --> %s",n0, m_rulesPtr[n0], m_rulesPtr[n0+1] );*/ /** check whole string **/ k0 = k; p0 = 5; s = m_rulesPtr[n0]; s++; while (*s != '\0' && word[i+k0] == *s && !isdigit(*s) && strchr("(-<^$",*s) == NULL) { k0++; s++; } if (*s == '(') { /** check letters **/ if ( isalpha(word[i+k0]) && strchr (s+1, word[i+k0] ) != NULL) { k0++; while (*s != ')' && *s != '\0') s++; if (*s == ')') s++; } } while (*s == '-') { /** "k0" gets NOT reduced **/ /** because "if (k0 == k)" **/ s++; } if (*s == '<') s++; if (isdigit (*s)) { p0 = *s - '0'; s++; } if (*s == '\0' || /** *s == '^' cuts **/ (*s == '$' && !isalpha(word[i+k0]))) { if (k0 == k) { /** this is just a piece of the string **/ //log(LOG_WARN,"lang: discarded (too short)"); n0 += 2; continue; } if (p0 < p) { /** priority too low **/ //log(LOG_WARN,"lang: discarded (priority)"); n0 += 2; continue; } /** rule fits; stop search **/ break; } // log(LOG_WARN,"lang: discarded"); n0 += 2; } /** End of "while (parms.rules[n0][0] == c0)" **/ if (p0 >= p && m_rulesPtr[n0][0] == c0) { /*log(LOG_WARN,"lang: Rule No.%li, %s",n, m_rulesPtr[n]); log(LOG_WARN,"lang: not used because of follow-up Rule No.%li, %s", n0,m_rulesPtr[n0]);*/ n += 2; continue; } } /** end of follow-up stuff **/ /** replace string **/ /*log(LOG_WARN,"lang: Using rule " "No.%li, %s\t --> %s", n, m_rulesPtr[n],m_rulesPtr[n+1]);*/ s = m_rulesPtr[n+1]; p0 = ( m_rulesPtr[n][0] != '\0' && strchr ( m_rulesPtr[n]+1,'<') != NULL) ? 1:0; if (p0 == 1 && z == 0) { /** rule with '<' is used **/ if (j > 0 && *s != '\0' && (target[j-1] == c || target[j-1] == *s)) { j--; } z0 = 1; z = 1; k0 = 0; while (*s != '\0' && word[i+k0] != '\0') { word[i+k0] = *s; k0++; s++; } if (k > k0){ //strmove (&word[0]+i+k0, &word[0]+i+k); char *to = &word[0]+i+k0; char *from = &word[0]+i+k; while (( *to++ = *from++ ) != 0 ) ; } /** new "actual letter" **/ c = word[i]; } else { /** no '<' rule used **/ i += k - 1; z = 0; while (*s != '\0' && *(s+1) != '\0' && j < wordLen) { if (j == 0 || target[j-1] != *s) { target[j] = *s; j++; } s++; } /** new "actual letter" **/ c = *s; if (m_rulesPtr[n][0] != '\0' && strstr (m_rulesPtr[n]+1, "^^") != NULL) { if (c != '\0') { target[j] = c; j++; } //strmove (&word[0], &word[0]+i+1); char *to = &word[0]; char *from = &word[0]+i+1; while (( *to++ = *from++ ) != 0 ) ; i = 0; z0 = 1; } } break; } /** end of follow-up stuff **/ n += 2; } /** end of while (parms.rules[n][0] == c) **/ } /** end of if (n >= 0) **/ if (z0 == 0) { // collapse_result is false for english if (k && p0 != -333 && !p0 && //(assert(p0!=-333),!p0) && j < wordLen && c != '\0' ) { //&& //(!parms.collapse_result || // j == 0 || target[j-1] != c)) /** condense only double letters **/ target[j] = c; ///printf("\n setting \n"); j++; } /*else if (p0 || !k) log( LOG_WARN,"lang: no rule found; " "character \"%c\" skipped",word[i] );*/ // goto the next character of the word i++; z = 0; k=0; } } /** end of while ((c = word[i]) != '\0') **/ target[j] = '\0'; return true; } bool Language::hasMispelling(char *phrase, long phraseLen){ char *p = phrase; char *pend = p; while ( pend < phrase + phraseLen ){ while ( *pend != ' ' && pend < phrase + phraseLen ) pend++; char word[1024]; memcpy(word, p, pend - p); word[pend - p] = '\0'; unsigned long key = hash32d(p, pend - p); long slot = m_misp.getSlot(key); if ( slot != -1 ){ log(LOG_WARN,"lang: found mispelling in %s", word); return true; } pend++; p = pend; } return false; } /////////////////////////////////////////////////////// // DICTIONARY GENERATION ROUTINES BELOW HERE // /////////////////////////////////////////////////////// /* // . return false and set g_errno on error, true on success bool Language::generateDicts ( long numWordsToDump , char *coll ) { log(LOG_INIT, "lang: Reading first %li words from titledb records in " "collection '%s'.", numWordsToDump,coll); // ensure we got a dict dir in our working dir char dd[1024]; if ( gbstrlen ( g_hostdb.m_dir ) > 1000 ) { g_errno = EBADENGINEER; log("lang: Working directory %s is too long.", g_hostdb.m_dir); return false; } sprintf ( dd , "mkdir %sdict.new/" , g_hostdb.m_dir ); log(LOG_INIT,"lang: %s",dd); if ( system ( dd ) == -1 ) return false; sprintf ( dd , "mkdir %stmp/" , g_hostdb.m_dir ); log(LOG_INIT,"lang: %s",dd); if ( system ( dd ) == -1 ) return false; // . loop through all titleRecs // . put all words/phrases that begin with letter X in file // words.Y, where Y is the numeric value of to_dict_char(X) // . don't dump out more than "100,000" words/phrases // . only dump out one title rec per IP // . do not dump out a word/phrase more than once for the same titleRec // . stores files in /tmp/ dir if (!ucInit(g_hostdb.m_dir)) return log("Unicode initialization failed!"); g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_titledb.init (); g_collectiondb.init(true); g_titledb.addColl ( coll ); // load the mispellings file first //if ( !loadMispelledWords() ) // log (LOG_WARN,"lang: mispelled file could not be loaded"); //log(LOG_DEBUG, "lang: making query files"); //if( !makeQueryFiles ( ) ) // return log("lang: had error: %s.", // mstrerror(g_errno)); log(LOG_DEBUG, "lang: making word files"); if( ! makeWordFiles ( numWordsToDump , MAX_WORDS_PER_PHRASE , coll ) ) return log("lang: had error: %s.", mstrerror(g_errno)); log(LOG_DEBUG, "lang: making pop files"); if ( ! makePopFiles ( numWordsToDump , MAX_WORDS_PER_PHRASE , coll ) ) return log("lang: had error: %s.", mstrerror(g_errno)); // add words from /usr/dict/words to the word files //if ( ! addDictWords ( ) ) return false; // sort each file for ( long i = 0 ; i < NUM_CHARS ; i++ ) { char tmp[1024]; // . sort should treat all lower chars as upper // . sort in reverse order so longer fragments are on top // of their shorter sub fragments so if they have the // same score in the end, we'll keep the longer fragment sprintf(tmp,"sort -f -r %stmp/%s/%s.words.%li > " "%stmp/%s/%s.words.%li.sorted", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i, g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i); log(LOG_INIT,"lang: %s",tmp); system ( tmp ); } // . now convert each sorted file into a unique list of word/phrases // with scores // . score is number of times that word/phrase was found in the file // . truncate each file to the top "1000000" words/phrases if ( ! makeScoreFiles ( 180000 ))//numWordsToDump, max # words per file return log( "lang: had error: %s.",mstrerror(g_errno)); loadRules(); // success return true; } // . TODO: remove bad words // . loop through all titleRecs // . put all words/phrases that begin with letter X in file // words.Y, where Y = to_dict_char(X) [that compress the char value] // . don't dump out more than "100,000" words/phrases // . only dump out one title rec per IP // . do not dump out a word/phrase more than once for the same titleRec // . stores files in /tmp/ dir // . return false and set g_errno on error, true on success bool Language::makeWordFiles ( long numWordsToDump , long numWordsPerPhrase , char *coll ) { long numDumped = 0; // message log(LOG_INIT,"lang: Dumping first %li words/phrases.", numWordsToDump ); // . only allow 1 vote per ip domain // . assume each titlerec has about 50 words in it unsigned long maxNumIps = numWordsToDump / 50 ; if ( maxNumIps < 100000 ) maxNumIps = 100000; long iptableSize = maxNumIps * 4; log(LOG_INIT,"lang: Allocating %li bytes.", iptableSize ); long *iptable = (long *) mmalloc ( iptableSize , "Language" ); if ( ! iptable ) { return log( "lang: Could not allocate %li bytes: %s", iptableSize,mstrerror(g_errno)); } memset ( iptable , 0 , iptableSize ); // get the default siteRec //SiteRec sr; //Url dummy; //dummy.set ( "www.jinx.com" , gbstrlen("www.jinx.com") ); //sr.set ( &dummy , coll , gbstrlen(coll) , 7 ); // filenum // read in 12 byte key, 4 byte size then data of that size unsigned long ip; long totalVoters = 0; unsigned long h; // buffer used for storing de-tagged doc content // JAB: warning abatement // long xbufSize ; // declare up here so we can jump to done: label long nw; //XmlDoc doc; Words w; Xml xml; Url *u; TitleRec tr; // JAB: warning abatement //char xbuf [ 1024*512 ] ; //1024 ]; //long jx = numWordsPerPhrase * 2; // the word vote table to ensure one vote per word per doc long vnumEntries ; long vtableSize = 0 ; long *vtable = NULL; // display titlerec # we are scanning long count = 0; // open all files for appending int fds [ NUM_CHARS ]; for ( long i = 0 ; i < NUM_CHARS ; i++ ) { char ff[1024]; sprintf ( ff , "%stmp/%s/%s.words.%li", g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); // delete it first unlink ( ff ); // then open a new one for appending fds[i] = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fds[i] < 0 ) return log("lang: Could not open %s for writing: " "%s.",ff, strerror(errno)); } // message //log(LOG_INIT,"lang: Scanning title recs for words and phrases in " // "%s",colldir); // // THE TITLE SCAN LOOP // //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; //g_titledb.init (); //g_collectiondb.init(true); //g_titledb.addColl ( coll ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); startKey = g_titledb.makeFirstTitleRecKey ( 0 ); // docid ); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; key_t k ; char *rec ; long recSize ; long sameip = 0; long y; char quality; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , //"main" , // coll , coll , &list , startKey , endKey , minRecSizes , false , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , 1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"lang: getList did not block."); return false; } // all done if empty log(LOG_INIT, "lang: got list: %ld recs", list.getNumRecs()); if ( list.isEmpty() ) goto done; k = list.getCurrentKey(); rec = list.getCurrentRec(); recSize = list.getCurrentRecSize(); startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) goto done; // // END SCAN LOOP // // parse out and decompress the TitleRec tr.set ( rec , recSize , false ) ; // owndata? // if quality is low, skip this doc quality = tr.getDocQuality(); if ( quality < 60 ) goto loop; // only do your language if ( tr.m_language != m_lang ) goto loop; // extract the url u = tr.getUrl(); // get ip ip = u->getIp(); // look up in ip table h = ip % maxNumIps; y = 0; ipchain: if ( iptable[h] ) { // skip if already voted if ( iptable[h] == (long)ip ) { sameip++; goto loop; } // chain to next bucket if ( ++h >= maxNumIps ) h = 0; if ( ++y > (long)maxNumIps ) { log(LOG_LOGIC,"spell: IP table is too small. " "Exiting."); char *xx = NULL; *xx = 0; } goto ipchain; } // store in bucket so no doc from this ip votes again iptable[h] = ip; // count the voters totalVoters++; // parse all the tags out //doc.set ( &tr , &sr ); // store in this xbuf w/o tags xml.set ( tr.getCharset(),tr.getContent() , tr.getContentLen() , false , 0, false , tr.getVersion() ); //xml = doc.getXml(); // xbufSize = xml.getText ( xbuf , // 1024*512 , // 0 , // 999999 , // false , // true , // true ); // convert non-tag content into words w.set(&xml, true, true); // hash each phrase nw = w.getNumWords(); // TODO: make the above a getWords(&w) routine!! // so it can take from titleRecs or query logs // . don't hash a word from this doc more than once // . wvtable = word vote table vnumEntries = (nw * numWordsPerPhrase * 130) / 100; vtableSize = vnumEntries * 4; //log("mallocing2b %li bytes", vtableSize ); if ( (count % 100) == 0 ) log(LOG_INIT,"lang: Scanning document %li " "(%li dup ips, %li words dumped).", count,sameip,numDumped); count++; vtable = (long *) mmalloc ( vtableSize , "Language" ); if ( ! vtable ) { mfree ( iptable , iptableSize , "Language" ); return log("lang: Failed to allocate %li " "bytes: %s.",iptableSize,mstrerror(g_errno)); } memset ( vtable , 0 , vtableSize ); // every other word is punctuation, so step by 2 for ( long i = 0 ; i < nw ; i ++ ) { // skip punct. wordId is 0. if ( w.isPunct(i) ) continue; // is the ith word a stop word? // tmp buffer to hold word/phrase char tmp[1024]; char *tmpp = tmp; char *tmpend = tmp + 1024 - 3; char *ww = w.getWord(i); long wwlen = w.getWordLen(i); if ( wwlen < 2 ) continue; bool isStop = ::isStopWord ( ww, wwlen, w.getWordId (i)); // BUT ok if Capitalized or number if ( isStop ) { if ( is_digit (ww[0]) ) isStop = false; if ( is_cap (ww,wwlen) ) isStop = false; // e-mail, c file, c. s. lewis if ( wwlen == 1 && ww[0] != 'a' ) isStop = false; } // loop over # of words per phrase for ( long k = 1 ; k < numWordsPerPhrase ; k++ ) { tmpp = tmp; // stop words cannot start dictionary phrases if ( k > 1 && isStop ) break; long lastj = -1; // do not end on stop word either for ( long j = i ; j < i + k * 2 ; j++ ) { // skip if overflow if ( j >= nw ) continue; // skip punct if ( w.isPunct(j) ) continue; // point to word char *ww = w.getWord(j); long wwlen = w.getWordLen(j); // if no room to store word, skip it if ( tmpp + wwlen >= tmpend ) { tmpp = tmp; break; } // write word into buf // convert to lower case so our sort works // they way it should char tx[1024]; // n is how many bytes we wrote into "tx" long n = to_lower_utf8(tmpp,tmpend,ww,wwlen); // advance it tmpp += n; // no longer convert to utf8, cuz title rec // is now already in utf8 by default!! //tmpp += latin1ToUtf8( tmpp, // tmpend - tmpp, // tx, wwlen ); // remember last word # we added lastj = j; // followed by space, apostrophe or hyphen if ( ww[wwlen] == '-' ) *tmpp = '-'; else if ( ww[wwlen] == '\'' ) *tmpp = '\''; else *tmpp = ' '; tmpp++; } // bail if nothing to add if ( tmpp <= tmp ) continue; // don't add dict phrase if last word is a stop word if ( k > 1 && lastj >= 0 ) { char *ww = w.getWord ( lastj ); long wwlen = w.getWordLen ( lastj ); long long wid = w.getWordId ( lastj ); bool isStop = ::isStopWord(ww,wwlen,wid); // BUT ok if Capitalized or number if ( isStop ) { if (is_digit (ww[0]) ) isStop=false; if (is_cap (ww,wwlen)) isStop=false; } if ( isStop ) continue; } // point to last space tmpp--; // overwrite it, terminate with a \n *tmpp = '\n'; // how long is it? does not include terminating \n long tmplen = tmpp - tmp; // skip if nothing if ( tmplen <= 0 ) continue; // skip word if it has binary chars in it if ( has_binary ( tmp , tmplen ) ) continue; // debug //if ( strncasecmp ( tmp , "a zero" , 6 ) == 0 ) // log("shit"); // get hash of word/phrase // we need to preserve distinguish between proper // and improper accent marks, so don't do just ascii // by using wh = w.getWordId(j) unsigned long long hh = hash64Lower_utf8 (tmp,tmplen ); // don't allow more than one vote per doc for a word long ii = hh % vnumEntries; vchain: if ( vtable[ii] && vtable[ii] != (long)hh ) { if ( ++ii >= vnumEntries ) ii = 0 ; goto vchain; } if ( vtable[ii] ) continue; // store it vtable[ii] = (long)hh; // a new word for this doc // append the word out to file long fn = to_dict_char(tmp[0]); // write the hash before the word //char tt[32]; //sprintf ( tt , "%016llx ", hh ); //if ( write ( fds[fn], tt , 17 ) != 17 ) // return log("spell: makeWordFiles: write: %s", // strerror(errno)); char tmpx[2080]; tmpp++; *tmpp = '\0'; sprintf(tmpx,"%s", tmp); long tmpxlen = gbstrlen(tmpx); // write out the trailing \n as well long wn = write ( fds[fn] , tmpx , tmpxlen ) ; if ( wn != tmpxlen ) return log("spell: makeWordFiles: write: %s", strerror(errno)); numDumped++; if ( numDumped >= numWordsToDump ) goto done; } } // breakout: // don't need the word voting table anymore if ( vtable ) mfree ( vtable , vtableSize , "Language"); vtable = NULL; // get more titlerecs so we can hash more words/phrases goto loop; done: // don't need the word voting table anymore if ( vtable ) mfree ( vtable , vtableSize , "Language"); vtable = NULL; // close all files for ( long i = 0 ; i < NUM_CHARS ; i++ ) close ( fds[i] ); return true; } #define NUM_UNIFILES MAX_LANGUAGES bool Language::makePopFiles ( long numWordsToDump , long numWordsPerPhrase , char *coll) { long numDumped = 0; long docCount = 0; // message log(LOG_INIT,"lang: Dumping first %li words/phrases.", numWordsToDump ); // . only allow 1 vote per ip domain // . assume each titlerec has about 50 words in it unsigned long maxNumIps = numWordsToDump / 50 ; if ( maxNumIps < 100000 ) maxNumIps = 100000; long iptableSize = maxNumIps * 4; log(LOG_INIT,"lang: Allocating %li bytes.", iptableSize ); long *iptable = (long *) mmalloc ( iptableSize , "Language" ); if ( ! iptable ) { return log( "lang: Could not allocate %li bytes: %s", iptableSize,mstrerror(g_errno)); } memset ( iptable , 0 , iptableSize ); // get the default siteRec //SiteRec sr; //Url dummy; //dummy.set ( "www.jinx.com" , gbstrlen("www.jinx.com") ); //sr.set ( &dummy , coll , gbstrlen(coll) , 7 ); // filenum // read in 12 byte key, 4 byte size then data of that size unsigned long ip; long totalVoters = 0; unsigned long h; // buffer used for storing de-tagged doc content long xbufSize ; // declare up here so we can jump to done: label long nw; //XmlDoc doc; Words w; Xml xml; //Scores s; Url *u; TitleRec tr; char xbuf [ 1024*512 ] ; //1024 ]; //long jx = numWordsPerPhrase * 2; // the word vote table to ensure one vote per word per doc long vnumEntries ; long vtableSize = 0 ; long *vtable = NULL; // display titlerec # we are scanning long count = 0; // open all files for appending int fds [ NUM_UNIFILES ]; for ( long i = 0 ; i < NUM_UNIFILES ; i++ ) { char ff[1024]; sprintf ( ff , "%stmp/%s/%s.popwords.%li", g_hostdb.m_dir , getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); // delete it first unlink ( ff ); // then open a new one for appending fds[i] = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fds[i] < 0 ) return log("lang: Could not open %s for writing: " "%s.",ff, strerror(errno)); } // message //log(LOG_INIT,"lang: Scanning title recs for words and phrases in " // "%s",colldir); // // THE TITLE SCAN LOOP // //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; //g_titledb.init (); //g_collectiondb.init(true); //g_titledb.addColl ( coll ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); startKey = g_titledb.makeFirstTitleRecKey ( 0 ); // docid ); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; key_t k ; char *rec ; long recSize ; long sameip = 0; long y; char quality; long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; Sections ss; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , //"main" , // coll , coll , &list , startKey , endKey , minRecSizes , false , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"lang: getList did not block."); return false; } // all done if empty log(LOG_INIT, "lang: got list: %ld recs", list.getNumRecs()); if ( list.isEmpty() ) goto done; list.resetListPtr(); docloop: k = list.getCurrentKey(); rec = list.getCurrentRec(); recSize = list.getCurrentRecSize(); // // END SCAN LOOP // docCount++; // parse out and decompress the TitleRec tr.set ( rec , recSize , false ) ; // owndata? // if quality is low, skip this doc quality = tr.getDocQuality(); if ( quality < 60 ) goto docdone; if ( tr.m_language != m_lang ) goto docdone; // extract the url u = tr.getUrl(); // get ip ip = u->getIp(); // look up in ip table h = ip % maxNumIps; y = 0; ipchain: if ( iptable[h] ) { // skip if already voted if ( iptable[h] == (long)ip ) { sameip++; goto docdone; } // chain to next bucket if ( ++h >= maxNumIps ) h = 0; if ( ++y > (long)maxNumIps ) { log(LOG_LOGIC,"spell: IP table is too small. " "Exiting."); char *xx = NULL; *xx = 0; } goto ipchain; } // store in bucket so no doc from this ip votes again iptable[h] = ip; // count the voters totalVoters++; // parse all the tags out //doc.set ( &tr , &sr ); // store in this xbuf w/o tags xml.set ( tr.getCharset(),tr.getContent() , tr.getContentLen() , false , 0, false , tr.getVersion() ); //xml = doc.getXml(); xbufSize = xml.getText ( xbuf , 1024*512 , 0 , 999999 , false , true , true ); // convert non-tag content into words //w.set ( true, (char*)xbuf , xbufSize ); w.set ( &xml, true, true); //s.set ( &w, &xml , TITLEREC_CURRENT_VERSION ); //s.set ( &w, TITLEREC_CURRENT_VERSION , false ); ss.set ( &w,NULL,0,NULL,0,NULL,NULL,&tr,NULL,0); // hash each phrase nw = w.getNumWords(); // TODO: make the above a getWords(&w) routine!! // so it can take from titleRecs or query logs // . don't hash a word from this doc more than once // . wvtable = word vote table vnumEntries = (nw * numWordsPerPhrase * 130) / 100; vtableSize = vnumEntries * 4; //log("mallocing2b %li bytes", vtableSize ); if ( (count % 100) == 0 ) log(LOG_INIT,"lang: Scanning document %li " "(%li dup ips, %li words dumped).", count,sameip,numDumped); count++; vtable = (long *) mmalloc ( vtableSize , "Language" ); if ( ! vtable ) { mfree ( iptable , iptableSize , "Language" ); return log("lang: Failed to allocate %li " "bytes: %s.",iptableSize,mstrerror(g_errno)); } memset ( vtable , 0 , vtableSize ); // every other word is punctuation, so step by 2 //log("Adding %d words", nw); for ( long i = 0 ; i < nw ; i ++ ) { // skip punct //if ( w.isPunct(i) ) continue; //if ( !s.getScore(i) ) continue; if ( ss.m_sectionPtrs[i]->m_flags & badFlags ) continue; // is the ith word a stop word? // tmp buffer to hold word/phrase char tmp[2048]; char *tmpp = tmp; char *tmpend = tmp + 2048 - 3; char *ww = w.getWord(i); long wwlen = w.getWordLen(i); bool isStop = ::isStopWord ( ww, wwlen, w.getWordId (i)); // BUT ok if Capitalized or number if ( isStop ) { if ( w.isNum(i) ) isStop = false; if ( w.isUpper(i)) isStop = false; // e-mail, c file, c. s. lewis if ( wwlen == 1 && ww[0] != 'a' ) isStop = false; } // loop over # of words per phrase for ( long k = 1 ; k < numWordsPerPhrase ; k++ ) { tmpp = tmp; // stop words cannot start dictionary phrases if ( k > 1 && isStop ) break; long lastj = -1; // do not end on stop word either for ( long j = i ; j < i + k * 2 ; j++ ) { // skip if overflow if ( j >= nw ) continue; // skip punct //if ( w.isPunct(i+j) ) continue; //if ( !s.getScore(i+j) ) continue; if ( ss.m_sectionPtrs[j]->m_flags &badFlags ) continue; // point to word char *ww = w.getWord(j); long wwlen = w.getWordLen(j); // if no room to store word, skip it if ( tmpp + wwlen >= tmpend ) { tmpp = tmp; break; } // write word into buf // convert to lower case so our sort works // they way it should // n is how many bytes we wrote into "tx" long n = to_lower_utf8(tmpp,tmpend,ww,wwlen); // advance it tmpp += n; // remember last word # we added lastj = j; // followed by space, apostrophe or hyphen if ( ww[wwlen] == '-' ) *tmpp = '-'; else if ( ww[wwlen] == '\'' ) *tmpp = '\''; else *tmpp = ' '; tmpp++; } // bail if nothing to add if ( tmpp <= tmp ) continue; // don't add dict phrase if last word is a stop word if ( k > 1 && lastj >= 0 ) { char *ww = w.getWord ( lastj ); long wwlen = w.getWordLen ( lastj ); long long wid = w.getWordId ( lastj ); isStop =::isStopWord(ww,wwlen,wid); // BUT ok if Capitalized or number if ( isStop ) { if ( w.isNum(lastj) ) isStop=false; if ( w.isUpper( lastj ) ) isStop=false; } if ( isStop ) continue; } // point to last space //tmpp--; // overwrite it, terminate with a \n *tmpp = '\n'; // how long is it? does not include terminating \n long tmplen = tmpp - tmp; // skip if nothing if ( tmplen <= 0 ) continue; // skip word if it has binary chars in it if ( has_binary ( tmp , tmplen ) ) continue; // debug //if ( strncasecmp ( tmp , "a zero" , 6 ) == 0 ) // log("shit"); // get hash of word/phrase // we need to preserve distinguish between proper // and improper accent marks, so don't do just ascii // by using wh = w.getWordId(i+j) unsigned long long hh = hash64Lower_utf8 (tmp,tmplen ); // don't allow more than one vote per doc for a word long ii = hh % vnumEntries; vchain: if ( vtable[ii] && vtable[ii] != (long)hh ) { if ( ++ii >= vnumEntries ) ii = 0 ; goto vchain; } if ( vtable[ii] ) continue; // store it vtable[ii] = (long)hh; // a new word for this doc // append the word out to file //long fn = to_dict_char(tmp[0]); long fn = tr.getLanguage(); // write the hash before the word //char tt[32]; //sprintf ( tt , "%016llx ", hh ); //if ( write ( fds[fn], tt , 17 ) != 17 ) // return log("spell: makeWordFiles: write: %s", // strerror(errno)); // write out the trailing \n as well long wn = write ( fds[fn] , tmp , tmplen + 1) ; if ( wn != tmplen + 1 ) return log("spell: makePopFiles: " "write: %s", strerror(errno)); numDumped++; if ( numDumped >= numWordsToDump ) goto done; } } //log(LOG_INIT, "lang: got %ld docs, %ld words", //docCount, numDumped); // breakout: // don't need the word voting table anymore if ( vtable ) mfree ( vtable , vtableSize , "Language"); vtable = NULL; docdone: // get more titlerecs so we can hash more words/phrases list.skipCurrentRecord(); if (!list.isExhausted()) goto docloop; startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) goto done; goto loop; done: // don't need the word voting table anymore log(LOG_INIT, "lang: got %ld docs total", docCount); if ( vtable ) mfree ( vtable , vtableSize , "Language"); vtable = NULL; // close all files for ( long i = 0 ; i < NUM_UNIFILES ; i++ ) close ( fds[i] ); return true; } // . now convert each sorted file into a unique list of word/phrases // with scores // . score is number of times that word/phrase was found in the file // . truncate each file to the top "maxWordsPerFile" words/phrases bool Language::makeScoreFiles ( long maxWordsPerFile ) { // convert each file for ( long i = 0 ; i < NUM_CHARS ; i++ ) { // open the file for reading char ff[1024]; sprintf ( ff , "%stmp/%s/%s.words.%li.sorted", g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); FILE *fdr = fopen ( ff , "r" ); if ( ! fdr ) return log( "lang: Failed to open %s for reading: " "%s.",ff, strerror(errno)); // and one for writing out score/word pairs sprintf ( ff, "%stmp/%s/%s.words.%li.prescored",g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); FILE *fdw = fopen ( ff , "w" ); if ( ! fdw ) return log( "lang: Failed to open %s for writing: " "%s.",ff, strerror(errno)); log(LOG_INIT,"lang: Making %s.", ff ); // ongoing score count long score = 0; long oldscore = 0; // store last word/phrase in here char lastw [ 1029]; lastw[0] = '\0'; // and its hash in here unsigned long long lasthh = 0; char pbuf[1024]; //long bonus = 0; //bool gotit = false; // do we start w/ '*'? means in dict. // read in each line while ( fgets ( pbuf , 1024 , fdr ) ) { char *p = pbuf; // skip '*' //if ( *p == '*' ) { gotit = true ; p++; } //else gotit = false; // skip lines beginning with "the " TOO COMMON if ( (p[0] == 't' || p[0] == 'T') && strncasecmp ( p , "the ", 4 ) == 0 ) continue; // also, "and " if ( (p[0] == 'a' || p[0] == 'A') && strncasecmp ( p , "and ", 4 ) == 0 ) continue; // and, "a " if ( (p[0] == 'a' || p[0] == 'A') && p[1] == ' ') continue; // don't include terminating \n in the length long plen = gbstrlen(p) - 1; if ( plen <= 0 ) continue; // skip if too big and might have been truncated if ( plen >= 1000 ) continue; // NULL terminate it to take off ending * and/or \n p [plen] = '\0'; // get the hash of this word/phrase unsigned long long hh = hash64Lower_utf8 ( p , plen ); //sscanf ( buf , "%llx" , &hh ); // was it same as last? if so, tally and continue if ( hh == lasthh ) { score++; //if ( gotit ) bonus = IN_DICT_BONUS; continue; } // add bonus to score to get final score //score += bonus; // . otherwise, we're starting a new word // . print out the word before us if ( score >= MIN_DOCS ) { //if ( gotit ) // bonus ) // fprintf(fdw,"%05li *%s\n",score,lastw); //else fprintf(fdw,"%05li %s\n" ,score,lastw); } // we are now the new word lasthh = hh; strncpy ( lastw , p , 1010 ); //if ( gotit ) bonus = IN_DICT_BONUS; //else bonus = 0; // give us score 1 score = 1; } // write out the last // skip if too big and might have been truncated //score += bonus; if ( score >= MIN_DOCS && gbstrlen(lastw) < 1000) { //if (gotit) fprintf (fdw,"%05li *%s\n",score,lastw ); // else fprintf (fdw,"%05li %s\n" ,score,lastw ); fprintf (fdw,"%05li %s\n" ,score,lastw ); } fclose ( fdr ); fclose ( fdw ); // // now remove small phrases in there just because the // big phrase containing them is the popular one // // open the file for reading sprintf ( ff, "%stmp/%s/%s.words.%li.prescored",g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); fdr = fopen ( ff , "r" ); if ( ! fdr ) return log( "lang: Failed to open %s for reading: " "%s.",ff, strerror(errno)); // and one for writing out score/word pairs sprintf ( ff , "%stmp/%s/%s.words.%li.scored", g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang), i ); fdw = fopen ( ff , "w" ); if ( ! fdw ) return log( "lang: Failed to open %s for writing: " "%s.",ff, strerror(errno)); lastw[0] = '\0'; // read in each line while ( fgets ( pbuf , 1024 , fdr ) ) { char *p = pbuf; // don't include terminating \n in the length long plen = gbstrlen(p) - 1; // NULL terminate it to take off ending * and/or \n p [plen] = '\0'; // get score long score = atoi(p); // advance p over score and separating space while ( isdigit(*p) ) p++; p++; // skip '*' //if ( *p == '*' ) { gotit = true ; p++; } //else gotit = false; // debug point //if ( strcmp ( p , "a wide variety of topics" )==0) // log("got it"); // does the new chunk match the last one? long n; for ( n = 0 ; p[n] && to_lower_a(p[n]) == to_lower_a(lastw[n]); n++ ); // cancel match if doesn't fail on a word boundary if ( p[n] ) n = 0; if ( is_alnum(lastw[n]) ) n = 0; // if match subtract score so we don't leech our // points from him if ( n > 0 ) score -= oldscore; // if our score is now too low, don't add ourselves if ( score < MIN_DOCS ) continue; // . save it to disk // . this puts the asterisk back at the end of the // word for easier reading //if ( gotit) fprintf(fdw,"%05li %s*\n",score,p); //else fprintf(fdw,"%05li %s\n" ,score,p); fprintf(fdw,"%05li\t%s\n" ,score,p); // store as last oldscore = score; strncpy ( lastw , p , 1010 ); } fclose ( fdr ); fclose ( fdw ); // sort the score file and output to dict.%li char bb[1024]; sprintf( bb, "sort -f -r %stmp/%s/%s.words.%li.scored | " "head -%li > %sdict.new/%s/%s.dict.%li", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i, maxWordsPerFile, g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang), i ); log(LOG_INIT,"lang: %s",bb); system ( bb ); // make the phonets for it too //sprintf(bb,"%sdict.new/dict.%li",g_hostdb.m_dir,i); //makePhonet ( bb ); } return true; } // Get the queries from the http query requests and use them as phrases bool Language::makeQueryFiles ( ) { char buf [1024*10]; for ( long i = 1; i < 2; i++ ){ //fdr = fopen ( "dict/queries.mamma","r" ); char fx[1024]; sprintf( fx,"%sdict/queries.mamma%li",g_hostdb.m_dir, i ); FILE *fdr = fopen ( fx,"r" ); if ( ! fdr ) { return log("lang: Could not open query file for " "reading: %s.",strerror(errno)); } // open for writing char ff[1024]; sprintf ( ff , "%stmp/dict.queries.%li", g_hostdb.m_dir, i ); // delete it first unlink ( ff ); // then open a new one for appending int fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ){ return log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); } Url u; Query q; while ( fgets ( buf , 1024 * 10, fdr ) ) { buf[1024 * 10 - 1] = '\0'; // length of word(s), including the terminating \n long wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; u.set(buf,gbstrlen(buf)); HttpRequest r1,r2; bool status = r1.set ( &u ) ; if ( !status ) continue; r2.set( r1.getRequest(), r1.getRequestLen(), NULL ); char frag[1024]; long flen; char *query = r2.getString( "uip",&flen ); memcpy ( frag, query, flen ); frag[flen++] = '\t'; long queryLen; query = r2.getString( "q",&queryLen ); q.set(query, queryLen, NULL, 0, true); // don't use truncated queries if ( q.m_truncated ) continue; if ( q.m_isBoolean ) continue; long nqw = q.m_numWords; for ( long i = 0 ; i < nqw ; i++ ) { long fragLen = flen; // get a word in the Query to start a fragment // with QueryWord *qw = &q.m_qwords[i]; // can he start the phrase? bool canStart = true; if (!qw->isAlphaWord()) canStart = false; // MDW: wtf is this? //UCScript script = qw->wordScript(); //if ((script != ucScriptCommon) && // (script != ucScriptLatin)) // canStart = false; if ( qw->m_ignoreWord && qw->m_ignoreWord != IGNORE_CONNECTED && qw->m_ignoreWord != IGNORE_QUOTED ) canStart = false; // if he can't start our fragment, // just copy over to "dst" if ( ! canStart ) { continue; } bool inQuotes = qw->m_inQuotes; char fieldCode = qw->m_fieldCode; // . get longest continual fragment that // . starts with word #i. get the following // words that can be in a fragment // that starts with word #i start of the frag char *p = qw->m_word; long plen = 0; long lastLen = 0; for ( ; i < nqw ; i++ ) { // . skip if we should // . keep punct, however QueryWord *qw = &q.m_qwords[i]; if ( qw->m_opcode ) break; if ( qw->m_inQuotes != inQuotes ) break; if ( qw->m_fieldCode != fieldCode ) break; // are we punct? lastLen = 0; if ( is_alnum_utf8 ( qw->m_word ) ) lastLen=plen; // inc the ptr plen += qw->m_wordLen; } // revisit this i in big loop since we did not // include it i--; // if last thing we added was punct, roll back // over it if ( lastLen ) { plen = lastLen; i--; } bool lastPunct = false; char *pend = p + plen; for ( ; p < pend ; p += getUtf8CharSize(p) ) { //skip anything but latin-1 //if (c > 255) continue; if ( getUtf8CharSize(p) != 1) continue; // only works on a single character if ( ! to_dict_char ( *p ) ) continue; // skip back to back punct/spaces if ( ! is_alnum_utf8(p) && lastPunct ) continue; if ( ! is_alnum_utf8(p) ) lastPunct = true; else lastPunct=false; // check for a breech if ( fragLen+4>=1023) { break; g_errno = EBUFTOOSMALL; return false; } // language phrases are looking // for latin-1 char cs = getUtf8CharSize(p); if ( cs == 1 ) { frag[fragLen++] = *p; continue; } // otherwise, more than 1 byte char memcpy(frag+fragLen,p,cs); fragLen += cs; } // if any part of the phrase has a mispelling, // discard the query if ( hasMispelling( &frag[flen], fragLen - flen) ){ break; } frag[fragLen++] = '\n'; frag[fragLen] = '\0'; // write out the trailing \n as well long wn = write ( fdw, frag, fragLen ) ; if ( wn != fragLen ) return log("spell: makeWordFiles: " "write: %s", strerror(errno)); // break here so that we only print one phrase // per query break; } } fclose (fdr); close (fdw); // each ip can only vote once for a particular query. // Each ip vote counts as one popular vote //char cmd[2048]; // sort, the uniquify so that each ip can have only 1 occurance // of each phrase. Then awk to get just the phrase. // Then sort again and uniquify with count and remove single // occurance phrases. Then sort on the count to get the most // common phrases on top. //sprintf( cmd, "sort -f %s | uniq -i | " //"awk -F \'\\t\' \'{print $2}\' " //"| sort -f | uniq -i -c -d | sort -g -r -k 1,1 " //"> %s.uniq.sorted", ff, ff ); //log ( LOG_INIT,"lang: %s", cmd ); //system(cmd); } return true; } // Make a list of the wikipedia titles of docs found by the query // "site:xx.wikipedia.org", where xx is the abbr of the language. // Store in xx.wiki bool Language::makeWikiFiles( ) { // open for writing char ff[1024]; sprintf ( ff , "%sdict/%s/%s.wiki", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang) ); // delete it first unlink ( ff ); // then open a new one for appending int fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ){ log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); return true; } // make a state StateWik *st ; try { st = new (StateWik); } catch ( ... ) { g_errno = ENOMEM; log("Lang: new(%i): %s", sizeof(StateWik), mstrerror(g_errno)); return false; } mnew ( st , sizeof(StateWik) , "LanguageWik" ); st->m_fdw = fdw; char query [MAX_QUERY_LEN]; sprintf(query,"site:%s.wikipedia.org",getLanguageAbbr(m_lang)); st->m_coll = g_conf.m_defaultColl; st->m_collLen = gbstrlen(st->m_coll); // . a boolFlag of 0 means query is not boolean st->m_q.set ( query, gbstrlen(query), st->m_coll, st->m_collLen, 0 ); // boolFlag st->m_termId = st->m_q.getTermId(0); st->m_startKey = g_indexdb.makeStartKey ( st->m_termId ); st->m_endKey = g_indexdb.makeEndKey ( st->m_termId ); st->m_minRecSize = 500 * 1024; if ( !st->getIndexList( ) ) return false; return st->getSummary(); } bool StateWik::getIndexList( ) { // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb // get the title rec at or after this docId if ( ! m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_INDEXDB , // rdbId of 2 = indexdb m_coll , &m_list , m_startKey , m_endKey , m_minRecSize, // recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles this , // state gotIndexListWrapper , 0 ) ) // niceness return false; return getSummary( ); } void gotIndexListWrapper( void *state , RdbList *list ){ StateWik *st = (StateWik *) state; list->resetListPtr(); st->getSummary(); return; } bool StateWik::getSummary( ){ m_numMsg20sOutstanding = 0; m_numMsg20sReceived = 0; long numLaunched = 0; // launch MAX_FRAG_SIZE msg20's at a time, wait for all of them while ( numLaunched < MAX_FRAG_SIZE && !m_list.isExhausted() ){ long long docId = m_list.getCurrentDocId () ; // set the summary request then get it! Msg20Request req; Query *q = &m_q; //long nt = q->m_numTerms; req.ptr_qbuf = q->getQuery(); req.size_qbuf = q->getQueryLen()+1; req.ptr_coll = m_coll; req.size_coll = m_collLen+1; req.m_docId = docId; req.m_numSummaryLines = 3; req.m_maxCacheAge = g_conf.m_indexdbMaxIndexListAge; req.m_wcache = true; // addToCache req.m_state = this; req.m_callback = gotSummaryWrapper; req.m_niceness = 0; req.m_expected = true; req.m_boolFlag = q->m_isBoolean; // 2 means auto? req.m_allowPunctInPhrase = true; req.m_showBanned = false; if ( ! m_msg20s[numLaunched].getSummary ( &req ) ) m_numMsg20sOutstanding++; #ifdef _OLDMSG20_ if ( !m_msg20s[numLaunched]. getSummary(&m_q, NULL, NULL, docId, -1, //clusterLevel 3,//numLinesInSummary, g_conf.m_indexdbMaxIndexListAge, 1 , //addToCache m_coll , m_collLen , this , gotSummaryWrapper , 0 ,// niceness //m_sequentialTitledbLookup, false ,// titledb restrict? NULL,//m_si->m_displayMetas , 0,//m_si->m_displayMetasLen , 0,//bigSampleRadius , 0,//bigSampleMaxLen , true,//m_si->m_isAdmin , true , //requireallterms false , //count links 0, NULL, //url false, //just get link info false,//considerTitlesFromBody true,// usenewsummaries 0, NULL, //link info NULL, //hostdb true,//expect 2b there? NULL, 0, 0, true,//getvectorrec false,//deduping true,// allowPunctinPhrase false,//showbanned false,//excludeLinkText, false,//hackFixWords, false,//hackFixPhrases, 0,//includeCachedCopy false))// justgetlinkquality m_numMsg20sOutstanding++; #endif m_list.skipCurrentRecord(); numLaunched++; } m_numMsg20sLaunched = numLaunched; if ( m_numMsg20sOutstanding > 0 ) return false; gotSummaryWrapper( this ); return false; } void gotSummaryWrapper ( void *state ){ StateWik *st = (StateWik *) state; st->m_numMsg20sReceived++; if ( !st->m_list.isExhausted() && st->m_numMsg20sLaunched < MAX_FRAG_SIZE ) return; if ( st->m_numMsg20sReceived < st->m_numMsg20sOutstanding ) return; if ( !st->gotSummary( ) ) return; return; } bool StateWik::gotSummary ( ){ for ( long i = 0; i < m_numMsg20sLaunched; i++ ){ if ( m_msg20s[i].m_errno ) continue; char frag[MAX_FRAG_SIZE]; long flen = 0; strcpy(frag, m_msg20s[i].getTitle()); flen = gbstrlen(frag); //log ( LOG_WARN,"lang: Got url %s with title %s", // m_msg20s[i].getUrl(), // m_msg20s[i].getTitle() ); // check for two or more consecutive puncts bool lastPunct = false; bool skip = false; char *p = frag; char *pend = frag + flen; for ( ; p < pend ; p += getUtf8CharSize(p) ) { if ( lastPunct && !is_alnum_utf8(p) ){ skip = true; break; } if ( !is_alnum_utf8 ( p ) ) lastPunct = true; } if ( skip ) continue; // check if all the letters are not alphabets long numAlphas = 0; // anoterh loop p = frag; for ( ; p < pend ; p += getUtf8CharSize(p) ) { if ( !is_alpha_utf8 ( p ) ) numAlphas++; } if ( numAlphas >= flen ) continue; frag[flen++] = '\n'; frag[flen] = '\0'; //log ( LOG_WARN,"lang: Got url %s with title %s", // m_msg20s[i].getUrl(),frag ); // write out the trailing \n as well long wn = write ( m_fdw, frag, flen ) ; if ( wn != flen ) continue; } // see if u can launch more if ( !m_list.isExhausted() ) return getSummary(); // see if the termlist is over if ( m_list.getListSize() >= m_minRecSize ){ // see if u can get some more of the list. m_startKey = *(key_t *)m_list.getLastKey(); m_startKey += (unsigned long) 1; // watch out for wrap around if ( m_startKey >= *(key_t *)m_list.getLastKey() ) return getIndexList(); } // close the file close(m_fdw); return true; } // Generates the phonetics of the words of the dictionary. // Finds the term frequency and then put it as the popularity after adjusting bool Language::makeDict(){ StateDict *st ; try { st = new (StateDict); } catch ( ... ) { g_errno = ENOMEM; log("Lang: new(%i): %s", sizeof(StateDict), mstrerror(g_errno)); return true; } mnew ( st , sizeof(StateDict) , "StateDict" ); m_stateDict = st; char ff[1024]; sprintf(ff,"%sdict/%s/%s.wl", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); File f; f.set (ff); // open file if ( ! f.open ( O_RDONLY ) ) { log("lang: open: %s",mstrerror(g_errno)); return true; } // TODO : CHANGE THIS TO USE fgets // get file size long fileSize = f.getFileSize() ; // store a \0 at the end st->m_dictBufSize = fileSize + 1; // make buffer to hold all st->m_dictBuf = (char *) mmalloc ( st->m_dictBufSize , "LanguageWordsBuf" ); if ( ! st->m_dictBuf) { log("lang: mmalloc: %s",mstrerror(errno));return false; } // read em all in if ( ! f.read ( st->m_dictBuf , fileSize , 0 ) ) { log("lang: read: %s", mstrerror(g_errno)); return true; } // change \n to \0 st->m_numTuples = 0; for ( long i = 0 ; i < st->m_dictBufSize ; i++ ) { if ( st->m_dictBuf[i] != '\n' ) continue; st->m_dictBuf[i] = '\0'; st->m_numTuples++; } f.close(); // log a msg log(LOG_INIT,"lang: read %li words into memory", st->m_numTuples ); // alloc space to make them into termids st->m_bufSize = st->m_numTuples * ( sizeof (char*) + 2 * sizeof (long long) ); st->m_buf = (char *) mmalloc ( st->m_bufSize, "LanguagePtrs" ); if ( !st->m_buf ) { log ( LOG_WARN,"lang: could not alloc %li bytes", st->m_bufSize ); g_errno = ENOMEM; return true; } char *p = st->m_buf; st->m_wordsPtr = (char **) p; p += st->m_numTuples * sizeof(char *); st->m_termIds = (long long *)p; p += st->m_numTuples * sizeof(long long); st->m_termFreqs = (long long *)p; p += st->m_numTuples * sizeof(long long); char *coll = g_conf.m_defaultColl; long collLen = gbstrlen(coll); p = st->m_dictBuf; for ( long i = 0; i < st->m_numTuples; i++ ){ st->m_wordsPtr[i] = p; p += gbstrlen(p) + 1; long wordLen = gbstrlen(st->m_wordsPtr[i]); // . set query class // . a boolFlag of 0 means query is not boolean Query q; q.set ( st->m_wordsPtr[i], wordLen , coll , collLen , 0 ); st->m_termIds[i] = q.getTermId(0); st->m_termFreqs[i] = 0; } if ( !st->m_msg37.getTermFreqs ( coll , 0 , // maxAge st->m_termIds , st->m_numTuples , st->m_termFreqs , this , gotTermFreqsWrapper, 0 , // niceness false ))// exact count? return false; gotTermFreqsWrapper(this); return true; } void gotTermFreqsWrapper(void *state){ Language *lang = (Language *) state; lang->gotTermFreqs(lang->m_stateDict); } bool Language::gotTermFreqs( StateDict *st ){ int fd; char ff[1024]; sprintf ( ff , "%sdict/%s/%s.wl.phonet",g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); // delete it first unlink ( ff ); // then open a new one for appending fd = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fd < 0 ){ log("lang: Could not open %s for writing: " "%s.",ff, strerror(errno)); st->m_numTuples = 0; } long long max = 0LL; for ( long i = 0; i < st->m_numTuples; i++ ){ if ( st->m_termFreqs[i] > max ) max = st->m_termFreqs[i]; } char cleanWord[MAX_PHRASE_LEN]; char phonetic[MAX_PHRASE_LEN]; long wordLen = 0; char tmp[1024]; for ( long i = 0; i < st->m_numTuples; i++ ){ wordLen = gbstrlen(st->m_wordsPtr[i]); // clean the word, i.e. convert word to uppercase and // remove possible accents makeClean( st->m_wordsPtr[i], wordLen, cleanWord, MAX_PHRASE_LEN ); getPhonetic ( cleanWord, gbstrlen(cleanWord), phonetic, MAX_PHRASE_LEN ); long long freq = ( st->m_termFreqs[i] * 32000 ) / max ; sprintf(tmp,"%lli\t%s\t%s\n", freq, st->m_wordsPtr[i], phonetic); unsigned long wn = write ( fd , tmp , gbstrlen(tmp) ) ; if ( wn != gbstrlen(tmp) ){ log("lang: makeWordFiles: write: %s", strerror(errno)); break; } } close(fd); mfree ( st->m_dictBuf, st->m_dictBufSize,"LanguageDictBuf" ); mfree ( st->m_buf, st->m_bufSize,"LanguageBuf"); mdelete(st,sizeof(StateDict),"StateDict"); delete(st); return true; } #if 0 bool Language::makeAffinities(){ // make a state StateAff *st ; try { st = new (StateAff); } catch ( ... ) { g_errno = ENOMEM; log("Lang: new(%i): %s", sizeof(StateAff), mstrerror(g_errno)); return false; } mnew ( st , sizeof(StateAff) , "LanguageAffinity" ); st->m_fileNum = 12; // blocked if ( !openAffinityFile(st) ) return false; return st->doneAffinities(st); } bool StateAff::openAffinityFile( ){ if ( m_fileNum >= NUM_CHARS ) return true; // open for reading char ff[1024]; sprintf ( ff , "%sdict/dict.%li", g_hostdb.m_dir, m_fileNum ); m_fdr = fopen ( ff, "r" ); if ( !m_fdr ) { log("lang: test: Could not open %s for " "reading: %s.", ff,strerror(errno)); return true; } // open for writing sprintf ( ff , "%sdict.new/dict.%li.aff", g_hostdb.m_dir, m_fileNum ); // delete it first unlink ( ff ); // then open a new one for appending m_fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( m_fdw < 0 ){ log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); return true; } if ( !launchAffinity(st) ){ return false; } m_fileNum++; return openAffinityFile(st); } bool Language::launchAffinity(StateAff *st){ //char dst[1026]; // go through the words in dict/words while ( fgets ( m_buf , MAX_FRAG_SIZE , m_fdr ) ){ // length of word(s), including the terminating \n long wlen = gbstrlen(m_buf) ; // skip if empty if ( wlen <= 0 ) return launchAffinity(st); m_buf[wlen-1]='\0'; // skip to the phrase. titlerec dict have space as a seperator char *p = m_buf; while ( *p != ' ' ) p++; p++; char *coll = g_conf.m_defaultColl; long collLen = gbstrlen(coll); // . set query class // . a boolFlag of 0 means query is not boolean long numTerms = 0; Query *q = &m_q; if ( q->set ( p, gbstrlen(p), coll, collLen, 0 ) ) numTerms = q->getNumTerms(); // no use doing affinities on 1 word phrases if ( numTerms <= 1 ){ char dst[1096]; sprintf( dst, "00000\t%s\n", m_buf ); log("%s",dst); unsigned long wn = write(m_fdw, dst, gbstrlen(dst)); if ( wn != gbstrlen(dst) ) log("lang: genTopPopFile: write: %s", strerror(errno)); continue; } m_msg3a.reset(); if ( !m_msg3a. getDocIds( q , coll , collLen , 100.0 , g_conf.m_indexdbMaxIndexListAge, true , 0 ,//stage0 30, 0 , this, gotAffinityFreqs1Wrapper ) ) return false; return gotAffinityFreqs1(st); } fclose(m_fdr); close(m_fdw); return true; } void gotAffinityFreqs1Wrapper(void *state){ StateAff *st = (StateAff *) state; st->gotAffinityFreqs1(st); return; } bool StateAff::gotAffinityFreqs1( ){ m_denominator = m_msg3a.getNumTotalHits(); // now get the phrase hits char *p = m_buf; while ( *p != ' ' ) p++; // change the space to a quote *p = '\"'; //go to the end while ( *p != '\0' ) p++; //change that to quote *p = '\"'; p++; // null end *p = '\0'; p = m_buf; while ( *p != '\"') p++; char *coll = g_conf.m_defaultColl; long collLen = gbstrlen(coll); // . set query class // . a boolFlag of 0 means query is not boolean Query *q = &m_q; q->set ( p, gbstrlen(p), coll, collLen, 0 ); m_msg3a.reset(); if ( !m_msg3a. getDocIds( q , coll , collLen , 100.0 , g_conf.m_indexdbMaxIndexListAge, true , 0 ,//stage0 30, 0 , this , gotAffinityFreqs2Wrapper ) ) return false; return gotAffinityFreqs2(st); } void gotAffinityFreqs2Wrapper(void *state){ StateAff *st = (StateAff *) state; st->gotAffinityFreqs2(st); return; } bool StateAff::gotAffinityFreqs2(StateAff *st){ m_numerator = m_msg3a.getNumTotalHits(); double affinity = 0; if ( m_denominator > 0 ) affinity = (double)m_numerator / (double)m_denominator; affinity *= 10000; char dst[1096]; sprintf( dst, "%05.0f\t%s\n", affinity, m_buf ); log("num=%lli, denom=%lli, %s",m_numerator,m_denominator,dst); unsigned long wn = write ( m_fdw , dst , gbstrlen(dst) ) ; if ( wn != gbstrlen(dst) ) log("lang: genTopPopFile: write: %s",strerror(errno)); //blocked if ( !launchAffinity(st) ) return false; // didn't block means the file ended m_fileNum++; if ( !openAffinityFile(st) ) return false; return doneAffinities(st); } bool StateAff::doneAffinities(StateAff *st){ mdelete(st,sizeof(StateAff), "StateAff"); delete(st); return true; } #endif /////////////////////////////////////////////////////// // DICTIONARY MANIPULATION ROUTINES BELOW HERE // /////////////////////////////////////////////////////// // Clean query dict file of mispelleings // NOTE: This function shall only compare each word to see if the phrase // is present in the most commonly mispelled words list, that is present // in the file mispelled_words. For spellchecking, use spellcheckDict() // NOTE: Whenever you use these functions, please check the infile, outfile // and the text format is correct bool Language::cleanDictFile ( ) { char buf [1024*10]; char fx[1024]; sprintf( fx,"%sdict/%s/%s.query.phonet",g_hostdb.m_dir, getLanguageAbbr(m_lang),getLanguageAbbr(m_lang) ); FILE *fdr = fopen ( fx,"r" ); if ( ! fdr ) { return log("lang: Could not open query file for " "reading: %s.",strerror(errno)); } // open for writing char ff[1024]; sprintf ( ff , "%stmp/query.phonet.clean", g_hostdb.m_dir ); // delete it first unlink ( ff ); // then open a new one for appending int fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ){ return log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); } while ( fgets ( buf , 1024 * 10, fdr ) ) { buf[1024 * 10 - 1] = '\0'; // length of word(s), including the terminating \n long wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; //buf[wlen-1]='\0'; char *p = buf; while ( *p != '\t' ) p++; p++; char *str = p; while ( *p != '\t' ) p++; if ( hasMispelling(str, p - str) ) continue; // write out the trailing \n as well long wn = write ( fdw, buf, wlen ) ; if ( wn != wlen ) return log("spell: makeWordFiles: " "write: %s", strerror(errno)); // break here so that we only print one phrase // per query } return true; } // opens each file and creates the (score, word, phonet) tuple and stores // in phonet file. Normalizes scores to a high score of 32000. Also removes // tuples for which there are no phonets and tuples that are adult. // The incoming file is supposed to be a tuple of (score, word) bool Language::makePhonet( char *infile){ loadRules(); // create the output file int fdw; char outfile[1024]; sprintf ( outfile , "%s.phonet", infile); // delete it first unlink ( outfile ); // then open a new one for appending fdw = open ( outfile , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); char buf[1024]; long max = 0; // open the input file FILE *fdr; // then open fdr = fopen ( infile, "r" ); if ( !fdr ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); // this loop goes through all the tuples and finds max score while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; while ( *p == ' ' ) p++; // first is the popularity score if ( atoi (p) > max ) max = atoi(p); } // close fclose(fdr); // then open fdr = fopen ( infile, "r" ); if ( !fdr ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); char *scorePtr; char *wordPtr; char cleanWord[MAX_PHRASE_LEN]; char phonetic[MAX_PHRASE_LEN]; long wordLen = 0; char tmp[1024]; // this loop goes through all the tuples and only adds those // tuples into the phonetic dict that have phonets. Normalizes scores. while ( fgets ( buf , 1024 , fdr ) ) { long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; while ( *p == ' ' ) p++; // first is the popularity score scorePtr = p; long long score = (long long ) atoi(scorePtr); // normalize score score = ( score * 32000 )/ max; // skip it while ( *p != '\t' ) p++; // null end it *p = '\0'; p++; wordPtr = p; wordLen = gbstrlen( wordPtr ); // make the all letters in lower case to_lower1(p); // clean the word, i.e. convert word to uppercase and // remove possible accents if (!makeClean(wordPtr, wordLen, cleanWord, MAX_PHRASE_LEN)){ log ( "removed unclean phrase %s", p ); continue; } if ( !getPhonetic ( cleanWord, gbstrlen(cleanWord), phonetic, MAX_PHRASE_LEN ) ){ log ( "could not get phonetic of phrase %s", p ); continue; } if ( gbstrlen(phonetic) == 0 ){ log ( "got 0 len phonetic of phrase %s", p ); continue; } sprintf(tmp,"%lli\t%s\t%s\n",score, wordPtr, phonetic); unsigned long wn = write ( fdw , tmp , gbstrlen(tmp) ) ; if ( wn != gbstrlen(tmp) ) return log("lang: makePopPhonet: write: " "%s",strerror(errno)); } close(fdw); fclose(fdr); // all done return true; } bool Language::genTopPopFile ( char *infile ){ // open the input file FILE *fdr; // then open fdr = fopen ( infile, "r" ); if ( !fdr ) return log("lang: Could not open %s for reading: " "%s.", infile, strerror(errno)); // create the output file int fdw; char outfile[1024]; sprintf ( outfile , "%s.top", infile ); // delete it first unlink ( outfile ); // then open a new one for appending fdw = open ( outfile , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); char buf[1024]; long count = 0; // this loop goes through all the words and only adds those // tuples into the distributed file that belong to this host. while ( fgets ( buf , 1024 , fdr ) ) { // put the first TOP_POP_PHRASES words if ( count++ >= TOP_POP_PHRASES ) break; long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; unsigned long wn = write ( fdw , buf , gbstrlen(buf) ) ; if ( wn != gbstrlen(buf) ) return log("lang: genTopPopFile: write: " "%s",strerror(errno)); } close(fdw); fclose(fdr); return true; } */ // the distributed pop file is stored as a tuple of (phrase, phonet, lang, pop) // to comply with the unified dict bool Language::genDistributedPopFile ( char *infile, unsigned long myHash ){ // open the input file FILE *fdr; // then open fdr = fopen ( infile, "r" ); if ( !fdr ) return log("lang: Could not open %s for writing: " "%s.", infile, strerror(errno)); // create the output file int fdw; char outfile[1024]; sprintf ( outfile , "%s.%li", infile, myHash ); // delete it first unlink ( outfile ); // then open a new one for appending fdw = open ( outfile , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); char buf[1024]; long hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits; hostsPerSplit /= g_hostdb.m_numHostsPerGroup; long count = 0; // this loop goes through all the words and only adds those // tuples into the distributed file that belong to this host. while ( fgets ( buf , 1024 , fdr ) ) { // skip the first TOP_POP_PHRASES words because they shall be // put in the top pop file if ( count++ < TOP_POP_PHRASES ) continue; long wlen = gbstrlen(buf); if ( wlen <= 0 || wlen > MAX_PHRASE_LEN ) continue; // remove the newline \n buf [wlen - 1] = '\0'; char *p = buf; char *pend = p + wlen - 1; // first is the popularity score char *score = p; while ( *p != '\t' && p < pend ) p++; // null end the score *p = '\0'; p++; // next is the phrase char *phrase = p; while ( *p != '\t' && p < pend ) p++; p++; // check if we're at the phonet if ( p >= pend ) continue; char *phonet = p; unsigned long long phonetKey = hash64Lower_utf8(phonet); if ( phonetKey % hostsPerSplit != myHash ) continue; char tmp[1024]; sprintf(tmp,"%s\t%s\n", phrase, score); // put the \n in place of \0 //buf [wlen-1] = '\n'; unsigned long wn = write ( fdw , tmp , gbstrlen(tmp) ) ; if ( (long)wn != gbstrlen(tmp) ) return log("lang: genDistributedPop: write: " "%s",strerror(errno)); } close(fdw); fclose(fdr); return true; } // heuristic code to spellcheck the dictionary // spellcheck each word in the pop words dictionary with forceReco on so that // we get a recommendation. Output words that have a recommendation that has // 4 times the popularity of the word long Language::spellcheckDict(){ if ( !loadWikipediaWords() ) return 0; char ff[1024]; sprintf ( ff , "%sdict/%s/%s.query.phonet", g_hostdb.m_dir, getLanguageAbbr(m_lang), getLanguageAbbr(m_lang)); FILE *fd = fopen ( ff, "r" ); if ( ! fd ) { log("lang: test: Could not open %s for " "reading: %s.", "query.phonet",strerror(errno)); return 0; } // create the output file int fdw; char outfile[1024]; sprintf ( outfile , "%s.spellcheck", ff ); // delete it first unlink ( outfile ); // then open a new one for appending fdw = open ( outfile , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ) return log("lang: Could not open %s for writing: " "%s.", outfile, strerror(errno)); HashTableT kickedOutPhrases; kickedOutPhrases.set(256); long notFound = 0; char buf[1026]; //char dst[1026]; // go through the words in dict/words while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) { // length of word(s), including the terminating \n long wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; for ( long j = 0; j < wlen; j++ ) if ( buf[j] == '\t') buf[j] = '\0'; char *tuple = buf; //skip score and go to phrase tuple += gbstrlen(tuple) + 1; char *word = tuple; // . make the all letters in lower case // . TODO: fix for utf8 words? to_lower1_a(word); // check for adult words /*if ( isAdult (word) ){ log(LOG_WARN,"lang: kicking out adult phrase=%s", word); continue; }*/ unsigned long long h = hash64d ( word, gbstrlen(word)); bool isInWiki = false; // if the phrase is in wikipedia, its safe long slot = m_wiki.getSlot(h); if ( slot != -1 ) isInWiki = true; long wordPop = g_speller.getPhrasePopularity( word, h, false ); if ( wordPop == 0 ) { slot = m_distributedPopPhrases.getSlot(h); if ( slot != -1 ){ wordPop = m_distributedPopPhrases. getValueFromSlot(slot); } } bool isPhrase = false; while ( *tuple != '\0' ){ if ( *tuple == ' ' ) isPhrase = true; tuple++; } // point back to the phrase tuple = word; char recommendation[MAX_PHRASE_LEN]; bool found; long score; long pop; /* if ( !isPhrase && !isInWiki ){ // just the the best narrow phrase we can find long numNarrow = 0; char narrow[MAX_PHRASE_LEN]; long narrowPop; numNarrow = narrowPhrase ( word, narrow, &narrowPop, 1 ); if ( numNarrow == 0 ){ log (LOG_WARN,"lang: no Narrow Searches " "for %s",word); continue; } word = narrow; wordPop = narrowPop; } */ bool reco = getRecommendation( word, gbstrlen(word), recommendation, MAX_PHRASE_LEN, &found, &score, &pop, true );// forceReco // if a kicked out phrase is the recommendation, then DON'T // kick out this one too, because it probably means that the // kicked out phrase was good. BUT should we put the kicked // out phrase back ?? if ( reco && !isInWiki ){ long h1 = hash32d ( recommendation, gbstrlen(recommendation) ); slot = m_wiki.getSlot(h1); // if the recommendation is in wiki, then double the // pop of the recommendation if ( slot != -1 && !isInWiki ){ log (LOG_WARN,"lang: recommendation=%s " "is in the wiki. kicks out phrase %s", recommendation, buf+gbstrlen(buf)+1); pop *= 2; } slot = kickedOutPhrases.getSlot(h1); if ( slot != -1 ){ log (LOG_WARN,"lang: recommendation has " "already been kicked out, word=%s, " "reco=%s",buf+gbstrlen(buf)+1, recommendation ); reco = false; } } // if it is found in wikipedia OR // if no reco is found (even though it is a phrase) OR // if phrase popularity is 4x the recommendation popularity // if score is less than 99. if ( isInWiki || !reco || wordPop * 4 > pop || score > 99 ){ char tmp[MAX_FRAG_SIZE]; sprintf(tmp,"%s\t%s\t%s\n",buf, tuple, tuple + gbstrlen(tuple) + 1); unsigned long wn = write ( fdw , tmp , gbstrlen(tmp) ); if ( (long)wn != gbstrlen(tmp) ) return log("spell: spellCheckDict: write: " "%s",strerror(errno)); continue; } kickedOutPhrases.addKey(h,1); log ( LOG_WARN,"lang: not found=%s, reco=%s, " "score=%li, wordPop=%li, recoPop=%li", buf + gbstrlen(buf) + 1, recommendation, score, wordPop, pop ); notFound++; } close (fdw); fclose(fd); return notFound; }