#include "gb-include.h" #include "Mem.h" #include "Conf.h" #include "Dns.h" #include "HttpServer.h" #include "Loop.h" #include // setrlimit #include "Speller.h" #include #include /* static void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ); static void gotSpellerReplyWrapper (void *state, void *state2); bool Speller::registerHandler ( ) { // . register ourselves with the udp server // . it calls our callback when it receives a msg of type 0x39 if ( ! g_udpServer.registerHandler ( 0x3d, handleRequestSpeller )) return false; return true; } // . handle a request to get a linkInfo for a given docId/url/collection // . returns false if slot should be nuked and no reply sent // . sometimes sets g_errno on error void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ) { // The request is the string to be spellchecked, null ended char *request = slot->m_readBuf; // first tells us if we should narrow the search stuff bool narrowP = *(bool *) request; request += sizeof(bool); // is it found in dict or pop words bool found; int32_t score; char reco[MAX_PHRASE_LEN]; int32_t pop; int64_t start = gettimeofdayInMilliseconds(); bool recommendation = g_speller.m_language[langEnglish]. getRecommendation( request, gbstrlen(request), reco, MAX_PHRASE_LEN, &found, &score, &pop ); log ( LOG_DEBUG,"speller: %s --> %s", request, reco ); int32_t numNarrow = 0; char narrow[MAX_NARROW_SEARCHES * MAX_PHRASE_LEN]; int32_t narrowPops[MAX_NARROW_SEARCHES]; //if ( narrowP ) // numNarrow = g_speller.m_language[langEnglish]. // narrowPhrase ( request, narrow, narrowPops, // MAX_NARROW_SEARCHES ); // calculate total reply size // int32_t replySize = found + recommendation + score + pop + reco int32_t replySize = sizeof(bool) + sizeof(bool) + 4 + 4 + gbstrlen(reco) + 1; if ( narrowP ){ replySize += 4; // numPhrases for ( int32_t i = 0; i < numNarrow; i++ ) replySize += 4 + gbstrlen(&narrow[i*MAX_FRAG_SIZE]) + 1; } char *reply = (char*) mmalloc(replySize, "SpellerReplyBuf"); if ( !reply ) { g_errno = ENOMEM; //g_udpServer.sendReply_ass( NULL, 0, NULL, 0, slot ); g_udpServer.sendErrorReply( slot , g_errno ); return; } char *p = reply; *(bool *)p = found; p += sizeof(bool); *(bool *)p = recommendation; p += sizeof(bool); // store the score and pop *(int32_t *) p = score; p += 4; *(int32_t *) p = pop; p += 4; // store the recommendation strcpy( p, reco ); p += gbstrlen(reco) + 1; if ( narrowP ){ // store the number of narrow phrases found *(int32_t *) p = numNarrow; p += 4; for ( int32_t i = 0; i < numNarrow; i++ ){ *(int32_t *)p = narrowPops[i]; p += 4; strcpy(p, &narrow[i * MAX_FRAG_SIZE]); p += gbstrlen(&narrow[i * MAX_FRAG_SIZE]) + 1; } } //sanity check if ( p - reply != replySize ){ char *xx = NULL; *xx = 0; } int64_t end = gettimeofdayInMilliseconds(); if ( end - start > 1 ) log (LOG_INFO,"speller: took %"INT64" ms to spellcheck " "fragment %s", end- start, request); g_udpServer.sendReply_ass ( reply , replySize, reply , replySize, slot ); } */ Speller g_speller; Speller::Speller(){ //m_unifiedBuf = NULL; //mm_unifiedBufSize = 0; } Speller::~Speller(){ reset(); } char *g_str=NULL; bool Speller::init(){ static bool s_init = false; if ( s_init ) return true; s_init = true; /* m_hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits; m_hostsPerSplit /= g_hostdb.m_numHostsPerShard; if ( m_hostsPerSplit <= 0 ) return log("db: the in gb.conf is probably not " "too big. Are you using the wrong hosts.conf?"); // check if we've got enough multicasts avaiable if ( m_hostsPerSplit > MAX_UNIQUE_HOSTS_PER_SPLIT ){ log( LOG_WARN,"speller: not enough multicasts available for " "this host configuration. Increase multicasts" ); return false; } */ if ( !loadUnifiedDict() ) return log("spell: Could not load unified dict from " "unifiedDict-buf.txt and unifiedDict-map.dat"); // this seems to slow our startup way down!!! log("speller: turning off spell checking for now"); return true; /* int32_t myHash = g_hostdb.m_hostId % ( m_hostsPerSplit * g_hostdb.m_indexSplits ); myHash /= g_hostdb.m_indexSplits; //for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ) m_language[langEnglish].init ( m_unifiedBuf.getBufStart(), m_unifiedBuf.length(), langEnglish, m_hostsPerSplit, myHash ); return true; */ } void Speller::reset(){ //if ( m_unifiedBuf && m_unifiedBufSize > 0 ) // mfree ( m_unifiedBuf, m_unifiedBufSize, "SpellerBuf" ); m_unifiedBuf.purge(); m_unifiedDict.reset(); /* for(int32_t i = 0; i < MAX_LANGUAGES; i++) m_language[i].reset(); */ //m_unifiedBuf = NULL; //m_unifiedBufSize = 0; } // test it. void Speller::test ( char *ff ) { //char *ff = "/tmp/sctest"; FILE *fd = fopen ( ff, "r" ); if ( ! fd ) { log("speller: test: Could not open %s for " "reading: %s.", ff,strerror(errno)); return; } char buf[1026]; //char dst[1026]; // go through the words in dict/words while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) { // length of word(s), including the terminating \n int32_t wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; Query q; q.set2 ( buf , langUnknown , false ); //if ( getRecommendation ( &q, dst , 1024 ) ) // log(LOG_INIT,"speller: %s-->%s",buf,dst); // else // log(LOG_INIT,"speller: %s",buf); } fclose(fd); } /* /////////////////////////////////////////////////////// // RECOMMENDATION ROUTINES BELOW HERE // // These will spellcheck and give recommendations /////////////////////////////////////////////////////// bool Speller::canStart( QueryWord *qw ) { // can only start with a alpha character, no numeric if ( ! is_alnum_utf8 ( qw->m_word+0 ) ) return false; if ( qw->m_ignoreWord && qw->m_ignoreWord != IGNORE_CONNECTED && qw->m_ignoreWord != IGNORE_QUOTED ) return false; // don't check 'rom' in phrase "cd-rom", or 't' in "ain't" if ( qw->m_leftConnected ) return false; // don't start with a stop word if ( qw->m_isStopWord ) return false; // a lot of field terms should not be spell checked if ( qw->m_fieldCode ) { if ( qw->m_fieldCode != FIELD_TITLE && qw->m_fieldCode != FIELD_CITY && qw->m_fieldCode != FIELD_AUTHOR && qw->m_fieldCode != FIELD_COUNTRY ) return false; } return true; } // . returns false if blocked // recommended something different than original query, "q" // and false otherwise // . also returns false and sets g_errno on error // . stores recommended query in "dst" and NULL terminates it // . if dst is too small it will bitch and return true with g_errno set bool Speller::getRecommendation ( Query *q, bool spellcheck, char *dst, // recommendation destination int32_t dstLen, // recommendation max len bool narrowSearch, char *narrow, // narrow search int32_t narrowLen, // narrow search len int32_t *numNarrows, // num narrows found void *state, void (*callback)(void *state) ){ *dst = '\0'; *narrow = '\0'; // no narrowing search if spellchecking is off if ( !spellcheck ) return true; // don't spellcheck queries that are more than MAX_FRAG_SIZE int32_t. if ( q->getQueryLen() >= MAX_FRAG_SIZE ) return true; StateSpeller *st ; try { st = new (StateSpeller); } catch ( ... ) { g_errno = ENOMEM; log("Speller: new(%i): %s", sizeof(StateSpeller), mstrerror(g_errno)); return true; } mnew ( st , sizeof(StateSpeller) , "State00" ); st->m_state = state; st->m_callback = callback; st->m_q = q; st->m_spellcheck = spellcheck; st->m_dst = dst; st->m_dend = dst + dstLen; st->m_narrowSearch = narrowSearch; st->m_nrw = narrow; st->m_nend = narrow + narrowLen; st->m_numNarrow = numNarrows; *st->m_numNarrow = 0; st->m_start = gettimeofdayInMilliseconds(); st->m_numFrags = 0; st->m_numFragsReceived = 0; // . break query down into fragments // . each fragment is a string of words // . quotes and field names will separate fragments // . TODO: make field data in its own fragment int32_t nqw = q->m_numWords; for ( int32_t i = 0 ; i < nqw ; i++ ) { // get a word in the Query to start a fragment with QueryWord *qw = &q->m_qwords[i]; // can he start the phrase? if ( ! canStart( qw ) ) continue; bool inQuotes = qw->m_inQuotes; char fieldCode = qw->m_fieldCode; // . get longest continual fragment that starts with word #i // . get the following words that can be in a fragment // that starts with word #i // . start of the frag int32_t endQword = i; int32_t startQword = i; for ( ; i < nqw ; i++ ) { // . skip if we should // . keep punct, however QueryWord *qw1 = &q->m_qwords[i]; if ( qw1->m_opcode ) break; if ( qw1->m_inQuotes != inQuotes ) break; if ( qw1->m_fieldCode != fieldCode ) break; if ( qw1->m_ignoreWord == IGNORE_FIELDNAME ) break; if ( qw1->m_phraseSign && !qw1->m_rightConnected ) break; // are we punct? if ( ! is_alnum_utf8(qw1->m_word) ) endQword = i - 1; else endQword = i; } // revisit this i in big loop since we did not include it i = endQword; //create a new stateFrag StateFrag *stFrag; try { stFrag = new (StateFrag); } catch ( ... ) { mdelete ( st, sizeof(StateSpeller), "StateSpeller" ); delete (st); g_errno = ENOMEM; log("Speller: new(%i): %s", sizeof(StateFrag), mstrerror(g_errno)); //continue; return true; } mnew ( stFrag, sizeof(StateFrag), "StateFrag" ); stFrag->m_state = (void*) st; stFrag->m_narrowPhrase = st->m_narrowSearch; stFrag->m_q = q; stFrag->m_startQword = startQword; stFrag->m_endQword = endQword; stFrag->m_errno = 0; st->m_stFrag[st->m_numFrags] = stFrag; st->m_numFrags++; // blocked if ( !getRecommendation( stFrag ) ){ continue; } st->m_numFragsReceived++; } // if outstanding frags if ( st->m_numFragsReceived < st->m_numFrags ) return false; gotFrags(st); // delete state mdelete ( st, sizeof(StateSpeller), "StateSpeller" ); delete (st); return true; } bool Speller::getRecommendation ( StateFrag *st ){ st->m_recommended = false; st->m_numFound = 0; st->m_numNarrowPhrases = 0; char *dst = st->m_dst; // normalize this fragment and store in "dst" bool wasAlnum = true; for ( int32_t i = st->m_startQword; i <= st->m_endQword; i++ ){ // start of each word st->m_wp[i] = dst; char *p = st->m_q->m_qwords[i].m_word; int32_t plen = st->m_q->m_qwords[i].m_wordLen; for ( int32_t j = 0; dst-st->m_dst 0 && !is_alnum_utf8(p+j) &&!wasAlnum) continue; *dst = p[j]; dst++; wasAlnum = is_alnum_utf8 ( p+j ); } st->m_wplen[i] = dst - st->m_wp[i]; st->m_isfound[i] = false; } *dst = '\0'; // debug msg log(LOG_DEBUG,"speller: Getting recommendation for frag=%s", st->m_dst); // give each word in the phrase a chance to start the subphrase int32_t maxPhrase = st->m_endQword - st->m_startQword; if ( maxPhrase > MAX_WORDS_PER_PHRASE ) maxPhrase = MAX_WORDS_PER_PHRASE; // store the phraseLen and posn st->m_pLen = maxPhrase; st->m_pPosn = st->m_startQword; return launchReco(st); } bool Speller::launchReco(StateFrag *st){ // if we checked all the phrases or found all the words if ( st->m_numFound == st->m_endQword - st->m_startQword + 1 || st->m_pLen < 0 ){ return true; } bool launchPhrase = false; for ( ; st->m_pLen >= 0; st->m_pLen-- ){ for ( ; st->m_pPosn + st->m_pLen <= st->m_endQword; st->m_pPosn++ ) { // find a word that can start the phrase QueryWord *qw = &st->m_q->m_qwords[st->m_pPosn]; if ( !canStart (qw) ) continue; // don't do this phrase if we have found even one // word in the phrase bool found = false; for ( int32_t k = st->m_pPosn; k <= st->m_pPosn + st->m_pLen; k++ ) { if ( st->m_isfound[k] ){ found = true; break; } } if ( found ) continue; // cannot end on a stop word, punct, right-connected // word QueryWord *qwEnd = &st->m_q->m_qwords[st->m_pPosn + st->m_pLen]; if ( qwEnd->m_isStopWord || qwEnd->m_isPunct || qwEnd->m_rightConnected ) continue; // found someone to start the phrase with // what is the new phrase parms? st->m_a = st->m_wp[st->m_pPosn]; st->m_b = st->m_wp[st->m_pLen + st->m_pPosn]+ st->m_wplen[st->m_pLen + st->m_pPosn]; // also store the tmp char that we are changing st->m_c = *(st->m_b); *(st->m_b) = '\0'; // if it is just a number, don't get recommendation // lest we emabarrass ourselves if ( st->m_pPosn == 0 && is_digit(st->m_a[0]) ) { char *k = st->m_a+1; while ( is_digit(*k) ) k++; if ( ! *k ) { *st->m_b = st->m_c ; continue; } } // if it is an adult phrase, don't get a recommendation // check if isAdult really finds a word. char *adultLoc = NULL; if ( isAdult(st->m_a, gbstrlen(st->m_a), &adultLoc) && ( adultLoc == st->m_a || *(adultLoc-1) == ' ' ) ){ // mark as found for ( int32_t k = st->m_pPosn; k <= st->m_pPosn + st->m_pLen; k++ ) st->m_isfound[k] = true; *(st->m_b) = st->m_c; continue; } // if the phrase is in dict or in the top pop words, // phrase is found. Don't check if we are narrowing // the phrase because we need to multicast anyways uint64_t h ; h = hash64d(st->m_a, gbstrlen(st->m_a) ); if ( !st->m_narrowPhrase && getPhrasePopularity( st->m_a, h, false ) > 0 ){ // mark as found for ( int32_t k = st->m_pPosn; k <= st->m_pPosn + st->m_pLen; k++ ) st->m_isfound[k] = true; *(st->m_b) = st->m_c; continue; } launchPhrase = true; break; } if ( launchPhrase ) break; st->m_pPosn = st->m_startQword; } if ( st->m_pLen < 0 ){ return true; } // debug msg log(LOG_DEBUG,"speller: ----------"); log(LOG_DEBUG,"speller: Checking phrase=%s", st->m_a); // launch for all the splits st->m_numRequests = 0; st->m_numReplies = 0; int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits; // don't send to twins... hostsPerSplit /= g_hostdb.m_numHostsPerShard; int32_t mySplit = g_hostdb.m_hostId % g_hostdb.m_indexSplits; int32_t key = st->m_q->getQueryHash();//0; int32_t timeout = 30; int32_t niceness = 0; char request[MAX_FRAG_SIZE + 4]; char *p = request; *(bool *)p = st->m_narrowPhrase; p += sizeof(bool); strcpy ( p, st->m_a ); // send the null end too p += gbstrlen(st->m_a)+1; int32_t plen = p - request; for ( int32_t i = 0; i < hostsPerSplit; i++ ){ // get the hostId of the host we're sending to uint32_t hostId = mySplit + ( i * g_hostdb.m_indexSplits ); Host *h = g_hostdb.getHost(hostId); st->m_mcast[i].reset(); bool status = st->m_mcast[i]. send(request , plen , // request size 0x3d , // msgType 0x3d false , // multicast owns m_request? h->m_groupId, // group to send to (groupKey) false , // send to whole group? key , st , // state data NULL , // state data gotSpellerReplyWrapper , timeout , // in seconds niceness , false , // realtime? -1 , // m_q->m_bestHandlingHostId , NULL , // m_replyBuf , 0 , // MSG39REPLYSIZE, // this is true if multicast should free // the // reply, otherwise caller is responsible // for freeing it after calling // getBestReply). // actually, this should always be false, // there // is a bug in Multicast.cpp. false ); if (!status){ st->m_numReplies++; log("speller: Multicast had error: %s", mstrerror(g_errno)); st->m_errno = g_errno; continue; } // blocked else st->m_numRequests++; } if ( st->m_numReplies == st->m_numRequests ) return true; return false; } void gotSpellerReplyWrapper( void *state, void *state2 ){ StateFrag *stFrag = (StateFrag *) state; stFrag->m_numReplies++; if ( stFrag->m_numReplies < stFrag->m_numRequests ) return; // blocked if ( !g_speller.gotSpellerReply(stFrag) ) return; StateSpeller *st = (StateSpeller *)stFrag->m_state; // One more frag received st->m_numFragsReceived++; if ( st->m_numFragsReceived < st->m_numFrags ) return; g_speller.gotFrags(st); // callback st->m_callback( st->m_state ); // delete state mdelete ( st, sizeof(StateSpeller), "StateSpeller" ); delete (st); } bool Speller::gotSpellerReply( StateFrag *st ){ int32_t minScore = LARGE_SCORE; int32_t maxPop = -1; char *bestReco = NULL; char *reply[MAX_UNIQUE_HOSTS_PER_SPLIT]; int32_t replySize[MAX_UNIQUE_HOSTS_PER_SPLIT]; int32_t replyMaxSize[MAX_UNIQUE_HOSTS_PER_SPLIT]; bool freeit; bool found = false; //phrase was found in dict or pop words int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits; // don't send to twins... hostsPerSplit /= g_hostdb.m_numHostsPerShard; int32_t numNarrowPhrases[MAX_UNIQUE_HOSTS_PER_SPLIT]; char *narrowPtrs[MAX_UNIQUE_HOSTS_PER_SPLIT]; // init narrowSearch arrays for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ){ numNarrowPhrases[i] = 0; narrowPtrs[i] = NULL; } for ( int32_t i = 0; i < hostsPerSplit; i++ ){ reply[i] = st->m_mcast[i].getBestReply( &replySize[i] , &replyMaxSize[i] , &freeit ); // multicast may have an empty reply buffer if there was an // OOM error or something. m_errno should have been set, but // we have to loop through all the multicasts to free the // reply buffers. char *p = reply[i]; if ( g_errno || st->m_errno || !p){ continue; } // was is found in dict bool foundInDict = *(bool *)p; p += sizeof(bool); if ( foundInDict ) found = true; // first is if there is a recommendation or not bool recommendation = *(bool *) p; p += sizeof (bool); if ( !recommendation && !st->m_narrowPhrase ) continue; int32_t score = *(int32_t *)p; p += 4; int32_t pop = *(int32_t *)p; p += 4; if ( recommendation ){ log ( LOG_DEBUG,"speller: Received reco %s, " "score=%"INT32", pop=%"INT32"", p, score, pop ); // we have a recommendation with score and pop // choose the one with the lowest score, and if the // score is same then the max pop // HACK: we are getting bad recommendations for smaller // popularities. So don't consider them if ( pop > 8 && ( score < minScore || ( score == minScore && pop > maxPop ) ) ){ bestReco = p; minScore = score; maxPop = pop; } } p += gbstrlen(p) + 1; if ( st->m_narrowPhrase ){ numNarrowPhrases[i] = *(int32_t *)p; p += 4; narrowPtrs[i] = p; } } // merge all the narrow results if ( st->m_narrowPhrase ){ int32_t currPhrase[MAX_UNIQUE_HOSTS_PER_SPLIT]; for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ) currPhrase[i] = 0; for ( int32_t i = 0; i < MAX_NARROW_SEARCHES; i++ ){ int32_t maxHost = -1; int32_t maxPop = 0; for ( int32_t j = 0; j < hostsPerSplit; j++ ){ if ( numNarrowPhrases[j] <= currPhrase[j] ) continue; int32_t pop = *(int32_t *)narrowPtrs[j]; if ( pop <= maxPop ) continue; maxPop = pop; maxHost = j; } if ( maxHost < 0 ) break; // narrowPtrs[maxHost] += 4; strcpy( st->m_narrowPhrases[i], narrowPtrs[maxHost] ); narrowPtrs[maxHost] +=gbstrlen(narrowPtrs[maxHost]) + 1; currPhrase[maxHost]++; st->m_numNarrowPhrases++; } } // make narrowPhrase false here, so that its not launched a second time // for the same frag; st->m_narrowPhrase = false; // revert *(st->m_b) = st->m_c; // if we found a recommendation,or if the phrase was found in the // dictionary or pop words then mark all the // words that fall under the phrase as found if ( found || bestReco ){ for ( int32_t k = st->m_pPosn; k <= st->m_pLen + st->m_pPosn; k++ ) st->m_isfound[k] = true; st->m_numFound += st->m_pLen + 1; } // if not found in the dictionary or a recommendation, copy the phrase if ( !found && bestReco){ // this fragment is going to be recommended st->m_recommended = true; // insert our recommendation into the phrase to get a new one char *s1 = st->m_wp[st->m_startQword]; int32_t slen1 = st->m_a - st->m_wp[st->m_startQword]; char *s2 = bestReco; int32_t slen2 = gbstrlen(bestReco); char *s3 = st->m_b ; // store the difference in length between the reco and the // original string int32_t diff = slen2 - ( st->m_b - st->m_a ); int32_t slen3 = st->m_wp[st->m_endQword] + st->m_wplen[st->m_endQword] - st->m_b; if ( slen3 < 0 ) slen3 = 0; int32_t tlen = slen1 + slen2 + slen3 ; if ( tlen > MAX_FRAG_SIZE ){ log(LOG_LOGIC,"speller: buf too small. Fix me 3."); // blocked if ( !launchReco(st) ) return false; return true; } // make substitution and store in "dst" char buf2 [ MAX_FRAG_SIZE]; char *nf = buf2; memcpy ( nf , s1 , slen1 ) ; nf += slen1; memcpy ( nf , s2 , slen2 ) ; nf += slen2; memcpy ( nf , s3 , slen3 ) ; nf += slen3; // don't forget to NULL terminate *nf = '\0'; // debug msg log( LOG_DEBUG,"speller: Trying substitution \"%s\"", buf2 ); strcpy ( st->m_dst , buf2 ); // the pointers might have to be changed if the // recommendation was not of the same length as the words if ( diff != 0 ){ for ( int32_t k = st->m_pLen+st->m_pPosn+1; k <= st->m_endQword; k++ ) st->m_wp[k] += diff; } } // don't forget to free the replies for ( int32_t i = 0; i < hostsPerSplit; i++ ) if ( reply[i] && replyMaxSize[i] > 0 ) mfree( reply[i], replyMaxSize[i], "SpellerReplyBuf" ); // go to the next position in the phrase. if we have reached the end // of the phrase position, decrement the phrase length and start again if ( st->m_pPosn + st->m_pLen >= st->m_endQword - 1 ){ st->m_pLen--; st->m_pPosn = st->m_startQword; } else st->m_pPosn++; if ( !launchReco(st) ) return false; return true; } */ // . break a NULL-terminated string down into a list of ptrs to the words // . return the number of words stored into "wp" /* int32_t Speller::getWords ( const char *s , char *wp [MAX_FRAG_SIZE] , int32_t wplen [MAX_FRAG_SIZE] , bool *isstop ) { int32_t nwp = 0; loop: // skip initial punct while ( *s && ! is_alnum ( *s ) ) s++; // bail if done if ( ! *s ) return nwp; // point to word wp [ nwp ] = (char *)s; // convenience ptr char *ww = (char *)s; // count over it while ( is_alnum ( *s ) ) s++; // how long is the word? int32_t slen = s - wp [ nwp ]; // set length wplen [ nwp ] = slen ; // is it a stop word? if ( isstop ) { // TODO: make the stop words utf8!!! int64_t h = hash64Lower_utf8 ( ww , slen ) ; bool stop = ::isStopWord ( ww , slen , h ) ; // BUT ok if Capitalized or number if ( stop ) { if ( is_digit (ww[0]) ) stop = false; if ( is_cap (ww,slen ) ) stop = false; // e-mail, c file, c. s. lewis if ( slen == 1 && ww[0] != 'a' ) stop = false; } isstop[nwp] = stop; } nwp++; goto loop; } */ /* void Speller::gotFrags( void *state ){ StateSpeller *st = (StateSpeller *) state; char *dptr = st->m_dst; char *nptr = st->m_nrw; bool recommendation = false; Query *q = st->m_q; // . break query down into fragments // . each fragment is a string of words // . quotes and field names will separate fragments // . TODO: make field data in its own fragment int32_t nqw = q->m_numWords; int32_t currFrag = 0; for ( int32_t i = 0 ; i < nqw ; i++ ) { // get a word in the Query to start a fragment with QueryWord *qw = &q->m_qwords[i]; // if he has a phraseSign, put it right away //if ( qw->m_phraseSign ) { // *dptr = qw->m_phraseSign; // dptr++; // } // can he start the phrase? // if he can't start our fragment, just copy over to "dst" if ( !canStart( qw )) { // copy to rp and get next word char *w = qw->m_word; int32_t wlen = qw->m_wordLen; if ( dptr + wlen >= st->m_dend ) { g_errno = EBUFTOOSMALL; continue; } // watch out for LeFtP and RiGhP if ( qw->m_opcode == OP_LEFTPAREN ) *dptr++ = '('; else if ( qw->m_opcode == OP_RIGHTPAREN) *dptr++ = ')'; else if ( qw->m_opcode == OP_PIPE ) *dptr++ = '|'; else { memcpy ( dptr , w , wlen ); dptr += wlen; } *dptr = '\0'; continue; } bool inQuotes = qw->m_inQuotes; char fieldCode = qw->m_fieldCode; // . get longest continual fragment that starts with word #i // . get the following words that can be in a fragment // that starts with word #i // . start of the frag int32_t endQword = i; for ( ; i < nqw ; i++ ) { // . skip if we should // . keep punct, however QueryWord *qw1 = &q->m_qwords[i]; if ( qw1->m_opcode ) break; if ( qw1->m_inQuotes != inQuotes ) break; if ( qw1->m_fieldCode != fieldCode ) break; if ( qw1->m_ignoreWord== IGNORE_FIELDNAME ) break; if ( qw1->m_phraseSign && !qw1->m_rightConnected ) break; // are we punct? if ( ! is_alnum_utf8 (qw1->m_word) ) endQword = i - 1; else endQword = i; } // revisit this i in big loop since we did not include it i = endQword; // OOM errors might cause us not to launch frags if ( currFrag >= st->m_numFrags ) continue; StateFrag *stFrag = st->m_stFrag[currFrag]; // don't breech if ( dptr + gbstrlen(stFrag->m_dst) >= st->m_dend ) { g_errno = EBUFTOOSMALL; } else { // store it strcpy ( dptr, stFrag->m_dst ); dptr += gbstrlen ( dptr ); // add a space between fragments // *dptr = ' '; //dptr++; *dptr = '\0'; // set the flag if ( stFrag->m_recommended ) recommendation = true; } // copy over all the narrow searches that can fit for ( int32_t j = 0; j < stFrag->m_numNarrowPhrases; j++ ){ // don't breech if ( nptr +gbstrlen(stFrag->m_narrowPhrases[j]) > st->m_nend ) break; strcpy(nptr, stFrag->m_narrowPhrases[j]); nptr += gbstrlen(stFrag->m_narrowPhrases[j]) + 1; (*st->m_numNarrow)++; } mdelete(stFrag, sizeof(StateFrag), "StateFrag"); delete (stFrag); // now we get the next frag currFrag++; } if ( !recommendation ) *st->m_dst = '\0'; int64_t now = gettimeofdayInMilliseconds(); if ( now - st->m_start > 50 ) log(LOG_INFO,"speller: Took %"INT64" ms to spell check %s", now - st->m_start, st->m_q->getQuery() ); return; } */ bool Speller::generateDicts ( int32_t numWordsToDump , char *coll ){ m_language[2].setLang(2); //m_language[2].generateDicts ( numWordsToDump, coll ); return false; } char *Speller::getRandomWord() { int32_t offset = rand() % m_unifiedBuf.length();//Size; // find nearest \0 char *p = m_unifiedBuf.getBufStart() + offset; // backup until we hit \0 for ( ; p > m_unifiedBuf.getBufStart() && *p ; p-- ); // now advance! if ( p > m_unifiedBuf.getBufStart() ) p++; // that is the word return p; } // The unified dict is the combination of the word list, title rec and the top // query dict of all languages. It has to be created by loading each languages // dict into memory using Language.loadWordList(), loadTitleRecDict(), etc bool Speller::loadUnifiedDict() { bool building = false; reload: bool needRebuild = false; m_unifiedBuf.purge(); m_unifiedBuf.setLabel("unibuf"); // this MUST be there if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir, "unifiedDict-buf.txt" ) == 0 ) needRebuild = true; // . give it a million slots // . unified dict currently has 1340223 entries m_unifiedDict.set ( 8,4, 2*1024*1024,NULL,0,false,0,"udictht"); // try to load in the hashtable and the buffer directly if ( ! m_unifiedDict.load(g_hostdb.m_dir,"unifiedDict-map.dat")) needRebuild = true; if ( ! needRebuild ) { // convert unifiedBuf \n's to \0's char *start = m_unifiedBuf.getBufStart(); char *end = start + m_unifiedBuf.length(); for ( char *p = start ; p < end ; p++ ) if ( *p == '\n' ) *p = '\0'; log(LOG_DEBUG,"speller: done loading successfully"); // a quick little checksum if ( ! g_conf.m_isLive ) return true; // the size int64_t h1 = m_unifiedDict.getNumSlotsUsed(); int64_t h2 = m_unifiedBuf .length(); int64_t h = hash64 ( h1 , h2 ); char *tail1 = (char *)m_unifiedDict.m_keys; char *tail2 = m_unifiedBuf.getBufStart()+h2-1000; h = hash64 ( tail1 , 1000 , h ); h = hash64 ( tail2 , 1000 , h ); //int64_t n = 8346765853685546681LL; int64_t n = -14450509118443930LL; if ( h != n ) { log("gb: unifiedDict-buf.txt or " "unifiedDict-map.dat " "checksum is not approved for " "live service (%"INT64" != %"INT64")" ,h,n); //return false; } return true; } if ( building ) { log("gb: rebuild failed. exiting."); exit(0); } building = true; log("gb: REBUILDING unifiedDict-buf.txt and unifiedDict-map.dat"); // just in case that was there and the buf wasn't m_unifiedDict.clear(); // or vice versa m_unifiedBuf.purge(); // load the .txt file. this is REQUIRED for rebuild SafeBuf ub; if ( ub.fillFromFile (g_hostdb.m_dir,"unifiedDict.txt") <= 0 ) return false; // // change \n to \0 // TODO: filter out the first word from each line? // char *start = ub.getBufStart(); char *end = start + ub.length(); for ( char *p = start ; p < end ; p++ ) if ( *p == '\n' ) *p = '\0'; // now scan wikitionary file wiktionary-lang.txt to get even // more words! this file is generated from Wiktionary.cpp when // it scans the wiktionary xml dump to generate the other // wiktionary-syns.dat and wiktionary-buf.txt files. it also // cranks this file out because we can use it since we do not // have czech in the unifiedDict.txt file. SafeBuf wkfBuf; if ( wkfBuf.fillFromFile ( g_hostdb.m_dir,"wiktionary-lang.txt") <= 0 ) return false; // scan each line char *p = wkfBuf.getBufStart(); char *pend = p + wkfBuf.length(); HashTableX wkfMap; // true = allow dups. because same word can appear in multiple langs if ( ! wkfMap.set ( 8,1,1000000,NULL,0,true,0,"wkfmap") ) return false; // "fr|livre" is how it's formatted for ( ; p && p < pend ; p = wkfBuf.getNextLine(p) ) { char *start = p; // skip til | for ( ; *p && *p != '|' ; p++ ); // sanity check if ( *p != '|' ) { char *xx=NULL;*xx=0; } // tmp NULL that *p = '\0'; char langId = getLangIdFromAbbr(start); // revert *p = '|'; if ( langId == langUnknown ) continue; if ( langId == langTranslingual ) continue; // skip | p++; // that's the word char *word = p; // find end char *end = p; for ( ; *end && *end != '\n' ; end++ ) ; // so hash it up int64_t wid = hash64d ( word , end - word ); // debug point //if ( wid == 5000864073612302341LL ) // log("download"); // add it to map if ( ! wkfMap.addKey ( &wid , &langId ) ) return false; } // // scan unifiedDict.txt file // int32_t totalCollisions = 0; uint64_t atline = 0; p = start; while ( p < end ) { atline++; char *phrase = p; // if line is a comment skip it if ( *p == '#' ){ p += gbstrlen(p) + 1; continue; } // skip phrase while ( *p != '\t' ) p++; // Null end the phrase *p = '\0'; // skip empty phrases if(gbstrlen(phrase) < 1) { log(LOG_WARN, "spell: Got zero length entry in unifiedDict " "at line %"UINT64", skipping\n", atline); p += gbstrlen(p) + 1; continue; } // skip single byte words that are not alphabetic // Anything over 'Z' is likely unicode, so don't bother if(gbstrlen(phrase) == 1 && (phrase[0] < 'a')) { log(LOG_WARN, "spell: Got questionable entry in " "unifiedDict at line %"UINT64", skipping: %s\n", atline,p); p += gbstrlen(p) + 1; continue; } // . i need to move everything over to utf8!!! // . this is the same hash function used by Words.cpp so that p++; // phonet char *phonet = p; // next is the phonet while ( *p != '\t' ) p++; // Null end the phonet *p = '\0'; p++; uint64_t key = hash64d(phrase,gbstrlen(phrase)); // make sure we haven't added this word/phrase yet if ( m_unifiedDict.isInTable ( &key ) ) { totalCollisions++; p += gbstrlen(p) + 1; continue; } // reset lang vector int64_t pops[MAX_LANGUAGES]; memset ( pops , 0 , MAX_LANGUAGES * 8 ); // see how many langs this key is in in unifiedDict.txt file char *phraseRec = p; getPhraseLanguages2 ( phraseRec , pops ); // make all pops positive if it has > 1 lang already //int32_t count = 0; //for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) // if ( pops[i] ) count++; int32_t imax = MAX_LANGUAGES; //if ( count <= 1 ) imax = 0; // assume none are in official dict // seems like nanny messed things up, so undo that // and set it negative if in wiktionary in loop below for ( int32_t i = 0 ; i < imax ; i++ ) // HOWEVER, if it is -1 leave it be, i think it // was probably correct in that case for some reason. // Wiktionary fails to get a TON of forms for // many foreign languages in the english dict. // so nanny got these from some dict, so try to // keep them. // like 'abelhudo' // http://pt.wiktionary.org/wiki/abelhudo // and is not in en.wiktionary.org // . NO! because it has "ein" as english with // a -1 popularity as well as "ist"! reconsider if ( pops[i] < -1 ) pops[i] *= -1; //if ( pops[i] < 0 ) pops[i] *= -1; // debug //if ( strcmp(phrase,"download") == 0 ) // log("hey"); // now add in from wiktionary int32_t slot = wkfMap.getSlot ( &key ); for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) { uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot); if ( langId == langUnknown ) continue; if ( langId == langTranslingual ) continue; // if it marked as already in that dictionary, cont if ( pops[langId] < 0 ) continue; // if it is positive, make it negative to mark // it as being in the official dictionary // -1 means pop unknown but in dictionary if ( pops[langId] == 0 ) pops[langId] = -1; else pops[langId] *= -1; } // save the offset int32_t offset = m_unifiedBuf.length(); // print the word/phrase and its phonet, if any m_unifiedBuf.safePrintf("%s\t%s\t",phrase,phonet); int32_t count = 0; // print the languages and their popularity scores for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) { if ( pops[i] == 0 ) continue; // skip "unknown" what does that really mean? if ( i == 0 ) continue; m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t", i,(int32_t)pops[i]); count++; } // if none, revert if ( count == 0 ) { m_unifiedBuf.setLength(offset); // skip "p" to next line in unifiedBuf.txt p += gbstrlen(p) + 1; continue; } // trim final tab i guess m_unifiedBuf.incrementLength(-1); // end line m_unifiedBuf.pushChar('\n'); // directly point to the (lang, score) tuples m_unifiedDict.addKey(&key, &offset); // skip "p" to next line in unifiedBuf.txt p += gbstrlen(p) + 1; } log (LOG_WARN,"spell: got %"INT32" TOTAL collisions in unified dict", totalCollisions); HashTableX dedup; dedup.set(8,0,1000000,NULL,0,false,0,"dmdm"); // . now add entries from wkfBuf that were not also in "ub" // . format is "|\n" p = wkfBuf.getBufStart(); end = p + wkfBuf.length(); for ( ; p ; p = wkfBuf.getNextLine(p) ) { //char *langAbbr = p; for ( ; *p && *p !='\n' && *p !='|' ; p++ ); if ( *p != '|' ) { log("speller: bad format in wiktionary-lang.txt"); char *xx=NULL;*xx=0; } //*p = '\0'; //uint8_t langId = getLangIdFromAbbr ( langAbbr ); //*p = '|'; // get word char *word = p + 1; // get end of it for ( ; *p && *p !='\n' ; p++ ); if ( *p != '\n' ) { log("speller: bad format in wiktionary-lang.txt"); char *xx=NULL;*xx=0; } int32_t wordLen = p - word; // wiktinary has like prefixes ending in minus. skip! if ( word[wordLen-1] == '-' ) continue; // suffix in wiktionary? skip if ( word[0] == '-' ) continue; // .zr .dd if ( word[0] == '.' ) continue; // hash the word int64_t key = hash64d ( word , wordLen ); // skip if we did it in the above loop if ( m_unifiedDict.isInTable ( &key ) ) continue; // skip if already did it in this loop if ( dedup.isInTable ( &key ) ) continue; if ( ! dedup.addKey ( &key ) ) return false; // reset lang vector int64_t pops[MAX_LANGUAGES]; memset ( pops , 0 , MAX_LANGUAGES * 8 ); // now add in from wiktionary map int32_t slot = wkfMap.getSlot ( &key ); for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) { uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot); if ( langId == langUnknown ) continue; if ( langId == langTranslingual ) continue; if ( pops[langId] ) continue; // -1 means pop unknown but in dictionary pops[langId] = -1; } // save the offset int32_t offset = m_unifiedBuf.length(); // . print the word/phrase and its phonet, if any // . phonet is unknown here... //char *phonet = ""; m_unifiedBuf.safeMemcpy ( word, wordLen ); m_unifiedBuf.safePrintf("\t\t");//word,phonet); int32_t count = 0; // print the languages and their popularity scores for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) { if ( pops[i] == 0 ) continue; // skip "unknown" what does that really mean? if ( i == 0 ) continue; m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t", i,(int32_t)pops[i]); count++; } // if none, revert if ( count == 0 ) { m_unifiedBuf.setLength(offset); continue; } // trim final tab i guess m_unifiedBuf.incrementLength(-1); // end line m_unifiedBuf.pushChar('\n'); // directly point to the (lang, score) tuples m_unifiedDict.addKey(&key, &offset); } // save the text too! a merge of unifiedDict.txt and // wiktionary-lang.txt!!! if ( m_unifiedBuf.saveToFile(g_hostdb.m_dir,"unifiedDict-buf.txt") <=0) return false; // save it if ( m_unifiedDict.save(g_hostdb.m_dir,"unifiedDict-map.dat")<=0 ) return false; // start over and load what we created goto reload; // hmmm... seems like we need to re-run for some reason log("spell: PLEASE RERUN gb"); log("spell: PLEASE RERUN gb"); log("spell: PLEASE RERUN gb"); exit(0); return true; } // in case the language is unknown, just give the pop of the // first found language int32_t Speller::getPhrasePopularity ( char *str, uint64_t h, bool checkTitleRecDict, unsigned char langId ){ //char *xx=NULL;*xx=0; // hack fixes. // common word like "and"? if ( isCommonWord(h) ) return MAX_PHRASE_POP; // another common word check if ( isQueryStopWord(NULL,0,h) ) return MAX_PHRASE_POP; // single letter? if ( str && str[0] && str[1] == '\0' ) return MAX_PHRASE_POP; // 0-99 only if ( str && is_digit(*str) ) { if ( !str[1]) return MAX_PHRASE_POP; if ( is_digit(str[1])&& !str[2]) return MAX_PHRASE_POP; } // what up with this? //if ( !s ) return 0; int32_t slot = m_unifiedDict.getSlot(&h); // if not in dictionary assume 0 popularity if ( slot == -1 ) return 0; //char *p = *(char **)m_unifiedDict.getValueFromSlot(slot); int32_t offset = *(int32_t *)m_unifiedDict.getValueFromSlot(slot); char *p = m_unifiedBuf.getBufStart() + offset; char *pend = p + gbstrlen(p); // skip word itself while ( *p != '\t' ) p++; p++; // skip phonet, if any while ( *p != '\t' ) p++; p++; int32_t max = 0; // the tuples are in ascending order of the langid // get to the right language while ( p < pend ){ int32_t currLang = atoi(p); // the the pops are sorted by langId, return 0 if the lang // was not found if ( langId != langUnknown && currLang > langId ) return 0; // skip language while ( *p != '\t' ) p++; p++; int32_t score = atoi(p); // i think negative scores mean it is only from titlerec and // not in any of the dictionaries. if ( score < 0 ) score *= -1; if ( currLang == langId && langId != langUnknown ) return score; // if lang is unknown get max if ( score > max ) max = score; // skip that score and go to the next tuple while ( *p != '\t' && *p != '\0' ) p++; p++; } return max; } // splits words and checks if they form a porn word or not. montanalinux.org // is showing up as porn because it has 'anal' in the hostname. So try to // find a combination of words such that they are NOT porn. // try this only after isAdult() succeeds. // Always tries to find longer words first. so 'montanalinux' is split as // 'montana' and 'linux' and not as 'mont', 'analinux' // if it finds a seq of words leading upto a porn word, then it returns true // eg. shall split montanalinux into 'mont', 'anal', and return true without // checking if 'inux' is a word. Need to do this because isAdult() cannot // define where an adult word has ended. // TODO: chatswingers.com NOT identified as porn because it is split as // 'chats' and 'wingers'. bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn, char *splitWords, unsigned char langId, int32_t encodeType ){ //char *xx=NULL;*xx=0; *isPorn = false; char *index[1024]; if ( slen == 0 ) return true; *splitWords = '\0'; // this is the current word we're on int32_t curr = 0; index[curr++] = s; index[curr] = s + slen; while ( curr > 0 ){ char *nextWord = NULL; while ( findNext( index[curr-1], index[curr], &nextWord, isPorn, langId, encodeType ) ){ // next word in chain index[curr++] = nextWord; index[curr] = s + slen; // found a porn word OR // finished making a sequence of words if ( *isPorn || nextWord == s + slen ){ char *p = splitWords; for ( int32_t k = 1; k < curr; k++ ){ memcpy (p, index[k - 1], index[k] - index[k - 1]); p += index[k] - index[k - 1]; *p = ' '; p++; } *p = '\0'; return true; } } // did not find any word. reduce the current position while ( --curr > 0 ){ if ( curr > 0 && index[curr] > index[curr-1] ){ index[curr]--; break; } } } return false; } bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn, unsigned char langId, int32_t encodeType ){ //char *xx=NULL;*xx=0; char *loc = NULL; int32_t slen = send - s; // check if there is an adult word in there // NOTE: The word 'adult' gives a lot of false positives, so even // though it is in the isAdult() list, skip it. // s/slen constitues an individual word. if ( isAdult ( s, slen, &loc ) && strncmp ( s, "adult", 5 ) != 0 ){ // if this string starts with the adult word, don't check // further if ( loc == s ){ *isPorn = true; *nextWord = send; return true; } } for ( char *a = send; a > s; a-- ){ // a hack, if the word is only one letter long, check if it // is 'a' or 'i'. If not then continue if ( a - s == 1 && *s != 'a' && *s != 'i') continue; // another hack, the end word of the string cannot be 2 letters // or less. freesex was being split as 'frees ex' if ( a == send && a - s <= 2 ) continue; // do not allow "ult" to be a word because it always will // split "adult" into "ad+ult" if ( a - s == 3 && s[0]=='u' && s[1]=='l' && s[2]=='t' ) continue; // adultsiteratings = "ad ul ts it era tings" if ( a - s == 2 && s[0]=='u' && s[1]=='l' ) continue; // lashaxxxnothing = "lash ax xx nothing" if ( a - s == 2 && s[0]=='u' && s[1]=='l' ) continue; // livesexasian = "lives ex asian" if ( a - s == 2 && s[0]=='e' && s[1]=='x' ) continue; // fuckedtits = "fu ck edt its" if ( a - s == 2 && s[0]=='c' && s[1]=='k' ) continue; // blogsexe = "blogs exe" ... many others // any 3 letter fucking word starting with "ex" if ( a - s == 3 && s[0]=='e' && s[1]=='x' ) continue; // shemales = "*s hem ales" if ( a - s == 4 && s[0]=='a' &&s[1]=='l'&&s[2]=='e'&&s[3]=='s') continue; // grooverotica = "groove rot ica" if ( a - s == 3 && s[0]=='i' && s[1]=='c' && s[2]=='a' ) continue; // dinerotik = dinero tik if ( a - s == 3 && s[0]=='t' && s[1]=='i' && s[2]=='k' ) continue; // nudeslutpics = "nud esl ut pics" if ( a - s == 3 && s[0]=='n' && s[1]=='u' && s[2]=='d' ) continue; // seepornos = "seep or nos" if ( a - s == 3 && s[0]=='n' && s[1]=='o' && s[2]=='s' ) continue; // bookslut = "books lut" if ( a - s == 3 && s[0]=='l' && s[1]=='u' && s[2]=='t' ) continue; // lesexegratuit = "lese xe gratuit" if ( a - s == 2 && s[0]=='x' && s[1]=='e' ) continue; // mooiemensensexdating = "mens ense xd a ting" if ( a - s == 2 && s[0]=='x' && s[1]=='d' ) continue; // mpornlinks = mpo rn links if ( a - s == 2 && s[0]=='r' && s[1]=='n' ) continue; // ukpornbases = ukp or nba bes if ( a - s == 2 && s[0]=='o' && s[1]=='r' ) continue; // slut if ( a - s == 2 && s[0]=='l' && s[1]=='u' ) continue; // independentstockholmescorts = "tock holme sco rts" if ( a - s == 3 && s[0]=='s' && s[1]=='c' && s[2]=='o' ) continue; // relatosexcitantes = relat ose xci tan tes if ( a - s == 3 && s[0]=='x' && s[1]=='c' && s[2]=='i' ) continue; // babe = * bes if ( a - s == 3 && s[0]=='b' && s[1]=='e' && s[2]=='s' ) continue; // xpornreviews "xp orn reviews " if ( a - s == 3 && s[0]=='o' && s[1]=='r' && s[2]=='n' ) continue; // shemal fix if ( a - s == 3 && s[0]=='h' && s[1]=='e' && s[2]=='m' ) continue; // adultswim = adults wim if ( a - s == 3 && s[0]=='w' && s[1]=='i' && s[2]=='m' ) continue; // bdsm if ( a - s == 3 && s[0]=='d' && s[1]=='s' && s[2]=='m' ) continue; // anal if ( a - s == 3 && s[0]=='n' && s[1]=='a' && s[2]=='l' ) continue; // vibrator = bra if ( a - s == 3 && s[0]=='b' && s[1]=='r' && s[2]=='a' ) continue; // sitiospornox = sitio spor nox if ( a - s == 4 && s[0]=='s' && s[1]=='p' && s[2]=='o' && s[3] == 'r' ) continue; // orn* if ( a - s == 4 && s[0]=='o' && s[1]=='r' && s[2]=='n' ) continue; // hotescorts = hote scor if ( a - s == 4 && s[0]=='s' && s[1]=='c' && s[2]=='o' && s[3] == 'r' ) continue; // uniformsluts = uniformts lutz if ( a - s == 4 && s[0]=='l' && s[1]=='u' && s[2]=='t' && s[3] == 'z' ) continue; // free porn login = freep ornl if ( a - s == 5 && s[0]=='f' && s[1]=='r' && s[2]=='e' && s[3] == 'e' && s[4] == 'p' ) continue; // shemal fix if ( a - s == 5 && s[0]=='h' && s[1]=='e' && s[2]=='m' && s[3] == 'a' && s[4] == 'l' ) continue; // inbondage = inbond age if ( a - s == 6 && s[0]=='i' && s[1]=='n' && s[2]=='b' && s[3]=='o' && s[4]=='n' && s[5]=='d' ) continue; // swingers = wingers if ( a - s == 7 && s[0]=='w' && s[1]=='i' && s[2]=='n' && s[3]=='g' && s[4]=='e' && s[5]=='r' && s[6]=='s' ) continue; // free sex contents = freese xc ont ents if ( a - s == 2 && s[0]=='x' && s[1]=='c' ) continue; // mosexstore = mose xs tore if ( a - s == 2 && s[0]=='x' && s[1]=='s' ) continue; // phonesexfootsies if ( a - s == 8 && s[0]=='p' && s[1]=='h' && s[2]=='o' && s[3]=='n' && s[4]=='e' && s[5]=='s' && s[6]=='e' && s[7]=='x' ) continue; // cybersex if ( a - s == 8 && s[0]=='c' && s[1]=='y' && s[2]=='b' && s[3]=='e' && s[4]=='r' && s[5]=='s' && s[6]=='e' && s[7]=='x' ) continue; // hotescorts // check if the word has popularity. if it is in the // unifiedDict, then it is considered to be a word uint64_t h = hash64d(s, a-s);//a - s, encodeType); int32_t pop = getPhrasePopularity(s, h, false, langId); // continue if did not find it if ( pop <= 0 ) continue; // this is our next word *nextWord = a; return true; } return false; } //similar to one above but using recursion /*bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn, char *splitWords, unsigned char langId, int32_t encodeType ){ if ( slen == 0 ) return true; char *loc = NULL; // check if there is an adult word in there if ( isAdult ( s, slen, &loc ) ){ // if this string starts with the adult word if ( loc == s ){ memcpy ( splitWords, s, slen ); splitWords[slen] = ' '; splitWords[slen + 1] = '\0'; *isPorn = true; return true; } } char *b = s + slen; // split the phrase into two or more phrases. for ( char *a = b; a > s; a-- ){ // while ( a > s ){ // a hack, if the word is only one letter long, check if it // is 'a' or 'i'. If not then continue if ( a - s == 1 && *s != 'a' && *s != 'i') continue; // check if the word has popularity. if it is in the // unifiedDict, then it is considered to be a word uint64_t h = hash64d(s, a - s, encodeType); int32_t pop = getPhrasePopularity(s, h, false, langId); // continue if did not find it if ( pop <= 0 ) continue; memcpy ( splitWords, s, a - s ); splitWords[a - s] = ' '; splitWords[a - s + 1] = '\0'; // see if we can split the rest if ( canSplitWords ( a, b - a, isPorn, splitWords + (a - s + 1), langId, encodeType ) ) return true; } // did not find any sequence of words that can make this string return false; }*/ bool Speller::createUnifiedDict (){ // first get all the tuples from wordlist and query file //HashTableT ht[MAX_LANGUAGES]; HashTableX ht[MAX_LANGUAGES]; char ff[1024]; for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){ ht[i].set ( 8,4,0,NULL,0,false,0,"cud"); sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir, getLanguageAbbr(i), getLanguageAbbr(i) ); populateHashTable(ff, &ht[i], i); sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir, getLanguageAbbr(i), getLanguageAbbr(i) ); populateHashTable(ff, &ht[i], i); for ( int32_t j = 0; j < NUM_CHARS; j++ ){ sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir, getLanguageAbbr(i), getLanguageAbbr(i), j ); populateHashTable(ff, &ht[i], i); } } //sprintf ( ff, "%sdict/unifiedDict",g_hostdb.m_dir ); sprintf ( ff, "%sunifiedDict.txt",g_hostdb.m_dir ); // delete it first unlink ( ff ); // then open a new one for appending int fdw = open ( ff , O_CREAT | O_RDWR | O_APPEND , S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH); if ( fdw < 0 ){ return log("lang: Could not open for %s " "writing: %s.",ff, strerror(errno)); } log(LOG_INIT,"spell: Making %s.", ff ); //HashTableT phrases; HashTableX phrases; phrases.set(8,4,0,NULL,0,false,0,"phud"); char buf[1024]; for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){ // get each slot for ( int32_t j = 0; j < ht[i].getNumSlots(); j++ ){ uint64_t key = *(uint64_t *)ht[i].getKey(j); if ( key == 0 ) continue; // if key is already found int32_t slot = phrases.getSlot(&key); if ( slot != -1 ) continue; char *tuple = *(char **)ht[i].getValueFromSlot(j); // here we print the phrase and the phonet if present // skip the score while ( *tuple != '\t' ) tuple++; tuple++; sprintf( buf, "%s", tuple ); char *p = buf; p += gbstrlen(buf); // if there wasn't a phonet, its from the titleRec. // add another tab bool fromTitleRec = false; if ( strstr (tuple,"\t") == NULL ){ *p = '\t'; p++; fromTitleRec = true; } for ( int32_t k = 0; k < MAX_LANGUAGES; k++ ){ slot = ht[k].getSlot(&key); if ( slot == -1 ) continue; char *val = *(char **)ht[k].getValueFromSlot(slot); int32_t pop = atoi(val); if ( fromTitleRec ) pop *= -1; sprintf(p,"\t%"INT32"\t%"INT32"",k,pop); p += gbstrlen(p); } // write out the trailing \n as well *p = '\n'; p++; *p = '\0'; p++; int32_t bufLen = gbstrlen(buf); int32_t wn = write ( fdw , buf , bufLen ) ; if ( wn != bufLen ) return log("lang: write: %s",strerror(errno)); int32_t val = 1; phrases.addKey(&key, &val); } } return true; } bool Speller::populateHashTable( char *ff, HashTableX *htable, unsigned char langId ){ File f; f.set(ff); // open file if ( ! f.open ( O_RDONLY ) ) { log("spell: open: %s",mstrerror(g_errno)); return false; } // get file size int32_t fileSize = f.getFileSize() ; int32_t bufSize = fileSize + 1; char *buf = (char *) mmalloc(bufSize, "SpellerTmpBuf"); if (!buf) return false; if ( !f.read(buf, fileSize,0) ){ log("spell: read: %s", mstrerror(g_errno)); return false; } for ( int32_t i = 0; i < bufSize; i++ ){ if ( buf[i] == '\n' ) buf[i] = '\0'; } char *p = buf; while ( p < buf + fileSize ){ char *tuple = p; int32_t score = atoi(p); // many scores in dict have a pop of 0. ignore them if ( score <= 0 ){ p += gbstrlen(p) + 1; continue; } while ( *p != '\t' ) p++; p++; // at the phrase char *phrase = p; while ( *p != '\t' && *p != '\0' ) p++; uint64_t key = hash64d(phrase, p-phrase ); int32_t slot = htable->getSlot(&key); if ( slot == -1 ) htable->addKey(&key,&tuple); p += gbstrlen(p) + 1; } return true; } // This isn't really much use except for the spider // language detection to keep from making 32 sequential // calls for the same phrase to isolate the language. char *Speller::getPhraseRecord(char *phrase, int len ) { //char *xx=NULL;*xx=0; if ( !phrase ) return NULL; //char *rv = NULL; int64_t h = hash64d(phrase, len); int32_t slot = m_unifiedDict.getSlot(&h); //log("speller: h=%"UINT64" len=%i slot=%"INT32"",h,len,slot); if ( slot < 0 ) return NULL; //rv = *(char **)m_unifiedDict.getValueFromSlot(slot); int32_t offset = *(int32_t *)m_unifiedDict.getValueFromSlot(slot); char *p = m_unifiedBuf.getBufStart() + offset; return p; } /* uint8_t Speller::getUniqueLang ( int64_t *wid ) { int32_t slot = m_unifiedDict.getSlot(wid); if (slot < 0) return langUnknown; //char *p = *(char **)m_unifiedDict.getValueFromSlot(slot); int32_t offset = *(int32_t *)m_unifiedDict.getValueFromSlot(slot); char *p = m_unifiedBuf.getBufStart() + offset; int32_t langId = langUnknown; char langCount = 0; // skip over word for ( ; *p && *p != '\t' ; ) p++; // nothing after? if ( !*p ) return langUnknown; // skip tab p++; // skip over phonet for ( ; *p && *p != '\t' ; ) p++; // nothing after? if ( !*p ) return langUnknown; // skip tab p++; // loop over langid/pop pairs while ( *p ) { // get langid langId = atoi(p); // skip to next delimiter for ( ; *p && *p != '\t' ; p++ ); // error? if ( ! *p ) break; // skip tab p++; // error? if ( ! *p ) break; // . if pop is zero ignore it // . we now set pops to zero when generating // unifiedDict-buf.txt if they are not in the wiktionary // map for that language. seems like to many bad entries // were put in there by john nanny. //char pop = 1; //if ( *p == '0' ) pop = 0; // require it be in the official dictionary here bool official; if ( *p == '-' ) official = true; else official = false; // skip pop for ( ; *p && *p != '\t' ; p++ ); // multi lang count if ( langId != langUnknown && official ) langCount++; // no unique lang //if ( langCount >= 2 ) return langTranslingual; if ( langCount >= 2 ) return langUnknown; // done? if ( ! *p ) break; // skip tab p++; } // unique lang! return langId; } */ int64_t Speller::getLangBits64 ( int64_t *wid ) { int32_t slot = m_unifiedDict.getSlot(wid); if (slot < 0) return 0LL; int32_t offset = *(int32_t *)m_unifiedDict.getValueFromSlot(slot); char *p = m_unifiedBuf.getBufStart() + offset; // skip over word for ( ; *p && *p != '\t' ; ) p++; // nothing after? if ( !*p ) return 0LL; // skip tab p++; // skip over phonet for ( ; *p && *p != '\t' ; ) p++; // nothing after? if ( !*p ) return 0LL; // skip tab p++; // init int64_t bits = 0LL; // loop over langid/pop pairs while ( *p ) { // get langid uint8_t langId = atoi(p); // skip to next delimiter for ( ; *p && *p != '\t' ; p++ ); // error? if ( ! *p ) break; // skip tab p++; // error? if ( ! *p ) break; // . if pop is zero ignore it // . we now set pops to zero when generating // unifiedDict-buf.txt if they are not in the wiktionary // map for that language. seems like to many bad entries // were put in there by john nanny. //char pop = 1; // if not official, cancel it? if ( *p != '-' ) langId = langUnknown; // skip pop for ( ; *p && *p != '\t' ; p++ ); // multi lang count //if ( langId != langUnknown ) langCount++; // no unique lang //if ( langCount >= 2 ) return langTranslingual; if ( langId != langTranslingual && langId != langUnknown ) // make english "1" bits |= 1LL << (langId-1); // done? if ( ! *p ) break; // skip tab p++; } return bits; } /* int64_t *Speller::getPhraseLanguages(char *phrase, int len ) { //char *xx=NULL;*xx=0; char *phraseRec = getPhraseRecord(phrase, len ); if(!phraseRec) return(NULL); int64_t *rv = (int64_t *)mmalloc(sizeof(int64_t) * MAX_LANGUAGES, "PhraseRec"); if(!rv) return(NULL); if(!getPhraseLanguages(phrase, len, rv)) { mfree(rv, sizeof(int64_t) * MAX_LANGUAGES, "PhraseRec"); return(NULL); } return(rv); } */ bool Speller::getPhraseLanguages(char *phrase, int len, int64_t *array) { //char *xx=NULL;*xx=0; char *phraseRec = getPhraseRecord(phrase, len); if(!phraseRec || !array) return false; return getPhraseLanguages2 ( phraseRec,array ); } bool Speller::getPhraseLanguages2 (char *phraseRec , int64_t *array) { int64_t l = 0; memset(array, 0, sizeof(int64_t)*MAX_LANGUAGES); while(*phraseRec) { l = 0; // skip leading whitespace while(*phraseRec && (*phraseRec == ' ' || *phraseRec == '\t')) phraseRec++; if(!*phraseRec) break; int64_t l = atoi(phraseRec); // l = abs(l); // not using score method anymore, so this is moot. // skip to next delimiter // while(*phraseRec && *phraseRec != '\t') phraseRec++; if(!(phraseRec = strchr(phraseRec, '\t'))) break; // skip tab phraseRec++; if(!*phraseRec) break; // wtf? if ( *phraseRec == '\t' ) return true; // Save score array[l] = atoi(phraseRec); // skip to next delimiter // while(*phraseRec && *phraseRec != '\t') phraseRec++; if(!(phraseRec = strchr(phraseRec, '\t'))) break; // skip over tab if(*phraseRec == '\t') phraseRec++; } return(true); } bool Speller::getSynsInEnglish ( char *w , int32_t wlen , char nativeLang , char wikiLang ) { // no digits please! if ( is_digit(w[0]) ) return false; char *p = getPhraseRecord(w,wlen); if ( ! p ) return false; bool inEnglish = false; // skip word for ( ; *p != '\t' ; p++ ); // skip tab p++; // skip phonet for ( ; *p != '\t' ; p++ ); // skip tab p++; for ( ; *p ; ) { // end of line? if ( !*p ) return inEnglish; // get language id int32_t l = atoi(p); // english? //if ( l == langEnglish ) inEnglish = true; //if ( l > langEnglish && ! inEnglish ) return false; //if ( l == nativeLang ) return false; // skip langid for ( ; *p && *p != '\t' ; p++ ); // end of line? if ( !*p ) return inEnglish; // skip tab p++; // . get popularity. if not negative undo inEnglish. // . it has to be negative because that means it is in the // OFFICIAL wiktionary dictionary for that language if ( l == langEnglish && p[0] == '-' ) inEnglish = true; // if this word is in the doc's primary/native language // then do not try to get english synonyms of it if ( l == nativeLang && p[0] == '-' ) return false; // no chance? it MUST be in english, and these are // sorted by langid... if ( l > langEnglish && ! inEnglish ) return false; // skip popularity for ( ; *p && *p != '\t' ; p++ ); // no more? if ( ! *p ) return inEnglish; // skip tab p++; } return inEnglish; } /* static inline int s_findMaxVal(int64_t *vals, int numVals) { int64_t max, oldmax, val; if(!vals) return(0); max = oldmax = INT_MIN; val = 0; for(int x = 0; x < numVals; x++) { if(vals[x] >= max) { oldmax = max; max = vals[x]; val = x; } } if(oldmax == max) return(0); return(val); } char Speller::getPhraseLanguage(char *phrase, int len) { //char *xx=NULL;*xx=0; char lang; int64_t *langs = getPhraseLanguages(phrase, len); if(!langs) return(0); lang = s_findMaxVal(langs, MAX_LANGUAGES); if ( lang < 0 ) { char *xx=NULL;*xx=0; } if(langs[(uint8_t)lang] == 0) lang = 0; mfree(langs, sizeof(int) * MAX_LANGUAGES, "PhraseRec"); return(lang); } */ void Speller::dictLookupTest ( char *ff ){ //char *ff = "/tmp/sctest"; FILE *fd = fopen ( ff, "r" ); if ( ! fd ) { log("speller: test: Could not open %s for " "reading: %s.", ff,strerror(errno)); return; } int64_t start = gettimeofdayInMilliseconds(); char buf[1026]; int32_t count = 0; // go through the words while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) { // length of word(s), including the terminating \n int32_t wlen = gbstrlen(buf) ; // skip if empty if ( wlen <= 0 ) continue; buf[wlen-1]='\0'; uint64_t h = hash64d ( buf, gbstrlen(buf)); int32_t pop = g_speller.getPhrasePopularity(buf, h, true); if ( pop < 0 ){ char *xx = NULL; *xx = 0; } count++; } log ( LOG_WARN,"speller: dictLookupTest took %"INT64" ms to do " "%"INT32" words. Compare against 46-66ms taken for dict/words file.", gettimeofdayInMilliseconds() - start, count ); fclose(fd); }