#include "gb-include.h" #include "PostQueryRerank.h" #include "Msg40.h" #include "LanguageIdentifier.h" #include "sort.h" //#include "Thesaurus.h" //#include "AppendingWordsWindow.h" //#include "Places.h" #include "Profiler.h" #include "CountryCode.h" #include "Phrases.h" #include "Linkdb.h" #define TOTAL_RERANKING_TIME_STR "PostQueryRerank Total Reranking Time" //#define DEBUGGING_LANGUAGE // Type for post query reranking weighted sort list struct M20List { Msg20 *m_m20; //long m_score; rscore_t m_score; //int m_tier; long long m_docId; char m_clusterLevel; //long m_bitScore; long m_numCommonInlinks; uint32_t m_host; }; static int32_t s_firstSortFunction( const M20List * a, const M20List * b ); static int32_t s_reSortFunction ( const M20List * a, const M20List * b ); #ifdef DEBUGGING_LANGUAGE static void DoDump(char *loc, Msg20 **m20, long num, score_t *scores, char *tiers); #endif bool PostQueryRerank::init ( ) { return true; } PostQueryRerank::PostQueryRerank ( ) { //log( LOG_DEBUG, "query:in PQR::PQR() AWL" ); m_enabled = false; m_maxResultsToRerank = 0; m_numToSort = 0; m_m20List = NULL; m_positionList = NULL; m_msg40 = NULL; //m_querysLoc = 0; m_maxUrlLen = 0; m_pageUrl = NULL; m_now = time(NULL); } PostQueryRerank::~PostQueryRerank ( ) { //log( LOG_DEBUG, "query:in PQR::~PQR() AWL" ); if ( m_m20List ) { mfree( m_m20List, sizeof(M20List) * m_maxResultsToRerank, "PostQueryRerank" ); m_m20List = NULL; } if ( m_positionList ) { mfree( m_positionList, sizeof(long) * m_maxResultsToRerank, "PQRPosList" ); m_positionList = NULL; } if ( m_cvtUrl ) mfree( m_cvtUrl, m_maxUrlLen, "pqrcvtUrl") ; if ( m_pageUrl ) mfree( m_pageUrl, sizeof(Url)*m_maxResultsToRerank, "pqrpageUrls" ); } // returns false on error bool PostQueryRerank::set1 ( Msg40 *msg40, SearchInput *si ) { //log(LOG_DEBUG, "query:in PQR::set1(%p) AWL", msg40); m_msg40 = msg40; m_si = si; if ( ! m_msg40 ) return false; if ( ! m_si ) return false; if ( ! m_si->m_cr ) return false; m_enabled = (m_si->m_docsToScanForReranking > 1); //log( LOG_DEBUG, "query: m_isEnabled:%ld; " // "P_docsToScanForReranking:%ld P_pqr_docsToSan:%ld; AWL", // (long)m_enabled, // m_si->m_docsToScanForReranking, // m_si->m_cr->m_pqr_docsToScan ); return m_enabled; } // must be called sometime after we know numDocIds and before preRerank // returns false if we shouldn't rerank bool PostQueryRerank::set2 ( long resultsWanted ) { //log(LOG_DEBUG, "query:in PQR::set2() AWL"); //log( LOG_DEBUG, "query: firstResultNum:%ld; numResults:%ld; " // "wanted:%ld numMsg20s:%ld AWL", // m_msg40->getFirstResultNum(), m_msg40->getNumResults(), // resultsWanted, m_msg40->m_numMsg20s ); // we only want to check the lessor of docsToScan and numDocIds m_maxResultsToRerank = m_si->m_docsToScanForReranking; if ( m_maxResultsToRerank > m_msg40->getNumDocIds() ) { m_maxResultsToRerank = m_msg40->getNumDocIds(); log( LOG_DEBUG, "pqr: request to rerank more results " "than the number of docids, capping number to rerank " "at %ld", m_maxResultsToRerank ); } // If we don't have less results from clustering / deduping or // we have less results in docids then ... if ( m_msg40->getNumResults() < m_msg40->getNumDocIds() && m_msg40->getNumResults() < resultsWanted ) return false; // are we passed pqr's range? if ( m_msg40->getFirstResultNum() > m_maxResultsToRerank ) return false; // Safety check, make sure there are less results to rerank // than the number of Msg20s if ( m_msg40->m_numMsg20s < m_maxResultsToRerank ) m_maxResultsToRerank = m_msg40->m_numMsg20s; //log( LOG_DEBUG, "query: m_maxResultsToRerank:%ld AWL", // m_maxResultsToRerank ); if ( m_maxResultsToRerank < 2 ) { //log( LOG_INFO, "pqr: too few results to rerank" ); return false; } if ( m_maxResultsToRerank > 250 ) { log( LOG_INFO, "pqr: too many results to rerank, " "capping at 250" ); m_maxResultsToRerank = 250; } // see if we are done if ( m_msg40->getFirstResultNum() >= m_maxResultsToRerank ) { log( LOG_INFO, "pqr: first result is higher than max " "results to rerank" ); return false; } // get space for host count table m_hostCntTable.set( m_maxResultsToRerank ); // get some space for dmoz table m_dmozTable.set( m_maxResultsToRerank << 1 ); // alloc urls for pqrqttiu, pqrfsh and clustering m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank, "pqrpageUrls" ); if ( ! m_pageUrl ) { log("pqr: had out of memory error"); return false; } return true; } // sets up PostQueryRerank for each page in m_maxResultsToRerank // returns false on error bool PostQueryRerank::preRerank ( ) { //if ( g_conf.m_profilingEnabled ) { // g_profiler // .startTimer((long)(this->*(&PostQueryRerank::rerank)), // TOTAL_RERANKING_TIME_STR ); //} //log( LOG_DEBUG, "query:in PQR::preRerank() AWL" ); #ifdef DEBUGGING_LANGUAGE DoDump( "Presort", m_msg40->m_msg20, m_maxResultsToRerank, m_msg40->m_msg3a.m_scores, NULL);//m_msg40->m_msg3a.m_tiers ); #endif if( m_si->m_enableLanguageSorting && !m_si->m_langHint ) log( LOG_INFO, "pqr: no language set for sort. " "language will not be reranked" ); SANITYCHECK( ! m_m20List ); m_m20List = (M20List*)mcalloc( sizeof(M20List) * m_maxResultsToRerank, "PostQueryRerank" ); if( ! m_m20List ) { log( LOG_INFO, "pqr: Could not allocate PostQueryRerank " "sort memory.\n" ); g_errno = ENOMEM; return(false); } SANITYCHECK( ! m_positionList ); m_positionList = (long *)mcalloc( sizeof(long) * m_maxResultsToRerank, "PQRPosList" ); if( ! m_positionList ) { log( LOG_INFO, "pqr: Could not allocate PostQueryRerank " "postion list memory.\n" ); g_errno = ENOMEM; return(false); } //log(LOG_DEBUG, "pqr: the query is '%s' AWL", m_si->m_q->m_orig); // setup for rerankNonLocationSpecificQueries if enabled //if ( ! preRerankNonLocationSpecificQueries() ) // return false; // . make a temp hash table for iptop // . each slot is a long key and a long value HashTable ipTable; // how many slots long numSlots = 5000 / ((4+4)*4); char tmp[5000]; // this should NEVER need to allocate, UNLESS for some reason we got // a ton of inlinking ips if ( ! ipTable.set ( numSlots , tmp , 5000 ) ) return false; // this table maps a docid to the number of search results it links to HashTableT inlinkTable; char tmp2[5000]; long numSlots2 = 5000 / ((8+4)*4); if ( ! inlinkTable.set ( numSlots2 , tmp2 , 5000 ) ) return false; // Fill sort array long y = 0; for( long x = 0; x < m_msg40->m_numMsg20s && y < m_maxResultsToRerank; x++ ) { // skip clustered out results char clusterLevel = m_msg40->getClusterLevel( x ); if ( clusterLevel != CR_OK ) { //log( LOG_DEBUG, "pqr: skipping result " // "%ld since cluster level(%ld) != " // "CR_OK(%ld) AWL", // x, (long)clusterLevel, (long)CR_OK ); continue; } // skip results that don't match all query terms //long bitScore = m_msg40->getBitScore( x ); //if ( bitScore == 0x00 ) continue; // . save postion of this result so we can fill it in later // with (possibly) a higher ranking result m_positionList[y] = x; M20List *sortArrItem = &m_m20List [ y ]; sortArrItem->m_clusterLevel = clusterLevel ; sortArrItem->m_m20 = m_msg40->m_msg20 [ x ]; sortArrItem->m_score = (rscore_t)m_msg40->getScore(x); //sortArrItem->m_tier = m_msg40->getTier ( x ); sortArrItem->m_docId = m_msg40->getDocId ( x ); //sortArrItem->m_bitScore = bitScore ; sortArrItem->m_host = 0; // to be filled in later Msg20 *msg20 = sortArrItem->m_m20; SANITYCHECK( msg20 && ! msg20->m_errno ); Msg20Reply *mr = msg20->m_r; // set the urls for each page // used by pqrqttiu, pqrfsh and clustering m_pageUrl[y].set( mr->ptr_ubuf , false ); // now fill in host without the 'www.' if present char *host = m_pageUrl[y].getHost(); long hostLen = m_pageUrl[y].getHostLen(); if (hostLen > 4 && host[3] == '.' && host[0] == 'w' && host[1] == 'w' && host[2] == 'w') sortArrItem->m_host = hash32(host+4, hostLen-4); else sortArrItem->m_host = hash32(host, hostLen); // add its inlinking docids into the hash table, inlinkTable LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; //long n = msg20->getNumInlinks (); //long long *docIds = msg20->getInlinkDocIds (); //char *flags = msg20->getInlinkFlags (); //long *ips = msg20->getInlinkIps (); //char *qualities = msg20->getInlinkQualities (); // skip adding the inlinking docids if search result has bad ip long ip = mr->m_ip;//msg20->getIp(); bool good = true; if ( ip == 0 ) good = false; if ( ip == -1 ) good = false; // . skip inlinker add already did this "ip top" // . "ip top" is the most significant 3 bytes of the ip // . get the ip of the docid: long top = iptop ( ip ); // if we already encountered a higher-scoring search result // with the same iptop, do not count its inlinkers! // so that if an inlinker links to two docids in the search // results, where those two docids are from the same // "ip top" then the docid is only "counted" once here. if ( ipTable.getSlot ( top ) >= 0 ) good = false; // not allowed to be 0 if ( top == 0 ) top = 1; // now add to table so no we do not add the inlinkers from // any other search results from the same "ip top" if ( ! ipTable.addKey ( top , 1 ) ) return false; // now hash all the inlinking docids into inlinkTable for ( Inlink *k=NULL; good && (k=info->getNextInlink(k) ) ; ) { // lower score if it is link spam though if ( k->m_isLinkSpam ) continue; // must be quality of 35 or higher to "vote" //if ( k->m_docQuality < 35 ) continue; if ( k->m_siteNumInlinks < 20 ) continue; // skip if bad ip for inlinker if ( k->m_ip == 0 || k->m_ip == -1 ) continue; // skip if inlinker has same top ip as search result if ( iptop(k->m_ip) == top ) continue; // get the current slot in table from docid of inlinker long slot = inlinkTable.getSlot ( k->m_docId ); // get the score if ( slot >= 0 ) { long count=inlinkTable.getValueFromSlot(slot); inlinkTable.setValue ( slot , count + 1 ); continue; } // add it fresh if not already in there if (!inlinkTable.addKey(k->m_docId,1)) return false; } //log( LOG_DEBUG, "pqr: pre: setting up sort array - " // "mapping x:%ld to y:%ld; " // "url:'%s' (%ld); tier:%d; score:%ld; " // "docId:%lld; clusterLevel:%d; AWL", // x, y, // msg20->getUrl(), msg20->getUrlLen(), // sortArrItem->tier, sortArrItem->score, // sortArrItem->docId, sortArrItem->clusterLevel ); // setup reranking for pages from the same host (pqrfsd) if ( ! preRerankOtherPagesFromSameHost( &m_pageUrl[y] )) return false; // setup reranking for pages with common topics in dmoz (pqrctid) if ( ! preRerankCommonTopicsInDmoz( mr ) ) return false; // . calculate maximum url length in pages for reranking // by query terms or topics in a url long urlLen = mr->size_ubuf - 1;//msg20->getUrlLen(); if ( urlLen > m_maxUrlLen ) m_maxUrlLen = urlLen; // update num to rerank and sort m_numToSort++; y++; } // get the max m_maxCommonInlinks = 0; // how many of OUR inlinkers are shared by other results? for ( long i = 0; i < m_numToSort; i++ ) { // get the item M20List *sortArrItem = &m_m20List [ i ]; Msg20 *msg20 = sortArrItem->m_m20; // reset sortArrItem->m_numCommonInlinks = 0; // lookup its inlinking docids in the hash table //long n = msg20->getNumInlinks (); //long long *docIds = msg20->getInlinkDocIds (); LinkInfo *info = (LinkInfo *)msg20->m_r->ptr_linkInfo; for ( Inlink *k=NULL;info&&(k=info->getNextInlink(k)) ; ) { // how many search results does this inlinker link to? long*v=(long *)inlinkTable.getValuePointer(k->m_docId); if ( ! v ) continue; // if only 1 result had this as an inlinker, skip it if ( *v <= 1 ) continue; // ok, give us a point sortArrItem->m_numCommonInlinks++; } // get the max if ( sortArrItem->m_numCommonInlinks > m_maxCommonInlinks ) m_maxCommonInlinks = sortArrItem->m_numCommonInlinks; } // . setup reranking for query terms or topics in url (pqrqttiu) // . add space to max url length for terminating NULL and allocate // room for max length m_maxUrlLen++; m_cvtUrl = (char *)mmalloc( m_maxUrlLen, "pqrcvtUrl" ); if ( ! m_cvtUrl ) { log( LOG_INFO, "pqr: Could not allocate %ld bytes " "for m_cvtUrl.", m_maxUrlLen ); g_errno = ENOMEM; return false; } // Safety valve, trim sort results if ( m_numToSort > m_maxResultsToRerank ) m_numToSort = m_maxResultsToRerank; //log( LOG_DEBUG, "pqr::m_numToSort:%ld AWL", m_numToSort ); return true; } // perform actual reranking of m_numToSort pages // returns false on error bool PostQueryRerank::rerank ( ) { //log(LOG_DEBUG,"query:in PQR::rerank() AWL"); if(m_si->m_debug||g_conf.m_logDebugPQR ) logf( LOG_DEBUG, "pqr: reranking %ld results", m_numToSort ); /* float maxDiversity = 0; if(m_si->m_pqr_demFactSubPhrase > 0) { for ( long x = 0; x < m_numToSort; x++ ) { M20List *sortArrItem = &m_m20List [ x ]; Msg20 *msg20 = sortArrItem->m_m20; if ( ! msg20 || msg20->m_errno ) continue; float d = msg20->m_r->m_diversity; if(d > maxDiversity) maxDiversity = d; } } float maxProximityScore = 0; float minProximityScore = -1.0; //float maxInSectionScore = 0; if(m_si->m_pqr_demFactProximity > 0 || m_si->m_pqr_demFactInSection > 0) { //grab the max score so that we know what the max to //demote is. for ( long x = 0; x < m_numToSort; x++ ) { M20List *sortArrItem = &m_m20List [ x ]; Msg20 *msg20 = sortArrItem->m_m20; if ( ! msg20 || msg20->m_errno ) continue; //float d = msg20->m_r->m_inSectionScore; //if(d > maxInSectionScore) // maxInSectionScore = d; // handle proximity float d = msg20->m_r->m_proximityScore; // i think this means it does not have all the query // terms! for 'sylvain segal' we got // www.regalosdirectos.tv/asp2/comparar.asp?cat=36 // in results if ( d == 0.0 ) continue; // . -1 is a bogus proximity // . it means we were not able to find all the terms // because they were in anomalous link text or // meta tags or select tags or whatever... so for // now such results will not be demoted to be on the // safe side if ( d == -1.0 ) continue; if ( d > maxProximityScore ) maxProximityScore = d; if ( d < minProximityScore || minProximityScore==-1.0 ) minProximityScore = d; } } */ // rerank weighted sort list for ( register long x = 0; x < m_numToSort; x++ ) { M20List *sortArrItem = &m_m20List [ x ]; Msg20 *msg20 = sortArrItem->m_m20; char *url = NULL; rscore_t score = sortArrItem->m_score; rscore_t startScore = score; // mwells: what is this? if(m_si->m_pqr_demFactOrigScore < 1) { //turn off the indexed score and just use a uniform start score //because I can't get the proximity pqr to overwhelm the //preexisting score. score = 1000000 + (m_numToSort - x) + (long)(score * m_si->m_pqr_demFactOrigScore); startScore = score; } // if don't have a good msg20, skip reranking for this result if ( ! msg20 || msg20->m_errno ) continue; url = msg20->m_r->ptr_ubuf;//getUrl(); if ( ! url ) url = "(none)"; if(m_si->m_debug||g_conf.m_logDebugPQR ) logf(LOG_DEBUG, "pqr: result #%ld:'%s' has initial " "score of %.02f", x, url, (float)startScore ); // resets msg20->m_pqr_old_score = score; msg20->m_pqr_factor_quality = 1.0; msg20->m_pqr_factor_diversity = 1.0; msg20->m_pqr_factor_inlinkers = 1.0; msg20->m_pqr_factor_proximity = 1.0; msg20->m_pqr_factor_ctype = 1.0; msg20->m_pqr_factor_lang = 1.0; // includes country Msg20Reply *mr = msg20->m_r; // demote for language and country score = rerankLanguageAndCountry( score, mr->m_language , mr->m_summaryLanguage, mr->m_country, // id msg20 ); // demote for content-type float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml; float xmlFactor = m_si->m_cr->m_pqr_demFactXml; long contentType= mr->m_contentType; if ( contentType == CT_XML && xmlFactor > 0 ) { score = score * xmlFactor; msg20->m_pqr_factor_ctype = xmlFactor; } else if ( contentType != CT_HTML && htmlFactor > 0 ) { score = score * htmlFactor; msg20->m_pqr_factor_ctype = htmlFactor; } //if ( score == 1 ) goto finishloop; // demote for fewer query terms or gigabits in url //score = rerankQueryTermsOrGigabitsInUrl( score, // &m_pageUrl[x] ); // . demote for not high quality // . multiply by "qf" for every quality point below 100 // . now we basically do this if we have a wiki title // . float qf = m_si->m_cr->m_pqr_demFactQual; /* if ( m_msg40->m_msg3a.m_oneTitle ) { //long q = msg20->getQuality(); long sni = mr->m_siteNumInlinks; if ( sni <= 0 ) sni = 1; float weight = 1.0; for ( ; sni < 100000 ; sni *= 2 ) weight = weight * 0.95; // apply the weight to the score score = score * weight; // store that for print in PageResults.cpp msg20->m_pqr_factor_quality = weight; } */ // demote for more paths in url score = rerankPathsInUrl( score, msg20->m_r->ptr_ubuf,//getUrl(), msg20->m_r->size_ubuf-1 ); // demote for smallest cat id has a lot of super topics score = rerankSmallestCatIdHasSuperTopics( score, msg20 ); // demote for larger page sizes score = rerankPageSize( score, msg20->m_r->m_contentLen ); // . demote for non location specific queries that have an // an obvious location in gigabits or url //score = rerankNonLocationSpecificQueries( score, // msg20 ); //if ( score == 1 ) goto finishloop; // demote for no cat id score = rerankNoCatId( score, msg20->m_r->size_catIds/4, msg20->m_r->size_indCatIds/4); // demote for no other pages from same host score = rerankOtherPagesFromSameHost( score, &m_pageUrl[x] ); // demote for fewer common topics in dmoz score = rerankCommonTopicsInDmoz( score, msg20 ); // . demote for pages with dmoz category names do not // contain a query term //score = rerankDmozCategoryNamesDontHaveQT( score, // msg20 ); // . demote for pages with dmoz category names do not // contain a query term //score = rerankDmozCategoryNamesDontHaveGigabits( score, // msg20 ); // . demote pages for older datedb dates score = rerankDatedbDate( score, msg20->m_r->m_datedbDate ); /* // . demote pages by proximity // . a -1 prox implies did not have any query terms // . see Summary.cpp proximity algo float ps = msg20->m_r->m_proximityScore;//getProximityScore(); if ( ps > 0.0 && m_si->m_pqr_demFactProximity > 0 && minProximityScore != -1.0 ) { // what percent were we of the max? float factor = minProximityScore / ps ; // this can be weighted //factor *= m_si->m_pqr_demFactProximity; // apply the factor to the score score *= factor; // this is the factor msg20->m_pqr_factor_proximity = factor; } // . demote pages by the average of the scores of the // . terms based upon what section of the doc they are in // . mdw: proximity algo should obsolete this //if(maxInSectionScore > 0) // score = rerankInSection( score, // msg20->getInSectionScore(), // maxInSectionScore); // . demote pages which only have the query as a part of a // . larger phrase if ( maxDiversity != 0 ) { float diversity = msg20->m_r->m_diversity; float df = (1 - (diversity/maxDiversity)) * m_si->m_pqr_demFactSubPhrase; score = (rscore_t)(score * (1.0 - df)); if ( score <= 0.0 ) score = 0.001; msg20->m_pqr_factor_diversity = 1.0 - df; } */ // . COMMON INLINKER RERANK // . no need to create a superfluous function call here // . demote pages that do not share many inlinking docids // with other pages in the search results if ( m_maxCommonInlinks>0 && m_si->m_pqr_demFactCommonInlinks){ long nc = sortArrItem->m_numCommonInlinks ; float penalty; // the more inlinkers, the less the penalty penalty = 1.0 -(((float)nc)/(float)m_maxCommonInlinks); // . reduce the penalty for higher quality pages // . they are the most likely to have their inlinkers // truncated //char quality = msg20->getQuality(); float sni = (float)msg20->m_r->m_siteNumInlinks; // decrease penalty for really high quality docs //while ( quality-- > 60 ) penalty *= .95; for ( ; sni > 1000 ; sni *= .80 ) penalty *= .95; // if this parm is 0, penalty will become 0 penalty *= m_si->m_pqr_demFactCommonInlinks; // save old score score = score * (1.0 - penalty); // do not decrease all the way to 0! if ( score <= 0.0 ) score = 0.001; // store it! msg20->m_pqr_factor_inlinkers = 1.0 - penalty; } // finishloop: if(m_si->m_debug || g_conf.m_logDebugPQR ) logf( LOG_DEBUG, "pqr: result #%ld's final " "score is %.02f (-%3.3f%%) ", x, (float)score,100-100*(float)score/startScore ); sortArrItem->m_score = score; } return(true); } // perform post reranking tasks // returns false on error bool PostQueryRerank::postRerank ( ) { //log( LOG_DEBUG, "query:in PQR::postRerank() AWL" ); // Hopefully never happen... //log( LOG_DEBUG, "query: just before sort: " // "m_maxResultsToRerank:%ld m_numToSort:%ld AWL", // m_maxResultsToRerank, m_numToSort); if ( m_numToSort < 0 ) return false; // Sort the array gbmergesort( (void *) m_m20List, (size_t) m_numToSort, (size_t) sizeof(M20List), (int (*)(const void *, const void *))s_firstSortFunction); // move 2nd result from a particular domain to just below the first // result from that domain if it is within 10 results of the first //XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX put this back in after debugging summary rerank! //XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX //if (!attemptToCluster()) return false; // Fill result arrays with our reranked results for( long y = 0; y < m_numToSort; y++ ) { M20List *a = &m_m20List [ y ]; long x = m_positionList [ y ]; m_msg40->m_msg20 [ x ] = a->m_m20; //m_msg40->m_msg3a.m_tiers [ x ] = a->m_tier; m_msg40->m_msg3a.m_scores [ x ] = a->m_score; m_msg40->m_msg3a.m_docIds [ x ] = a->m_docId; m_msg40->m_msg3a.m_clusterLevels [ x ] = a->m_clusterLevel; //log( LOG_DEBUG, "pqr: post: mapped y:%ld " // "to x:%ld AWL", // y, x ); } #ifdef DEBUGGING_LANGUAGE DoDump( "Postsort", m_msg40->m_msg20, m_numToSort, m_msg40->m_msg3a.m_scores, NULL );//m_msg40->m_msg3a.m_tiers ); #endif //if ( ! g_conf.m_profilingEnabled ) return true; //if ( ! g_profiler.endTimer( (long)(this->*(&PostQueryRerank::rerank)), // TOTAL_RERANKING_TIME_STR) ) // log( LOG_WARN,"admin: Couldn't add the fn %li", // (long)(this->*(&PostQueryRerank::rerank)) ); return true; } // called if we weren't able to rerank for some reason void PostQueryRerank::rerankFailed ( ) { //if ( g_conf.m_profilingEnabled ) { // if( ! g_profiler // .endTimer( (long)(this->*(&PostQueryRerank::rerank)), // TOTAL_RERANKING_TIME_STR) ) // log(LOG_WARN,"admin: Couldn't add the fn %li", // (long)(this->*(&PostQueryRerank::rerank))); //} } // lsort (pqrlang, pqrlangunk, pqrcntry) // rerank for language, then country rscore_t PostQueryRerank::rerankLanguageAndCountry ( rscore_t score, uint8_t lang, uint8_t summaryLang, uint16_t country , Msg20 *msg20 ) { //log( LOG_DEBUG, "query:in PQR::rerankLanguageAndCountry(" // "score:%ld, lang:%ld, summLang:%ld, country:%ld)" // "[langSortingIsOn:%ld; langUnkWeight:%3.3f; langWeight:%3.3f; " // "&qlang=%ld; &lang=%ld; " // "&qcountry=%ld; &gbcountry=%ld; " // "queryLangs:%lld; pageLangs:%lld] AWL", // score, (long)lang, (long)summaryLang, (long)country, // (long)m_si->m_enableLanguageSorting, // m_si->m_languageUnknownWeight, // m_si->m_languageWeightFactor, // (long)m_si->m_langHint, // (long)m_si->m_language, // (long)m_si->m_countryHint, // (long)m_si->m_country, // g_countryCode.getLanguagesWritten( m_si->m_countryHint ), // g_countryCode.getLanguagesWritten( country ) ); // if lsort is off, skip if ( ! m_si->m_enableLanguageSorting ) return score; // . use query lanaguage (si->m_langHint) or restricted search // language (si->m_language) // . if both are 0, don't rerank by language uint8_t langWanted = m_si->m_langHint; if ( langWanted == langUnknown ) langWanted = m_si->m_queryLang;//language; if ( langWanted == langUnknown ) return score; // . apply score factors for unknown languages, iff reranking unknown // languages if ( lang == langUnknown && m_si->m_languageUnknownWeight > 0 ) { msg20->m_pqr_factor_lang =m_si->m_languageUnknownWeight; return rerankAssignPenalty(score, m_si->m_languageUnknownWeight, "pqrlangunk", "it's language is unknown" ); } // . if computed lanaguage is unknown, don't penalize // . no, what if from a different country? if ( summaryLang == langUnknown ) return score; // . first, apply score factors for non-preferred summary languages // that don't match the page language if ( summaryLang != langUnknown && summaryLang != langWanted ) { msg20->m_pqr_factor_lang = m_si->m_languageWeightFactor; return rerankAssignPenalty( score, m_si->m_languageWeightFactor, "pqrlang", "it's summary/title " "language is foreign" ); } // second, apply score factors for non-preferred page languages //if ( lang != langWanted ) // return rerankAssignPenalty( score, // m_si->m_languageWeightFactor, // "pqrlang", // "it's page language is foreign" ); // . if we got here languages of query and page match and are not // unknown, so rerank based on country // . don't demote if countries match or either the search country // or page country is unknown (0) // . default country wanted to gbcountry parm if not specified uint8_t countryWanted = m_si->m_countryHint; // SearchInput sets m_country based on the IP address of the incoming // query, which is often wrong, especially for internal 10.x.y.z ips. // so just fallback to countryHint for now bcause that uses teh default // country... right now set to "us" in search controls page. if ( countryWanted == 0 ) countryWanted = m_si->m_country; if ( country == 0 || countryWanted == 0 || country == countryWanted ) return score; // . now, languages match and are not unknown and countries don't // match and neither is unknown // . so, demote if country of query speaks the same language as // country of page, ie US query and UK or AUS page (since all 3 // places speak english), but not US query and IT page uint64_t qLangs = g_countryCode.getLanguagesWritten( countryWanted ); uint64_t pLangs = g_countryCode.getLanguagesWritten( country ); // . if no language written by query country is written by page // country, don't penalize if ( (uint64_t)(qLangs & pLangs) == (uint64_t)0LL ) return score; msg20->m_pqr_factor_lang = m_si->m_cr->m_pqr_demFactCountry; // countries do share at least one language - demote! return rerankAssignPenalty( score, m_si->m_cr->m_pqr_demFactCountry, "pqrcntry", "it's language is the same as that of " "of the query, but it is from a country " "foreign to that of the query which " "writes in at least one of the same " "languages" ); } // pqrqttiu // . look for query terms and gigabits in the url, demote more the fewer // are matched. /* rscore_t PostQueryRerank::rerankQueryTermsOrGigabitsInUrl( rscore_t score, Url *pageUrl ) { //log( LOG_DEBUG, "query:in PQR::rerankQueryTermsOrGigabitsInUrl(" // "score:%ld, url:'%s', urlLen:%ld)" // "[factor:%3.3f; max:%ld] AWL", // score, pageUrl->getUrl(), pageUrl->getUrlLen(), // m_si->m_cr->m_pqr_demFactQTTopicsInUrl, // m_si->m_cr->m_pqr_maxValQTTopicsInUrl ); if ( pageUrl->getUrlLen() == 0 ) return score; float factor = m_si->m_cr->m_pqr_demFactQTTopicsInUrl; if ( factor <= 0 ) return score; // disables long maxQTInUrl = m_si->m_q->getNumTerms(); long maxGigabitsInUrl = m_msg40->getNumTopics(); long maxVal = m_si->m_cr->m_pqr_maxValQTTopicsInUrl; if ( maxVal < 0 ) maxVal = maxQTInUrl+maxGigabitsInUrl; // from original url: // . remove scheme // . remove 'www' from host // . remove tld // . remove ext // . convert symbols to spaces // . remove extra space //log( LOG_DEBUG, "query: origurl:'%s' AWL", pageUrl->getUrl() ); //log( LOG_DEBUG, "query: url: whole:'%s' host:'%s' (%ld); " // "domain:'%s' (%ld); tld:'%s' (%ld); midDom:'%s' (%ld); " // "path:'%s' (%ld); fn:'%s'; ext:'%s'; query:'%s' (%ld); " // "ipStr:'%s' {%ld}; anch:'%s' (%ld) " // "site:'%s' (%ld) AWL", // pageUrl->getUrl(), // pageUrl->getHost(), pageUrl->getHostLen(), // pageUrl->getDomain(), pageUrl->getDomainLen(), // pageUrl->getTLD(), pageUrl->getTLDLen(), // pageUrl->getMidDomain(), pageUrl->getMidDomainLen(), // pageUrl->getPath(), pageUrl->getPathLen(), // pageUrl->getFilename(), pageUrl->getExtension(), // pageUrl->getQuery(), pageUrl->getQueryLen(), // pageUrl->getIpString(), pageUrl->getIp(), // pageUrl->getAnchor(), pageUrl->getAnchorLen(), // pageUrl->getSite(), pageUrl->getSiteLen() ); m_cvtUrl[0] = '\0'; long cvtUrlLen = 0; char *host = pageUrl->getHost(); // first, add hostname - "www." iff it is not an ip addr if ( pageUrl->getIp() == 0 ) { if ( host[0] == 'w' && host[1] == 'w' && host[2] == 'w' && host[3] == '.' ) { // if starts with 'www.', don't add the 'www.' if(pageUrl->getHostLen()-pageUrl->getDomainLen() == 4){ // add domain - 'www.' - tld strncpy( m_cvtUrl, pageUrl->getDomain(), pageUrl->getDomainLen() - pageUrl->getTLDLen() ); cvtUrlLen += pageUrl->getDomainLen() - pageUrl->getTLDLen(); m_cvtUrl[cvtUrlLen] = '\0'; } else { // add host + domain - 'www.' - tld strncpy( m_cvtUrl, pageUrl->getHost()+4, pageUrl->getHostLen() - pageUrl->getTLDLen() - 4 ); cvtUrlLen += pageUrl->getHostLen() - pageUrl->getTLDLen() - 4; m_cvtUrl[cvtUrlLen] = '\0'; } } else { // add host + domain - tld strncpy( m_cvtUrl, pageUrl->getHost(), pageUrl->getHostLen() - pageUrl->getTLDLen() - 1 ); cvtUrlLen += pageUrl->getHostLen() - pageUrl->getTLDLen() - 1; m_cvtUrl[cvtUrlLen] = '\0'; } } // next, add path if ( pageUrl->getPathLen() > 0 ) { strncat( m_cvtUrl, pageUrl->getPath(), pageUrl->getPathLen()-pageUrl->getExtensionLen() ); cvtUrlLen += pageUrl->getPathLen()-pageUrl->getExtensionLen(); m_cvtUrl[cvtUrlLen] = '\0'; } // next, add query if ( pageUrl->getQueryLen() > 0 ) { strncat( m_cvtUrl, pageUrl->getQuery(), pageUrl->getQueryLen() ); cvtUrlLen += pageUrl->getQueryLen(); m_cvtUrl[cvtUrlLen] = '\0'; } // remove all non-alpha-numeric chars char *t = m_cvtUrl; for ( char *s = m_cvtUrl; *s; s++ ) { if ( is_alnum_a(*s) ) *t++ = *s; else if ( t>m_cvtUrl && *(t-1) != ' ' ) *t++ = ' '; } *t = '\0'; cvtUrlLen = (t-m_cvtUrl); //log( LOG_DEBUG, "query: m_cvtUrl:'%s' (%ld) AWL", // m_cvtUrl, cvtUrlLen ); // find number of query terms in url long numQTInUrl = 0; long numQTs = m_si->m_q->getNumTerms(); for ( long i = 0; i < numQTs; i++ ) { char *qtStr = m_si->m_q->getTerm(i); long qtLen = m_si->m_q->getTermLen(i); if ( strncasestr(m_cvtUrl, qtStr, cvtUrlLen, qtLen) != NULL ) { numQTInUrl++; //log( LOG_DEBUG, "query: qt is in url AWL"); } } // find number of gigabits in url long numGigabitsInUrl = 0; long numTopics = m_msg40->getNumTopics(); for ( long i = 0; i < numTopics; i++ ) { char *topicStr = m_msg40->getTopicPtr(i); long topicLen = m_msg40->getTopicLen(i); if ( strncasestr(m_cvtUrl, topicStr, cvtUrlLen, topicLen) ) { numGigabitsInUrl++; //log( LOG_DEBUG, "query: topic is in url AWL"); } } //log( LOG_DEBUG, "query: qts:%ld, gigabits:%ld; " // "maxQTInUrl:%ld, maxGbInUrl:%ld AWL", // numQTInUrl, numGigabitsInUrl, // maxQTInUrl, maxGigabitsInUrl ); return rerankLowerDemotesMore( score, numQTInUrl+numGigabitsInUrl, maxVal, factor, "pqrqttiu", "query terms or topics in its url" ); } */ // pqrqual // demote pages that are not high quality /* rscore_t PostQueryRerank::rerankQuality ( rscore_t score, unsigned char quality ) { //log( LOG_DEBUG, "query:in PQR::rerankQuality(" // "score:%ld, quality:%d)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, (int)quality, // m_si->m_cr->m_pqr_demFactQual, // m_si->m_cr->m_pqr_maxValQual ); float factor = m_si->m_cr->m_pqr_demFactQual; if ( factor <= 0 ) return score; long maxVal = m_si->m_cr->m_pqr_maxValQual; if ( maxVal < 0 ) maxVal = 100; return rerankLowerDemotesMore( score, quality, maxVal, factor, "pqrqual", "quality" ); } */ // pqrpaths // demote pages that are not root or have many paths in the url rscore_t PostQueryRerank::rerankPathsInUrl ( rscore_t score, char *url, long urlLen ) { //log( LOG_DEBUG, "query:in PQR::rerankPathsInUrl(" // "score:%ld, url:%s)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, url, // m_si->m_cr->m_pqr_demFactPaths, // m_si->m_cr->m_pqr_maxValPaths ); if ( urlLen == 0 ) return score; float factor = m_si->m_cr->m_pqr_demFactPaths; if ( factor <= 0 ) return score; // disables long maxVal = m_si->m_cr->m_pqr_maxValPaths; // bypass scheme and "://" url = strstr( url, "://" ); if ( ! url ) return score; url += 3; // count '/'s to get number of paths long numPaths = -1; // don't count first path for ( url = strchr(url, '/') ; url ; url = strchr(url, '/') ) { numPaths++; url++; } return rerankHigherDemotesMore( score, numPaths, maxVal, factor, "pqrpaths", "paths in its url" ); } // pqrcatid // demote page if does not have a catid rscore_t PostQueryRerank::rerankNoCatId ( rscore_t score, long numCatIds, long numIndCatIds ) { //log( LOG_DEBUG, "AWL:in PQR::rerankNoCatId(" // "score:%ld, numCatIds:%ld, numIndCatIds:%ld)" // "[P_factor:%3.3f]", // score, numCatIds, numIndCatIds, // m_si->m_cr->m_pqr_demFactNoCatId ); float factor = m_si->m_cr->m_pqr_demFactNoCatId; if ( factor <= 0 ) return score; // disables if ( numCatIds + numIndCatIds > 0 ) return score; return rerankAssignPenalty( score, factor, "pqrcatid", "it has no category id" ); } // pqrsuper // demote page if smallest catid has a lot of super topics rscore_t PostQueryRerank::rerankSmallestCatIdHasSuperTopics ( rscore_t score, Msg20 *msg20 ) { //log( LOG_DEBUG, "query:in PQR::rerankSmallestCatIdHasSuperTopics(" // "score:%ld)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, // m_si->m_cr->m_pqr_demFactCatidHasSupers, // m_si->m_cr->m_pqr_maxValCatidHasSupers ); float factor = m_si->m_cr->m_pqr_demFactCatidHasSupers; if ( factor <= 0 ) return score; // disables long maxVal = m_si->m_cr->m_pqr_maxValCatidHasSupers; // If page doesn't have a catid, we should demote it as if it has // max catids, otherwise pages with a catid will be penalized more if ( msg20->m_r->size_catIds == 0 ) { return rerankAssignPenalty( score, factor, "pqrsuper", "it has no category id" ); } // find smallest catid long minCatid = LONG_MAX; long numCatids = msg20->m_r->size_catIds / 4; for ( long i = 0; i < numCatids; i++ ) { if ( msg20->m_r->ptr_catIds[i] < minCatid ) { minCatid = msg20->m_r->ptr_catIds[i]; } } //log( LOG_DEBUG, "query: minCatid:%ld AWL", minCatid ); // count super topics by walking up catids long numSupers = -1; long currCatId = minCatid; long currParentId = minCatid; while ( currCatId > 1 ) { // next cat currCatId = currParentId; // get the index for this cat long currCatIndex = g_categories->getIndexFromId(currCatId); if ( currCatIndex <= 0 ) break; // get the parent for this cat currParentId = g_categories->m_cats[currCatIndex].m_parentid; numSupers++; } return rerankHigherDemotesMore( score, numSupers, maxVal, factor, "pqrsuper", "category ids" ); } // pqrpgsz // . demote page based on size. (number of words) The bigger, the // more it should be demoted. rscore_t PostQueryRerank::rerankPageSize ( rscore_t score, long docLen ) { //log( LOG_DEBUG, "query:in PQR::rerankPageSize(" // "score:%ld, docLen:%ld)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, docLen, // m_si->m_cr->m_pqr_demFactPageSize, // m_si->m_cr->m_pqr_maxValPageSize ); float factor = m_si->m_cr->m_pqr_demFactPageSize; if ( factor <= 0 ) return score; long maxVal = m_si->m_cr->m_pqr_maxValPageSize; // safety check if ( docLen <= 0 ) docLen = maxVal; return rerankHigherDemotesMore( score, docLen, maxVal, factor, "pqrpgsz", "page size" ); } /* // pqrloc const long MIN_PLACEPOP = 50000; // . returns true if buf contains a location // . locBuf is the location name // . locLen is it's length // . locPop is it's population bool PostQueryRerank::getLocation( char *locBuf, long locBufLen, long *locLen, long *locPop, char *buf, long bufLen ) { //log( LOG_DEBUG, "query:in getLocation(buf:%c%c%c%c, len:%ld, " // "uc:%d) AWL", // buf[0], buf[2], buf[4], buf[6], bufLen, Words words; if ( ! words.set( buf, bufLen, TITLEREC_CURRENT_VERSION, false, // computeIds false // hasHtmlEntities ) ) return false; AppendingWordsWindow ww; if ( ! ww.set( &words, 1, // minWindowSize 5, // maxWindowSize locBufLen, locBuf ) ) return false; // find all phrases between length of 1 and 5 for ( ww.processFirstWindow(); !ww.isDone(); ww.processNextWindow() ){ ww.act(); char *phrasePtr = ww.getPhrasePtr(); long phraseLen = ww.getPhraseLen(); long numPhraseWords = ww.getNumWords(); if ( numPhraseWords == 0 ) continue; //log( LOG_DEBUG, "query: p:%s (%ld) AWL", // phrasePtr, phraseLen ); // see if buf phrase is a place long encodeType = csUTF8;//csISOLatin1; long placePop = getPlacePop( phrasePtr, phraseLen, encodeType ); if ( placePop > MIN_PLACEPOP ) { //log( LOG_DEBUG, "query: p:%s (%ld) is " // "loc spec AWL", // phrasePtr, phraseLen ); *locLen = phraseLen; *locPop = placePop; return true; } // check to see if buf phrase's abbreviation is loc spec //log( LOG_DEBUG, "query: utf8 p:%s (%ld) AWL", // phrasePtr, phraseLen ); SynonymInfo synInfo; if ( ! g_thesaurus.getSynonymInfo( phrasePtr, &synInfo, phraseLen ) ) continue; long numSyns = synInfo.m_numSyns; for ( long j = 0; j < numSyns; j++ ) { char *syn = synInfo.m_syn[j]; long synLen = gbstrlen(syn); placePop = getPlacePop( syn, synLen, csISOLatin1 ); if ( placePop > MIN_PLACEPOP ) { //log( LOG_DEBUG, "query: s:%s (%ld) is " // "loc spec AWL", // syn, synLen ); *locLen = phraseLen; *locPop = placePop; return true; } } } *locLen = 0; *locPop = 0; return false; } // pqrloc bool PostQueryRerank::preRerankNonLocationSpecificQueries ( ) { //log( LOG_DEBUG, "query:in PQR::preRerankNonLocSpecQueries() AWL" ); if ( m_si->m_pqr_demFactLocTitle <= 0 && m_si->m_pqr_demFactLocSummary <= 0 && m_si->m_pqr_demFactLocDmoz <= 0 ) return true; //log( LOG_DEBUG, "query: q:%s (%ld) AWL", // m_si->m_q->m_orig, // m_si->m_q->m_origLen ); // See if query is location specific by building a buffer of // query terms without punct then checking all phrases of that // buffer long numQWords = m_si->m_q->m_numWords; char locBuf[1024]; long locLen = 0; long locPop = 0; char buf[MAX_QUERY_LEN]; char *p = buf; Query *q = m_si->m_q; for ( long i = 0; i < numQWords; i++ ) { QueryWord *qw = &q->m_qwords[i]; //log( LOG_DEBUG, "query: qw:%c%c%c%c (%ld) " // "inQuotes:%d; inQuoted:%d; quoteStrt:%ld " // "op:%d; opcode:%d; isPunct:%d level:%d; " // "wsign:%d; psign:%d id:%lld " // "ignore:%d AWL", // qw->m_word[0], qw->m_word[2], // qw->m_word[4], qw->m_word[6], // qw->m_wordLen, // qw->m_inQuotes, qw->m_inQuotedPhrase, qw->m_quoteStart, // qw->m_queryOp, qw->m_opcode, qw->m_isPunct, qw->m_level, // qw->m_wordSign, qw->m_phraseSign, qw->m_wordId, // qw->m_ignoreWord ); // reset buf if word is punct (except all space) or an opcode bool isPunct = qw->m_isPunct; bool isAllSpace = false; if ( isPunct ) { char *s = qw->m_word; for ( ; (int)(s-qw->m_word) < qw->m_wordLen; s++ ) { if ( ! is_space(*s) ) break; } isAllSpace = ( s-qw->m_word == qw->m_wordLen ); } if ( (isPunct && ! isAllSpace) || qw->m_opcode != 0 ) { // before we reset, see if buffer contains a location if ( getLocation( locBuf, 1024, &locLen, &locPop, buf, p-buf ) ) { long encodeType = csUTF8;//csISOLatin1; m_querysLoc = hash64d( locBuf, locLen); break; } p = buf; //log( LOG_DEBUG, "query: encountered symbol:%d|%d AWL", // qw->m_isPunct, qw->m_opcode ); continue; } // but if word is all space, dont append if ( isAllSpace ) continue; // skip if word is subtracted out if ( qw->m_wordSign == '-' ) continue; // skip if word or phrase is under NOT ||| AWL not working right now if ( qw->m_queryWordTerm && qw->m_queryWordTerm->m_underNOT ) continue; if ( qw->m_queryPhraseTerm && qw->m_queryPhraseTerm->m_underNOT ) continue; // else, append word + space to buf memcpy( p, qw->m_word, qw->m_wordLen ); p += qw->m_wordLen; *p++ = ' '; } // now see if there's a location in buf if ( m_querysLoc == 0 && getLocation( locBuf, 1024, &locLen, &locPop, buf, p-buf ) ) { m_querysLoc = hash64d( locBuf, locLen ); } //log( LOG_DEBUG, "query: q loc:%lld AWL", // m_querysLoc ); // check the gigabits for locations //log( LOG_DEBUG, "query: places lookup gigabits numTopics:%ld AWL", // m_msg40->getNumTopics() ); m_ignoreLocs.set( 28 ); // if searching the us, these should not be demoted, so // put them into the gigabit table if (m_si->m_country == 226) { m_ignoreLocs.addKey(hash64d("u.s.",4),true); m_ignoreLocs.addKey(hash64d("us",2),true); m_ignoreLocs.addKey(hash64d("united states",14),true); m_ignoreLocs.addKey(hash64d("u.s.a.",6),true); m_ignoreLocs.addKey(hash64d("usa",3),true); m_ignoreLocs.addKey(hash64d("america",7),true); m_ignoreLocs.addKey(hash64d("american",8),true); m_ignoreLocs.addKey(hash64d("americans",9),true); m_ignoreLocs.addKey(hash64d("canada",6),true); m_ignoreLocs.addKey(hash64d("kanada",6),true); m_ignoreLocs.addKey(hash64d("canucks",7),true); m_ignoreLocs.addKey(hash64d("canadians",9),true); m_ignoreLocs.addKey(hash64d("canadian",8),true); m_ignoreLocs.addKey(hash64d("north america",13),true); m_ignoreLocs.addKey(hash64d("uk",2),true); m_ignoreLocs.addKey(hash64d("united kingdom",14),true); m_ignoreLocs.addKey(hash64d("british",7),true); m_ignoreLocs.addKey(hash64d("britain",7),true); m_ignoreLocs.addKey(hash64d("britons",7),true); m_ignoreLocs.addKey(hash64d("great britain",13),true); } // now add the locations from the gigabits long numTopics = m_msg40->getNumTopics(); for ( long i = 0; !m_si->m_pqr_demInTopics && i < numTopics; i++ ) { char *topicStr = m_msg40->getTopicPtr(i); long topicLen = m_msg40->getTopicLen(i); Words words; if ( ! words.set( topicStr, topicLen, TITLEREC_CURRENT_VERSION, false, // computeIds false // hasHtmlEntities ) ) continue; AppendingWordsWindow ww; if ( ! ww.set( &words, 1, // minWindowSize 5, // maxWindowSize AWW_INIT_BUF_SIZE, NULL ) ) continue; // find all phrases between length of 1 and 5 for ( ww.processFirstWindow(); ! ww.isDone(); ww.processNextWindow() ) { ww.act(); char *phrasePtr = ww.getPhrasePtr(); long phraseLen = ww.getPhraseLen(); long numPhraseWords = ww.getNumWords(); if ( numPhraseWords == 0 ) continue; // see if topic phrase is a place long placePop = getPlacePop( phrasePtr, phraseLen, encodeType ); if ( placePop > MIN_PLACEPOP ) { // It's a place, mark it so if a page has // this place name in it's title we won't // rerank it uint64_t h = hash64d( phrasePtr, phraseLen); m_ignoreLocs.addKey( h, true ); //log( LOG_DEBUG, "query: pre gigabit has " // "location '%s' (%ld) [h:%lld] AWL", // phrasePtr, phraseLen, h ); continue; } // Check if a gigabit's abbreviation is location // specific SynonymInfo synInfo; if ( ! g_thesaurus.getSynonymInfo( phrasePtr, &synInfo, phraseLen ) ) continue; long numSyns = synInfo.m_numSyns; for ( long j = 0; j < numSyns; j++ ) { char *syn = synInfo.m_syn[j]; long synLen = gbstrlen(syn); placePop = getPlacePop( syn, synLen, csISOLatin1 ); if ( placePop > MIN_PLACEPOP ) { // It's a place, so mark syn uint64_t h = hash64d( syn, synLen); m_ignoreLocs.addKey( h, true ); //log( LOG_DEBUG, "query: pre gigabit" // " has location synonym '%s'" // " h:%lld AWL", // syn, h ); continue; } } } } if (m_querysLoc != 0) log(LOG_DEBUG, "pqr: query contains a location, " "will not demote location specific results"); else log(LOG_DEBUG, "pqr: query DOES NOT contain a location, " "will demote location specific results"); return true; } // pqrloc // . if query is not location specific, and a page has a geographic location // in its title then demote that page UNLESS the geographic location is // contained in the list of gigabits for the search query. like "Shoes (UK)" // or "retail stores in New York" when you are UK and New York are not in // your query. We will need a file of locations. BUT if the location is // contained in the gigabits, do NOT demote such pages, query might have // something like "the big apple" in it... Note: if query ops out of a // location, it should not be considered location specific (like "expo // -montreal"). demote by popularity weight of the place name as returned // from getPlacesPeoplePop(). // . demote results containing geographic locations // unless THAT location is in gigabits or in query. fixes // 'car insurance'? demote a little bit if in summary... // or a little bit if in a single dmoz catregory and it is // dmoz regional category. do not demote 'united states' 'us' // 'america' or 'usa' if searching default is the us. do // not dmoz dmoz north america:US region if searching in us. // but if 'albuquerque' in query, do not demote if 'new mexico' // in search results. rscore_t PostQueryRerank::rerankNonLocationSpecificQueries ( rscore_t score, Msg20 *msg20 ) { float titleFactor = m_si->m_pqr_demFactLocTitle; float summFactor = m_si->m_pqr_demFactLocSummary; float dmozFactor = m_si->m_pqr_demFactLocDmoz; if ( titleFactor <= 0 && summFactor <= 0 && dmozFactor <= 0 ) return score; long maxVal = m_si->m_cr->m_pqr_maxValLoc; // if we found a location in the query, don't rerank for locs if (m_querysLoc != 0) return score; //log( LOG_DEBUG, "query:in PQR::rerankNonLocSpecQueries(" // "score:%ld)" // "[P_factorTitle:%3.3f; P_factorSummary:%3.3f; P_factorDmoz:%3.3f; " // " P_max:%ld; " // "m_querysLoc:%lld; #m_ignoreLocs:%ld; #summaryLocs:%ld] AWL", // score, // titleFactor, summFactor, dmozFactor, // maxVal, // m_querysLoc, // m_ignoreLocs.getNumSlotsUsed(), // msg20->getNumSummaryLocs() ); // check if categories are regional and contain a location long numCatids = msg20->m_r->size_catIds / 4; long *catids = msg20->m_r->ptr_catIds; long catLocMaxPop = 0; uint8_t searchingUS = (m_si->m_country == 226); //log(LOG_DEBUG, "pqr: checking %ld categories for locs AWL", // numCatids); for ( unsigned char i = 0; dmozFactor > 0 && i < numCatids; i++ ) { SafeBuf sb; long catid = catids[i]; g_categories->printPathFromId(&sb, catid, true); // copy first part of category so we can work with it const long MAX_PQRCAT = 512; char cat[MAX_PQRCAT]; long len = sb.length(); if (len > MAX_PQRCAT) len = MAX_PQRCAT; strncpy(cat, sb.getBufStart(), len); cat[len] = '\0'; //log(LOG_DEBUG, "pqr: catid:%ld category:'%s' AWL", // catid, cat); // see if we have a regional category char *p = cat; char region[64]; char *q = region; while (*p && q-region < 64 && *p != '/') *q++ = *p++; *q = '\0'; bool catIsRegional = (0 == strncmp(region, "Regional", 8)); //log(LOG_DEBUG, "pqr: cat has region:%ld AWL", // (long)catIsRegional); // we only care about regional categories if (!catIsRegional) continue; long placePop = 0; // scan category for region while (*p) { p++; q = region; while (*p && q-region < 64 && *p != '/') { if (*p == '_') *q++ = ' '; else *q++ = *p; p++; } *q = '\0'; bool regionIsUS = (searchingUS && (0 == strcasecmp(region, "us") || 0 == strcasecmp(region, "united states") || 0 == strcasecmp(region, "usa") || 0 == strcasecmp(region, "america"))); //log(LOG_DEBUG, "pqr: region:%s (isUS:%ld) AWL", // region, (long)regionIsUS); // if region is us, skip category demotion if (!regionIsUS) { // see if region is a place placePop = getPlacePop(region, q-region, csISOLatin1); if (placePop > MIN_PLACEPOP) break; } } // if we didn't find a place, go to next cat if (placePop <= MIN_PLACEPOP) continue; uint64_t h = hash64d( region, q-region ); if (h == 0) continue; // is it the location of the query? if (h == m_querysLoc) { //log(LOG_DEBUG, "pqr: cat " // "has query's loc " // "[pop:%ld; h:%lld] AWL", // placePop, h); return score; } // is it in the gigabits? if (m_ignoreLocs.getSlot( h ) != -1) { //log(LOG_DEBUG, "pqr: cat has " // "gigabit's loc [pop:%ld; h:%llu] AWL", // placePop, h); return score; } // use only the max pop for all places in category if (placePop > catLocMaxPop) { //log(LOG_DEBUG, "pqr: cat has a non-query, " // "non-gigabit loc:'%s' %llu pop:%ld AWL", // region, h, placePop); catLocMaxPop = placePop; continue; } } //log(LOG_DEBUG, "pqr: categories' max population:%ld AWL", // catLocMaxPop); if (dmozFactor > 0 && catLocMaxPop > MIN_PLACEPOP) score = rerankHigherDemotesMore(score, catLocMaxPop, maxVal, dmozFactor, "pqrlocd", "population of a place in a " "category and the place was " "not in the query or gigabits"); // check if summary contains a location // check if summary's location is in gigabits long numSummaryLocs = msg20->m_r->size_summLocs/8; uint64_t *summaryLocs = msg20->m_r->ptr_summLocs; long *summaryLocsPops = msg20->m_r->ptr_summLocsPop; long summaryLocMaxPop = 0; for (long i = 0; summFactor > 0 && i < numSummaryLocs; i++) { uint64_t h = summaryLocs[i]; long placePop = summaryLocsPops[i]; if (h == 0) continue; if (placePop <= MIN_PLACEPOP) continue; // is it the location of the query? if ( h == m_querysLoc ) { //log( LOG_DEBUG, "pqr: summary " // "has query's loc " // "[pop:%ld; h:%lld] AWL", // placePop, h ); return score; } // is it in the gigabits? if (m_ignoreLocs.getSlot( h ) != -1 ) { //log( LOG_DEBUG, "pqr: summary has " // "gigabit's loc [pop:%ld; h:%llu] AWL", // placePop, h ); return score; } // use only the max pop for all places in title if ( placePop > summaryLocMaxPop ) { //log( LOG_DEBUG, "pqr: summary has a non-query, " // "non-gigabit loc:%llu pop:%ld AWL", // h, placePop ); summaryLocMaxPop = placePop; continue; } } //log( LOG_DEBUG, "pqr: summary's max population:%ld AWL", // summaryLocMaxPop ); if (summFactor > 0 && summaryLocMaxPop > MIN_PLACEPOP) score = rerankHigherDemotesMore(score, summaryLocMaxPop, maxVal, summFactor, "pqrlocs", "population of a place in its " "summary and the place was " "not in the query or gigabits"); // check if title contains a location if (titleFactor <= 0) return score; char *pageTitle = msg20->getTitle(); long pageTitleLen = msg20->getTitleLen(); Words words; if ( ! words.set( pageTitle, pageTitleLen, TITLEREC_CURRENT_VERSION, false, // computeIds false // hasHtmlEntities ) ) return score; AppendingWordsWindow ww; if ( ! ww.set( &words, 1, // minWindowSize 5, // maxWindowSize AWW_INIT_BUF_SIZE, NULL ) ) return score; // find all phrases between length of 1 and 5 long titleLocMaxPop = 0; for ( ww.processFirstWindow(); ! ww.isDone(); ww.processNextWindow()) { ww.act(); char *phrasePtr = ww.getPhrasePtr(); long phraseLen = ww.getPhraseLen(); long numPhraseWords = ww.getNumWords(); if ( numPhraseWords == 0 ) continue; // Get the place's population // If it's a place, check gigabits for the place name long encodeType = csUTF8; //ISOLatin1; long placePop = getPlacePop( phrasePtr, phraseLen, encodeType ); if ( placePop > MIN_PLACEPOP ) { // Check if place is same as query // Check if gigabits has this location or // an abbreviation of the location, if so don't // rerank this page uint64_t h = hash64d(phrasePtr, phraseLen); if ( h == 0 ) continue; // is it the query's location? if ( h == m_querysLoc ) { //log( LOG_DEBUG, "query: title has " // "query's loc [pop:%ld; h:%llu] AWL", // placePop, h ); return score; } // is it in the gigabits? if ( m_ignoreLocs.getSlot( h ) != -1 ) { //log( LOG_DEBUG, "pqr: title has " // "gigabit's loc [pop:%ld; h:%llu] AWL", // placePop, h ); return score; } } // use only the max pop for all places in title if ( placePop > titleLocMaxPop ) { //log( LOG_DEBUG, "pqr: title has a non-query, " // "non-gigabit loc:'%s' (%ld) pop:%ld AWL", // phrasePtr, phraseLen, placePop ); titleLocMaxPop = placePop; continue; } // If we haven't found a place name yet, check for // abbreviations of a place name //log( LOG_DEBUG, "pqr: phrase:'%s' (%ld) words:%ld " // "pop:%ld AWL", // phrasePtr, phraseLen, numPhraseWords, // placePop ); SynonymInfo synInfo; if ( ! g_thesaurus.getSynonymInfo( phrasePtr, &synInfo, phraseLen ) ) { continue; } long numSyns = synInfo.m_numSyns; for ( long j = 0; j < numSyns; j++ ) { char *syn = synInfo.m_syn[j]; long synLen = gbstrlen(synInfo.m_syn[j]); placePop = getPlacePop( syn, synLen, csISOLatin1 ); if ( placePop > MIN_PLACEPOP ) { // Check if gigabits has an abbreviation // of the location, if so don't rerank // this page uint64_t h = hash64d(syn, synLen); if ( h == 0 ) continue; // is syn the query's loc? if ( h == m_querysLoc ) { //log( LOG_DEBUG, "pqr: title " // "has query's loc syn " // "[pop:%ld; h:%lld] AWL", // placePop, h ); return score; } // is syn in gigabits? if ( m_ignoreLocs.getSlot( h ) != -1 ) { //log(LOG_DEBUG, "pqr: syn title " // " has gigabits's loc '%s' " // "[pop:%ld; h:%lld] AWL", // syn, // placePop, h ); return score; } // only use max pop in calculations if ( placePop > titleLocMaxPop ) { //log( LOG_DEBUG, "pqr: title " // "has a non-query, " // "non-gigabit loc syn AWL" ); titleLocMaxPop = placePop; } } } } //log( LOG_DEBUG, "pqr: title's max population:%ld AWL", // titleLocMaxPop ); return rerankHigherDemotesMore( score, titleLocMaxPop, maxVal, titleFactor, "pqrloct", "population of a place in its title " "and the place was not in the query " "or gigabits" ); } */ // pqrhtml, pqrxml // demote if content type is not html (or is xml) /* rscore_t PostQueryRerank::rerankContentType ( rscore_t score, char contentType ) { float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml; float xmlFactor = m_si->m_cr->m_pqr_demFactXml; //log( LOG_DEBUG, "query:in PQR::rerankContentType(" // "score:%ld, content-type:%ld)" // "[P_factorHtml:%3.3f; P_factorXml:%3.3f] AWL", // score, (long)contentType, // htmlFactor, xmlFactor ); // if completely disabled or page is html, don't do anything if ( xmlFactor <= 0 && htmlFactor <= 0 || contentType == CT_HTML ) return score; // if demoting for xml, then do that if ( xmlFactor > 0 && contentType == CT_XML ) return rerankAssignPenalty( score, xmlFactor, "pqrxml", "it is xml" ); // we are demoting for non-html and the page is not html return rerankAssignPenalty( score, htmlFactor, "pqrhtml", "it is not html" ); } */ // pqrfsd // setup bool PostQueryRerank::preRerankOtherPagesFromSameHost( Url *pageUrl ) { // don't do anything if this method is disabled if ( m_si->m_cr->m_pqr_demFactOthFromHost <= 0 ) return true; // don't add if no url if ( pageUrl->getUrlLen() == 0 ) return true; //log( LOG_DEBUG, "query:in PQR::preRerankOtherPagesFromSameHost() AWL"); //log( LOG_DEBUG, "query: u:'%s' host:'%s' (%ld); " // "domain:'%s' (%ld) AWL", // pageUrl->m_url, // pageUrl->getHost(), pageUrl->getHostLen(), // pageUrl->getDomain(), pageUrl->getDomainLen() ); char *host = pageUrl->getDomain(); long hostLen = pageUrl->getDomainLen(); uint64_t key = hash64Lower_a( host, hostLen ); if ( key == 0 ) key = 1; long slot = m_hostCntTable.getSlot( key ); if ( slot == -1 ) { m_hostCntTable.addKey( key, 0 ); // first page doesn't cnt } else { long *cnt = m_hostCntTable.getValuePointerFromSlot( slot ); (*cnt)++; } return true; } // pqrfsd // . if page does not have any other pages from its same hostname in the // search results (clustered or not) then demote it. demote based on // how many pages occur in the results from the same hostname. (tends // to promote pages from hostnames that occur a lot in the unclustered // results, they tend to be authorities) If it has pages from the same // hostname, they must have the query terms in different contexts, so // we must get the summaries for 5 of the results, and just cluster the rest. rscore_t PostQueryRerank::rerankOtherPagesFromSameHost ( rscore_t score, Url *pageUrl ) { //log( LOG_DEBUG, "query:in PQR::rerankOtherPagesFromSameHost(" // "score:%ld, url:'%s', urlLen:%ld)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, pageUrl->getUrl(), pageUrl->getUrlLen(), // m_si->m_cr->m_pqr_demFactOthFromHost, // m_si->m_cr->m_pqr_maxValOthFromHost ); if ( pageUrl->getUrlLen() == 0 ) return score; float factor = m_si->m_cr->m_pqr_demFactOthFromHost; if ( factor <= 0 ) return score; // disables long maxVal = m_si->m_cr->m_pqr_maxValOthFromHost; if ( maxVal < 0 ) maxVal = m_numToSort-1; // all but this one // . lookup host for this page in hash table to get number of other // pages from the same host char *host = pageUrl->getDomain(); long hostLen = pageUrl->getDomainLen(); uint64_t key = hash64Lower_a( host, hostLen ); long slot = m_hostCntTable.getSlot( key ); long numFromSameHost = m_hostCntTable.getValueFromSlot( slot ); //log( LOG_DEBUG, "query: numFromSameHost:%ld AWL", numFromSameHost ); return rerankLowerDemotesMore( score, numFromSameHost, maxVal, factor, "pqrfsd", "other pages from the same host" ); } // pqrctid // . if page is from a topic in dmoz that is in common with a lot of other // results, then do not demote it as much as if it is not. ("birds of // a feather") Reduce demotion penalty as you demote each result in // order to avoid "clumping". // setup bool PostQueryRerank::preRerankCommonTopicsInDmoz( Msg20Reply *mr ) { if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 ) return true; //SANITYCHECK( msg20 ); if ( ! mr ) { char *xx=NULL;*xx=0; } //log( LOG_DEBUG, "query:in PQR::preRerankCommonTopicsInDmoz() " // "AWL" ); //log(LOG_DEBUG, "query: qdmoz pre cnt:%d AWL", // (int)msg20->m_numCatids); long numCatids = mr->size_catIds/4;//msg20->getNumCatids(); for ( unsigned char i = 0; i < numCatids; i++ ) { long key = mr->ptr_catIds[i];//msg20->getDmozCatids()[i]; if ( key == 0 ) key = 1; long slot = m_dmozTable.getSlot( key ); //log( LOG_DEBUG, "query: qdmoz pre %ld/%ld; " // "catId:%ld; slot:%ld AWL", // (long)i+1, (long)msg20->m_numCatids, // key, slot ); if ( slot == -1 ) { // first occurance // cnt is 0, no other common topics // demotion factor is the parm ComTopInDmozRec rec; rec.cnt = 0; rec.demFact = m_si->m_cr->m_pqr_demFactComTopicInDmoz; m_dmozTable.addKey( key, rec ); //log(LOG_DEBUG, "query: qdmoz pre occurance 1 AWL"); } else { // nth occurance ComTopInDmozRec *rec = m_dmozTable.getValuePointerFromSlot( slot ); rec->cnt++; //log( LOG_DEBUG, "query: qdmoz pre key:%ld " // "occurance %ld AWL", // key, rec->cnt ); } } return true; } // pqrctid // . if page is from a topic in dmoz that is in common with a lot of other // results, then do not demote it as much as if it is not. ("birds of // a feather") Reduce demotion penalty as you demote each result in // order to avoid "clumping". rscore_t PostQueryRerank::rerankCommonTopicsInDmoz ( rscore_t score, Msg20 *msg20 ) { //log( LOG_DEBUG, "query:in PQR::rerankCommonTopicsInDmoz(" // "score:%ld)" // "[P_max:%ld P_decFact:%3.3f] AWL", // score, // m_si->m_cr->m_pqr_maxValComTopicInDmoz, // m_si->m_cr->m_pqr_decFactComTopicInDmoz ); //log( LOG_DEBUG, "query: qdmoz cnt:%ld AWL", // (long)msg20->m_numCatids ); if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 ) return score; long maxVal = m_si->m_cr->m_pqr_maxValComTopicInDmoz; if ( maxVal < 0 ) maxVal = m_numToSort; // . see if page is from a topic in dmoz that is in common with a // lot of other results // . if no catid, result will be not be demoted float chosenDemFact = 0.0; long numComTopicsInDmoz = 0; long maxComTopicsInDmoz = 0; long numCatids = msg20->m_r->size_catIds/4;//getNumCatids(); for ( unsigned char i = 0; i < numCatids; i++ ) { long key = msg20->m_r->ptr_catIds[i];//getDmozCatids()[i]; long slot = m_dmozTable.getSlot( key ); ComTopInDmozRec *rec = m_dmozTable.getValuePointerFromSlot( slot ); //log( LOG_DEBUG, "query: slot:%ld key:%ld cnt:%ld; " // "demFact:%3.3f AWL", // slot, key, // rec->cnt, rec->demFact ); // add # of other pages with same topic as this numComTopicsInDmoz += rec->cnt; // . find the slot with the max common topics in dmoz so // it can be decayed if ( rec->cnt > maxComTopicsInDmoz ) { chosenDemFact = rec->demFact; maxComTopicsInDmoz = rec->cnt; } } score = rerankHigherDemotesMore( score, numComTopicsInDmoz, maxVal, chosenDemFact, "pqrctid", "common topics in dmoz " "as other results" ); // now decay the factors float decFactor = m_si->m_cr->m_pqr_decFactComTopicInDmoz; if ( decFactor < 0 ) return score; for ( unsigned char i = 0; i < numCatids; i++ ) { long key = msg20->m_r->ptr_catIds[i]; long slot = m_dmozTable.getSlot( key ); ComTopInDmozRec *rec = m_dmozTable.getValuePointerFromSlot( slot ); rec->demFact *= (1.0 - decFactor); //log( LOG_DEBUG, "query: decay slot:%ld key:%ld " // "cnt:%ld; decFact:%3.3f; new demFact:%3.3f AWL", // slot, key, // rec->cnt, decFactor, rec->demFact ); } return score; } // pqrdcndcqt // . if the dmoz category names contain a query term (or its synonyms or // gigabits), "boost" the result based on the query term weight (look at // query phrase term weights, too) (actually, demote others that do not // have them...) /* rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveQT ( rscore_t score, Msg20 *msg20 ) { //log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveQT(" // "score:%ld)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, // m_si->m_cr->m_pqr_demFactDmozCatNmNoQT, // m_si->m_cr->m_pqr_maxValDmozCatNmNoQT ); float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoQT; if ( factor <= 0 ) return score; // disables long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoQT; long numQTsInDmoz = 0; char *pd = msg20->m_r->ptr_dmozTitles; long numCatids = msg20->m_r->size_catIds/4; long numQTs = m_si->m_q->m_numTerms; HashTableT matchedIds; matchedIds.set( numQTs*2 ); for ( long j = 0; j < numCatids; j++ ) { char *currTitle = pd; long currTitleLen = gbstrlen(pd); if ( currTitleLen == 0 ) continue; //log( LOG_DEBUG, "query: currTitle:%s (%ld) AWL", // currTitle, currTitleLen ); Words w; Bits b; Phrases p; long long *wids; long nw; long long *pids; if ( ! w.set( currTitle , currTitleLen , TITLEREC_CURRENT_VERSION, true , // computeIds false ) ) goto next; if ( ! b.set( &w, TITLEREC_CURRENT_VERSION ,0) ) goto next; if ( ! p.set( &w , &b , true , // useStopWords false , // useStems TITLEREC_CURRENT_VERSION, 0 ) ) // niceness goto next; wids = w.getWordIds (); nw = w.getNumWords (); pids = p.getPhraseIds2 (); // go through all words in cat name for ( long i = 0; i < nw; i++ ) { // go through all query terms for ( long k = 0; k < numQTs; k++ ) { QueryTerm *qt = &m_si->m_q->m_qterms[k]; long long rawTermId = qt->m_rawTermId; // ignore 0 termIds if ( rawTermId == 0 ) continue; // see if we already matched this id long n = matchedIds.getSlot( rawTermId ); if ( n != -1 ) continue; // compare this query term to cat word if ( rawTermId == wids[i] ) { matchedIds.addKey( rawTermId, 0 ); numQTsInDmoz++; //log( LOG_DEBUG, "query: qt-dmozw " // "match '%s' (%ld) AWL", // qt->m_term, // qt->m_termLen ); continue; } // compare this query term to cat phrase if ( qt->m_isPhrase && rawTermId == pids[i] ) { matchedIds.addKey( rawTermId, 0 ); numQTsInDmoz++; //log( LOG_DEBUG, "query: qt-dmozp " // "match '%s' (%ld) AWL", // qt->m_term, // qt->m_termLen ); continue; } // if we haven't matched yet, check syns SynonymInfo synInfo; if ( ! g_thesaurus.getSynonymInfo( rawTermId, &synInfo )) continue; long numSyns = synInfo.m_numSyns; for ( long k = 0; k < numSyns; k++ ) { //log( LOG_DEBUG, "query: syn:'%s' AWL", // synInfo.m_syn[j]); uint64_t h ; h =hash64Lower_utf8(synInfo.m_syn[j], gbstrlen(synInfo.m_syn[j])); // see if we already matched this id long n = matchedIds.getSlot( h ); if ( n != -1 ) continue; // Compare this query term syn to // cat word if ( (long long)h == wids[i] ) { matchedIds.addKey( h, 0 ); numQTsInDmoz++; //log( LOG_DEBUG, "query: " // "synmatch:'%s' " // "in dmozw:'%s' AWL", // synInfo.m_syn[j], // currTitle ); continue; } // Compare this query term syn to // cat phrase if ( qt->m_isPhrase && (long long)h == pids[i] ) { matchedIds.addKey( h, 0 ); numQTsInDmoz++; //log( LOG_DEBUG, "query: " // "synmatch:'%s' " // "in dmozp:'%s' AWL", // synInfo.m_syn[j], // currTitle ); continue; } } } } next: pd += currTitleLen; } //log( LOG_DEBUG, "query: qts or syns in dmoz cat name:%ld AWL", // numQTsInDmoz ); return rerankLowerDemotesMore( score, numQTsInDmoz, maxVal, factor, "pqrdcndcqt", "query terms in its dmoz category names"); } */ // pqrdcndcgb // . if the dmoz category names contain a query term (or its synonyms or // gigabits), "boost" the result based on the query term weight (look at // query phrase term weights, too) (actually, demote others that do not // have them...) /* rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveGigabits ( rscore_t score, Msg20 *msg20 ) { //log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveGigabits(" // "score:%ld)" // "[P_factor:%3.3f; P_max:%ld] AWL", // score, // m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits, // m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits ); float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits; if ( factor <= 0 ) return score; // disables long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits; if ( maxVal < 0 ) maxVal = m_si->m_docsToScanForTopics; // find number of gigabits in dmoz category name long numGigabitsInDmoz = 0; // go through gigabits each possible phrase in gigabits //log( LOG_DEBUG, "query: numGigabits:%ld AWL", // m_msg40->getNumTopics() ); long numTopics = m_msg40->getNumTopics(); HashTableT matchedIds; matchedIds.set( numTopics*4 ); for ( long i = 0; i < numTopics; i++ ) { Words words; if ( ! words.set( m_msg40->getTopicPtr(i), m_msg40->getTopicLen(i), TITLEREC_CURRENT_VERSION, false, // computeIds false // hasHtmlEntities ) ) continue; AppendingWordsWindow ww; if ( ! ww.set( &words, 1, // minWindowSize 4, // maxWindowSize AWW_INIT_BUF_SIZE, NULL ) ) continue; // find all phrases between length of 1 and 4 for ( ww.processFirstWindow(); ! ww.isDone(); ww.processNextWindow() ) { ww.act(); char *phrasePtr = ww.getPhrasePtr(); long phraseLen = ww.getPhraseLen(); long numPhraseWords = ww.getNumWords(); if ( numPhraseWords == 0 ) continue; //log( LOG_DEBUG, "query: gb phrase:%s (%ld) AWL", // phrasePtr, phraseLen ); // see if we already matched this phrase uint64_t h = hash64Lower_utf8( phrasePtr, phraseLen ); if ( h == 0 ) h = 1; if ( matchedIds.getSlot( h ) != -1 ) continue; // ignore phrases that are just common words if ( isCommonWord( h ) ) continue; matchedIds.addKey( h, 0 ); // go through dmoz category names char *p = msg20->m_r->ptr_dmozTitles; long numCatids = msg20->m_r->size_catIds/4; for ( long j = 0; j < numCatids; j++ ) { char *currTitle = p; long currTitleLen = gbstrlen(p); if ( currTitleLen == 0 ) continue; //log( LOG_DEBUG, "query: dmoz:%s (%ld) AWL", // currTitle, currTitleLen ); // check if gigabit is in dmoz category name if (strncasestr(currTitle, phrasePtr, currTitleLen, phraseLen)){ //log( LOG_DEBUG, "query: gb is in " // "dmoz AWL"); numGigabitsInDmoz++; } p += currTitleLen; } } } //log( LOG_DEBUG, "query: numGigabitsInDmoz:%ld AWL", // numGigabitsInDmoz ); return rerankLowerDemotesMore( score, numGigabitsInDmoz, maxVal, factor, "pqrdcndcgb", "gigabits in its dmoz category names" ); } */ // pqrdate // . demote pages by datedb date rscore_t PostQueryRerank::rerankDatedbDate( rscore_t score, time_t datedbDate ) { float factor = m_si->m_cr->m_pqr_demFactDatedbDate; if ( factor <= 0 ) return score; long minVal = m_si->m_cr->m_pqr_minValDatedbDate; if ( minVal <= 0 ) minVal = 0; minVal *= 1000; long maxVal = m_si->m_cr->m_pqr_maxValDatedbDate; if ( maxVal <= 0 ) maxVal = 0; maxVal = m_now - maxVal*1000; //log( LOG_DEBUG, "query:in PQR::rerankDatedbDate(" // "score:%ld, datedbDate:%ld)" // "[P_factor:%3.3f; maxVal:%ld] AWL", // score, datedbDate, // factor, maxVal ); // don't penalize results whose publish date is unknown if ( datedbDate == -1 ) return score; if ( datedbDate <= minVal ) return rerankAssignPenalty( score, factor, "pqrdate", "publish date is older then " "minimum value" ); return rerankLowerDemotesMore( score, datedbDate-minVal, maxVal-minVal, factor, "pqrdate", "publish date" ); } // pqrprox // . demote pages by the average distance of query terms from // . one another in the document. Lower score is better. /* rscore_t PostQueryRerank::rerankProximity( rscore_t score, float proximityScore, float maxScore) { // . a -1 implies did not have any query terms // . see Summary.cpp proximity algo if ( proximityScore == -1 ) return 0; if(m_si->m_pqr_demFactProximity <= 0) return score; float factor = (// 1 - (proximityScore/maxScore)) * m_si->m_pqr_demFactProximity; if ( factor <= 0 ) return score; //return rerankAssignPenalty(score, // factor, // "pqrprox", // "proximity rerank"); // just divide the score by the proximityScore now // ...new stuff... if ( proximityScore == 0.0 ) return score; float score2 = (float)score; score2 /= proximityScore; score2 += 0.5; rscore_t newScore = (rscore_t)score2; if(m_si->m_debug || g_conf.m_logDebugPQR ) logf( LOG_DEBUG, "query: pqr: result demoted " "from %.02f to %.02f becaose of proximity rerank", (float)score,(float)newScore); return newScore; } */ // pqrinsec // . demote pages by the average of the score of the sections // . in which the query terms appear in. Higher score is better. rscore_t PostQueryRerank::rerankInSection( rscore_t score, long summaryScore, float maxScore) { if(m_si->m_pqr_demFactInSection <= 0) return score; float factor = ( 1 - (summaryScore/maxScore)) * m_si->m_pqr_demFactInSection; if ( factor <= 0 ) return score; return rerankAssignPenalty(score, factor, "pqrsection", "section rerank"); } /* rscore_t PostQueryRerank::rerankSubPhrase( rscore_t score, float diversity, float maxDiversity) { if(maxDiversity == 0) return score; float factor = (1 - (diversity/maxDiversity)) * m_si->m_pqr_demFactSubPhrase; if ( factor <= 0 ) return score; return rerankAssignPenalty(score, factor, "pqrspd", "subphrase demotion"); } */ bool PostQueryRerank::attemptToCluster ( ) { // find results that should be clustered bool needResort = false; HashTableT hostPosTable; hostPosTable.set(m_numToSort); for (long i = 0; i < m_numToSort; i++) { // look up this hostname to see if it's been clustered uint32_t key = m_m20List[i].m_host; if ( key == 0 ) key = 1; long slot = hostPosTable.getSlot(key); if (slot != -1) { // see if we are within 10 results of first result // from same host long firstPos = hostPosTable.getValueFromSlot(slot); if (i - firstPos > 1 && i - firstPos < 10) { // this result can be clustered rscore_t maxNewScore; maxNewScore = m_m20List[firstPos].m_score; if (maxNewScore <= m_m20List[i].m_score) continue; needResort = true; if(m_si->m_debug||g_conf.m_logDebugPQR ) logf(LOG_DEBUG, "pqr: re-ranking result " "%ld (%s) from score %.02f to " "score %.02f " "in order to cluster it with " "result " "%ld (%s)", i, m_m20List[i].m_m20->m_r->ptr_ubuf, (float)m_m20List[i].m_score, (float)maxNewScore, firstPos, m_m20List[firstPos].m_m20->m_r->ptr_ubuf); // bump up the score to cluster this result m_m20List[i].m_score = maxNewScore; } else { hostPosTable.setValue(slot, i); } } else { // add the hostname of this result to the table if (!hostPosTable.addKey(key, i)) { g_errno = ENOMEM; return false; } } } // re-sort the array if necessary if (needResort) { log(LOG_DEBUG, "pqr: re-sorting results for clustering"); gbmergesort( (void *) m_m20List, (size_t) m_numToSort, (size_t) sizeof(M20List), (int (*)(const void *, const void *))s_reSortFunction); } return true; } // Sort function for post query reranking's M20List static int32_t s_firstSortFunction(const M20List * a, const M20List * b) { // Sort by tier first, then score // When sorting by tier, an explicit match (0x40) in a higher tier // gets precedence over an implicit match (0x20) from a lower tier // Note: don't sort by tier, don't consider bitscores //if ( a->tier < b->tier && // (a->bitScore & 0x40 || !b->bitScore & 0x40) ) // return -1; //if ( a->tier > b->tier && // (b->bitScore & 0x40 || !a->bitScore & 0x40) ) // return 1; // Absolute match proximity //if ( a->m20->m_proximityScore > b->m20->proximityScore ) // return -1; //else if ( a->m20->m_proximityScore < b->m20->proximityScore ) // return 1; // same tier, same proximity, sort by score if ( a->m_score > b->m_score ) return -1; if ( a->m_score < b->m_score ) return 1; // same tier, same proximity, same score, sort by docid //if ( a->docId < b->docId ) // return -1; //if ( a->docId > b->docId ) // return 1; // same score, sort by host if ( a->m_host > b->m_host ) return -1; if ( a->m_host < b->m_host ) return 1; return 0; } // Sort function for post query reranking's M20List static int32_t s_reSortFunction(const M20List * a, const M20List * b) { // Sort by tier first, then score // When sorting by tier, an explicit match (0x40) in a higher tier // gets precedence over an implicit match (0x20) from a lower tier // Note: don't sort by tier, don't consider bitscores //if ( a->tier < b->tier && // (a->bitScore & 0x40 || !b->bitScore & 0x40) ) // return -1; //if ( a->tier > b->tier && // (b->bitScore & 0x40 || !a->bitScore & 0x40) ) // return 1; // Absolute match proximity //if ( a->m20->m_proximityScore > b->m20->proximityScore ) // return -1; //else if ( a->m20->m_proximityScore < b->m20->proximityScore ) // return 1; // same tier, same proximity, sort by score if ( a->m_score > b->m_score ) return -1; if ( a->m_score < b->m_score ) return 1; // same tier, same proximity, same score, sort by docid //if ( a->docId < b->docId ) // return -1; //if ( a->docId > b->docId ) // return 1; // same score, sort by host if ( a->m_host > b->m_host ) return -1; if ( a->m_host < b->m_host ) return 1; return 0; } #ifdef DEBUGGING_LANGUAGE // Debug stuff, remove before flight static void DoDump(char *loc, Msg20 **m20, long num, score_t *scores, char *tiers) { int x; char *url; //log(LOG_DEBUG, "query: DoDump(): checkpoint %s AWL DEBUG", loc); for(x = 0; x < num; x++) { url = m20[x]->getUrl(); if(!url) url = "None"; //log( LOG_DEBUG, "query: DoDump(%d): " // "tier:%d score:%ld [url:'%s'] msg20:%p\n AWL DEBUG", // x, tiers[x], scores[x], url, m20[x] ); } } #endif // DEBUGGING_LANGUAGE