open-source-search-engine/PostQueryRerank.cpp
mwells 5baf6a95d4 handle a bunch of oom conditions that
caused core. found using oom tester.
2013-11-20 10:14:02 -07:00

2496 lines
79 KiB
C++

#include "gb-include.h"
#include "PostQueryRerank.h"
#include "Msg40.h"
#include "LanguageIdentifier.h"
#include "sort.h"
//#include "Thesaurus.h"
//#include "AppendingWordsWindow.h"
//#include "Places.h"
#include "Profiler.h"
#include "CountryCode.h"
#include "Phrases.h"
#include "Linkdb.h"
#define TOTAL_RERANKING_TIME_STR "PostQueryRerank Total Reranking Time"
//#define DEBUGGING_LANGUAGE
// Type for post query reranking weighted sort list
struct M20List {
Msg20 *m_m20;
//long m_score;
rscore_t m_score;
//int m_tier;
long long m_docId;
char m_clusterLevel;
//long m_bitScore;
long m_numCommonInlinks;
uint32_t m_host;
};
static int32_t s_firstSortFunction( const M20List * a, const M20List * b );
static int32_t s_reSortFunction ( const M20List * a, const M20List * b );
#ifdef DEBUGGING_LANGUAGE
static void DoDump(char *loc, Msg20 **m20, long num,
score_t *scores, char *tiers);
#endif
bool PostQueryRerank::init ( ) {
return true;
}
PostQueryRerank::PostQueryRerank ( ) {
//log( LOG_DEBUG, "query:in PQR::PQR() AWL" );
m_enabled = false;
m_maxResultsToRerank = 0;
m_numToSort = 0;
m_m20List = NULL;
m_positionList = NULL;
m_msg40 = NULL;
//m_querysLoc = 0;
m_maxUrlLen = 0;
m_pageUrl = NULL;
m_now = time(NULL);
}
PostQueryRerank::~PostQueryRerank ( ) {
//log( LOG_DEBUG, "query:in PQR::~PQR() AWL" );
if ( m_m20List ) {
mfree( m_m20List, sizeof(M20List) * m_maxResultsToRerank,
"PostQueryRerank" );
m_m20List = NULL;
}
if ( m_positionList ) {
mfree( m_positionList, sizeof(long) * m_maxResultsToRerank,
"PQRPosList" );
m_positionList = NULL;
}
if ( m_cvtUrl ) mfree( m_cvtUrl, m_maxUrlLen, "pqrcvtUrl") ;
if ( m_pageUrl ) mfree( m_pageUrl, sizeof(Url)*m_maxResultsToRerank,
"pqrpageUrls" );
}
// returns false on error
bool PostQueryRerank::set1 ( Msg40 *msg40, SearchInput *si ) {
//log(LOG_DEBUG, "query:in PQR::set1(%p) AWL", msg40);
m_msg40 = msg40;
m_si = si;
if ( ! m_msg40 ) return false;
if ( ! m_si ) return false;
if ( ! m_si->m_cr ) return false;
m_enabled = (m_si->m_docsToScanForReranking > 1);
//log( LOG_DEBUG, "query: m_isEnabled:%ld; "
// "P_docsToScanForReranking:%ld P_pqr_docsToSan:%ld; AWL",
// (long)m_enabled,
// m_si->m_docsToScanForReranking,
// m_si->m_cr->m_pqr_docsToScan );
return m_enabled;
}
// must be called sometime after we know numDocIds and before preRerank
// returns false if we shouldn't rerank
bool PostQueryRerank::set2 ( long resultsWanted ) {
//log(LOG_DEBUG, "query:in PQR::set2() AWL");
//log( LOG_DEBUG, "query: firstResultNum:%ld; numResults:%ld; "
// "wanted:%ld numMsg20s:%ld AWL",
// m_msg40->getFirstResultNum(), m_msg40->getNumResults(),
// resultsWanted, m_msg40->m_numMsg20s );
// we only want to check the lessor of docsToScan and numDocIds
m_maxResultsToRerank = m_si->m_docsToScanForReranking;
if ( m_maxResultsToRerank > m_msg40->getNumDocIds() ) {
m_maxResultsToRerank = m_msg40->getNumDocIds();
log( LOG_DEBUG, "pqr: request to rerank more results "
"than the number of docids, capping number to rerank "
"at %ld", m_maxResultsToRerank );
}
// If we don't have less results from clustering / deduping or
// we have less results in docids then ...
if ( m_msg40->getNumResults() < m_msg40->getNumDocIds() &&
m_msg40->getNumResults() < resultsWanted )
return false;
// are we passed pqr's range?
if ( m_msg40->getFirstResultNum() > m_maxResultsToRerank )
return false;
// Safety check, make sure there are less results to rerank
// than the number of Msg20s
if ( m_msg40->m_numMsg20s < m_maxResultsToRerank )
m_maxResultsToRerank = m_msg40->m_numMsg20s;
//log( LOG_DEBUG, "query: m_maxResultsToRerank:%ld AWL",
// m_maxResultsToRerank );
if ( m_maxResultsToRerank < 2 ) {
//log( LOG_INFO, "pqr: too few results to rerank" );
return false;
}
if ( m_maxResultsToRerank > 250 ) {
log( LOG_INFO, "pqr: too many results to rerank, "
"capping at 250" );
m_maxResultsToRerank = 250;
}
// see if we are done
if ( m_msg40->getFirstResultNum() >= m_maxResultsToRerank ) {
log( LOG_INFO, "pqr: first result is higher than max "
"results to rerank" );
return false;
}
// get space for host count table
m_hostCntTable.set( m_maxResultsToRerank );
// get some space for dmoz table
m_dmozTable.set( m_maxResultsToRerank << 1 );
// alloc urls for pqrqttiu, pqrfsh and clustering
m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank,
"pqrpageUrls" );
if ( ! m_pageUrl ) {
log("pqr: had out of memory error");
return false;
}
return true;
}
// sets up PostQueryRerank for each page in m_maxResultsToRerank
// returns false on error
bool PostQueryRerank::preRerank ( ) {
//if ( g_conf.m_profilingEnabled ) {
// g_profiler
// .startTimer((long)(this->*(&PostQueryRerank::rerank)),
// TOTAL_RERANKING_TIME_STR );
//}
//log( LOG_DEBUG, "query:in PQR::preRerank() AWL" );
#ifdef DEBUGGING_LANGUAGE
DoDump( "Presort", m_msg40->m_msg20, m_maxResultsToRerank,
m_msg40->m_msg3a.m_scores, NULL);//m_msg40->m_msg3a.m_tiers );
#endif
if( m_si->m_enableLanguageSorting
&& !m_si->m_langHint )
log( LOG_INFO, "pqr: no language set for sort. "
"language will not be reranked" );
SANITYCHECK( ! m_m20List );
m_m20List = (M20List*)mcalloc( sizeof(M20List) * m_maxResultsToRerank,
"PostQueryRerank" );
if( ! m_m20List ) {
log( LOG_INFO, "pqr: Could not allocate PostQueryRerank "
"sort memory.\n" );
g_errno = ENOMEM;
return(false);
}
SANITYCHECK( ! m_positionList );
m_positionList = (long *)mcalloc( sizeof(long) * m_maxResultsToRerank,
"PQRPosList" );
if( ! m_positionList ) {
log( LOG_INFO, "pqr: Could not allocate PostQueryRerank "
"postion list memory.\n" );
g_errno = ENOMEM;
return(false);
}
//log(LOG_DEBUG, "pqr: the query is '%s' AWL", m_si->m_q->m_orig);
// setup for rerankNonLocationSpecificQueries if enabled
//if ( ! preRerankNonLocationSpecificQueries() )
// return false;
// . make a temp hash table for iptop
// . each slot is a long key and a long value
HashTable ipTable;
// how many slots
long numSlots = 5000 / ((4+4)*4);
char tmp[5000];
// this should NEVER need to allocate, UNLESS for some reason we got
// a ton of inlinking ips
if ( ! ipTable.set ( numSlots , tmp , 5000 ) ) return false;
// this table maps a docid to the number of search results it links to
HashTableT <long long, long> inlinkTable;
char tmp2[5000];
long numSlots2 = 5000 / ((8+4)*4);
if ( ! inlinkTable.set ( numSlots2 , tmp2 , 5000 ) ) return false;
// Fill sort array
long y = 0;
for( long x = 0;
x < m_msg40->m_numMsg20s && y < m_maxResultsToRerank;
x++ ) {
// skip clustered out results
char clusterLevel = m_msg40->getClusterLevel( x );
if ( clusterLevel != CR_OK ) {
//log( LOG_DEBUG, "pqr: skipping result "
// "%ld since cluster level(%ld) != "
// "CR_OK(%ld) AWL",
// x, (long)clusterLevel, (long)CR_OK );
continue;
}
// skip results that don't match all query terms
//long bitScore = m_msg40->getBitScore( x );
//if ( bitScore == 0x00 ) continue;
// . save postion of this result so we can fill it in later
// with (possibly) a higher ranking result
m_positionList[y] = x;
M20List *sortArrItem = &m_m20List [ y ];
sortArrItem->m_clusterLevel = clusterLevel ;
sortArrItem->m_m20 = m_msg40->m_msg20 [ x ];
sortArrItem->m_score = (rscore_t)m_msg40->getScore(x);
//sortArrItem->m_tier = m_msg40->getTier ( x );
sortArrItem->m_docId = m_msg40->getDocId ( x );
//sortArrItem->m_bitScore = bitScore ;
sortArrItem->m_host = 0; // to be filled in later
Msg20 *msg20 = sortArrItem->m_m20;
SANITYCHECK( msg20 && ! msg20->m_errno );
Msg20Reply *mr = msg20->m_r;
// set the urls for each page
// used by pqrqttiu, pqrfsh and clustering
m_pageUrl[y].set( mr->ptr_ubuf , false );
// now fill in host without the 'www.' if present
char *host = m_pageUrl[y].getHost();
long hostLen = m_pageUrl[y].getHostLen();
if (hostLen > 4 &&
host[3] == '.' &&
host[0] == 'w' && host[1] == 'w' && host[2] == 'w')
sortArrItem->m_host = hash32(host+4, hostLen-4);
else
sortArrItem->m_host = hash32(host, hostLen);
// add its inlinking docids into the hash table, inlinkTable
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
//long n = msg20->getNumInlinks ();
//long long *docIds = msg20->getInlinkDocIds ();
//char *flags = msg20->getInlinkFlags ();
//long *ips = msg20->getInlinkIps ();
//char *qualities = msg20->getInlinkQualities ();
// skip adding the inlinking docids if search result has bad ip
long ip = mr->m_ip;//msg20->getIp();
bool good = true;
if ( ip == 0 ) good = false;
if ( ip == -1 ) good = false;
// . skip inlinker add already did this "ip top"
// . "ip top" is the most significant 3 bytes of the ip
// . get the ip of the docid:
long top = iptop ( ip );
// if we already encountered a higher-scoring search result
// with the same iptop, do not count its inlinkers!
// so that if an inlinker links to two docids in the search
// results, where those two docids are from the same
// "ip top" then the docid is only "counted" once here.
if ( ipTable.getSlot ( top ) >= 0 ) good = false;
// not allowed to be 0
if ( top == 0 ) top = 1;
// now add to table so no we do not add the inlinkers from
// any other search results from the same "ip top"
if ( ! ipTable.addKey ( top , 1 ) ) return false;
// now hash all the inlinking docids into inlinkTable
for ( Inlink *k=NULL; good && (k=info->getNextInlink(k) ) ; ) {
// lower score if it is link spam though
if ( k->m_isLinkSpam ) continue;
// must be quality of 35 or higher to "vote"
//if ( k->m_docQuality < 35 ) continue;
if ( k->m_siteNumInlinks < 20 ) continue;
// skip if bad ip for inlinker
if ( k->m_ip == 0 || k->m_ip == -1 ) continue;
// skip if inlinker has same top ip as search result
if ( iptop(k->m_ip) == top ) continue;
// get the current slot in table from docid of inlinker
long slot = inlinkTable.getSlot ( k->m_docId );
// get the score
if ( slot >= 0 ) {
long count=inlinkTable.getValueFromSlot(slot);
inlinkTable.setValue ( slot , count + 1 );
continue;
}
// add it fresh if not already in there
if (!inlinkTable.addKey(k->m_docId,1)) return false;
}
//log( LOG_DEBUG, "pqr: pre: setting up sort array - "
// "mapping x:%ld to y:%ld; "
// "url:'%s' (%ld); tier:%d; score:%ld; "
// "docId:%lld; clusterLevel:%d; AWL",
// x, y,
// msg20->getUrl(), msg20->getUrlLen(),
// sortArrItem->tier, sortArrItem->score,
// sortArrItem->docId, sortArrItem->clusterLevel );
// setup reranking for pages from the same host (pqrfsd)
if ( ! preRerankOtherPagesFromSameHost( &m_pageUrl[y] ))
return false;
// setup reranking for pages with common topics in dmoz (pqrctid)
if ( ! preRerankCommonTopicsInDmoz( mr ) )
return false;
// . calculate maximum url length in pages for reranking
// by query terms or topics in a url
long urlLen = mr->size_ubuf - 1;//msg20->getUrlLen();
if ( urlLen > m_maxUrlLen )
m_maxUrlLen = urlLen;
// update num to rerank and sort
m_numToSort++;
y++;
}
// get the max
m_maxCommonInlinks = 0;
// how many of OUR inlinkers are shared by other results?
for ( long i = 0; i < m_numToSort; i++ ) {
// get the item
M20List *sortArrItem = &m_m20List [ i ];
Msg20 *msg20 = sortArrItem->m_m20;
// reset
sortArrItem->m_numCommonInlinks = 0;
// lookup its inlinking docids in the hash table
//long n = msg20->getNumInlinks ();
//long long *docIds = msg20->getInlinkDocIds ();
LinkInfo *info = (LinkInfo *)msg20->m_r->ptr_linkInfo;
for ( Inlink *k=NULL;info&&(k=info->getNextInlink(k)) ; ) {
// how many search results does this inlinker link to?
long*v=(long *)inlinkTable.getValuePointer(k->m_docId);
if ( ! v ) continue;
// if only 1 result had this as an inlinker, skip it
if ( *v <= 1 ) continue;
// ok, give us a point
sortArrItem->m_numCommonInlinks++;
}
// get the max
if ( sortArrItem->m_numCommonInlinks > m_maxCommonInlinks )
m_maxCommonInlinks = sortArrItem->m_numCommonInlinks;
}
// . setup reranking for query terms or topics in url (pqrqttiu)
// . add space to max url length for terminating NULL and allocate
// room for max length
m_maxUrlLen++;
m_cvtUrl = (char *)mmalloc( m_maxUrlLen, "pqrcvtUrl" );
if ( ! m_cvtUrl ) {
log( LOG_INFO, "pqr: Could not allocate %ld bytes "
"for m_cvtUrl.",
m_maxUrlLen );
g_errno = ENOMEM;
return false;
}
// Safety valve, trim sort results
if ( m_numToSort > m_maxResultsToRerank )
m_numToSort = m_maxResultsToRerank;
//log( LOG_DEBUG, "pqr::m_numToSort:%ld AWL", m_numToSort );
return true;
}
// perform actual reranking of m_numToSort pages
// returns false on error
bool PostQueryRerank::rerank ( ) {
//log(LOG_DEBUG,"query:in PQR::rerank() AWL");
if(m_si->m_debug||g_conf.m_logDebugPQR )
logf( LOG_DEBUG, "pqr: reranking %ld results",
m_numToSort );
/*
float maxDiversity = 0;
if(m_si->m_pqr_demFactSubPhrase > 0) {
for ( long x = 0; x < m_numToSort; x++ ) {
M20List *sortArrItem = &m_m20List [ x ];
Msg20 *msg20 = sortArrItem->m_m20;
if ( ! msg20 || msg20->m_errno ) continue;
float d = msg20->m_r->m_diversity;
if(d > maxDiversity) maxDiversity = d;
}
}
float maxProximityScore = 0;
float minProximityScore = -1.0;
//float maxInSectionScore = 0;
if(m_si->m_pqr_demFactProximity > 0 ||
m_si->m_pqr_demFactInSection > 0) {
//grab the max score so that we know what the max to
//demote is.
for ( long x = 0; x < m_numToSort; x++ ) {
M20List *sortArrItem = &m_m20List [ x ];
Msg20 *msg20 = sortArrItem->m_m20;
if ( ! msg20 || msg20->m_errno ) continue;
//float d = msg20->m_r->m_inSectionScore;
//if(d > maxInSectionScore)
// maxInSectionScore = d;
// handle proximity
float d = msg20->m_r->m_proximityScore;
// i think this means it does not have all the query
// terms! for 'sylvain segal' we got
// www.regalosdirectos.tv/asp2/comparar.asp?cat=36
// in results
if ( d == 0.0 ) continue;
// . -1 is a bogus proximity
// . it means we were not able to find all the terms
// because they were in anomalous link text or
// meta tags or select tags or whatever... so for
// now such results will not be demoted to be on the
// safe side
if ( d == -1.0 ) continue;
if ( d > maxProximityScore )
maxProximityScore = d;
if ( d < minProximityScore || minProximityScore==-1.0 )
minProximityScore = d;
}
}
*/
// rerank weighted sort list
for ( register long x = 0; x < m_numToSort; x++ ) {
M20List *sortArrItem = &m_m20List [ x ];
Msg20 *msg20 = sortArrItem->m_m20;
char *url = NULL;
rscore_t score = sortArrItem->m_score;
rscore_t startScore = score;
// mwells: what is this?
if(m_si->m_pqr_demFactOrigScore < 1) {
//turn off the indexed score and just use a uniform start score
//because I can't get the proximity pqr to overwhelm the
//preexisting score.
score = 1000000 + (m_numToSort - x) +
(long)(score * m_si->m_pqr_demFactOrigScore);
startScore = score;
}
// if don't have a good msg20, skip reranking for this result
if ( ! msg20 || msg20->m_errno )
continue;
url = msg20->m_r->ptr_ubuf;//getUrl();
if ( ! url ) url = "(none)";
if(m_si->m_debug||g_conf.m_logDebugPQR )
logf(LOG_DEBUG, "pqr: result #%ld:'%s' has initial "
"score of %.02f",
x, url, (float)startScore );
// resets
msg20->m_pqr_old_score = score;
msg20->m_pqr_factor_quality = 1.0;
msg20->m_pqr_factor_diversity = 1.0;
msg20->m_pqr_factor_inlinkers = 1.0;
msg20->m_pqr_factor_proximity = 1.0;
msg20->m_pqr_factor_ctype = 1.0;
msg20->m_pqr_factor_lang = 1.0; // includes country
Msg20Reply *mr = msg20->m_r;
// demote for language and country
score = rerankLanguageAndCountry( score,
mr->m_language ,
mr->m_summaryLanguage,
mr->m_country, // id
msg20 );
// demote for content-type
float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml;
float xmlFactor = m_si->m_cr->m_pqr_demFactXml;
long contentType= mr->m_contentType;
if ( contentType == CT_XML && xmlFactor > 0 ) {
score = score * xmlFactor;
msg20->m_pqr_factor_ctype = xmlFactor;
}
else if ( contentType != CT_HTML && htmlFactor > 0 ) {
score = score * htmlFactor;
msg20->m_pqr_factor_ctype = htmlFactor;
}
//if ( score == 1 ) goto finishloop;
// demote for fewer query terms or gigabits in url
//score = rerankQueryTermsOrGigabitsInUrl( score,
// &m_pageUrl[x] );
// . demote for not high quality
// . multiply by "qf" for every quality point below 100
// . now we basically do this if we have a wiki title
// . float qf = m_si->m_cr->m_pqr_demFactQual;
/*
if ( m_msg40->m_msg3a.m_oneTitle ) {
//long q = msg20->getQuality();
long sni = mr->m_siteNumInlinks;
if ( sni <= 0 ) sni = 1;
float weight = 1.0;
for ( ; sni < 100000 ; sni *= 2 )
weight = weight * 0.95;
// apply the weight to the score
score = score * weight;
// store that for print in PageResults.cpp
msg20->m_pqr_factor_quality = weight;
}
*/
// demote for more paths in url
score = rerankPathsInUrl( score,
msg20->m_r->ptr_ubuf,//getUrl(),
msg20->m_r->size_ubuf-1 );
// demote for smallest cat id has a lot of super topics
score = rerankSmallestCatIdHasSuperTopics( score,
msg20 );
// demote for larger page sizes
score = rerankPageSize( score,
msg20->m_r->m_contentLen );
// . demote for non location specific queries that have an
// an obvious location in gigabits or url
//score = rerankNonLocationSpecificQueries( score,
// msg20 );
//if ( score == 1 ) goto finishloop;
// demote for no cat id
score = rerankNoCatId( score,
msg20->m_r->size_catIds/4,
msg20->m_r->size_indCatIds/4);
// demote for no other pages from same host
score = rerankOtherPagesFromSameHost( score,
&m_pageUrl[x] );
// demote for fewer common topics in dmoz
score = rerankCommonTopicsInDmoz( score,
msg20 );
// . demote for pages with dmoz category names do not
// contain a query term
//score = rerankDmozCategoryNamesDontHaveQT( score,
// msg20 );
// . demote for pages with dmoz category names do not
// contain a query term
//score = rerankDmozCategoryNamesDontHaveGigabits( score,
// msg20 );
// . demote pages for older datedb dates
score = rerankDatedbDate( score,
msg20->m_r->m_datedbDate );
/*
// . demote pages by proximity
// . a -1 prox implies did not have any query terms
// . see Summary.cpp proximity algo
float ps = msg20->m_r->m_proximityScore;//getProximityScore();
if ( ps > 0.0 &&
m_si->m_pqr_demFactProximity > 0 &&
minProximityScore != -1.0 ) {
// what percent were we of the max?
float factor = minProximityScore / ps ;
// this can be weighted
//factor *= m_si->m_pqr_demFactProximity;
// apply the factor to the score
score *= factor;
// this is the factor
msg20->m_pqr_factor_proximity = factor;
}
// . demote pages by the average of the scores of the
// . terms based upon what section of the doc they are in
// . mdw: proximity algo should obsolete this
//if(maxInSectionScore > 0)
// score = rerankInSection( score,
// msg20->getInSectionScore(),
// maxInSectionScore);
// . demote pages which only have the query as a part of a
// . larger phrase
if ( maxDiversity != 0 ) {
float diversity = msg20->m_r->m_diversity;
float df = (1 - (diversity/maxDiversity)) *
m_si->m_pqr_demFactSubPhrase;
score = (rscore_t)(score * (1.0 - df));
if ( score <= 0.0 ) score = 0.001;
msg20->m_pqr_factor_diversity = 1.0 - df;
}
*/
// . COMMON INLINKER RERANK
// . no need to create a superfluous function call here
// . demote pages that do not share many inlinking docids
// with other pages in the search results
if ( m_maxCommonInlinks>0 && m_si->m_pqr_demFactCommonInlinks){
long nc = sortArrItem->m_numCommonInlinks ;
float penalty;
// the more inlinkers, the less the penalty
penalty = 1.0 -(((float)nc)/(float)m_maxCommonInlinks);
// . reduce the penalty for higher quality pages
// . they are the most likely to have their inlinkers
// truncated
//char quality = msg20->getQuality();
float sni = (float)msg20->m_r->m_siteNumInlinks;
// decrease penalty for really high quality docs
//while ( quality-- > 60 ) penalty *= .95;
for ( ; sni > 1000 ; sni *= .80 ) penalty *= .95;
// if this parm is 0, penalty will become 0
penalty *= m_si->m_pqr_demFactCommonInlinks;
// save old score
score = score * (1.0 - penalty);
// do not decrease all the way to 0!
if ( score <= 0.0 ) score = 0.001;
// store it!
msg20->m_pqr_factor_inlinkers = 1.0 - penalty;
}
// finishloop:
if(m_si->m_debug || g_conf.m_logDebugPQR )
logf( LOG_DEBUG, "pqr: result #%ld's final "
"score is %.02f (-%3.3f%%) ",
x, (float)score,100-100*(float)score/startScore );
sortArrItem->m_score = score;
}
return(true);
}
// perform post reranking tasks
// returns false on error
bool PostQueryRerank::postRerank ( ) {
//log( LOG_DEBUG, "query:in PQR::postRerank() AWL" );
// Hopefully never happen...
//log( LOG_DEBUG, "query: just before sort: "
// "m_maxResultsToRerank:%ld m_numToSort:%ld AWL",
// m_maxResultsToRerank, m_numToSort);
if ( m_numToSort < 0 ) return false;
// Sort the array
gbmergesort( (void *) m_m20List, (size_t) m_numToSort,
(size_t) sizeof(M20List),
(int (*)(const void *, const void *))s_firstSortFunction);
// move 2nd result from a particular domain to just below the first
// result from that domain if it is within 10 results of the first
//XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX put this back in after debugging summary rerank!
//XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//if (!attemptToCluster()) return false;
// Fill result arrays with our reranked results
for( long y = 0; y < m_numToSort; y++ ) {
M20List *a = &m_m20List [ y ];
long x = m_positionList [ y ];
m_msg40->m_msg20 [ x ] = a->m_m20;
//m_msg40->m_msg3a.m_tiers [ x ] = a->m_tier;
m_msg40->m_msg3a.m_scores [ x ] = a->m_score;
m_msg40->m_msg3a.m_docIds [ x ] = a->m_docId;
m_msg40->m_msg3a.m_clusterLevels [ x ] = a->m_clusterLevel;
//log( LOG_DEBUG, "pqr: post: mapped y:%ld "
// "to x:%ld AWL",
// y, x );
}
#ifdef DEBUGGING_LANGUAGE
DoDump( "Postsort", m_msg40->m_msg20, m_numToSort,
m_msg40->m_msg3a.m_scores, NULL );//m_msg40->m_msg3a.m_tiers );
#endif
//if ( ! g_conf.m_profilingEnabled ) return true;
//if ( ! g_profiler.endTimer( (long)(this->*(&PostQueryRerank::rerank)),
// TOTAL_RERANKING_TIME_STR) )
// log( LOG_WARN,"admin: Couldn't add the fn %li",
// (long)(this->*(&PostQueryRerank::rerank)) );
return true;
}
// called if we weren't able to rerank for some reason
void PostQueryRerank::rerankFailed ( ) {
//if ( g_conf.m_profilingEnabled ) {
// if( ! g_profiler
// .endTimer( (long)(this->*(&PostQueryRerank::rerank)),
// TOTAL_RERANKING_TIME_STR) )
// log(LOG_WARN,"admin: Couldn't add the fn %li",
// (long)(this->*(&PostQueryRerank::rerank)));
//}
}
// lsort (pqrlang, pqrlangunk, pqrcntry)
// rerank for language, then country
rscore_t PostQueryRerank::rerankLanguageAndCountry ( rscore_t score,
uint8_t lang,
uint8_t summaryLang,
uint16_t country ,
Msg20 *msg20 ) {
//log( LOG_DEBUG, "query:in PQR::rerankLanguageAndCountry("
// "score:%ld, lang:%ld, summLang:%ld, country:%ld)"
// "[langSortingIsOn:%ld; langUnkWeight:%3.3f; langWeight:%3.3f; "
// "&qlang=%ld; &lang=%ld; "
// "&qcountry=%ld; &gbcountry=%ld; "
// "queryLangs:%lld; pageLangs:%lld] AWL",
// score, (long)lang, (long)summaryLang, (long)country,
// (long)m_si->m_enableLanguageSorting,
// m_si->m_languageUnknownWeight,
// m_si->m_languageWeightFactor,
// (long)m_si->m_langHint,
// (long)m_si->m_language,
// (long)m_si->m_countryHint,
// (long)m_si->m_country,
// g_countryCode.getLanguagesWritten( m_si->m_countryHint ),
// g_countryCode.getLanguagesWritten( country ) );
// if lsort is off, skip
if ( ! m_si->m_enableLanguageSorting ) return score;
// . use query lanaguage (si->m_langHint) or restricted search
// language (si->m_language)
// . if both are 0, don't rerank by language
uint8_t langWanted = m_si->m_langHint;
if ( langWanted == langUnknown ) langWanted = m_si->m_queryLang;//language;
if ( langWanted == langUnknown ) return score;
// . apply score factors for unknown languages, iff reranking unknown
// languages
if ( lang == langUnknown &&
m_si->m_languageUnknownWeight > 0 ) {
msg20->m_pqr_factor_lang =m_si->m_languageUnknownWeight;
return rerankAssignPenalty(score,
m_si->m_languageUnknownWeight,
"pqrlangunk",
"it's language is unknown" );
}
// . if computed lanaguage is unknown, don't penalize
// . no, what if from a different country?
if ( summaryLang == langUnknown ) return score;
// . first, apply score factors for non-preferred summary languages
// that don't match the page language
if ( summaryLang != langUnknown && summaryLang != langWanted ) {
msg20->m_pqr_factor_lang = m_si->m_languageWeightFactor;
return rerankAssignPenalty( score,
m_si->m_languageWeightFactor,
"pqrlang",
"it's summary/title "
"language is foreign" );
}
// second, apply score factors for non-preferred page languages
//if ( lang != langWanted )
// return rerankAssignPenalty( score,
// m_si->m_languageWeightFactor,
// "pqrlang",
// "it's page language is foreign" );
// . if we got here languages of query and page match and are not
// unknown, so rerank based on country
// . don't demote if countries match or either the search country
// or page country is unknown (0)
// . default country wanted to gbcountry parm if not specified
uint8_t countryWanted = m_si->m_countryHint;
// SearchInput sets m_country based on the IP address of the incoming
// query, which is often wrong, especially for internal 10.x.y.z ips.
// so just fallback to countryHint for now bcause that uses teh default
// country... right now set to "us" in search controls page.
if ( countryWanted == 0 ) countryWanted = m_si->m_country;
if ( country == 0 || countryWanted == 0 ||
country == countryWanted )
return score;
// . now, languages match and are not unknown and countries don't
// match and neither is unknown
// . so, demote if country of query speaks the same language as
// country of page, ie US query and UK or AUS page (since all 3
// places speak english), but not US query and IT page
uint64_t qLangs = g_countryCode.getLanguagesWritten( countryWanted );
uint64_t pLangs = g_countryCode.getLanguagesWritten( country );
// . if no language written by query country is written by page
// country, don't penalize
if ( (uint64_t)(qLangs & pLangs) == (uint64_t)0LL ) return score;
msg20->m_pqr_factor_lang = m_si->m_cr->m_pqr_demFactCountry;
// countries do share at least one language - demote!
return rerankAssignPenalty( score,
m_si->m_cr->m_pqr_demFactCountry,
"pqrcntry",
"it's language is the same as that of "
"of the query, but it is from a country "
"foreign to that of the query which "
"writes in at least one of the same "
"languages" );
}
// pqrqttiu
// . look for query terms and gigabits in the url, demote more the fewer
// are matched.
/*
rscore_t PostQueryRerank::rerankQueryTermsOrGigabitsInUrl( rscore_t score,
Url *pageUrl ) {
//log( LOG_DEBUG, "query:in PQR::rerankQueryTermsOrGigabitsInUrl("
// "score:%ld, url:'%s', urlLen:%ld)"
// "[factor:%3.3f; max:%ld] AWL",
// score, pageUrl->getUrl(), pageUrl->getUrlLen(),
// m_si->m_cr->m_pqr_demFactQTTopicsInUrl,
// m_si->m_cr->m_pqr_maxValQTTopicsInUrl );
if ( pageUrl->getUrlLen() == 0 ) return score;
float factor = m_si->m_cr->m_pqr_demFactQTTopicsInUrl;
if ( factor <= 0 ) return score; // disables
long maxQTInUrl = m_si->m_q->getNumTerms();
long maxGigabitsInUrl = m_msg40->getNumTopics();
long maxVal = m_si->m_cr->m_pqr_maxValQTTopicsInUrl;
if ( maxVal < 0 ) maxVal = maxQTInUrl+maxGigabitsInUrl;
// from original url:
// . remove scheme
// . remove 'www' from host
// . remove tld
// . remove ext
// . convert symbols to spaces
// . remove extra space
//log( LOG_DEBUG, "query: origurl:'%s' AWL", pageUrl->getUrl() );
//log( LOG_DEBUG, "query: url: whole:'%s' host:'%s' (%ld); "
// "domain:'%s' (%ld); tld:'%s' (%ld); midDom:'%s' (%ld); "
// "path:'%s' (%ld); fn:'%s'; ext:'%s'; query:'%s' (%ld); "
// "ipStr:'%s' {%ld}; anch:'%s' (%ld) "
// "site:'%s' (%ld) AWL",
// pageUrl->getUrl(),
// pageUrl->getHost(), pageUrl->getHostLen(),
// pageUrl->getDomain(), pageUrl->getDomainLen(),
// pageUrl->getTLD(), pageUrl->getTLDLen(),
// pageUrl->getMidDomain(), pageUrl->getMidDomainLen(),
// pageUrl->getPath(), pageUrl->getPathLen(),
// pageUrl->getFilename(), pageUrl->getExtension(),
// pageUrl->getQuery(), pageUrl->getQueryLen(),
// pageUrl->getIpString(), pageUrl->getIp(),
// pageUrl->getAnchor(), pageUrl->getAnchorLen(),
// pageUrl->getSite(), pageUrl->getSiteLen() );
m_cvtUrl[0] = '\0';
long cvtUrlLen = 0;
char *host = pageUrl->getHost();
// first, add hostname - "www." iff it is not an ip addr
if ( pageUrl->getIp() == 0 ) {
if ( host[0] == 'w' && host[1] == 'w' && host[2] == 'w' &&
host[3] == '.' ) {
// if starts with 'www.', don't add the 'www.'
if(pageUrl->getHostLen()-pageUrl->getDomainLen() == 4){
// add domain - 'www.' - tld
strncpy( m_cvtUrl, pageUrl->getDomain(),
pageUrl->getDomainLen() -
pageUrl->getTLDLen() );
cvtUrlLen += pageUrl->getDomainLen() -
pageUrl->getTLDLen();
m_cvtUrl[cvtUrlLen] = '\0';
}
else {
// add host + domain - 'www.' - tld
strncpy( m_cvtUrl, pageUrl->getHost()+4,
pageUrl->getHostLen() -
pageUrl->getTLDLen() - 4 );
cvtUrlLen += pageUrl->getHostLen() -
pageUrl->getTLDLen() - 4;
m_cvtUrl[cvtUrlLen] = '\0';
}
}
else {
// add host + domain - tld
strncpy( m_cvtUrl, pageUrl->getHost(),
pageUrl->getHostLen() -
pageUrl->getTLDLen() - 1 );
cvtUrlLen += pageUrl->getHostLen() -
pageUrl->getTLDLen() - 1;
m_cvtUrl[cvtUrlLen] = '\0';
}
}
// next, add path
if ( pageUrl->getPathLen() > 0 ) {
strncat( m_cvtUrl, pageUrl->getPath(),
pageUrl->getPathLen()-pageUrl->getExtensionLen() );
cvtUrlLen += pageUrl->getPathLen()-pageUrl->getExtensionLen();
m_cvtUrl[cvtUrlLen] = '\0';
}
// next, add query
if ( pageUrl->getQueryLen() > 0 ) {
strncat( m_cvtUrl, pageUrl->getQuery(), pageUrl->getQueryLen() );
cvtUrlLen += pageUrl->getQueryLen();
m_cvtUrl[cvtUrlLen] = '\0';
}
// remove all non-alpha-numeric chars
char *t = m_cvtUrl;
for ( char *s = m_cvtUrl; *s; s++ ) {
if ( is_alnum_a(*s) ) *t++ = *s;
else if ( t>m_cvtUrl && *(t-1) != ' ' ) *t++ = ' ';
}
*t = '\0';
cvtUrlLen = (t-m_cvtUrl);
//log( LOG_DEBUG, "query: m_cvtUrl:'%s' (%ld) AWL",
// m_cvtUrl, cvtUrlLen );
// find number of query terms in url
long numQTInUrl = 0;
long numQTs = m_si->m_q->getNumTerms();
for ( long i = 0; i < numQTs; i++ ) {
char *qtStr = m_si->m_q->getTerm(i);
long qtLen = m_si->m_q->getTermLen(i);
if ( strncasestr(m_cvtUrl, qtStr, cvtUrlLen, qtLen) != NULL ) {
numQTInUrl++;
//log( LOG_DEBUG, "query: qt is in url AWL");
}
}
// find number of gigabits in url
long numGigabitsInUrl = 0;
long numTopics = m_msg40->getNumTopics();
for ( long i = 0; i < numTopics; i++ ) {
char *topicStr = m_msg40->getTopicPtr(i);
long topicLen = m_msg40->getTopicLen(i);
if ( strncasestr(m_cvtUrl, topicStr, cvtUrlLen, topicLen) ) {
numGigabitsInUrl++;
//log( LOG_DEBUG, "query: topic is in url AWL");
}
}
//log( LOG_DEBUG, "query: qts:%ld, gigabits:%ld; "
// "maxQTInUrl:%ld, maxGbInUrl:%ld AWL",
// numQTInUrl, numGigabitsInUrl,
// maxQTInUrl, maxGigabitsInUrl );
return rerankLowerDemotesMore( score,
numQTInUrl+numGigabitsInUrl,
maxVal,
factor,
"pqrqttiu",
"query terms or topics in its url" );
}
*/
// pqrqual
// demote pages that are not high quality
/*
rscore_t PostQueryRerank::rerankQuality ( rscore_t score,
unsigned char quality ) {
//log( LOG_DEBUG, "query:in PQR::rerankQuality("
// "score:%ld, quality:%d)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score, (int)quality,
// m_si->m_cr->m_pqr_demFactQual,
// m_si->m_cr->m_pqr_maxValQual );
float factor = m_si->m_cr->m_pqr_demFactQual;
if ( factor <= 0 ) return score;
long maxVal = m_si->m_cr->m_pqr_maxValQual;
if ( maxVal < 0 ) maxVal = 100;
return rerankLowerDemotesMore( score, quality, maxVal, factor,
"pqrqual", "quality" );
}
*/
// pqrpaths
// demote pages that are not root or have many paths in the url
rscore_t PostQueryRerank::rerankPathsInUrl ( rscore_t score,
char *url,
long urlLen ) {
//log( LOG_DEBUG, "query:in PQR::rerankPathsInUrl("
// "score:%ld, url:%s)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score, url,
// m_si->m_cr->m_pqr_demFactPaths,
// m_si->m_cr->m_pqr_maxValPaths );
if ( urlLen == 0 ) return score;
float factor = m_si->m_cr->m_pqr_demFactPaths;
if ( factor <= 0 ) return score; // disables
long maxVal = m_si->m_cr->m_pqr_maxValPaths;
// bypass scheme and "://"
url = strstr( url, "://" );
if ( ! url ) return score;
url += 3;
// count '/'s to get number of paths
long numPaths = -1; // don't count first path
for ( url = strchr(url, '/') ; url ; url = strchr(url, '/') ) {
numPaths++;
url++;
}
return rerankHigherDemotesMore( score, numPaths, maxVal, factor,
"pqrpaths", "paths in its url" );
}
// pqrcatid
// demote page if does not have a catid
rscore_t PostQueryRerank::rerankNoCatId ( rscore_t score,
long numCatIds,
long numIndCatIds ) {
//log( LOG_DEBUG, "AWL:in PQR::rerankNoCatId("
// "score:%ld, numCatIds:%ld, numIndCatIds:%ld)"
// "[P_factor:%3.3f]",
// score, numCatIds, numIndCatIds,
// m_si->m_cr->m_pqr_demFactNoCatId );
float factor = m_si->m_cr->m_pqr_demFactNoCatId;
if ( factor <= 0 ) return score; // disables
if ( numCatIds + numIndCatIds > 0 ) return score;
return rerankAssignPenalty( score, factor,
"pqrcatid", "it has no category id" );
}
// pqrsuper
// demote page if smallest catid has a lot of super topics
rscore_t PostQueryRerank::rerankSmallestCatIdHasSuperTopics ( rscore_t score,
Msg20 *msg20 ) {
//log( LOG_DEBUG, "query:in PQR::rerankSmallestCatIdHasSuperTopics("
// "score:%ld)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score,
// m_si->m_cr->m_pqr_demFactCatidHasSupers,
// m_si->m_cr->m_pqr_maxValCatidHasSupers );
float factor = m_si->m_cr->m_pqr_demFactCatidHasSupers;
if ( factor <= 0 ) return score; // disables
long maxVal = m_si->m_cr->m_pqr_maxValCatidHasSupers;
// If page doesn't have a catid, we should demote it as if it has
// max catids, otherwise pages with a catid will be penalized more
if ( msg20->m_r->size_catIds == 0 ) {
return rerankAssignPenalty( score,
factor,
"pqrsuper",
"it has no category id" );
}
// find smallest catid
long minCatid = LONG_MAX;
long numCatids = msg20->m_r->size_catIds / 4;
for ( long i = 0; i < numCatids; i++ ) {
if ( msg20->m_r->ptr_catIds[i] < minCatid ) {
minCatid = msg20->m_r->ptr_catIds[i];
}
}
//log( LOG_DEBUG, "query: minCatid:%ld AWL", minCatid );
// count super topics by walking up catids
long numSupers = -1;
long currCatId = minCatid;
long currParentId = minCatid;
while ( currCatId > 1 ) {
// next cat
currCatId = currParentId;
// get the index for this cat
long currCatIndex = g_categories->getIndexFromId(currCatId);
if ( currCatIndex <= 0 ) break;
// get the parent for this cat
currParentId = g_categories->m_cats[currCatIndex].m_parentid;
numSupers++;
}
return rerankHigherDemotesMore( score, numSupers, maxVal, factor,
"pqrsuper",
"category ids" );
}
// pqrpgsz
// . demote page based on size. (number of words) The bigger, the
// more it should be demoted.
rscore_t PostQueryRerank::rerankPageSize ( rscore_t score,
long docLen ) {
//log( LOG_DEBUG, "query:in PQR::rerankPageSize("
// "score:%ld, docLen:%ld)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score, docLen,
// m_si->m_cr->m_pqr_demFactPageSize,
// m_si->m_cr->m_pqr_maxValPageSize );
float factor = m_si->m_cr->m_pqr_demFactPageSize;
if ( factor <= 0 ) return score;
long maxVal = m_si->m_cr->m_pqr_maxValPageSize;
// safety check
if ( docLen <= 0 ) docLen = maxVal;
return rerankHigherDemotesMore( score, docLen, maxVal, factor,
"pqrpgsz", "page size" );
}
/*
// pqrloc
const long MIN_PLACEPOP = 50000;
// . returns true if buf contains a location
// . locBuf is the location name
// . locLen is it's length
// . locPop is it's population
bool PostQueryRerank::getLocation( char *locBuf, long locBufLen,
long *locLen, long *locPop,
char *buf, long bufLen ) {
//log( LOG_DEBUG, "query:in getLocation(buf:%c%c%c%c, len:%ld, "
// "uc:%d) AWL",
// buf[0], buf[2], buf[4], buf[6], bufLen,
Words words;
if ( ! words.set( buf, bufLen, TITLEREC_CURRENT_VERSION,
false, // computeIds
false // hasHtmlEntities
) )
return false;
AppendingWordsWindow ww;
if ( ! ww.set( &words,
1, // minWindowSize
5, // maxWindowSize
locBufLen,
locBuf
) )
return false;
// find all phrases between length of 1 and 5
for ( ww.processFirstWindow(); !ww.isDone(); ww.processNextWindow() ){
ww.act();
char *phrasePtr = ww.getPhrasePtr();
long phraseLen = ww.getPhraseLen();
long numPhraseWords = ww.getNumWords();
if ( numPhraseWords == 0 ) continue;
//log( LOG_DEBUG, "query: p:%s (%ld) AWL",
// phrasePtr, phraseLen );
// see if buf phrase is a place
long encodeType = csUTF8;//csISOLatin1;
long placePop = getPlacePop( phrasePtr, phraseLen,
encodeType );
if ( placePop > MIN_PLACEPOP ) {
//log( LOG_DEBUG, "query: p:%s (%ld) is "
// "loc spec AWL",
// phrasePtr, phraseLen );
*locLen = phraseLen;
*locPop = placePop;
return true;
}
// check to see if buf phrase's abbreviation is loc spec
//log( LOG_DEBUG, "query: utf8 p:%s (%ld) AWL",
// phrasePtr, phraseLen );
SynonymInfo synInfo;
if ( ! g_thesaurus.getSynonymInfo( phrasePtr,
&synInfo,
phraseLen ) )
continue;
long numSyns = synInfo.m_numSyns;
for ( long j = 0; j < numSyns; j++ ) {
char *syn = synInfo.m_syn[j];
long synLen = gbstrlen(syn);
placePop = getPlacePop( syn, synLen,
csISOLatin1 );
if ( placePop > MIN_PLACEPOP ) {
//log( LOG_DEBUG, "query: s:%s (%ld) is "
// "loc spec AWL",
// syn, synLen );
*locLen = phraseLen;
*locPop = placePop;
return true;
}
}
}
*locLen = 0;
*locPop = 0;
return false;
}
// pqrloc
bool PostQueryRerank::preRerankNonLocationSpecificQueries ( ) {
//log( LOG_DEBUG, "query:in PQR::preRerankNonLocSpecQueries() AWL" );
if ( m_si->m_pqr_demFactLocTitle <= 0 &&
m_si->m_pqr_demFactLocSummary <= 0 &&
m_si->m_pqr_demFactLocDmoz <= 0 )
return true;
//log( LOG_DEBUG, "query: q:%s (%ld) AWL",
// m_si->m_q->m_orig,
// m_si->m_q->m_origLen );
// See if query is location specific by building a buffer of
// query terms without punct then checking all phrases of that
// buffer
long numQWords = m_si->m_q->m_numWords;
char locBuf[1024];
long locLen = 0;
long locPop = 0;
char buf[MAX_QUERY_LEN];
char *p = buf;
Query *q = m_si->m_q;
for ( long i = 0; i < numQWords; i++ ) {
QueryWord *qw = &q->m_qwords[i];
//log( LOG_DEBUG, "query: qw:%c%c%c%c (%ld) "
// "inQuotes:%d; inQuoted:%d; quoteStrt:%ld "
// "op:%d; opcode:%d; isPunct:%d level:%d; "
// "wsign:%d; psign:%d id:%lld "
// "ignore:%d AWL",
// qw->m_word[0], qw->m_word[2],
// qw->m_word[4], qw->m_word[6],
// qw->m_wordLen,
// qw->m_inQuotes, qw->m_inQuotedPhrase, qw->m_quoteStart,
// qw->m_queryOp, qw->m_opcode, qw->m_isPunct, qw->m_level,
// qw->m_wordSign, qw->m_phraseSign, qw->m_wordId,
// qw->m_ignoreWord );
// reset buf if word is punct (except all space) or an opcode
bool isPunct = qw->m_isPunct;
bool isAllSpace = false;
if ( isPunct ) {
char *s = qw->m_word;
for ( ; (int)(s-qw->m_word) < qw->m_wordLen; s++ ) {
if ( ! is_space(*s) ) break;
}
isAllSpace = ( s-qw->m_word == qw->m_wordLen );
}
if ( (isPunct && ! isAllSpace) || qw->m_opcode != 0 ) {
// before we reset, see if buffer contains a location
if ( getLocation( locBuf, 1024,
&locLen, &locPop,
buf, p-buf ) ) {
long encodeType = csUTF8;//csISOLatin1;
m_querysLoc = hash64d( locBuf, locLen);
break;
}
p = buf;
//log( LOG_DEBUG, "query: encountered symbol:%d|%d AWL",
// qw->m_isPunct, qw->m_opcode );
continue;
}
// but if word is all space, dont append
if ( isAllSpace ) continue;
// skip if word is subtracted out
if ( qw->m_wordSign == '-' ) continue;
// skip if word or phrase is under NOT ||| AWL not working right now
if ( qw->m_queryWordTerm &&
qw->m_queryWordTerm->m_underNOT ) continue;
if ( qw->m_queryPhraseTerm &&
qw->m_queryPhraseTerm->m_underNOT ) continue;
// else, append word + space to buf
memcpy( p, qw->m_word, qw->m_wordLen );
p += qw->m_wordLen;
*p++ = ' ';
}
// now see if there's a location in buf
if ( m_querysLoc == 0 &&
getLocation( locBuf, 1024, &locLen, &locPop,
buf, p-buf ) ) {
m_querysLoc = hash64d( locBuf, locLen );
}
//log( LOG_DEBUG, "query: q loc:%lld AWL",
// m_querysLoc );
// check the gigabits for locations
//log( LOG_DEBUG, "query: places lookup gigabits numTopics:%ld AWL",
// m_msg40->getNumTopics() );
m_ignoreLocs.set( 28 );
// if searching the us, these should not be demoted, so
// put them into the gigabit table
if (m_si->m_country == 226) {
m_ignoreLocs.addKey(hash64d("u.s.",4),true);
m_ignoreLocs.addKey(hash64d("us",2),true);
m_ignoreLocs.addKey(hash64d("united states",14),true);
m_ignoreLocs.addKey(hash64d("u.s.a.",6),true);
m_ignoreLocs.addKey(hash64d("usa",3),true);
m_ignoreLocs.addKey(hash64d("america",7),true);
m_ignoreLocs.addKey(hash64d("american",8),true);
m_ignoreLocs.addKey(hash64d("americans",9),true);
m_ignoreLocs.addKey(hash64d("canada",6),true);
m_ignoreLocs.addKey(hash64d("kanada",6),true);
m_ignoreLocs.addKey(hash64d("canucks",7),true);
m_ignoreLocs.addKey(hash64d("canadians",9),true);
m_ignoreLocs.addKey(hash64d("canadian",8),true);
m_ignoreLocs.addKey(hash64d("north america",13),true);
m_ignoreLocs.addKey(hash64d("uk",2),true);
m_ignoreLocs.addKey(hash64d("united kingdom",14),true);
m_ignoreLocs.addKey(hash64d("british",7),true);
m_ignoreLocs.addKey(hash64d("britain",7),true);
m_ignoreLocs.addKey(hash64d("britons",7),true);
m_ignoreLocs.addKey(hash64d("great britain",13),true);
}
// now add the locations from the gigabits
long numTopics = m_msg40->getNumTopics();
for ( long i = 0; !m_si->m_pqr_demInTopics && i < numTopics; i++ ) {
char *topicStr = m_msg40->getTopicPtr(i);
long topicLen = m_msg40->getTopicLen(i);
Words words;
if ( ! words.set( topicStr, topicLen, TITLEREC_CURRENT_VERSION,
false, // computeIds
false // hasHtmlEntities
) )
continue;
AppendingWordsWindow ww;
if ( ! ww.set( &words,
1, // minWindowSize
5, // maxWindowSize
AWW_INIT_BUF_SIZE,
NULL
) )
continue;
// find all phrases between length of 1 and 5
for ( ww.processFirstWindow();
! ww.isDone();
ww.processNextWindow() ) {
ww.act();
char *phrasePtr = ww.getPhrasePtr();
long phraseLen = ww.getPhraseLen();
long numPhraseWords = ww.getNumWords();
if ( numPhraseWords == 0 ) continue;
// see if topic phrase is a place
long placePop = getPlacePop( phrasePtr, phraseLen,
encodeType );
if ( placePop > MIN_PLACEPOP ) {
// It's a place, mark it so if a page has
// this place name in it's title we won't
// rerank it
uint64_t h = hash64d( phrasePtr, phraseLen);
m_ignoreLocs.addKey( h, true );
//log( LOG_DEBUG, "query: pre gigabit has "
// "location '%s' (%ld) [h:%lld] AWL",
// phrasePtr, phraseLen, h );
continue;
}
// Check if a gigabit's abbreviation is location
// specific
SynonymInfo synInfo;
if ( ! g_thesaurus.getSynonymInfo( phrasePtr,
&synInfo,
phraseLen ) )
continue;
long numSyns = synInfo.m_numSyns;
for ( long j = 0; j < numSyns; j++ ) {
char *syn = synInfo.m_syn[j];
long synLen = gbstrlen(syn);
placePop = getPlacePop( syn, synLen,
csISOLatin1 );
if ( placePop > MIN_PLACEPOP ) {
// It's a place, so mark syn
uint64_t h = hash64d( syn, synLen);
m_ignoreLocs.addKey( h, true );
//log( LOG_DEBUG, "query: pre gigabit"
// " has location synonym '%s'"
// " h:%lld AWL",
// syn, h );
continue;
}
}
}
}
if (m_querysLoc != 0)
log(LOG_DEBUG, "pqr: query contains a location, "
"will not demote location specific results");
else
log(LOG_DEBUG, "pqr: query DOES NOT contain a location, "
"will demote location specific results");
return true;
}
// pqrloc
// . if query is not location specific, and a page has a geographic location
// in its title then demote that page UNLESS the geographic location is
// contained in the list of gigabits for the search query. like "Shoes (UK)"
// or "retail stores in New York" when you are UK and New York are not in
// your query. We will need a file of locations. BUT if the location is
// contained in the gigabits, do NOT demote such pages, query might have
// something like "the big apple" in it... Note: if query ops out of a
// location, it should not be considered location specific (like "expo
// -montreal"). demote by popularity weight of the place name as returned
// from getPlacesPeoplePop().
// . demote results containing geographic locations
// unless THAT location is in gigabits or in query. fixes
// 'car insurance'? demote a little bit if in summary...
// or a little bit if in a single dmoz catregory and it is
// dmoz regional category. do not demote 'united states' 'us'
// 'america' or 'usa' if searching default is the us. do
// not dmoz dmoz north america:US region if searching in us.
// but if 'albuquerque' in query, do not demote if 'new mexico'
// in search results.
rscore_t PostQueryRerank::rerankNonLocationSpecificQueries ( rscore_t score,
Msg20 *msg20 ) {
float titleFactor = m_si->m_pqr_demFactLocTitle;
float summFactor = m_si->m_pqr_demFactLocSummary;
float dmozFactor = m_si->m_pqr_demFactLocDmoz;
if ( titleFactor <= 0 &&
summFactor <= 0 &&
dmozFactor <= 0 )
return score;
long maxVal = m_si->m_cr->m_pqr_maxValLoc;
// if we found a location in the query, don't rerank for locs
if (m_querysLoc != 0) return score;
//log( LOG_DEBUG, "query:in PQR::rerankNonLocSpecQueries("
// "score:%ld)"
// "[P_factorTitle:%3.3f; P_factorSummary:%3.3f; P_factorDmoz:%3.3f; "
// " P_max:%ld; "
// "m_querysLoc:%lld; #m_ignoreLocs:%ld; #summaryLocs:%ld] AWL",
// score,
// titleFactor, summFactor, dmozFactor,
// maxVal,
// m_querysLoc,
// m_ignoreLocs.getNumSlotsUsed(),
// msg20->getNumSummaryLocs() );
// check if categories are regional and contain a location
long numCatids = msg20->m_r->size_catIds / 4;
long *catids = msg20->m_r->ptr_catIds;
long catLocMaxPop = 0;
uint8_t searchingUS = (m_si->m_country == 226);
//log(LOG_DEBUG, "pqr: checking %ld categories for locs AWL",
// numCatids);
for ( unsigned char i = 0; dmozFactor > 0 && i < numCatids; i++ ) {
SafeBuf sb;
long catid = catids[i];
g_categories->printPathFromId(&sb, catid, true);
// copy first part of category so we can work with it
const long MAX_PQRCAT = 512;
char cat[MAX_PQRCAT];
long len = sb.length();
if (len > MAX_PQRCAT) len = MAX_PQRCAT;
strncpy(cat, sb.getBufStart(), len);
cat[len] = '\0';
//log(LOG_DEBUG, "pqr: catid:%ld category:'%s' AWL",
// catid, cat);
// see if we have a regional category
char *p = cat;
char region[64];
char *q = region;
while (*p && q-region < 64 && *p != '/') *q++ = *p++;
*q = '\0';
bool catIsRegional = (0 == strncmp(region, "Regional", 8));
//log(LOG_DEBUG, "pqr: cat has region:%ld AWL",
// (long)catIsRegional);
// we only care about regional categories
if (!catIsRegional) continue;
long placePop = 0;
// scan category for region
while (*p) {
p++;
q = region;
while (*p && q-region < 64 && *p != '/') {
if (*p == '_') *q++ = ' ';
else *q++ = *p;
p++;
}
*q = '\0';
bool regionIsUS = (searchingUS &&
(0 == strcasecmp(region, "us") ||
0 == strcasecmp(region, "united states") ||
0 == strcasecmp(region, "usa") ||
0 == strcasecmp(region, "america")));
//log(LOG_DEBUG, "pqr: region:%s (isUS:%ld) AWL",
// region, (long)regionIsUS);
// if region is us, skip category demotion
if (!regionIsUS) {
// see if region is a place
placePop = getPlacePop(region, q-region,
csISOLatin1);
if (placePop > MIN_PLACEPOP) break;
}
}
// if we didn't find a place, go to next cat
if (placePop <= MIN_PLACEPOP) continue;
uint64_t h = hash64d( region, q-region );
if (h == 0) continue;
// is it the location of the query?
if (h == m_querysLoc) {
//log(LOG_DEBUG, "pqr: cat "
// "has query's loc "
// "[pop:%ld; h:%lld] AWL",
// placePop, h);
return score;
}
// is it in the gigabits?
if (m_ignoreLocs.getSlot( h ) != -1) {
//log(LOG_DEBUG, "pqr: cat has "
// "gigabit's loc [pop:%ld; h:%llu] AWL",
// placePop, h);
return score;
}
// use only the max pop for all places in category
if (placePop > catLocMaxPop) {
//log(LOG_DEBUG, "pqr: cat has a non-query, "
// "non-gigabit loc:'%s' %llu pop:%ld AWL",
// region, h, placePop);
catLocMaxPop = placePop;
continue;
}
}
//log(LOG_DEBUG, "pqr: categories' max population:%ld AWL",
// catLocMaxPop);
if (dmozFactor > 0 && catLocMaxPop > MIN_PLACEPOP)
score = rerankHigherDemotesMore(score,
catLocMaxPop, maxVal,
dmozFactor,
"pqrlocd",
"population of a place in a "
"category and the place was "
"not in the query or gigabits");
// check if summary contains a location
// check if summary's location is in gigabits
long numSummaryLocs = msg20->m_r->size_summLocs/8;
uint64_t *summaryLocs = msg20->m_r->ptr_summLocs;
long *summaryLocsPops = msg20->m_r->ptr_summLocsPop;
long summaryLocMaxPop = 0;
for (long i = 0; summFactor > 0 && i < numSummaryLocs; i++) {
uint64_t h = summaryLocs[i];
long placePop = summaryLocsPops[i];
if (h == 0) continue;
if (placePop <= MIN_PLACEPOP) continue;
// is it the location of the query?
if ( h == m_querysLoc ) {
//log( LOG_DEBUG, "pqr: summary "
// "has query's loc "
// "[pop:%ld; h:%lld] AWL",
// placePop, h );
return score;
}
// is it in the gigabits?
if (m_ignoreLocs.getSlot( h ) != -1 ) {
//log( LOG_DEBUG, "pqr: summary has "
// "gigabit's loc [pop:%ld; h:%llu] AWL",
// placePop, h );
return score;
}
// use only the max pop for all places in title
if ( placePop > summaryLocMaxPop ) {
//log( LOG_DEBUG, "pqr: summary has a non-query, "
// "non-gigabit loc:%llu pop:%ld AWL",
// h, placePop );
summaryLocMaxPop = placePop;
continue;
}
}
//log( LOG_DEBUG, "pqr: summary's max population:%ld AWL",
// summaryLocMaxPop );
if (summFactor > 0 && summaryLocMaxPop > MIN_PLACEPOP)
score = rerankHigherDemotesMore(score,
summaryLocMaxPop, maxVal,
summFactor,
"pqrlocs",
"population of a place in its "
"summary and the place was "
"not in the query or gigabits");
// check if title contains a location
if (titleFactor <= 0) return score;
char *pageTitle = msg20->getTitle();
long pageTitleLen = msg20->getTitleLen();
Words words;
if ( ! words.set( pageTitle, pageTitleLen, TITLEREC_CURRENT_VERSION,
false, // computeIds
false // hasHtmlEntities
) )
return score;
AppendingWordsWindow ww;
if ( ! ww.set( &words,
1, // minWindowSize
5, // maxWindowSize
AWW_INIT_BUF_SIZE,
NULL
) )
return score;
// find all phrases between length of 1 and 5
long titleLocMaxPop = 0;
for ( ww.processFirstWindow(); ! ww.isDone(); ww.processNextWindow()) {
ww.act();
char *phrasePtr = ww.getPhrasePtr();
long phraseLen = ww.getPhraseLen();
long numPhraseWords = ww.getNumWords();
if ( numPhraseWords == 0 ) continue;
// Get the place's population
// If it's a place, check gigabits for the place name
long encodeType = csUTF8; //ISOLatin1;
long placePop = getPlacePop( phrasePtr, phraseLen,
encodeType );
if ( placePop > MIN_PLACEPOP ) {
// Check if place is same as query
// Check if gigabits has this location or
// an abbreviation of the location, if so don't
// rerank this page
uint64_t h = hash64d(phrasePtr, phraseLen);
if ( h == 0 ) continue;
// is it the query's location?
if ( h == m_querysLoc ) {
//log( LOG_DEBUG, "query: title has "
// "query's loc [pop:%ld; h:%llu] AWL",
// placePop, h );
return score;
}
// is it in the gigabits?
if ( m_ignoreLocs.getSlot( h ) != -1 ) {
//log( LOG_DEBUG, "pqr: title has "
// "gigabit's loc [pop:%ld; h:%llu] AWL",
// placePop, h );
return score;
}
}
// use only the max pop for all places in title
if ( placePop > titleLocMaxPop ) {
//log( LOG_DEBUG, "pqr: title has a non-query, "
// "non-gigabit loc:'%s' (%ld) pop:%ld AWL",
// phrasePtr, phraseLen, placePop );
titleLocMaxPop = placePop;
continue;
}
// If we haven't found a place name yet, check for
// abbreviations of a place name
//log( LOG_DEBUG, "pqr: phrase:'%s' (%ld) words:%ld "
// "pop:%ld AWL",
// phrasePtr, phraseLen, numPhraseWords,
// placePop );
SynonymInfo synInfo;
if ( ! g_thesaurus.getSynonymInfo( phrasePtr, &synInfo,
phraseLen ) ) {
continue;
}
long numSyns = synInfo.m_numSyns;
for ( long j = 0; j < numSyns; j++ ) {
char *syn = synInfo.m_syn[j];
long synLen = gbstrlen(synInfo.m_syn[j]);
placePop = getPlacePop( syn, synLen,
csISOLatin1 );
if ( placePop > MIN_PLACEPOP ) {
// Check if gigabits has an abbreviation
// of the location, if so don't rerank
// this page
uint64_t h = hash64d(syn, synLen);
if ( h == 0 ) continue;
// is syn the query's loc?
if ( h == m_querysLoc ) {
//log( LOG_DEBUG, "pqr: title "
// "has query's loc syn "
// "[pop:%ld; h:%lld] AWL",
// placePop, h );
return score;
}
// is syn in gigabits?
if ( m_ignoreLocs.getSlot( h ) != -1 ) {
//log(LOG_DEBUG, "pqr: syn title "
// " has gigabits's loc '%s' "
// "[pop:%ld; h:%lld] AWL",
// syn,
// placePop, h );
return score;
}
// only use max pop in calculations
if ( placePop > titleLocMaxPop ) {
//log( LOG_DEBUG, "pqr: title "
// "has a non-query, "
// "non-gigabit loc syn AWL" );
titleLocMaxPop = placePop;
}
}
}
}
//log( LOG_DEBUG, "pqr: title's max population:%ld AWL",
// titleLocMaxPop );
return rerankHigherDemotesMore( score, titleLocMaxPop, maxVal,
titleFactor,
"pqrloct",
"population of a place in its title "
"and the place was not in the query "
"or gigabits" );
}
*/
// pqrhtml, pqrxml
// demote if content type is not html (or is xml)
/*
rscore_t PostQueryRerank::rerankContentType ( rscore_t score,
char contentType ) {
float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml;
float xmlFactor = m_si->m_cr->m_pqr_demFactXml;
//log( LOG_DEBUG, "query:in PQR::rerankContentType("
// "score:%ld, content-type:%ld)"
// "[P_factorHtml:%3.3f; P_factorXml:%3.3f] AWL",
// score, (long)contentType,
// htmlFactor, xmlFactor );
// if completely disabled or page is html, don't do anything
if ( xmlFactor <= 0 && htmlFactor <= 0 || contentType == CT_HTML )
return score;
// if demoting for xml, then do that
if ( xmlFactor > 0 && contentType == CT_XML )
return rerankAssignPenalty( score, xmlFactor,
"pqrxml", "it is xml" );
// we are demoting for non-html and the page is not html
return rerankAssignPenalty( score, htmlFactor,
"pqrhtml", "it is not html" );
}
*/
// pqrfsd
// setup
bool PostQueryRerank::preRerankOtherPagesFromSameHost( Url *pageUrl ) {
// don't do anything if this method is disabled
if ( m_si->m_cr->m_pqr_demFactOthFromHost <= 0 ) return true;
// don't add if no url
if ( pageUrl->getUrlLen() == 0 ) return true;
//log( LOG_DEBUG, "query:in PQR::preRerankOtherPagesFromSameHost() AWL");
//log( LOG_DEBUG, "query: u:'%s' host:'%s' (%ld); "
// "domain:'%s' (%ld) AWL",
// pageUrl->m_url,
// pageUrl->getHost(), pageUrl->getHostLen(),
// pageUrl->getDomain(), pageUrl->getDomainLen() );
char *host = pageUrl->getDomain();
long hostLen = pageUrl->getDomainLen();
uint64_t key = hash64Lower_a( host, hostLen );
if ( key == 0 ) key = 1;
long slot = m_hostCntTable.getSlot( key );
if ( slot == -1 ) {
m_hostCntTable.addKey( key, 0 ); // first page doesn't cnt
}
else {
long *cnt = m_hostCntTable.getValuePointerFromSlot( slot );
(*cnt)++;
}
return true;
}
// pqrfsd
// . if page does not have any other pages from its same hostname in the
// search results (clustered or not) then demote it. demote based on
// how many pages occur in the results from the same hostname. (tends
// to promote pages from hostnames that occur a lot in the unclustered
// results, they tend to be authorities) If it has pages from the same
// hostname, they must have the query terms in different contexts, so
// we must get the summaries for 5 of the results, and just cluster the rest.
rscore_t PostQueryRerank::rerankOtherPagesFromSameHost ( rscore_t score,
Url *pageUrl ) {
//log( LOG_DEBUG, "query:in PQR::rerankOtherPagesFromSameHost("
// "score:%ld, url:'%s', urlLen:%ld)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score, pageUrl->getUrl(), pageUrl->getUrlLen(),
// m_si->m_cr->m_pqr_demFactOthFromHost,
// m_si->m_cr->m_pqr_maxValOthFromHost );
if ( pageUrl->getUrlLen() == 0 ) return score;
float factor = m_si->m_cr->m_pqr_demFactOthFromHost;
if ( factor <= 0 ) return score; // disables
long maxVal = m_si->m_cr->m_pqr_maxValOthFromHost;
if ( maxVal < 0 ) maxVal = m_numToSort-1; // all but this one
// . lookup host for this page in hash table to get number of other
// pages from the same host
char *host = pageUrl->getDomain();
long hostLen = pageUrl->getDomainLen();
uint64_t key = hash64Lower_a( host, hostLen );
long slot = m_hostCntTable.getSlot( key );
long numFromSameHost = m_hostCntTable.getValueFromSlot( slot );
//log( LOG_DEBUG, "query: numFromSameHost:%ld AWL", numFromSameHost );
return rerankLowerDemotesMore( score,
numFromSameHost, maxVal,
factor,
"pqrfsd",
"other pages from the same host" );
}
// pqrctid
// . if page is from a topic in dmoz that is in common with a lot of other
// results, then do not demote it as much as if it is not. ("birds of
// a feather") Reduce demotion penalty as you demote each result in
// order to avoid "clumping".
// setup
bool PostQueryRerank::preRerankCommonTopicsInDmoz( Msg20Reply *mr ) {
if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 ) return true;
//SANITYCHECK( msg20 );
if ( ! mr ) { char *xx=NULL;*xx=0; }
//log( LOG_DEBUG, "query:in PQR::preRerankCommonTopicsInDmoz() "
// "AWL" );
//log(LOG_DEBUG, "query: qdmoz pre cnt:%d AWL",
// (int)msg20->m_numCatids);
long numCatids = mr->size_catIds/4;//msg20->getNumCatids();
for ( unsigned char i = 0; i < numCatids; i++ ) {
long key = mr->ptr_catIds[i];//msg20->getDmozCatids()[i];
if ( key == 0 ) key = 1;
long slot = m_dmozTable.getSlot( key );
//log( LOG_DEBUG, "query: qdmoz pre %ld/%ld; "
// "catId:%ld; slot:%ld AWL",
// (long)i+1, (long)msg20->m_numCatids,
// key, slot );
if ( slot == -1 ) {
// first occurance
// cnt is 0, no other common topics
// demotion factor is the parm
ComTopInDmozRec rec;
rec.cnt = 0;
rec.demFact =
m_si->m_cr->m_pqr_demFactComTopicInDmoz;
m_dmozTable.addKey( key, rec );
//log(LOG_DEBUG, "query: qdmoz pre occurance 1 AWL");
}
else {
// nth occurance
ComTopInDmozRec *rec =
m_dmozTable.getValuePointerFromSlot( slot );
rec->cnt++;
//log( LOG_DEBUG, "query: qdmoz pre key:%ld "
// "occurance %ld AWL",
// key, rec->cnt );
}
}
return true;
}
// pqrctid
// . if page is from a topic in dmoz that is in common with a lot of other
// results, then do not demote it as much as if it is not. ("birds of
// a feather") Reduce demotion penalty as you demote each result in
// order to avoid "clumping".
rscore_t PostQueryRerank::rerankCommonTopicsInDmoz ( rscore_t score,
Msg20 *msg20 ) {
//log( LOG_DEBUG, "query:in PQR::rerankCommonTopicsInDmoz("
// "score:%ld)"
// "[P_max:%ld P_decFact:%3.3f] AWL",
// score,
// m_si->m_cr->m_pqr_maxValComTopicInDmoz,
// m_si->m_cr->m_pqr_decFactComTopicInDmoz );
//log( LOG_DEBUG, "query: qdmoz cnt:%ld AWL",
// (long)msg20->m_numCatids );
if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 )
return score;
long maxVal = m_si->m_cr->m_pqr_maxValComTopicInDmoz;
if ( maxVal < 0 ) maxVal = m_numToSort;
// . see if page is from a topic in dmoz that is in common with a
// lot of other results
// . if no catid, result will be not be demoted
float chosenDemFact = 0.0;
long numComTopicsInDmoz = 0;
long maxComTopicsInDmoz = 0;
long numCatids = msg20->m_r->size_catIds/4;//getNumCatids();
for ( unsigned char i = 0; i < numCatids; i++ ) {
long key = msg20->m_r->ptr_catIds[i];//getDmozCatids()[i];
long slot = m_dmozTable.getSlot( key );
ComTopInDmozRec *rec =
m_dmozTable.getValuePointerFromSlot( slot );
//log( LOG_DEBUG, "query: slot:%ld key:%ld cnt:%ld; "
// "demFact:%3.3f AWL",
// slot, key,
// rec->cnt, rec->demFact );
// add # of other pages with same topic as this
numComTopicsInDmoz += rec->cnt;
// . find the slot with the max common topics in dmoz so
// it can be decayed
if ( rec->cnt > maxComTopicsInDmoz ) {
chosenDemFact = rec->demFact;
maxComTopicsInDmoz = rec->cnt;
}
}
score = rerankHigherDemotesMore( score,
numComTopicsInDmoz, maxVal,
chosenDemFact,
"pqrctid",
"common topics in dmoz "
"as other results" );
// now decay the factors
float decFactor =
m_si->m_cr->m_pqr_decFactComTopicInDmoz;
if ( decFactor < 0 ) return score;
for ( unsigned char i = 0; i < numCatids; i++ ) {
long key = msg20->m_r->ptr_catIds[i];
long slot = m_dmozTable.getSlot( key );
ComTopInDmozRec *rec =
m_dmozTable.getValuePointerFromSlot( slot );
rec->demFact *= (1.0 - decFactor);
//log( LOG_DEBUG, "query: decay slot:%ld key:%ld "
// "cnt:%ld; decFact:%3.3f; new demFact:%3.3f AWL",
// slot, key,
// rec->cnt, decFactor, rec->demFact );
}
return score;
}
// pqrdcndcqt
// . if the dmoz category names contain a query term (or its synonyms or
// gigabits), "boost" the result based on the query term weight (look at
// query phrase term weights, too) (actually, demote others that do not
// have them...)
/*
rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveQT ( rscore_t score,
Msg20 *msg20 ) {
//log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveQT("
// "score:%ld)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score,
// m_si->m_cr->m_pqr_demFactDmozCatNmNoQT,
// m_si->m_cr->m_pqr_maxValDmozCatNmNoQT );
float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoQT;
if ( factor <= 0 ) return score; // disables
long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoQT;
long numQTsInDmoz = 0;
char *pd = msg20->m_r->ptr_dmozTitles;
long numCatids = msg20->m_r->size_catIds/4;
long numQTs = m_si->m_q->m_numTerms;
HashTableT<uint64_t, long> matchedIds;
matchedIds.set( numQTs*2 );
for ( long j = 0; j < numCatids; j++ ) {
char *currTitle = pd;
long currTitleLen = gbstrlen(pd);
if ( currTitleLen == 0 ) continue;
//log( LOG_DEBUG, "query: currTitle:%s (%ld) AWL",
// currTitle, currTitleLen );
Words w;
Bits b;
Phrases p;
long long *wids;
long nw;
long long *pids;
if ( ! w.set( currTitle ,
currTitleLen ,
TITLEREC_CURRENT_VERSION,
true , // computeIds
false ) )
goto next;
if ( ! b.set( &w, TITLEREC_CURRENT_VERSION ,0) )
goto next;
if ( ! p.set( &w ,
&b ,
true , // useStopWords
false , // useStems
TITLEREC_CURRENT_VERSION,
0 ) ) // niceness
goto next;
wids = w.getWordIds ();
nw = w.getNumWords ();
pids = p.getPhraseIds2 ();
// go through all words in cat name
for ( long i = 0; i < nw; i++ ) {
// go through all query terms
for ( long k = 0; k < numQTs; k++ ) {
QueryTerm *qt =
&m_si->m_q->m_qterms[k];
long long rawTermId = qt->m_rawTermId;
// ignore 0 termIds
if ( rawTermId == 0 ) continue;
// see if we already matched this id
long n = matchedIds.getSlot( rawTermId );
if ( n != -1 ) continue;
// compare this query term to cat word
if ( rawTermId == wids[i] ) {
matchedIds.addKey( rawTermId, 0 );
numQTsInDmoz++;
//log( LOG_DEBUG, "query: qt-dmozw "
// "match '%s' (%ld) AWL",
// qt->m_term,
// qt->m_termLen );
continue;
}
// compare this query term to cat phrase
if ( qt->m_isPhrase && rawTermId == pids[i] ) {
matchedIds.addKey( rawTermId, 0 );
numQTsInDmoz++;
//log( LOG_DEBUG, "query: qt-dmozp "
// "match '%s' (%ld) AWL",
// qt->m_term,
// qt->m_termLen );
continue;
}
// if we haven't matched yet, check syns
SynonymInfo synInfo;
if ( ! g_thesaurus.getSynonymInfo( rawTermId,
&synInfo ))
continue;
long numSyns = synInfo.m_numSyns;
for ( long k = 0; k < numSyns; k++ ) {
//log( LOG_DEBUG, "query: syn:'%s' AWL",
// synInfo.m_syn[j]);
uint64_t h ;
h =hash64Lower_utf8(synInfo.m_syn[j],
gbstrlen(synInfo.m_syn[j]));
// see if we already matched this id
long n = matchedIds.getSlot( h );
if ( n != -1 ) continue;
// Compare this query term syn to
// cat word
if ( (long long)h == wids[i] ) {
matchedIds.addKey( h, 0 );
numQTsInDmoz++;
//log( LOG_DEBUG, "query: "
// "synmatch:'%s' "
// "in dmozw:'%s' AWL",
// synInfo.m_syn[j],
// currTitle );
continue;
}
// Compare this query term syn to
// cat phrase
if ( qt->m_isPhrase &&
(long long)h == pids[i] ) {
matchedIds.addKey( h, 0 );
numQTsInDmoz++;
//log( LOG_DEBUG, "query: "
// "synmatch:'%s' "
// "in dmozp:'%s' AWL",
// synInfo.m_syn[j],
// currTitle );
continue;
}
}
}
}
next:
pd += currTitleLen;
}
//log( LOG_DEBUG, "query: qts or syns in dmoz cat name:%ld AWL",
// numQTsInDmoz );
return rerankLowerDemotesMore( score,
numQTsInDmoz, maxVal,
factor,
"pqrdcndcqt",
"query terms in its dmoz category names");
}
*/
// pqrdcndcgb
// . if the dmoz category names contain a query term (or its synonyms or
// gigabits), "boost" the result based on the query term weight (look at
// query phrase term weights, too) (actually, demote others that do not
// have them...)
/*
rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveGigabits ( rscore_t score,
Msg20 *msg20 ) {
//log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveGigabits("
// "score:%ld)"
// "[P_factor:%3.3f; P_max:%ld] AWL",
// score,
// m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits,
// m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits );
float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits;
if ( factor <= 0 ) return score; // disables
long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits;
if ( maxVal < 0 ) maxVal = m_si->m_docsToScanForTopics;
// find number of gigabits in dmoz category name
long numGigabitsInDmoz = 0;
// go through gigabits each possible phrase in gigabits
//log( LOG_DEBUG, "query: numGigabits:%ld AWL",
// m_msg40->getNumTopics() );
long numTopics = m_msg40->getNumTopics();
HashTableT<long long, long> matchedIds;
matchedIds.set( numTopics*4 );
for ( long i = 0; i < numTopics; i++ ) {
Words words;
if ( ! words.set( m_msg40->getTopicPtr(i),
m_msg40->getTopicLen(i),
TITLEREC_CURRENT_VERSION,
false, // computeIds
false // hasHtmlEntities
) )
continue;
AppendingWordsWindow ww;
if ( ! ww.set( &words,
1, // minWindowSize
4, // maxWindowSize
AWW_INIT_BUF_SIZE,
NULL
) )
continue;
// find all phrases between length of 1 and 4
for ( ww.processFirstWindow();
! ww.isDone();
ww.processNextWindow() ) {
ww.act();
char *phrasePtr = ww.getPhrasePtr();
long phraseLen = ww.getPhraseLen();
long numPhraseWords = ww.getNumWords();
if ( numPhraseWords == 0 ) continue;
//log( LOG_DEBUG, "query: gb phrase:%s (%ld) AWL",
// phrasePtr, phraseLen );
// see if we already matched this phrase
uint64_t h = hash64Lower_utf8( phrasePtr, phraseLen );
if ( h == 0 ) h = 1;
if ( matchedIds.getSlot( h ) != -1 )
continue;
// ignore phrases that are just common words
if ( isCommonWord( h ) )
continue;
matchedIds.addKey( h, 0 );
// go through dmoz category names
char *p = msg20->m_r->ptr_dmozTitles;
long numCatids = msg20->m_r->size_catIds/4;
for ( long j = 0; j < numCatids; j++ ) {
char *currTitle = p;
long currTitleLen = gbstrlen(p);
if ( currTitleLen == 0 ) continue;
//log( LOG_DEBUG, "query: dmoz:%s (%ld) AWL",
// currTitle, currTitleLen );
// check if gigabit is in dmoz category name
if (strncasestr(currTitle, phrasePtr,
currTitleLen, phraseLen)){
//log( LOG_DEBUG, "query: gb is in "
// "dmoz AWL");
numGigabitsInDmoz++;
}
p += currTitleLen;
}
}
}
//log( LOG_DEBUG, "query: numGigabitsInDmoz:%ld AWL",
// numGigabitsInDmoz );
return rerankLowerDemotesMore( score,
numGigabitsInDmoz, maxVal,
factor,
"pqrdcndcgb",
"gigabits in its dmoz category names" );
}
*/
// pqrdate
// . demote pages by datedb date
rscore_t PostQueryRerank::rerankDatedbDate( rscore_t score,
time_t datedbDate ) {
float factor = m_si->m_cr->m_pqr_demFactDatedbDate;
if ( factor <= 0 ) return score;
long minVal = m_si->m_cr->m_pqr_minValDatedbDate;
if ( minVal <= 0 ) minVal = 0;
minVal *= 1000;
long maxVal = m_si->m_cr->m_pqr_maxValDatedbDate;
if ( maxVal <= 0 ) maxVal = 0;
maxVal = m_now - maxVal*1000;
//log( LOG_DEBUG, "query:in PQR::rerankDatedbDate("
// "score:%ld, datedbDate:%ld)"
// "[P_factor:%3.3f; maxVal:%ld] AWL",
// score, datedbDate,
// factor, maxVal );
// don't penalize results whose publish date is unknown
if ( datedbDate == -1 ) return score;
if ( datedbDate <= minVal )
return rerankAssignPenalty( score,
factor,
"pqrdate",
"publish date is older then "
"minimum value" );
return rerankLowerDemotesMore( score,
datedbDate-minVal, maxVal-minVal,
factor,
"pqrdate",
"publish date" );
}
// pqrprox
// . demote pages by the average distance of query terms from
// . one another in the document. Lower score is better.
/*
rscore_t PostQueryRerank::rerankProximity( rscore_t score,
float proximityScore,
float maxScore) {
// . a -1 implies did not have any query terms
// . see Summary.cpp proximity algo
if ( proximityScore == -1 ) return 0;
if(m_si->m_pqr_demFactProximity <= 0) return score;
float factor = (// 1 -
(proximityScore/maxScore)) *
m_si->m_pqr_demFactProximity;
if ( factor <= 0 ) return score;
//return rerankAssignPenalty(score,
// factor,
// "pqrprox",
// "proximity rerank");
// just divide the score by the proximityScore now
// ...new stuff...
if ( proximityScore == 0.0 ) return score;
float score2 = (float)score;
score2 /= proximityScore;
score2 += 0.5;
rscore_t newScore = (rscore_t)score2;
if(m_si->m_debug || g_conf.m_logDebugPQR )
logf( LOG_DEBUG, "query: pqr: result demoted "
"from %.02f to %.02f becaose of proximity rerank",
(float)score,(float)newScore);
return newScore;
}
*/
// pqrinsec
// . demote pages by the average of the score of the sections
// . in which the query terms appear in. Higher score is better.
rscore_t PostQueryRerank::rerankInSection( rscore_t score,
long summaryScore,
float maxScore) {
if(m_si->m_pqr_demFactInSection <= 0) return score;
float factor = ( 1 -
(summaryScore/maxScore)) *
m_si->m_pqr_demFactInSection;
if ( factor <= 0 ) return score;
return rerankAssignPenalty(score,
factor,
"pqrsection",
"section rerank");
}
/*
rscore_t PostQueryRerank::rerankSubPhrase( rscore_t score,
float diversity,
float maxDiversity) {
if(maxDiversity == 0) return score;
float factor = (1 - (diversity/maxDiversity)) *
m_si->m_pqr_demFactSubPhrase;
if ( factor <= 0 ) return score;
return rerankAssignPenalty(score,
factor,
"pqrspd",
"subphrase demotion");
}
*/
bool PostQueryRerank::attemptToCluster ( ) {
// find results that should be clustered
bool needResort = false;
HashTableT<uint32_t, long> hostPosTable;
hostPosTable.set(m_numToSort);
for (long i = 0; i < m_numToSort; i++) {
// look up this hostname to see if it's been clustered
uint32_t key = m_m20List[i].m_host;
if ( key == 0 ) key = 1;
long slot = hostPosTable.getSlot(key);
if (slot != -1) {
// see if we are within 10 results of first result
// from same host
long firstPos = hostPosTable.getValueFromSlot(slot);
if (i - firstPos > 1 && i - firstPos < 10) {
// this result can be clustered
rscore_t maxNewScore;
maxNewScore = m_m20List[firstPos].m_score;
if (maxNewScore <= m_m20List[i].m_score)
continue;
needResort = true;
if(m_si->m_debug||g_conf.m_logDebugPQR )
logf(LOG_DEBUG, "pqr: re-ranking result "
"%ld (%s) from score %.02f to "
"score %.02f "
"in order to cluster it with "
"result "
"%ld (%s)",
i,
m_m20List[i].m_m20->m_r->ptr_ubuf,
(float)m_m20List[i].m_score,
(float)maxNewScore,
firstPos,
m_m20List[firstPos].m_m20->m_r->ptr_ubuf);
// bump up the score to cluster this result
m_m20List[i].m_score = maxNewScore;
}
else {
hostPosTable.setValue(slot, i);
}
}
else {
// add the hostname of this result to the table
if (!hostPosTable.addKey(key, i)) {
g_errno = ENOMEM;
return false;
}
}
}
// re-sort the array if necessary
if (needResort) {
log(LOG_DEBUG, "pqr: re-sorting results for clustering");
gbmergesort( (void *) m_m20List, (size_t) m_numToSort,
(size_t) sizeof(M20List),
(int (*)(const void *, const void *))s_reSortFunction);
}
return true;
}
// Sort function for post query reranking's M20List
static int32_t s_firstSortFunction(const M20List * a, const M20List * b)
{
// Sort by tier first, then score
// When sorting by tier, an explicit match (0x40) in a higher tier
// gets precedence over an implicit match (0x20) from a lower tier
// Note: don't sort by tier, don't consider bitscores
//if ( a->tier < b->tier &&
// (a->bitScore & 0x40 || !b->bitScore & 0x40) )
// return -1;
//if ( a->tier > b->tier &&
// (b->bitScore & 0x40 || !a->bitScore & 0x40) )
// return 1;
// Absolute match proximity
//if ( a->m20->m_proximityScore > b->m20->proximityScore )
// return -1;
//else if ( a->m20->m_proximityScore < b->m20->proximityScore )
// return 1;
// same tier, same proximity, sort by score
if ( a->m_score > b->m_score )
return -1;
if ( a->m_score < b->m_score )
return 1;
// same tier, same proximity, same score, sort by docid
//if ( a->docId < b->docId )
// return -1;
//if ( a->docId > b->docId )
// return 1;
// same score, sort by host
if ( a->m_host > b->m_host )
return -1;
if ( a->m_host < b->m_host )
return 1;
return 0;
}
// Sort function for post query reranking's M20List
static int32_t s_reSortFunction(const M20List * a, const M20List * b)
{
// Sort by tier first, then score
// When sorting by tier, an explicit match (0x40) in a higher tier
// gets precedence over an implicit match (0x20) from a lower tier
// Note: don't sort by tier, don't consider bitscores
//if ( a->tier < b->tier &&
// (a->bitScore & 0x40 || !b->bitScore & 0x40) )
// return -1;
//if ( a->tier > b->tier &&
// (b->bitScore & 0x40 || !a->bitScore & 0x40) )
// return 1;
// Absolute match proximity
//if ( a->m20->m_proximityScore > b->m20->proximityScore )
// return -1;
//else if ( a->m20->m_proximityScore < b->m20->proximityScore )
// return 1;
// same tier, same proximity, sort by score
if ( a->m_score > b->m_score )
return -1;
if ( a->m_score < b->m_score )
return 1;
// same tier, same proximity, same score, sort by docid
//if ( a->docId < b->docId )
// return -1;
//if ( a->docId > b->docId )
// return 1;
// same score, sort by host
if ( a->m_host > b->m_host )
return -1;
if ( a->m_host < b->m_host )
return 1;
return 0;
}
#ifdef DEBUGGING_LANGUAGE
// Debug stuff, remove before flight
static void DoDump(char *loc, Msg20 **m20, long num,
score_t *scores, char *tiers) {
int x;
char *url;
//log(LOG_DEBUG, "query: DoDump(): checkpoint %s AWL DEBUG", loc);
for(x = 0; x < num; x++) {
url = m20[x]->getUrl();
if(!url) url = "None";
//log( LOG_DEBUG, "query: DoDump(%d): "
// "tier:%d score:%ld [url:'%s'] msg20:%p\n AWL DEBUG",
// x, tiers[x], scores[x], url, m20[x] );
}
}
#endif // DEBUGGING_LANGUAGE