open-source-search-engine/PostQueryRerank.cpp

#include "gb-include.h"
#include "PostQueryRerank.h"
#include "Msg40.h"
#include "LanguageIdentifier.h"
#include "sort.h"
//#include "Thesaurus.h"
//#include "AppendingWordsWindow.h"
//#include "Places.h"
#include "Profiler.h"
#include "CountryCode.h"
#include "Phrases.h"
#include "Linkdb.h"

#define TOTAL_RERANKING_TIME_STR  "PostQueryRerank Total Reranking Time"

//#define DEBUGGING_LANGUAGE

// Type for post query reranking weighted sort list
struct M20List {
	Msg20 *m_m20;
	//long m_score;
	rscore_t m_score;
	//int m_tier;
	long long m_docId;
	char m_clusterLevel;
	//long m_bitScore;
	long m_numCommonInlinks;
	uint32_t m_host;
};

static int32_t s_firstSortFunction( const M20List * a, const M20List * b );
static int32_t s_reSortFunction   ( const M20List * a, const M20List * b );
#ifdef DEBUGGING_LANGUAGE
static void DoDump(char *loc, Msg20 **m20, long num,
		   score_t *scores, char *tiers);
#endif

bool PostQueryRerank::init ( ) {
	return true;
}

PostQueryRerank::PostQueryRerank ( ) {
	//log( LOG_DEBUG, "query:in PQR::PQR() AWL" );
	m_enabled            = false;
	m_maxResultsToRerank = 0;

	m_numToSort    = 0;
	m_m20List      = NULL;
	m_positionList = NULL;

	m_msg40 = NULL;

	//m_querysLoc = 0;

	m_maxUrlLen = 0;
	m_pageUrl   = NULL;

	m_now = time(NULL);
}

PostQueryRerank::~PostQueryRerank ( ) {
	//log( LOG_DEBUG, "query:in PQR::~PQR() AWL" );
	if ( m_m20List ) {
		mfree( m_m20List, sizeof(M20List) * m_maxResultsToRerank,
		       "PostQueryRerank" );
		m_m20List = NULL;
	}
	if ( m_positionList ) {
		mfree( m_positionList, sizeof(long) * m_maxResultsToRerank,
		       "PQRPosList" );
		m_positionList = NULL;
	}

	if ( m_cvtUrl ) mfree( m_cvtUrl, m_maxUrlLen, "pqrcvtUrl") ;
	if ( m_pageUrl ) mfree( m_pageUrl, sizeof(Url)*m_maxResultsToRerank,
				"pqrpageUrls" );
}

// returns false on error
bool PostQueryRerank::set1 ( Msg40 *msg40, SearchInput *si ) {
	//log(LOG_DEBUG, "query:in PQR::set1(%p) AWL", msg40);

	m_msg40 = msg40;
	m_si    = si;

	if ( ! m_msg40 ) return false;
	if ( ! m_si ) return false;
	if ( ! m_si->m_cr ) return false;

	m_enabled = (m_si->m_docsToScanForReranking > 1);
	//log( LOG_DEBUG, "query:  m_isEnabled:%ld; "
	//     "P_docsToScanForReranking:%ld P_pqr_docsToSan:%ld; AWL",
	//     (long)m_enabled,
	//     m_si->m_docsToScanForReranking,
	//     m_si->m_cr->m_pqr_docsToScan );

	return m_enabled;
}

// must be called sometime after we know numDocIds and before preRerank
// returns false if we shouldn't rerank
bool PostQueryRerank::set2 ( long resultsWanted ) {
	//log(LOG_DEBUG, "query:in PQR::set2() AWL");

	//log( LOG_DEBUG, "query: firstResultNum:%ld; numResults:%ld; "
	//     "wanted:%ld numMsg20s:%ld AWL",
	//     m_msg40->getFirstResultNum(), m_msg40->getNumResults(),
	//     resultsWanted, m_msg40->m_numMsg20s );

	// we only want to check the lessor of docsToScan and numDocIds
	m_maxResultsToRerank = m_si->m_docsToScanForReranking;
	if ( m_maxResultsToRerank > m_msg40->getNumDocIds() ) {
		m_maxResultsToRerank = m_msg40->getNumDocIds();
		log( LOG_DEBUG, "pqr: request to rerank more results "
		     "than the number of docids, capping number to rerank "
		     "at %ld", m_maxResultsToRerank );
	}

	// If we don't have less results from clustering / deduping or
	// we have less results in docids then ...
	if ( m_msg40->getNumResults() < m_msg40->getNumDocIds() &&
	     m_msg40->getNumResults() < resultsWanted )
		return false;

	// are we passed pqr's range?
	if ( m_msg40->getFirstResultNum() > m_maxResultsToRerank )
		return false;

	// Safety check, make sure there are less results to rerank
	// than the number of Msg20s
	if ( m_msg40->m_numMsg20s < m_maxResultsToRerank )
		m_maxResultsToRerank = m_msg40->m_numMsg20s;

	//log( LOG_DEBUG, "query: m_maxResultsToRerank:%ld AWL",
	//     m_maxResultsToRerank );

	if ( m_maxResultsToRerank < 2 ) {
		//log( LOG_INFO, "pqr: too few results to rerank" );
		return false;
	}

	if ( m_maxResultsToRerank > 250 ) {
		log( LOG_INFO, "pqr: too many results to rerank, "
		     "capping at 250" );
		m_maxResultsToRerank = 250;
	}

	// see if we are done
	if ( m_msg40->getFirstResultNum() >= m_maxResultsToRerank ) {
		log( LOG_INFO, "pqr: first result is higher than max "
		     "results to rerank" );
		return false;
	}

	// get space for host count table
	m_hostCntTable.set( m_maxResultsToRerank );

	// get some space for dmoz table
	m_dmozTable.set( m_maxResultsToRerank << 1 );

	// alloc urls for pqrqttiu, pqrfsh and clustering
	m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank,
				    "pqrpageUrls" );

	if ( ! m_pageUrl ) {
		log("pqr: had out of memory error");
		return false;
	}

	return true;
}

// sets up PostQueryRerank for each page in m_maxResultsToRerank
// returns false on error
bool PostQueryRerank::preRerank ( ) {
  //if ( g_conf.m_profilingEnabled ) {
  //	g_profiler
  //		.startTimer((long)(this->*(&PostQueryRerank::rerank)),
  //			    TOTAL_RERANKING_TIME_STR );
  //}
	//log( LOG_DEBUG, "query:in PQR::preRerank() AWL" );

#ifdef DEBUGGING_LANGUAGE
	DoDump( "Presort", m_msg40->m_msg20, m_maxResultsToRerank,
		m_msg40->m_msg3a.m_scores, NULL);//m_msg40->m_msg3a.m_tiers );
#endif

	if( m_si->m_enableLanguageSorting
	    && !m_si->m_langHint )
		log( LOG_INFO, "pqr: no language set for sort. "
		"language will not be reranked" );

	SANITYCHECK( ! m_m20List );
	m_m20List = (M20List*)mcalloc( sizeof(M20List) * m_maxResultsToRerank,
				       "PostQueryRerank" );
	if( ! m_m20List ) {
		log( LOG_INFO, "pqr: Could not allocate PostQueryRerank "
		     "sort memory.\n" );
		g_errno = ENOMEM;
		return(false);
	}
	SANITYCHECK( ! m_positionList );
	m_positionList = (long *)mcalloc( sizeof(long) * m_maxResultsToRerank,
					  "PQRPosList" );
	if( ! m_positionList ) {
		log( LOG_INFO, "pqr: Could not allocate PostQueryRerank "
		     "postion list memory.\n" );
		g_errno = ENOMEM;
		return(false);
	}

	//log(LOG_DEBUG, "pqr: the query is '%s' AWL", m_si->m_q->m_orig);

	// setup for rerankNonLocationSpecificQueries if enabled
	//if ( ! preRerankNonLocationSpecificQueries() )
	//	return false;

	// . make a temp hash table for iptop
	// . each slot is a long key and a long value
	HashTable ipTable;
	// how many slots
	long numSlots = 5000 / ((4+4)*4);
	char tmp[5000];
	// this should NEVER need to allocate, UNLESS for some reason we got
	// a ton of inlinking ips
	if ( ! ipTable.set ( numSlots , tmp , 5000 ) ) return false;
	// this table maps a docid to the number of search results it links to
	HashTableT <long long, long> inlinkTable;
	char tmp2[5000];
	long numSlots2 = 5000 / ((8+4)*4);
	if ( ! inlinkTable.set ( numSlots2 , tmp2 , 5000 ) ) return false;

	// Fill sort array
	long y = 0;
	for( long x = 0;
	     x < m_msg40->m_numMsg20s && y < m_maxResultsToRerank;
	     x++ ) {
		// skip clustered out results
		char clusterLevel = m_msg40->getClusterLevel( x );
		if ( clusterLevel != CR_OK ) {
			//log( LOG_DEBUG, "pqr: skipping result "
			//     "%ld since cluster level(%ld) != "
			//     "CR_OK(%ld) AWL",
			//     x, (long)clusterLevel, (long)CR_OK );
			continue;
		}
		// skip results that don't match all query terms
		//long bitScore = m_msg40->getBitScore( x );
		//if ( bitScore == 0x00 ) continue;

		// . save postion of this result so we can fill it in later
		//   with (possibly) a higher ranking result
		m_positionList[y] = x;

		M20List *sortArrItem = &m_m20List [ y ];
		sortArrItem->m_clusterLevel = clusterLevel                  ;
		sortArrItem->m_m20          = m_msg40->m_msg20         [ x ];
		sortArrItem->m_score        = (rscore_t)m_msg40->getScore(x);
		//sortArrItem->m_tier         = m_msg40->getTier         ( x );
		sortArrItem->m_docId        = m_msg40->getDocId        ( x );
		//sortArrItem->m_bitScore     = bitScore                      ;
		sortArrItem->m_host         = 0; // to be filled in later

		Msg20 *msg20 = sortArrItem->m_m20;
		SANITYCHECK( msg20 && ! msg20->m_errno );

		Msg20Reply *mr = msg20->m_r;

		// set the urls for each page
		// used by pqrqttiu, pqrfsh and clustering
		m_pageUrl[y].set( mr->ptr_ubuf , false );
		// now fill in host without the 'www.' if present
		char *host    = m_pageUrl[y].getHost();
		long  hostLen = m_pageUrl[y].getHostLen();
		if (hostLen > 4 &&
		    host[3] == '.' &&
		    host[0] == 'w' && host[1] == 'w' && host[2] == 'w')
			sortArrItem->m_host = hash32(host+4, hostLen-4);
		else
			sortArrItem->m_host = hash32(host, hostLen);

		// add its inlinking docids into the hash table, inlinkTable
		LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
		//long       n         = msg20->getNumInlinks      ();
		//long long *docIds    = msg20->getInlinkDocIds    ();
		//char      *flags     = msg20->getInlinkFlags     ();
		//long      *ips       = msg20->getInlinkIps       ();
		//char      *qualities = msg20->getInlinkQualities ();
		// skip adding the inlinking docids if search result has bad ip
		long ip = mr->m_ip;//msg20->getIp();
		bool good = true;
		if ( ip ==  0 ) good = false;
		if ( ip == -1 ) good = false;
		// . skip inlinker add already did this "ip top"
		// . "ip top" is the most significant 3 bytes of the ip
		// . get the ip of the docid:
		long top = iptop ( ip );
		// if we already encountered a higher-scoring search result
		// with the same iptop, do not count its inlinkers!
		// so that if an inlinker links to two docids in the search
		// results, where those two docids are from the same
		// "ip top" then the docid is only "counted" once here.
		if ( ipTable.getSlot ( top ) >= 0 ) good = false;
		// not allowed to be 0
		if ( top == 0 ) top = 1;
		// now add to table so no we do not add the inlinkers from
		// any other search results from the same "ip top"
		if ( ! ipTable.addKey ( top , 1 ) ) return false;
		// now hash all the inlinking docids into inlinkTable
		for ( Inlink *k=NULL; good && (k=info->getNextInlink(k) ) ; ) {
			// lower score if it is link spam though
			if ( k->m_isLinkSpam ) continue;
			// must be quality of 35 or higher to "vote"
			//if ( k->m_docQuality < 35 ) continue;
			if ( k->m_siteNumInlinks < 20 ) continue;
			// skip if bad ip for inlinker
			if ( k->m_ip == 0 || k->m_ip == -1 ) continue;
			// skip if inlinker has same top ip as search result
			if ( iptop(k->m_ip) == top ) continue;
			// get the current slot in table from docid of inlinker
			long slot = inlinkTable.getSlot ( k->m_docId );
			// get the score
			if ( slot >= 0 ) {
				long count=inlinkTable.getValueFromSlot(slot);
				inlinkTable.setValue ( slot , count + 1 );
				continue;
			}
			// add it fresh if not already in there
			if (!inlinkTable.addKey(k->m_docId,1)) return false;
		}

		//log( LOG_DEBUG, "pqr: pre: setting up sort array - "
		//     "mapping x:%ld to y:%ld; "
		//     "url:'%s' (%ld); tier:%d; score:%ld; "
		//     "docId:%lld; clusterLevel:%d; AWL",
		//     x, y,
		//     msg20->getUrl(), msg20->getUrlLen(),
		//     sortArrItem->tier, sortArrItem->score,
		//     sortArrItem->docId, sortArrItem->clusterLevel );

		// setup reranking for pages from the same host (pqrfsd)
		if ( ! preRerankOtherPagesFromSameHost( &m_pageUrl[y] ))
			return false;

		// setup reranking for pages with common topics in dmoz (pqrctid)
		if ( ! preRerankCommonTopicsInDmoz( mr ) )
			return false;

		// . calculate maximum url length in pages for reranking
		//   by query terms or topics in a url
		long urlLen = mr->size_ubuf - 1;//msg20->getUrlLen();
		if ( urlLen > m_maxUrlLen )
			m_maxUrlLen = urlLen;

		// update num to rerank and sort
		m_numToSort++;
		y++;
	}

	// get the max
	m_maxCommonInlinks = 0;
	// how many of OUR inlinkers are shared by other results?
	for ( long i = 0; i < m_numToSort; i++ ) {
		// get the item
		M20List *sortArrItem = &m_m20List [ i ];
		Msg20 *msg20 = sortArrItem->m_m20;
		// reset
		sortArrItem->m_numCommonInlinks = 0;
		// lookup its inlinking docids in the hash table
		//long       n      = msg20->getNumInlinks   ();
		//long long *docIds = msg20->getInlinkDocIds ();
		LinkInfo *info = (LinkInfo *)msg20->m_r->ptr_linkInfo;
		for ( Inlink *k=NULL;info&&(k=info->getNextInlink(k)) ; ) {
			// how many search results does this inlinker link to?
			long*v=(long *)inlinkTable.getValuePointer(k->m_docId);
			if ( ! v ) continue;
			// if only 1 result had this as an inlinker, skip it
			if ( *v <= 1 ) continue;
			// ok, give us a point
			sortArrItem->m_numCommonInlinks++;
		}
		// get the max
		if ( sortArrItem->m_numCommonInlinks > m_maxCommonInlinks )
			m_maxCommonInlinks = sortArrItem->m_numCommonInlinks;
	}


	// . setup reranking for query terms or topics in url (pqrqttiu)
	// . add space to max url length for terminating NULL and allocate
	//   room for max length
	m_maxUrlLen++;
	m_cvtUrl = (char *)mmalloc( m_maxUrlLen, "pqrcvtUrl" );
	if ( ! m_cvtUrl ) {
		log( LOG_INFO, "pqr: Could not allocate %ld bytes "
		     "for m_cvtUrl.",
		     m_maxUrlLen );
		g_errno = ENOMEM;
		return false;
	}

	// Safety valve, trim sort results
	if ( m_numToSort > m_maxResultsToRerank )
		m_numToSort = m_maxResultsToRerank;

	//log( LOG_DEBUG, "pqr::m_numToSort:%ld AWL", m_numToSort );

	return true;
}

// perform actual reranking of m_numToSort pages
// returns false on error
bool PostQueryRerank::rerank ( ) {
	//log(LOG_DEBUG,"query:in PQR::rerank() AWL");
	if(m_si->m_debug||g_conf.m_logDebugPQR )
		logf( LOG_DEBUG, "pqr: reranking %ld results",
		     m_numToSort );

	/*
	float maxDiversity = 0;
	if(m_si->m_pqr_demFactSubPhrase > 0) {
		for ( long x = 0; x < m_numToSort; x++ ) {
			M20List *sortArrItem = &m_m20List [ x ];
			Msg20 *msg20 = sortArrItem->m_m20;
			if ( ! msg20 || msg20->m_errno ) continue;
			float d = msg20->m_r->m_diversity;
			if(d > maxDiversity) maxDiversity = d;
		}
	}

	float maxProximityScore = 0;
	float minProximityScore = -1.0;
	//float maxInSectionScore = 0;
	if(m_si->m_pqr_demFactProximity > 0 ||
	   m_si->m_pqr_demFactInSection > 0) {
		//grab the max score so that we know what the max to
		//demote is.
		for ( long x = 0; x < m_numToSort; x++ ) {
			M20List *sortArrItem = &m_m20List [ x ];
			Msg20 *msg20 = sortArrItem->m_m20;
			if ( ! msg20 || msg20->m_errno ) continue;
			//float d = msg20->m_r->m_inSectionScore;
			//if(d > maxInSectionScore)
			//	maxInSectionScore = d;
			// handle proximity
			float d = msg20->m_r->m_proximityScore;
			// i think this means it does not have all the query
			// terms! for 'sylvain segal' we got
			// www.regalosdirectos.tv/asp2/comparar.asp?cat=36
			// in results
			if ( d == 0.0 ) continue;
			// . -1 is a bogus proximity
			// . it means we were not able to find all the terms
			//   because they were in anomalous link text or
			//   meta tags or select tags or whatever... so for
			//   now such results will not be demoted to be on the
			//   safe side
			if ( d == -1.0 ) continue;
			if ( d > maxProximityScore )
				maxProximityScore = d;
			if ( d < minProximityScore || minProximityScore==-1.0 )
				minProximityScore = d;
		}
	}
	*/


	// rerank weighted sort list
	for ( register long x = 0; x < m_numToSort; x++ ) {
		M20List *sortArrItem = &m_m20List [ x ];
		Msg20 *msg20 = sortArrItem->m_m20;
		char *url = NULL;
		rscore_t score = sortArrItem->m_score;
		rscore_t startScore = score;

		// mwells: what is this?
 		if(m_si->m_pqr_demFactOrigScore < 1) {
 		//turn off the indexed score and just use a uniform start score
 		//because I can't get the proximity pqr to overwhelm the
 		//preexisting score.
 			score = 1000000 + (m_numToSort - x) +
				(long)(score * m_si->m_pqr_demFactOrigScore);
			startScore = score;
 		}

		// if don't have a good msg20, skip reranking for this result
		if ( ! msg20 || msg20->m_errno )
			continue;

		url = msg20->m_r->ptr_ubuf;//getUrl();
		if ( ! url ) url = "(none)";
		if(m_si->m_debug||g_conf.m_logDebugPQR )
			logf(LOG_DEBUG, "pqr: result #%ld:'%s' has initial "
			     "score of %.02f",
			     x, url, (float)startScore );

		// resets
		msg20->m_pqr_old_score        = score;
		msg20->m_pqr_factor_quality   = 1.0;
		msg20->m_pqr_factor_diversity = 1.0;
		msg20->m_pqr_factor_inlinkers = 1.0;
		msg20->m_pqr_factor_proximity = 1.0;
		msg20->m_pqr_factor_ctype     = 1.0;
		msg20->m_pqr_factor_lang      = 1.0; // includes country

		Msg20Reply *mr = msg20->m_r;

		// demote for language and country
		score =	rerankLanguageAndCountry( score,
						  mr->m_language ,
						  mr->m_summaryLanguage,
						  mr->m_country, // id
						  msg20 );

		// demote for content-type
		float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml;
		float xmlFactor  = m_si->m_cr->m_pqr_demFactXml;
		long  contentType= mr->m_contentType;
		if ( contentType == CT_XML && xmlFactor > 0 ) {
			score = score * xmlFactor;
			msg20->m_pqr_factor_ctype = xmlFactor;
		}
		else if ( contentType != CT_HTML && htmlFactor > 0 ) {
			score = score * htmlFactor;
			msg20->m_pqr_factor_ctype = htmlFactor;
		}
		//if ( score == 1 ) goto finishloop;

		// demote for fewer query terms or gigabits in url
		//score =	rerankQueryTermsOrGigabitsInUrl( score,
		//					 &m_pageUrl[x] );


		// . demote for not high quality
		// . multiply by "qf" for every quality point below 100
		// . now we basically do this if we have a wiki title
		// . float qf = m_si->m_cr->m_pqr_demFactQual;
		/*
		if ( m_msg40->m_msg3a.m_oneTitle ) {
			//long q = msg20->getQuality();
			long sni = mr->m_siteNumInlinks;
			if ( sni <= 0 ) sni = 1;
			float weight = 1.0;
			for ( ; sni < 100000 ; sni *= 2 )
				weight = weight * 0.95;
			// apply the weight to the score
			score = score * weight;
			// store that for print in PageResults.cpp
			msg20->m_pqr_factor_quality = weight;
		}
		*/

		// demote for more paths in url
		score = rerankPathsInUrl( score,
					  msg20->m_r->ptr_ubuf,//getUrl(),
					  msg20->m_r->size_ubuf-1 );

		// demote for smallest cat id has a lot of super topics
		score = rerankSmallestCatIdHasSuperTopics( score,
							   msg20 );

		// demote for larger page sizes
		score = rerankPageSize( score,
					msg20->m_r->m_contentLen );

		// . demote for non location specific queries that have an
		//   an obvious location in gigabits or url
		//score = rerankNonLocationSpecificQueries( score,
		//					  msg20 );
		//if ( score == 1 ) goto finishloop;

		// demote for no cat id
		score = rerankNoCatId( score,
				       msg20->m_r->size_catIds/4,
				       msg20->m_r->size_indCatIds/4);

		// demote for no other pages from same host
		score = rerankOtherPagesFromSameHost( score,
						      &m_pageUrl[x] );

		// demote for fewer common topics in dmoz
		score = rerankCommonTopicsInDmoz( score,
						  msg20 );

		// . demote for pages with dmoz category names do not
		//   contain a query term
		//score = rerankDmozCategoryNamesDontHaveQT( score,
		//					   msg20 );

		// . demote for pages with dmoz category names do not
		//   contain a query term
		//score = rerankDmozCategoryNamesDontHaveGigabits( score,
		//						 msg20 );

		// . demote pages for older datedb dates
		score = rerankDatedbDate( score,
					  msg20->m_r->m_datedbDate );

		/*
		// . demote pages by proximity
		// . a -1 prox implies did not have any query terms
		// . see Summary.cpp proximity algo
		float ps = msg20->m_r->m_proximityScore;//getProximityScore();
		if ( ps > 0.0 &&
		     m_si->m_pqr_demFactProximity > 0 &&
		     minProximityScore != -1.0 ) {
			// what percent were we of the max?
			float factor = minProximityScore / ps ;
			// this can be weighted
			//factor *= m_si->m_pqr_demFactProximity;
			// apply the factor to the score
			score *= factor;
			// this is the factor
			msg20->m_pqr_factor_proximity = factor;
		}

		// . demote pages by the average of the scores of the
		// . terms based upon what section of the doc they are in
		// . mdw: proximity algo should obsolete this
		//if(maxInSectionScore > 0)
		//	score = rerankInSection( score,
		//				 msg20->getInSectionScore(),
		//				 maxInSectionScore);


		// . demote pages which only have the query as a part of a
		// . larger phrase
		if ( maxDiversity != 0 ) {
			float diversity = msg20->m_r->m_diversity;
			float df = (1 - (diversity/maxDiversity)) *
				m_si->m_pqr_demFactSubPhrase;
			score = (rscore_t)(score * (1.0 - df));
			if ( score <= 0.0 ) score = 0.001;
			msg20->m_pqr_factor_diversity = 1.0 - df;
		}
		*/

		// . COMMON INLINKER RERANK
		// . no need to create a superfluous function call here
		// . demote pages that do not share many inlinking docids
		//   with other pages in the search results
		if ( m_maxCommonInlinks>0 && m_si->m_pqr_demFactCommonInlinks){
			long nc = sortArrItem->m_numCommonInlinks ;
			float penalty;
			// the more inlinkers, the less the penalty
			penalty = 1.0 -(((float)nc)/(float)m_maxCommonInlinks);
			// . reduce the penalty for higher quality pages
			// . they are the most likely to have their inlinkers
			//   truncated
			//char quality = msg20->getQuality();
			float sni = (float)msg20->m_r->m_siteNumInlinks;
			// decrease penalty for really high quality docs
			//while ( quality-- > 60 ) penalty *= .95;
			for ( ; sni > 1000 ; sni *= .80 ) penalty *= .95;
			// if this parm is 0, penalty will become 0
			penalty *= m_si->m_pqr_demFactCommonInlinks;
			// save old score
			score = score * (1.0 - penalty);
			// do not decrease all the way to 0!
			if ( score <= 0.0 ) score = 0.001;
			// store it!
			msg20->m_pqr_factor_inlinkers = 1.0 - penalty;
		}

		//	finishloop:
		if(m_si->m_debug || g_conf.m_logDebugPQR )
			logf( LOG_DEBUG, "pqr: result #%ld's final "
			     "score is %.02f (-%3.3f%%) ",
			     x, (float)score,100-100*(float)score/startScore );
		sortArrItem->m_score = score;
	}

	return(true);
}

// perform post reranking tasks
// returns false on error
bool PostQueryRerank::postRerank ( ) {
	//log( LOG_DEBUG, "query:in PQR::postRerank() AWL" );

	// Hopefully never happen...
	//log( LOG_DEBUG, "query: just before sort: "
	//     "m_maxResultsToRerank:%ld m_numToSort:%ld AWL",
	//     m_maxResultsToRerank, m_numToSort);
	if ( m_numToSort < 0 ) return false;

	// Sort the array
	gbmergesort( (void *) m_m20List, (size_t) m_numToSort,
		     (size_t) sizeof(M20List),
		     (int (*)(const void *, const void *))s_firstSortFunction);

	// move 2nd result from a particular domain to just below the first
	// result from that domain if it is within 10 results of the first
	//XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX put this back in after debugging summary rerank!
	//XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
	//if (!attemptToCluster()) return false;

	// Fill result arrays with our reranked results
	for( long y = 0; y < m_numToSort; y++ ) {
		M20List *a = &m_m20List     [ y ];
		long     x = m_positionList [ y ];
		m_msg40->m_msg20                 [ x ] = a->m_m20;
		//m_msg40->m_msg3a.m_tiers         [ x ] = a->m_tier;
		m_msg40->m_msg3a.m_scores        [ x ] = a->m_score;
		m_msg40->m_msg3a.m_docIds        [ x ] = a->m_docId;
		m_msg40->m_msg3a.m_clusterLevels [ x ] = a->m_clusterLevel;
		//log( LOG_DEBUG, "pqr: post: mapped y:%ld "
		//     "to x:%ld AWL",
		//     y, x );
	}

#ifdef DEBUGGING_LANGUAGE
	DoDump( "Postsort", m_msg40->m_msg20, m_numToSort,
	        m_msg40->m_msg3a.m_scores, NULL );//m_msg40->m_msg3a.m_tiers );
#endif

	//if ( ! g_conf.m_profilingEnabled ) return true;
	//if ( ! g_profiler.endTimer( (long)(this->*(&PostQueryRerank::rerank)),
	//			    TOTAL_RERANKING_TIME_STR) )
	//	log( LOG_WARN,"admin: Couldn't add the fn %li",
	//	     (long)(this->*(&PostQueryRerank::rerank)) );
	return true;
}

// called if we weren't able to rerank for some reason
void PostQueryRerank::rerankFailed ( ) {
  //if ( g_conf.m_profilingEnabled ) {
  //	if( ! g_profiler
  //	    .endTimer( (long)(this->*(&PostQueryRerank::rerank)),
  //		       TOTAL_RERANKING_TIME_STR) )
  //		log(LOG_WARN,"admin: Couldn't add the fn %li",
  //		    (long)(this->*(&PostQueryRerank::rerank)));
  //}
}

// lsort (pqrlang, pqrlangunk, pqrcntry)
// rerank for language, then country
rscore_t PostQueryRerank::rerankLanguageAndCountry ( rscore_t score,
						 uint8_t lang,
						 uint8_t summaryLang,
						 uint16_t country ,
						     Msg20 *msg20 ) {
	//log( LOG_DEBUG, "query:in PQR::rerankLanguageAndCountry("
	//     "score:%ld, lang:%ld, summLang:%ld, country:%ld)"
	//     "[langSortingIsOn:%ld; langUnkWeight:%3.3f; langWeight:%3.3f; "
	//     "&qlang=%ld; &lang=%ld; "
	//     "&qcountry=%ld; &gbcountry=%ld; "
	//     "queryLangs:%lld; pageLangs:%lld] AWL",
	//     score, (long)lang, (long)summaryLang, (long)country,
	//     (long)m_si->m_enableLanguageSorting,
	//     m_si->m_languageUnknownWeight,
	//     m_si->m_languageWeightFactor,
	//     (long)m_si->m_langHint,
	//     (long)m_si->m_language,
	//     (long)m_si->m_countryHint,
	//     (long)m_si->m_country,
	//     g_countryCode.getLanguagesWritten( m_si->m_countryHint ),
	//     g_countryCode.getLanguagesWritten( country ) );

	// if lsort is off, skip
	if ( ! m_si->m_enableLanguageSorting ) return score;

	// . use query lanaguage (si->m_langHint) or restricted search
	//   language (si->m_language)
	// . if both are 0, don't rerank by language
	uint8_t langWanted = m_si->m_langHint;
	if ( langWanted == langUnknown ) langWanted = m_si->m_queryLang;//language;
	if ( langWanted == langUnknown ) return score;

	// . apply score factors for unknown languages, iff reranking unknown
	//   languages
	if ( lang == langUnknown &&
	     m_si->m_languageUnknownWeight > 0 ) {
		msg20->m_pqr_factor_lang =m_si->m_languageUnknownWeight;
		return rerankAssignPenalty(score,
					   m_si->m_languageUnknownWeight,
					   "pqrlangunk",
					   "it's language is unknown" );
	}

	// . if computed lanaguage is unknown, don't penalize
	// . no, what if from a different country?
	if ( summaryLang == langUnknown ) return score;

	// . first, apply score factors for non-preferred summary languages
	//   that don't match the page language
	if ( summaryLang != langUnknown && summaryLang != langWanted ) {
		msg20->m_pqr_factor_lang = m_si->m_languageWeightFactor;
		return rerankAssignPenalty( score,
					    m_si->m_languageWeightFactor,
					    "pqrlang",
					    "it's summary/title "
					    "language is foreign" );
	}

	// second, apply score factors for non-preferred page languages
	//if ( lang != langWanted )
	//	return rerankAssignPenalty( score,
	//				    m_si->m_languageWeightFactor,
	//				    "pqrlang",
	//				    "it's page language is foreign" );

	// . if we got here languages of query and page match and are not
	//   unknown, so rerank based on country
	// . don't demote if countries match or either the search country
	//   or page country is unknown (0)
	// . default country wanted to gbcountry parm if not specified
	uint8_t countryWanted = m_si->m_countryHint;
	// SearchInput sets m_country based on the IP address of the incoming
	// query, which is often wrong, especially for internal 10.x.y.z ips.
	// so just fallback to countryHint for now bcause that uses teh default
	// country... right now set to "us" in search controls page.
	if ( countryWanted == 0 ) countryWanted = m_si->m_country;
	if ( country == 0 || countryWanted == 0 ||
	     country == countryWanted )
		return score;

	// . now, languages match and are not unknown and countries don't
	//   match and neither is unknown
	// . so, demote if country of query speaks the same language as
	//   country of page, ie US query and UK or AUS page (since all 3
	//   places speak english), but not US query and IT page
	uint64_t qLangs = g_countryCode.getLanguagesWritten( countryWanted );
	uint64_t pLangs = g_countryCode.getLanguagesWritten( country );
	// . if no language written by query country is written by page
	//   country, don't penalize
	if ( (uint64_t)(qLangs & pLangs) == (uint64_t)0LL ) return score;

	msg20->m_pqr_factor_lang = m_si->m_cr->m_pqr_demFactCountry;

	// countries do share at least one language - demote!
	return rerankAssignPenalty( score,
				    m_si->m_cr->m_pqr_demFactCountry,
				    "pqrcntry",
				    "it's language is the same as that of "
				    "of the query, but it is from a country "
				    "foreign to that of the query which "
				    "writes in at least one of the same "
				    "languages" );
}

// pqrqttiu
// . look for query terms and gigabits in the url, demote more the fewer
//   are matched.
/*
rscore_t PostQueryRerank::rerankQueryTermsOrGigabitsInUrl( rscore_t score,
							    Url *pageUrl ) {
	//log( LOG_DEBUG, "query:in PQR::rerankQueryTermsOrGigabitsInUrl("
	//     "score:%ld, url:'%s', urlLen:%ld)"
	//     "[factor:%3.3f; max:%ld] AWL",
	//     score, pageUrl->getUrl(), pageUrl->getUrlLen(),
	//     m_si->m_cr->m_pqr_demFactQTTopicsInUrl,
	//     m_si->m_cr->m_pqr_maxValQTTopicsInUrl );

	if ( pageUrl->getUrlLen() == 0 ) return score;

	float factor = m_si->m_cr->m_pqr_demFactQTTopicsInUrl;
	if ( factor <= 0 ) return score; // disables
	long maxQTInUrl = m_si->m_q->getNumTerms();
	long maxGigabitsInUrl = m_msg40->getNumTopics();
	long maxVal = m_si->m_cr->m_pqr_maxValQTTopicsInUrl;
	if ( maxVal < 0 ) maxVal = maxQTInUrl+maxGigabitsInUrl;

	// from original url:
	// . remove scheme
	// . remove 'www' from host
	// . remove tld
	// . remove ext
	// . convert symbols to spaces
	// . remove extra space
	//log( LOG_DEBUG, "query: origurl:'%s' AWL", pageUrl->getUrl() );
	//log( LOG_DEBUG, "query: url: whole:'%s' host:'%s' (%ld); "
	//     "domain:'%s' (%ld); tld:'%s' (%ld); midDom:'%s' (%ld); "
	//     "path:'%s' (%ld); fn:'%s'; ext:'%s'; query:'%s' (%ld); "
	//     "ipStr:'%s' {%ld}; anch:'%s' (%ld) "
	//     "site:'%s' (%ld) AWL",
	//     pageUrl->getUrl(),
	//     pageUrl->getHost(), pageUrl->getHostLen(),
	//     pageUrl->getDomain(), pageUrl->getDomainLen(),
	//     pageUrl->getTLD(), pageUrl->getTLDLen(),
	//     pageUrl->getMidDomain(),  pageUrl->getMidDomainLen(),
	//     pageUrl->getPath(), pageUrl->getPathLen(),
	//     pageUrl->getFilename(), pageUrl->getExtension(),
	//     pageUrl->getQuery(), pageUrl->getQueryLen(),
	//     pageUrl->getIpString(), pageUrl->getIp(),
	//     pageUrl->getAnchor(), pageUrl->getAnchorLen(),
	//     pageUrl->getSite(), pageUrl->getSiteLen() );
	m_cvtUrl[0] = '\0';
	long cvtUrlLen = 0;
	char *host = pageUrl->getHost();
	// first, add hostname - "www." iff it is not an ip addr
	if ( pageUrl->getIp() == 0 ) {
		if ( host[0] == 'w' && host[1] == 'w' && host[2] == 'w' &&
		     host[3] == '.' ) {
			// if starts with 'www.', don't add the 'www.'
			if(pageUrl->getHostLen()-pageUrl->getDomainLen() == 4){
				// add domain - 'www.' - tld
				strncpy( m_cvtUrl, pageUrl->getDomain(),
					 pageUrl->getDomainLen() -
					 pageUrl->getTLDLen() );
				cvtUrlLen += pageUrl->getDomainLen() -
					pageUrl->getTLDLen();
				m_cvtUrl[cvtUrlLen] = '\0';
			}
			else {
				// add host + domain - 'www.' - tld
				strncpy( m_cvtUrl, pageUrl->getHost()+4,
					 pageUrl->getHostLen() -
					 pageUrl->getTLDLen() - 4 );
				cvtUrlLen += pageUrl->getHostLen() -
					pageUrl->getTLDLen() - 4;
				m_cvtUrl[cvtUrlLen] = '\0';
			}
		}
		else {
			// add host + domain - tld
			strncpy( m_cvtUrl, pageUrl->getHost(),
				 pageUrl->getHostLen() -
				 pageUrl->getTLDLen() - 1 );
			cvtUrlLen += pageUrl->getHostLen() -
				pageUrl->getTLDLen() - 1;
			m_cvtUrl[cvtUrlLen] = '\0';
		}

	}
	// next, add path
	if ( pageUrl->getPathLen() > 0 ) {
		strncat( m_cvtUrl, pageUrl->getPath(),
			 pageUrl->getPathLen()-pageUrl->getExtensionLen() );
		cvtUrlLen += pageUrl->getPathLen()-pageUrl->getExtensionLen();
		m_cvtUrl[cvtUrlLen] = '\0';
	}
	// next, add query
	if ( pageUrl->getQueryLen() > 0 ) {
		strncat( m_cvtUrl, pageUrl->getQuery(), pageUrl->getQueryLen() );
		cvtUrlLen += pageUrl->getQueryLen();
		m_cvtUrl[cvtUrlLen] = '\0';
	}
	// remove all non-alpha-numeric chars
	char *t = m_cvtUrl;
	for ( char *s = m_cvtUrl; *s; s++ ) {
		if ( is_alnum_a(*s) ) *t++ = *s;
		else if ( t>m_cvtUrl && *(t-1) != ' ' ) *t++ = ' ';
	}
	*t = '\0';
	cvtUrlLen = (t-m_cvtUrl);
	//log( LOG_DEBUG, "query:  m_cvtUrl:'%s' (%ld) AWL",
	//     m_cvtUrl, cvtUrlLen );

	// find number of query terms in url
	long numQTInUrl = 0;
	long numQTs = m_si->m_q->getNumTerms();
	for ( long i = 0; i < numQTs; i++ ) {
		char *qtStr = m_si->m_q->getTerm(i);
		long  qtLen = m_si->m_q->getTermLen(i);
		if ( strncasestr(m_cvtUrl, qtStr, cvtUrlLen, qtLen) != NULL ) {
			numQTInUrl++;
			//log( LOG_DEBUG, "query:  qt is in url AWL");
		}
	}

	// find number of gigabits in url
	long numGigabitsInUrl = 0;
	long numTopics = m_msg40->getNumTopics();
	for ( long i = 0; i < numTopics; i++ ) {
		char *topicStr = m_msg40->getTopicPtr(i);
		long  topicLen = m_msg40->getTopicLen(i);
		if ( strncasestr(m_cvtUrl, topicStr, cvtUrlLen, topicLen) ) {
			numGigabitsInUrl++;
			//log( LOG_DEBUG, "query:  topic is in url AWL");
		}
	}

	//log( LOG_DEBUG, "query:  qts:%ld, gigabits:%ld; "
	//     "maxQTInUrl:%ld, maxGbInUrl:%ld AWL",
	//     numQTInUrl, numGigabitsInUrl,
	//     maxQTInUrl, maxGigabitsInUrl );
	return rerankLowerDemotesMore( score,
				       numQTInUrl+numGigabitsInUrl,
				       maxVal,
				       factor,
				       "pqrqttiu",
				       "query terms or topics in its url" );
}
*/

// pqrqual
// demote pages that are not high quality
/*
rscore_t PostQueryRerank::rerankQuality ( rscore_t score,
				      unsigned char quality ) {
	//log( LOG_DEBUG, "query:in PQR::rerankQuality("
	//     "score:%ld, quality:%d)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score, (int)quality,
	//     m_si->m_cr->m_pqr_demFactQual,
	//     m_si->m_cr->m_pqr_maxValQual );

	float factor = m_si->m_cr->m_pqr_demFactQual;
	if ( factor <= 0 ) return score;
	long maxVal = m_si->m_cr->m_pqr_maxValQual;
	if ( maxVal < 0 ) maxVal = 100;

	return rerankLowerDemotesMore( score, quality, maxVal, factor,
				       "pqrqual", "quality" );
}
*/

// pqrpaths
// demote pages that are not root or have many paths in the url
rscore_t PostQueryRerank::rerankPathsInUrl ( rscore_t score,
					 char *url,
					 long urlLen ) {
	//log( LOG_DEBUG, "query:in PQR::rerankPathsInUrl("
	//     "score:%ld, url:%s)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score, url,
	//     m_si->m_cr->m_pqr_demFactPaths,
	//     m_si->m_cr->m_pqr_maxValPaths );

	if ( urlLen == 0 ) return score;

	float factor = m_si->m_cr->m_pqr_demFactPaths;
	if ( factor <= 0 ) return score; // disables
	long maxVal = m_si->m_cr->m_pqr_maxValPaths;

	// bypass scheme and "://"
	url = strstr( url, "://" );
	if ( ! url ) return score;
	url += 3;

	// count '/'s to get number of paths
	long numPaths = -1; // don't count first path
	for ( url = strchr(url, '/') ; url ; url = strchr(url, '/') ) {
		numPaths++;
		url++;
	}

	return rerankHigherDemotesMore( score, numPaths, maxVal, factor,
					"pqrpaths", "paths in its url" );
}

// pqrcatid
// demote page if does not have a catid
rscore_t PostQueryRerank::rerankNoCatId ( rscore_t score,
				      long numCatIds,
				      long numIndCatIds ) {
	//log( LOG_DEBUG, "AWL:in PQR::rerankNoCatId("
	//     "score:%ld, numCatIds:%ld, numIndCatIds:%ld)"
	//     "[P_factor:%3.3f]",
	//     score, numCatIds, numIndCatIds,
	//     m_si->m_cr->m_pqr_demFactNoCatId );

	float factor = m_si->m_cr->m_pqr_demFactNoCatId;
	if ( factor <= 0 ) return score; // disables

	if ( numCatIds + numIndCatIds > 0 ) return score;

	return rerankAssignPenalty( score, factor,
				    "pqrcatid", "it has no category id" );
}

// pqrsuper
// demote page if smallest catid has a lot of super topics
rscore_t PostQueryRerank::rerankSmallestCatIdHasSuperTopics ( rscore_t score,
							  Msg20 *msg20 ) {
	//log( LOG_DEBUG, "query:in PQR::rerankSmallestCatIdHasSuperTopics("
	//    "score:%ld)"
	//    "[P_factor:%3.3f; P_max:%ld] AWL",
	//    score,
	//    m_si->m_cr->m_pqr_demFactCatidHasSupers,
	//    m_si->m_cr->m_pqr_maxValCatidHasSupers );

	float factor = m_si->m_cr->m_pqr_demFactCatidHasSupers;
	if ( factor <= 0 ) return score; // disables
	long maxVal = m_si->m_cr->m_pqr_maxValCatidHasSupers;

	// If page doesn't have a catid, we should demote it as if it has
	// max catids, otherwise pages with a catid will be penalized more
	if ( msg20->m_r->size_catIds == 0 ) {
		return rerankAssignPenalty( score,
					    factor,
					    "pqrsuper",
					    "it has no category id" );
	}

	// find smallest catid
	long minCatid = LONG_MAX;
	long numCatids = msg20->m_r->size_catIds / 4;
	for ( long i = 0; i < numCatids; i++ ) {
		if ( msg20->m_r->ptr_catIds[i] < minCatid ) {
			minCatid = msg20->m_r->ptr_catIds[i];
		}
	}
	//log( LOG_DEBUG, "query:  minCatid:%ld AWL", minCatid );

	// count super topics by walking up catids
	long numSupers = -1;
	long currCatId = minCatid;
	long currParentId = minCatid;
	while ( currCatId > 1 ) {
		// next cat
		currCatId = currParentId;
		// get the index for this cat
		long currCatIndex = g_categories->getIndexFromId(currCatId);
		if ( currCatIndex <= 0 ) break;
		// get the parent for this cat
		currParentId = g_categories->m_cats[currCatIndex].m_parentid;
		numSupers++;
	}

	return rerankHigherDemotesMore( score, numSupers, maxVal, factor,
					"pqrsuper",
					"category ids" );
}

// pqrpgsz
// . demote page based on size. (number of words) The bigger, the
//   more it should be demoted.
rscore_t PostQueryRerank::rerankPageSize ( rscore_t score,
				       long docLen ) {
	//log( LOG_DEBUG, "query:in PQR::rerankPageSize("
	//     "score:%ld, docLen:%ld)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score, docLen,
	//     m_si->m_cr->m_pqr_demFactPageSize,
	//     m_si->m_cr->m_pqr_maxValPageSize );

	float factor = m_si->m_cr->m_pqr_demFactPageSize;
	if ( factor <= 0 ) return score;
	long maxVal = m_si->m_cr->m_pqr_maxValPageSize;

	// safety check
	if ( docLen <= 0 ) docLen = maxVal;

	return rerankHigherDemotesMore( score, docLen, maxVal, factor,
					"pqrpgsz", "page size" );
}

/*
// pqrloc
const long MIN_PLACEPOP = 50000;
// . returns true if buf contains a location
// . locBuf is the location name
// . locLen is it's length
// . locPop is it's population
bool PostQueryRerank::getLocation( char *locBuf, long locBufLen,
				   long *locLen, long *locPop,
				   char *buf, long bufLen ) {
	//log( LOG_DEBUG, "query:in getLocation(buf:%c%c%c%c, len:%ld, "
	//     "uc:%d) AWL",
	//     buf[0], buf[2], buf[4], buf[6], bufLen,

	Words words;
	if ( ! words.set( buf, bufLen, TITLEREC_CURRENT_VERSION,
			  false, // computeIds
			  false  // hasHtmlEntities
			  ) )
		return false;

	AppendingWordsWindow ww;
	if ( ! ww.set( &words,
		       1,   // minWindowSize
		       5,   // maxWindowSize
		       locBufLen,
		       locBuf
		       ) )
		return false;

	// find all phrases between length of 1 and 5
	for ( ww.processFirstWindow(); !ww.isDone();  ww.processNextWindow() ){
		ww.act();

		char *phrasePtr = ww.getPhrasePtr();
		long  phraseLen = ww.getPhraseLen();
		long  numPhraseWords = ww.getNumWords();
		if ( numPhraseWords == 0 ) continue;
		//log( LOG_DEBUG, "query:  p:%s (%ld) AWL",
		//     phrasePtr, phraseLen );

		// see if buf phrase is a place
		long encodeType = csUTF8;//csISOLatin1;
		long placePop = getPlacePop( phrasePtr, phraseLen,
					     encodeType );
		if ( placePop > MIN_PLACEPOP ) {
			//log( LOG_DEBUG, "query:  p:%s (%ld) is "
			//     "loc spec AWL",
			//     phrasePtr, phraseLen );
			*locLen = phraseLen;
			*locPop = placePop;
			return true;
		}

		// check to see if buf phrase's abbreviation is loc spec
		//log( LOG_DEBUG, "query:  utf8 p:%s (%ld) AWL",
		//     phrasePtr, phraseLen );
		SynonymInfo synInfo;
		if ( ! g_thesaurus.getSynonymInfo( phrasePtr,
						   &synInfo,
						   phraseLen ) )
			continue;
		long numSyns = synInfo.m_numSyns;
		for ( long j = 0; j < numSyns; j++ ) {
			char *syn    = synInfo.m_syn[j];
			long  synLen = gbstrlen(syn);
			placePop = getPlacePop( syn, synLen,
						csISOLatin1 );

			if ( placePop > MIN_PLACEPOP ) {
				//log( LOG_DEBUG, "query:   s:%s (%ld) is "
				//     "loc spec AWL",
				//     syn, synLen );
				*locLen = phraseLen;
				*locPop = placePop;
				return true;
			}
		}
	}

	*locLen = 0;
	*locPop = 0;
	return false;
}

// pqrloc
bool PostQueryRerank::preRerankNonLocationSpecificQueries ( ) {
        //log( LOG_DEBUG, "query:in PQR::preRerankNonLocSpecQueries() AWL" );

	if ( m_si->m_pqr_demFactLocTitle <= 0 &&
	     m_si->m_pqr_demFactLocSummary <= 0 &&
	     m_si->m_pqr_demFactLocDmoz <= 0 )
		return true;

	//log( LOG_DEBUG, "query: q:%s (%ld) AWL",
	//     m_si->m_q->m_orig,
	//     m_si->m_q->m_origLen );

	// See if query is location specific by building a buffer of
	// query terms without punct then checking all phrases of that
	// buffer
	long numQWords = m_si->m_q->m_numWords;
	char locBuf[1024];
	long locLen = 0;
	long locPop = 0;
	char buf[MAX_QUERY_LEN];
	char *p = buf;
	Query *q = m_si->m_q;
	for ( long i = 0; i < numQWords; i++ ) {
		QueryWord *qw = &q->m_qwords[i];
		//log( LOG_DEBUG, "query:  qw:%c%c%c%c (%ld) "
		//     "inQuotes:%d; inQuoted:%d; quoteStrt:%ld "
		//     "op:%d; opcode:%d; isPunct:%d level:%d; "
		//     "wsign:%d; psign:%d id:%lld "
		//     "ignore:%d AWL",
		//     qw->m_word[0], qw->m_word[2],
		//     qw->m_word[4], qw->m_word[6],
		//     qw->m_wordLen,
		//     qw->m_inQuotes, qw->m_inQuotedPhrase, qw->m_quoteStart,
		//     qw->m_queryOp, qw->m_opcode, qw->m_isPunct, qw->m_level,
		//     qw->m_wordSign, qw->m_phraseSign, qw->m_wordId,
		//     qw->m_ignoreWord );

		// reset buf if word is punct (except all space) or an opcode
		bool isPunct = qw->m_isPunct;
		bool isAllSpace = false;
		if ( isPunct ) {
			char *s = qw->m_word;
			for ( ; (int)(s-qw->m_word) < qw->m_wordLen; s++ ) {
				if ( ! is_space(*s) ) break;
			}
			isAllSpace = ( s-qw->m_word == qw->m_wordLen );
		}
		if ( (isPunct && ! isAllSpace) || qw->m_opcode != 0 ) {
			// before we reset, see if buffer contains a location
			if ( getLocation( locBuf, 1024,
					  &locLen, &locPop,
					  buf, p-buf ) ) {
				long encodeType = csUTF8;//csISOLatin1;
				m_querysLoc = hash64d( locBuf, locLen);
				break;
			}
			p = buf;
			//log( LOG_DEBUG, "query:  encountered symbol:%d|%d AWL",
			//     qw->m_isPunct, qw->m_opcode );
			continue;
		}
		// but if word is all space, dont append
		if ( isAllSpace ) continue;

		// skip if word is subtracted out
		if ( qw->m_wordSign == '-' ) continue;

		// skip if word or phrase is under NOT ||| AWL not working right now
		if ( qw->m_queryWordTerm &&
		     qw->m_queryWordTerm->m_underNOT ) continue;
		if ( qw->m_queryPhraseTerm &&
		     qw->m_queryPhraseTerm->m_underNOT ) continue;

		// else, append word + space to buf
		memcpy( p, qw->m_word, qw->m_wordLen );
		p += qw->m_wordLen;
		*p++ = ' ';
	}
	// now see if there's a location in buf
	if ( m_querysLoc == 0 &&
	     getLocation( locBuf, 1024, &locLen, &locPop,
			  buf, p-buf ) ) {
		m_querysLoc = hash64d( locBuf, locLen );
	}
	//log( LOG_DEBUG, "query: q loc:%lld AWL",
	//     m_querysLoc );

	// check the gigabits for locations
	//log( LOG_DEBUG, "query: places lookup gigabits numTopics:%ld AWL",
	//     m_msg40->getNumTopics() );
	m_ignoreLocs.set( 28 );
	// if searching the us, these should not be demoted, so
	// put them into the gigabit table
	if (m_si->m_country == 226) {
		m_ignoreLocs.addKey(hash64d("u.s.",4),true);
		m_ignoreLocs.addKey(hash64d("us",2),true);
		m_ignoreLocs.addKey(hash64d("united states",14),true);
		m_ignoreLocs.addKey(hash64d("u.s.a.",6),true);
		m_ignoreLocs.addKey(hash64d("usa",3),true);
		m_ignoreLocs.addKey(hash64d("america",7),true);
		m_ignoreLocs.addKey(hash64d("american",8),true);
		m_ignoreLocs.addKey(hash64d("americans",9),true);
		m_ignoreLocs.addKey(hash64d("canada",6),true);
		m_ignoreLocs.addKey(hash64d("kanada",6),true);
		m_ignoreLocs.addKey(hash64d("canucks",7),true);
		m_ignoreLocs.addKey(hash64d("canadians",9),true);
		m_ignoreLocs.addKey(hash64d("canadian",8),true);
		m_ignoreLocs.addKey(hash64d("north america",13),true);
		m_ignoreLocs.addKey(hash64d("uk",2),true);
		m_ignoreLocs.addKey(hash64d("united kingdom",14),true);
		m_ignoreLocs.addKey(hash64d("british",7),true);
		m_ignoreLocs.addKey(hash64d("britain",7),true);
		m_ignoreLocs.addKey(hash64d("britons",7),true);
		m_ignoreLocs.addKey(hash64d("great britain",13),true);
	}
	// now add the locations from the gigabits
	long numTopics = m_msg40->getNumTopics();
	for ( long i = 0; !m_si->m_pqr_demInTopics && i < numTopics; i++ ) {
		char *topicStr = m_msg40->getTopicPtr(i);
		long  topicLen = m_msg40->getTopicLen(i);

		Words words;
		if ( ! words.set( topicStr, topicLen, TITLEREC_CURRENT_VERSION,
				  false, // computeIds
				  false  // hasHtmlEntities
				  ) )
			continue;

		AppendingWordsWindow ww;
		if ( ! ww.set( &words,
			       1,   // minWindowSize
			       5,   // maxWindowSize
			       AWW_INIT_BUF_SIZE,
			       NULL
			       ) )
			continue;

		// find all phrases between length of 1 and 5
		for ( ww.processFirstWindow();
		      ! ww.isDone();
		      ww.processNextWindow() ) {
			ww.act();

			char *phrasePtr = ww.getPhrasePtr();
			long  phraseLen = ww.getPhraseLen();
			long  numPhraseWords = ww.getNumWords();
			if ( numPhraseWords == 0 ) continue;

			// see if topic phrase is a place
			long placePop = getPlacePop( phrasePtr, phraseLen,
						     encodeType );
			if ( placePop > MIN_PLACEPOP ) {
				// It's a place, mark it so if a page has
				// this place name in it's title we won't
				// rerank it
				uint64_t h = hash64d( phrasePtr, phraseLen);
				m_ignoreLocs.addKey( h, true );
				//log( LOG_DEBUG, "query:  pre gigabit has "
				//     "location '%s' (%ld) [h:%lld] AWL",
				//     phrasePtr, phraseLen, h );
				continue;
			}

			// Check if a gigabit's abbreviation is location
			// specific
			SynonymInfo synInfo;
			if ( ! g_thesaurus.getSynonymInfo( phrasePtr,
							   &synInfo,
							   phraseLen ) )
				continue;
			long numSyns = synInfo.m_numSyns;
			for ( long j = 0; j < numSyns; j++ ) {
				char *syn    = synInfo.m_syn[j];
				long  synLen = gbstrlen(syn);
				placePop = getPlacePop( syn, synLen,
							csISOLatin1 );
				if ( placePop > MIN_PLACEPOP ) {
					// It's a place, so mark syn
					uint64_t h = hash64d( syn, synLen);
					m_ignoreLocs.addKey( h, true );
					//log( LOG_DEBUG, "query:  pre gigabit"
					//     " has location synonym '%s'"
					//     " h:%lld AWL",
					//     syn, h );
					continue;
				}
			}
		}
	}

	if (m_querysLoc != 0)
		log(LOG_DEBUG, "pqr: query contains a location, "
		    "will not demote location specific results");
	else
		log(LOG_DEBUG, "pqr: query DOES NOT contain a location, "
		    "will demote location specific results");

	return true;
}

// pqrloc
// . if query is not location specific, and a page has a geographic location
//   in its title then demote that page UNLESS the geographic location is
//   contained in the list of gigabits for the search query. like "Shoes (UK)"
//   or "retail stores in New York" when you are UK and New York are not in
//   your query. We will need a file of locations. BUT if the location is
//   contained in the gigabits, do NOT demote such pages, query might have
//   something like "the big apple" in it... Note: if query ops out of a
//   location, it should not be considered location specific (like "expo
//   -montreal"). demote by popularity weight of the place name as returned
//   from getPlacesPeoplePop().
// . demote results containing geographic locations
//   unless THAT location is in gigabits or in query. fixes
//   'car insurance'? demote a little bit if in summary...
//   or a little bit if in a single dmoz catregory and it is
//   dmoz regional category. do not demote 'united states' 'us'
//   'america' or 'usa' if searching default is the us. do
//   not dmoz dmoz north america:US region if searching in us.
//   but if 'albuquerque' in query, do not demote if 'new mexico'
//   in search results.
rscore_t PostQueryRerank::rerankNonLocationSpecificQueries ( rscore_t score,
							 Msg20 *msg20 ) {
	float titleFactor = m_si->m_pqr_demFactLocTitle;
	float summFactor = m_si->m_pqr_demFactLocSummary;
	float dmozFactor = m_si->m_pqr_demFactLocDmoz;
	if ( titleFactor <= 0 &&
	     summFactor <= 0 &&
	     dmozFactor <= 0 )
		return score;
	long maxVal = m_si->m_cr->m_pqr_maxValLoc;
	// if we found a location in the query, don't rerank for locs
	if (m_querysLoc != 0) return score;

	//log( LOG_DEBUG, "query:in PQR::rerankNonLocSpecQueries("
	//     "score:%ld)"
	//     "[P_factorTitle:%3.3f; P_factorSummary:%3.3f; P_factorDmoz:%3.3f; "
	//     " P_max:%ld; "
	//     "m_querysLoc:%lld; #m_ignoreLocs:%ld; #summaryLocs:%ld] AWL",
	//     score,
	//     titleFactor, summFactor, dmozFactor,
	//     maxVal,
	//     m_querysLoc,
	//     m_ignoreLocs.getNumSlotsUsed(),
	//     msg20->getNumSummaryLocs() );

	// check if categories are regional and contain a location
	long    numCatids    = msg20->m_r->size_catIds / 4;
	long   *catids       = msg20->m_r->ptr_catIds;
	long    catLocMaxPop = 0;
	uint8_t searchingUS  = (m_si->m_country == 226);
	//log(LOG_DEBUG, "pqr: checking %ld categories for locs AWL",
	//    numCatids);
	for ( unsigned char i = 0; dmozFactor > 0 && i < numCatids; i++ ) {
		SafeBuf sb;
		long catid = catids[i];
		g_categories->printPathFromId(&sb, catid, true);

		// copy first part of category so we can work with it
		const long MAX_PQRCAT = 512;
		char  cat[MAX_PQRCAT];
		long  len = sb.length();
		if (len > MAX_PQRCAT) len = MAX_PQRCAT;
		strncpy(cat, sb.getBufStart(), len);
		cat[len] = '\0';
		//log(LOG_DEBUG, "pqr:  catid:%ld category:'%s' AWL",
		//    catid, cat);

		// see if we have a regional category
		char *p   = cat;
		char  region[64];
		char *q   = region;
		while (*p && q-region < 64 && *p != '/') *q++ = *p++;
		*q = '\0';
		bool catIsRegional = (0 == strncmp(region, "Regional", 8));
		//log(LOG_DEBUG, "pqr:  cat has region:%ld AWL",
		//    (long)catIsRegional);
		// we only care about regional categories
		if (!catIsRegional) continue;

		long placePop = 0;
		// scan category for region
		while (*p) {
			p++;
			q = region;
			while (*p && q-region < 64 && *p != '/') {
				if (*p == '_') *q++ = ' ';
				else           *q++ = *p;
				p++;
			}
			*q = '\0';

			bool regionIsUS = (searchingUS &&
					   (0 == strcasecmp(region, "us") ||
					    0 == strcasecmp(region, "united states") ||
					    0 == strcasecmp(region, "usa") ||
					    0 == strcasecmp(region, "america")));
			//log(LOG_DEBUG, "pqr: region:%s (isUS:%ld) AWL",
			//    region, (long)regionIsUS);

			// if region is us, skip category demotion
			if (!regionIsUS) {
				// see if region is a place
				placePop = getPlacePop(region, q-region,
						       csISOLatin1);
				if (placePop > MIN_PLACEPOP) break;
			}
		}
		// if we didn't find a place, go to next cat
		if (placePop <= MIN_PLACEPOP) continue;

		uint64_t h = hash64d( region, q-region );
		if (h == 0) continue;

		// is it the location of the query?
		if (h == m_querysLoc) {
			//log(LOG_DEBUG, "pqr: cat "
			//    "has query's loc "
			//    "[pop:%ld; h:%lld] AWL",
			//    placePop, h);
			return score;
		}

		// is it in the gigabits?
		if (m_ignoreLocs.getSlot( h ) != -1) {
			//log(LOG_DEBUG, "pqr: cat has "
			//    "gigabit's loc [pop:%ld; h:%llu] AWL",
			//    placePop, h);
			return score;
		}

		// use only the max pop for all places in category
		if (placePop > catLocMaxPop) {
			//log(LOG_DEBUG, "pqr: cat has a non-query, "
			//    "non-gigabit loc:'%s' %llu pop:%ld AWL",
			//    region, h, placePop);
			catLocMaxPop = placePop;
			continue;
		}
	}
	//log(LOG_DEBUG, "pqr: categories' max population:%ld AWL",
	//    catLocMaxPop);
	if (dmozFactor > 0 && catLocMaxPop > MIN_PLACEPOP)
		score = rerankHigherDemotesMore(score,
						catLocMaxPop, maxVal,
						dmozFactor,
						"pqrlocd",
						"population of a place in a "
						"category and the place was "
						"not in the query or gigabits");


	// check if summary contains a location
	// check if summary's location is in gigabits
	long      numSummaryLocs   = msg20->m_r->size_summLocs/8;
	uint64_t *summaryLocs      = msg20->m_r->ptr_summLocs;
	long     *summaryLocsPops  = msg20->m_r->ptr_summLocsPop;
	long      summaryLocMaxPop = 0;
	for (long i = 0; summFactor > 0 && i < numSummaryLocs; i++) {
		uint64_t h        = summaryLocs[i];
		long     placePop = summaryLocsPops[i];
		if (h == 0) continue;
		if (placePop <= MIN_PLACEPOP) continue;

		// is it the location of the query?
		if ( h == m_querysLoc ) {
			//log( LOG_DEBUG, "pqr: summary "
			//     "has query's loc "
			//     "[pop:%ld; h:%lld] AWL",
			//     placePop, h );
			return score;
		}

		// is it in the gigabits?
		if (m_ignoreLocs.getSlot( h ) != -1 ) {
			//log( LOG_DEBUG, "pqr: summary has "
			//     "gigabit's loc [pop:%ld; h:%llu] AWL",
			//     placePop, h );
			return score;
		}

		// use only the max pop for all places in title
		if ( placePop > summaryLocMaxPop ) {
			//log( LOG_DEBUG, "pqr: summary has a non-query, "
			//     "non-gigabit loc:%llu pop:%ld AWL",
			//     h, placePop );
			summaryLocMaxPop = placePop;
			continue;
		}
	}
	//log( LOG_DEBUG, "pqr: summary's max population:%ld AWL",
	//     summaryLocMaxPop );
	if (summFactor > 0 && summaryLocMaxPop > MIN_PLACEPOP)
		score = rerankHigherDemotesMore(score,
						summaryLocMaxPop, maxVal,
						summFactor,
						"pqrlocs",
						"population of a place in its "
						"summary and the place was "
						"not in the query or gigabits");

	// check if title contains a location
	if (titleFactor <= 0) return score;
	char *pageTitle    = msg20->getTitle();
	long  pageTitleLen = msg20->getTitleLen();
	Words words;
	if ( ! words.set( pageTitle, pageTitleLen, TITLEREC_CURRENT_VERSION,
			  false, // computeIds
			  false  // hasHtmlEntities
			  ) )
		return score;

	AppendingWordsWindow ww;
	if ( ! ww.set( &words,
		       1,   // minWindowSize
		       5,   // maxWindowSize
		       AWW_INIT_BUF_SIZE,
		       NULL
		       ) )
		return score;

	// find all phrases between length of 1 and 5
	long titleLocMaxPop = 0;
	for ( ww.processFirstWindow(); ! ww.isDone(); ww.processNextWindow()) {
		ww.act();

		char *phrasePtr = ww.getPhrasePtr();
		long  phraseLen = ww.getPhraseLen();
		long  numPhraseWords = ww.getNumWords();
		if ( numPhraseWords == 0 ) continue;

		// Get the place's population
		// If it's a place, check gigabits for the place name
		long encodeType = csUTF8; //ISOLatin1;
		long placePop = getPlacePop( phrasePtr, phraseLen,
					     encodeType );
		if ( placePop > MIN_PLACEPOP ) {
			// Check if place is same as query
			// Check if gigabits has this location or
			// an abbreviation of the location, if so don't
			// rerank this page
			uint64_t h = hash64d(phrasePtr, phraseLen);
			if ( h == 0 ) continue;

			// is it the query's location?
			if ( h == m_querysLoc ) {
				//log( LOG_DEBUG, "query:  title has "
				//     "query's loc [pop:%ld; h:%llu] AWL",
				//     placePop, h );
				return score;
			}

			// is it in the gigabits?
			if ( m_ignoreLocs.getSlot( h ) != -1 ) {
				//log( LOG_DEBUG, "pqr:  title has "
				//     "gigabit's loc [pop:%ld; h:%llu] AWL",
				//     placePop, h );
				return score;
			}
		}
		// use only the max pop for all places in title
		if ( placePop > titleLocMaxPop ) {
			//log( LOG_DEBUG, "pqr:  title has a non-query, "
			//     "non-gigabit loc:'%s' (%ld) pop:%ld AWL",
			//     phrasePtr, phraseLen, placePop );
			titleLocMaxPop = placePop;
			continue;
		}

		// If we haven't found a place name yet, check for
		// abbreviations of a place name
		//log( LOG_DEBUG, "pqr:  phrase:'%s' (%ld) words:%ld "
		//     "pop:%ld AWL",
		//     phrasePtr, phraseLen, numPhraseWords,
		//     placePop );
		SynonymInfo synInfo;
		if ( ! g_thesaurus.getSynonymInfo( phrasePtr, &synInfo,
						   phraseLen ) ) {
			continue;
		}
		long numSyns = synInfo.m_numSyns;
		for ( long j = 0; j < numSyns; j++ ) {
			char *syn    = synInfo.m_syn[j];
			long  synLen = gbstrlen(synInfo.m_syn[j]);
			placePop = getPlacePop( syn, synLen,
						csISOLatin1 );
			if ( placePop > MIN_PLACEPOP ) {
				// Check if gigabits has an abbreviation
				// of the location, if so don't rerank
				// this page
				uint64_t h = hash64d(syn, synLen);
				if ( h == 0 ) continue;

				// is syn the query's loc?
				if ( h == m_querysLoc ) {
					//log( LOG_DEBUG, "pqr:  title "
					//     "has query's loc syn "
					//     "[pop:%ld; h:%lld] AWL",
					//     placePop, h );
					return score;
				}

				// is syn in gigabits?
				if ( m_ignoreLocs.getSlot( h ) != -1 ) {
					//log(LOG_DEBUG, "pqr:  syn title "
					//    " has gigabits's loc '%s' "
					//    "[pop:%ld; h:%lld] AWL",
					//    syn,
					//    placePop, h );
					return score;
				}

				// only use max pop in calculations
				if ( placePop > titleLocMaxPop ) {
					//log( LOG_DEBUG, "pqr:  title "
					//     "has a non-query, "
					//     "non-gigabit loc syn AWL" );
					titleLocMaxPop = placePop;
				}
			}
		}
	}
	//log( LOG_DEBUG, "pqr: title's max population:%ld AWL",
	//     titleLocMaxPop );

	return rerankHigherDemotesMore( score, titleLocMaxPop, maxVal,
					titleFactor,
					"pqrloct",
					"population of a place in its title "
					"and the place was not in the query "
					"or gigabits" );
}
*/
// pqrhtml, pqrxml
// demote if content type is not html (or is xml)
/*
rscore_t PostQueryRerank::rerankContentType ( rscore_t score,
					  char contentType ) {
	float htmlFactor = m_si->m_cr->m_pqr_demFactNonHtml;
	float xmlFactor  = m_si->m_cr->m_pqr_demFactXml;

 	//log( LOG_DEBUG, "query:in PQR::rerankContentType("
	 //    "score:%ld, content-type:%ld)"
	 //    "[P_factorHtml:%3.3f; P_factorXml:%3.3f] AWL",
	 //    score, (long)contentType,
	 //    htmlFactor, xmlFactor );

	// if completely disabled or page is html, don't do anything
	if ( xmlFactor <= 0 && htmlFactor <= 0 || contentType == CT_HTML )
		return score;

	// if demoting for xml, then do that
	if ( xmlFactor > 0 && contentType == CT_XML )
		return rerankAssignPenalty( score, xmlFactor,
					    "pqrxml", "it is xml" );

	// we are demoting for non-html and the page is not html
	return rerankAssignPenalty( score, htmlFactor,
				    "pqrhtml", "it is not html" );
}
*/

// pqrfsd
// setup
bool PostQueryRerank::preRerankOtherPagesFromSameHost( Url *pageUrl ) {
	// don't do anything if this method is disabled
	if ( m_si->m_cr->m_pqr_demFactOthFromHost <= 0 ) return true;

	// don't add if no url
	if ( pageUrl->getUrlLen() == 0 ) return true;

	//log( LOG_DEBUG, "query:in PQR::preRerankOtherPagesFromSameHost() AWL");
	//log( LOG_DEBUG, "query: u:'%s' host:'%s' (%ld); "
	//     "domain:'%s' (%ld) AWL",
	//     pageUrl->m_url,
	//     pageUrl->getHost(), pageUrl->getHostLen(),
	//     pageUrl->getDomain(), pageUrl->getDomainLen() );
	char *host = pageUrl->getDomain();
	long  hostLen = pageUrl->getDomainLen();
	uint64_t key = hash64Lower_a( host, hostLen );
	if ( key == 0 ) key = 1;
	long slot = m_hostCntTable.getSlot( key );
	if ( slot == -1 ) {
		m_hostCntTable.addKey( key, 0 ); // first page doesn't cnt
	}
	else {
		long *cnt = m_hostCntTable.getValuePointerFromSlot( slot );
		(*cnt)++;
	}

	return true;
}

// pqrfsd
// . if page does not have any other pages from its same hostname in the
//   search results (clustered or not) then demote it. demote based on
//   how many pages occur in the results from the same hostname. (tends
//   to promote pages from hostnames that occur a lot in the unclustered
//   results, they tend to be authorities) If it has pages from the same
//   hostname, they must have the query terms in different contexts, so
//   we must get the summaries for 5 of the results, and just cluster the rest.
rscore_t PostQueryRerank::rerankOtherPagesFromSameHost ( rscore_t score,
							  Url *pageUrl ) {
	//log( LOG_DEBUG, "query:in PQR::rerankOtherPagesFromSameHost("
	//     "score:%ld, url:'%s', urlLen:%ld)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score, pageUrl->getUrl(), pageUrl->getUrlLen(),
	//     m_si->m_cr->m_pqr_demFactOthFromHost,
	//     m_si->m_cr->m_pqr_maxValOthFromHost );

	if ( pageUrl->getUrlLen() == 0 ) return score;

	float factor = m_si->m_cr->m_pqr_demFactOthFromHost;
	if ( factor <= 0 ) return score; // disables
	long maxVal = m_si->m_cr->m_pqr_maxValOthFromHost;
	if ( maxVal < 0 ) maxVal = m_numToSort-1; // all but this one

	// . lookup host for this page in hash table to get number of other
	//   pages from the same host
	char *host = pageUrl->getDomain();
	long  hostLen = pageUrl->getDomainLen();
	uint64_t key = hash64Lower_a( host, hostLen );
	long slot = m_hostCntTable.getSlot( key );
	long numFromSameHost = m_hostCntTable.getValueFromSlot( slot );

	//log( LOG_DEBUG, "query:  numFromSameHost:%ld AWL", numFromSameHost );

	return rerankLowerDemotesMore( score,
				       numFromSameHost, maxVal,
				       factor,
				       "pqrfsd",
				       "other pages from the same host" );
}

// pqrctid
// . if page is from a topic in dmoz that is in common with a lot of other
//   results, then do not demote it as much as if it is not. ("birds of
//   a feather") Reduce demotion penalty as you demote each result in
//   order to avoid "clumping".
// setup
bool PostQueryRerank::preRerankCommonTopicsInDmoz( Msg20Reply *mr ) {
	if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 ) return true;
	//SANITYCHECK( msg20 );
	if ( ! mr ) { char *xx=NULL;*xx=0; }

	//log( LOG_DEBUG, "query:in PQR::preRerankCommonTopicsInDmoz() "
	//     "AWL" );
	//log(LOG_DEBUG, "query:  qdmoz pre cnt:%d AWL",
	//    (int)msg20->m_numCatids);
	long numCatids = mr->size_catIds/4;//msg20->getNumCatids();
	for ( unsigned char i = 0; i < numCatids; i++ ) {
		long key = mr->ptr_catIds[i];//msg20->getDmozCatids()[i];
		if ( key == 0 ) key = 1;
		long slot = m_dmozTable.getSlot( key );
		//log( LOG_DEBUG, "query:  qdmoz pre %ld/%ld; "
		//     "catId:%ld; slot:%ld AWL",
		//     (long)i+1, (long)msg20->m_numCatids,
		//     key, slot );
		if ( slot == -1 ) {
			// first occurance
			// cnt is 0, no other common topics
			// demotion factor is the parm
			ComTopInDmozRec rec;
			rec.cnt = 0;
			rec.demFact =
				m_si->m_cr->m_pqr_demFactComTopicInDmoz;
			m_dmozTable.addKey( key, rec );
			//log(LOG_DEBUG, "query:  qdmoz pre occurance 1 AWL");
		}
		else {
			// nth occurance
			ComTopInDmozRec *rec =
				m_dmozTable.getValuePointerFromSlot( slot );
			rec->cnt++;
			//log( LOG_DEBUG, "query:  qdmoz pre key:%ld "
			//     "occurance %ld AWL",
			//     key, rec->cnt );
		}
	}
	return true;
}

// pqrctid
// . if page is from a topic in dmoz that is in common with a lot of other
//   results, then do not demote it as much as if it is not. ("birds of
//   a feather") Reduce demotion penalty as you demote each result in
//   order to avoid "clumping".
rscore_t PostQueryRerank::rerankCommonTopicsInDmoz ( rscore_t score,
						 Msg20 *msg20 ) {
	//log( LOG_DEBUG, "query:in PQR::rerankCommonTopicsInDmoz("
	//     "score:%ld)"
	//     "[P_max:%ld P_decFact:%3.3f] AWL",
	//     score,
	//     m_si->m_cr->m_pqr_maxValComTopicInDmoz,
	//     m_si->m_cr->m_pqr_decFactComTopicInDmoz );

	//log( LOG_DEBUG, "query:  qdmoz cnt:%ld AWL",
	//     (long)msg20->m_numCatids );

	if ( m_si->m_cr->m_pqr_demFactComTopicInDmoz <= 0 )
		return score;
	long maxVal = m_si->m_cr->m_pqr_maxValComTopicInDmoz;
	if ( maxVal < 0 ) maxVal = m_numToSort;

	// . see if page is from a topic in dmoz that is in common with a
	//   lot of other results
	// . if no catid, result will be not be demoted
	float chosenDemFact = 0.0;
	long numComTopicsInDmoz = 0;
	long maxComTopicsInDmoz = 0;
	long numCatids = msg20->m_r->size_catIds/4;//getNumCatids();
	for ( unsigned char i = 0; i < numCatids; i++ ) {
		long key = msg20->m_r->ptr_catIds[i];//getDmozCatids()[i];
		long slot = m_dmozTable.getSlot( key );
		ComTopInDmozRec *rec =
			m_dmozTable.getValuePointerFromSlot( slot );
		//log( LOG_DEBUG, "query:  slot:%ld key:%ld cnt:%ld; "
		//     "demFact:%3.3f AWL",
		//     slot, key,
		//     rec->cnt, rec->demFact );

		// add # of other pages with same topic as this
		numComTopicsInDmoz += rec->cnt;

		// . find the slot with the max common topics in dmoz so
		//   it can be decayed
		if ( rec->cnt > maxComTopicsInDmoz ) {
			chosenDemFact = rec->demFact;
			maxComTopicsInDmoz = rec->cnt;
		}
	}

	score = rerankHigherDemotesMore( score,
					 numComTopicsInDmoz, maxVal,
					 chosenDemFact,
					 "pqrctid",
					 "common topics in dmoz "
					 "as other results" );

	// now decay the factors
	float decFactor =
		m_si->m_cr->m_pqr_decFactComTopicInDmoz;
	if ( decFactor <  0 ) return score;
	for ( unsigned char i = 0; i < numCatids; i++ ) {
		long key = msg20->m_r->ptr_catIds[i];
		long slot = m_dmozTable.getSlot( key );
		ComTopInDmozRec *rec =
			m_dmozTable.getValuePointerFromSlot( slot );
		rec->demFact *= (1.0 - decFactor);
		//log( LOG_DEBUG, "query:  decay slot:%ld key:%ld "
		//     "cnt:%ld; decFact:%3.3f; new demFact:%3.3f AWL",
		//     slot, key,
		//     rec->cnt, decFactor, rec->demFact );
	}

	return score;
}

// pqrdcndcqt
// . if the dmoz category names contain a query term (or its synonyms or
//   gigabits), "boost" the result based on the query term weight (look at
//   query phrase term weights, too) (actually, demote others that do not
//   have them...)
/*
rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveQT ( rscore_t score,
							  Msg20 *msg20 ) {
	//log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveQT("
	//     "score:%ld)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score,
	//     m_si->m_cr->m_pqr_demFactDmozCatNmNoQT,
	//     m_si->m_cr->m_pqr_maxValDmozCatNmNoQT );

	float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoQT;
	if ( factor <= 0 ) return score; // disables
	long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoQT;

        long numQTsInDmoz = 0;
	char *pd = msg20->m_r->ptr_dmozTitles;
	long numCatids = msg20->m_r->size_catIds/4;
	long numQTs = m_si->m_q->m_numTerms;
	HashTableT<uint64_t, long> matchedIds;
	matchedIds.set( numQTs*2 );
	for ( long j = 0; j < numCatids; j++ ) {
		char *currTitle = pd;
		long currTitleLen = gbstrlen(pd);
		if ( currTitleLen == 0 ) continue;
		//log( LOG_DEBUG, "query: currTitle:%s (%ld) AWL",
		//     currTitle, currTitleLen );

		Words w;
		Bits b;
		Phrases p;
		long long *wids;
		long       nw;
		long long *pids;
		if ( ! w.set( currTitle     ,
			      currTitleLen  ,
			      TITLEREC_CURRENT_VERSION,
			      true          , // computeIds
			      false         ) )
			goto next;
		if ( ! b.set( &w, TITLEREC_CURRENT_VERSION ,0) )
			goto next;
		if ( ! p.set( &w            ,
			      &b            ,
			      true          , // useStopWords
			      false         , // useStems
			      TITLEREC_CURRENT_VERSION,
			      0             ) ) // niceness
			goto next;

		wids = w.getWordIds   ();
		nw   = w.getNumWords  ();
		pids = p.getPhraseIds2 ();
		// go through all words in cat name
		for ( long i = 0; i < nw; i++ ) {
			// go through all query terms
			for ( long k = 0; k < numQTs; k++ ) {
				QueryTerm *qt =
					&m_si->m_q->m_qterms[k];
				long long rawTermId = qt->m_rawTermId;
				// ignore 0 termIds
				if ( rawTermId == 0 ) continue;
				// see if we already matched this id
				long n = matchedIds.getSlot( rawTermId );
				if ( n != -1 ) continue;

				// compare this query term to cat word
				if ( rawTermId == wids[i] ) {
					matchedIds.addKey( rawTermId, 0 );
					numQTsInDmoz++;
					//log( LOG_DEBUG, "query: qt-dmozw "
					//     "match '%s' (%ld) AWL",
					//     qt->m_term,
					//     qt->m_termLen );
					continue;
				}
				// compare this query term to cat phrase
				if ( qt->m_isPhrase && rawTermId == pids[i] ) {
					matchedIds.addKey( rawTermId, 0 );
					numQTsInDmoz++;
					//log( LOG_DEBUG, "query: qt-dmozp "
					//     "match '%s' (%ld) AWL",
					//     qt->m_term,
					//     qt->m_termLen );
					continue;
				}

				// if we haven't matched yet, check syns
				SynonymInfo synInfo;
			        if ( !  g_thesaurus.getSynonymInfo( rawTermId,
								    &synInfo ))
					continue;
				long numSyns = synInfo.m_numSyns;
				for ( long k = 0; k < numSyns; k++ ) {
					//log( LOG_DEBUG, "query: syn:'%s' AWL",
					//     synInfo.m_syn[j]);
					uint64_t h ;
					h =hash64Lower_utf8(synInfo.m_syn[j],
							    gbstrlen(synInfo.m_syn[j]));
					// see if we already matched this id
					long n = matchedIds.getSlot( h );
					if ( n != -1 ) continue;
					// Compare this query term syn to
					// cat word
					if ( (long long)h == wids[i] ) {
						matchedIds.addKey( h, 0 );
						numQTsInDmoz++;
						//log( LOG_DEBUG, "query: "
						//     "synmatch:'%s' "
						//     "in dmozw:'%s' AWL",
						//     synInfo.m_syn[j],
						//     currTitle );
						continue;
					}
					// Compare this query term syn to
					// cat phrase
					if ( qt->m_isPhrase &&
					     (long long)h == pids[i] ) {
						matchedIds.addKey( h, 0 );
						numQTsInDmoz++;
						//log( LOG_DEBUG, "query: "
						//     "synmatch:'%s' "
						//     "in dmozp:'%s' AWL",
						//     synInfo.m_syn[j],
						//     currTitle );
						continue;
					}
				}
			}
		}
next:
		pd += currTitleLen;
	}
	//log( LOG_DEBUG, "query: qts or syns in dmoz cat name:%ld AWL",
	//     numQTsInDmoz );

	return rerankLowerDemotesMore( score,
				       numQTsInDmoz, maxVal,
				       factor,
				       "pqrdcndcqt",
				       "query terms in its dmoz category names");
}
*/

// pqrdcndcgb
// . if the dmoz category names contain a query term (or its synonyms or
//   gigabits), "boost" the result based on the query term weight (look at
//   query phrase term weights, too) (actually, demote others that do not
//   have them...)
/*
rscore_t PostQueryRerank::rerankDmozCategoryNamesDontHaveGigabits ( rscore_t score,
								Msg20 *msg20 ) {
	//log( LOG_DEBUG, "query:in PQR::rerankDmozCategoryNamesDontHaveGigabits("
	//     "score:%ld)"
	//     "[P_factor:%3.3f; P_max:%ld] AWL",
	//     score,
	//     m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits,
	//     m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits );

	float factor = m_si->m_cr->m_pqr_demFactDmozCatNmNoGigabits;
	if ( factor <= 0 ) return score; // disables
	long maxVal = m_si->m_cr->m_pqr_maxValDmozCatNmNoGigabits;
	if ( maxVal < 0 ) maxVal = m_si->m_docsToScanForTopics;

	// find number of gigabits in dmoz category name
	long numGigabitsInDmoz = 0;
	// go through gigabits each possible phrase in gigabits
	//log( LOG_DEBUG, "query: numGigabits:%ld AWL",
	//     m_msg40->getNumTopics() );
	long numTopics = m_msg40->getNumTopics();
	HashTableT<long long, long> matchedIds;
	matchedIds.set( numTopics*4 );
	for ( long i = 0; i < numTopics; i++ ) {
		Words words;
		if ( ! words.set( m_msg40->getTopicPtr(i),
				  m_msg40->getTopicLen(i),
				  TITLEREC_CURRENT_VERSION,
				  false,  // computeIds
				  false  // hasHtmlEntities
				  ) )
			continue;

		AppendingWordsWindow ww;
		if ( ! ww.set( &words,
			       1,   // minWindowSize
			       4,   // maxWindowSize
			       AWW_INIT_BUF_SIZE,
			       NULL
			       ) )
			continue;

		// find all phrases between length of 1 and 4
		for ( ww.processFirstWindow();
		      ! ww.isDone();
		      ww.processNextWindow() ) {
			ww.act();

			char *phrasePtr = ww.getPhrasePtr();
			long  phraseLen = ww.getPhraseLen();
			long  numPhraseWords = ww.getNumWords();
			if ( numPhraseWords == 0 ) continue;
			//log( LOG_DEBUG, "query: gb phrase:%s (%ld) AWL",
			//     phrasePtr, phraseLen );
			// see if we already matched this phrase
			uint64_t h = hash64Lower_utf8( phrasePtr, phraseLen );
			if ( h == 0 ) h = 1;
			if ( matchedIds.getSlot( h ) != -1 )
				continue;
			// ignore phrases that are just common words
			if ( isCommonWord( h ) )
				continue;
			matchedIds.addKey( h, 0 );

			// go through dmoz category names
			char *p = msg20->m_r->ptr_dmozTitles;
			long numCatids = msg20->m_r->size_catIds/4;
			for ( long j = 0; j < numCatids; j++ ) {
				char *currTitle = p;
				long currTitleLen = gbstrlen(p);
				if ( currTitleLen == 0 ) continue;

				//log( LOG_DEBUG, "query:   dmoz:%s (%ld) AWL",
				//     currTitle, currTitleLen );

				// check if gigabit is in dmoz category name
				if (strncasestr(currTitle, phrasePtr,
						currTitleLen, phraseLen)){
					//log( LOG_DEBUG, "query:    gb is in "
					//     "dmoz AWL");
					numGigabitsInDmoz++;
				}
				p += currTitleLen;
			}
		}
	}
	//log( LOG_DEBUG, "query:  numGigabitsInDmoz:%ld AWL",
	//     numGigabitsInDmoz );

	return rerankLowerDemotesMore( score,
				       numGigabitsInDmoz, maxVal,
				       factor,
				       "pqrdcndcgb",
				       "gigabits in its dmoz category names" );
}
*/

// pqrdate
// . demote pages by datedb date
rscore_t PostQueryRerank::rerankDatedbDate( rscore_t score,
					time_t datedbDate ) {
	float factor = m_si->m_cr->m_pqr_demFactDatedbDate;
	if ( factor <= 0 ) return score;
	long minVal = m_si->m_cr->m_pqr_minValDatedbDate;
	if ( minVal <= 0 ) minVal = 0;
	minVal *= 1000;
	long maxVal = m_si->m_cr->m_pqr_maxValDatedbDate;
	if ( maxVal <= 0 ) maxVal = 0;
	maxVal = m_now - maxVal*1000;

	//log( LOG_DEBUG, "query:in PQR::rerankDatedbDate("
	//     "score:%ld, datedbDate:%ld)"
	//     "[P_factor:%3.3f; maxVal:%ld] AWL",
	//     score, datedbDate,
	//     factor, maxVal );

	// don't penalize results whose publish date is unknown
	if ( datedbDate == -1 ) return score;
	if ( datedbDate <= minVal )
		return rerankAssignPenalty( score,
					    factor,
					    "pqrdate",
					    "publish date is older then "
					    "minimum value" );

	return rerankLowerDemotesMore( score,
				       datedbDate-minVal, maxVal-minVal,
				       factor,
				       "pqrdate",
				       "publish date" );
}

// pqrprox
// . demote pages by the average distance of query terms from
// . one another in the document.  Lower score is better.
/*
rscore_t PostQueryRerank::rerankProximity( rscore_t score,
				       float proximityScore,
				       float maxScore) {
	// . a -1 implies did not have any query terms
	// . see Summary.cpp proximity algo
	if ( proximityScore == -1 ) return 0;
	if(m_si->m_pqr_demFactProximity <= 0) return score;
	float factor = (// 1 -
			(proximityScore/maxScore)) *
		m_si->m_pqr_demFactProximity;
	if ( factor <= 0 ) return score;
	//return rerankAssignPenalty(score,
	//			   factor,
	//			   "pqrprox",
	//			   "proximity rerank");
	// just divide the score by the proximityScore now

	// ...new stuff...
	if ( proximityScore == 0.0 ) return score;
	float score2 = (float)score;
	score2 /= proximityScore;
	score2 += 0.5;
	rscore_t newScore = (rscore_t)score2;
	if(m_si->m_debug || g_conf.m_logDebugPQR )
		logf( LOG_DEBUG, "query: pqr: result demoted "
		      "from %.02f to %.02f becaose of proximity rerank",
		      (float)score,(float)newScore);
	return newScore;
}
*/

// pqrinsec
// . demote pages by the average of the score of the sections
// . in which the query terms appear in.  Higher score is better.
rscore_t PostQueryRerank::rerankInSection( rscore_t score,
				       long summaryScore,
				       float maxScore) {
	if(m_si->m_pqr_demFactInSection <= 0) return score;
	float factor = ( 1 -
			 (summaryScore/maxScore)) *
		m_si->m_pqr_demFactInSection;
	if ( factor <= 0 ) return score;
 	return rerankAssignPenalty(score,
 				   factor,
 				   "pqrsection",
 				   "section rerank");
}


/*
rscore_t PostQueryRerank::rerankSubPhrase( rscore_t score,
				       float diversity,
				       float maxDiversity) {
	if(maxDiversity == 0) return score;
	float factor = (1 - (diversity/maxDiversity)) *
		m_si->m_pqr_demFactSubPhrase;
	if ( factor <= 0 ) return score;
	return rerankAssignPenalty(score,
				   factor,
				   "pqrspd",
				   "subphrase demotion");

}
*/

bool PostQueryRerank::attemptToCluster ( ) {
	// find results that should be clustered
	bool                       needResort   = false;
	HashTableT<uint32_t, long> hostPosTable;
	hostPosTable.set(m_numToSort);
	for (long i = 0; i < m_numToSort; i++) {
		// look up this hostname to see if it's been clustered
		uint32_t key     = m_m20List[i].m_host;
		if ( key == 0 ) key = 1;
		long     slot    = hostPosTable.getSlot(key);
		if (slot != -1) {
			// see if we are within 10 results of first result
			// from same host
			long firstPos = hostPosTable.getValueFromSlot(slot);
			if (i - firstPos > 1 && i - firstPos < 10) {
				// this result can be clustered
				rscore_t maxNewScore;
				maxNewScore = m_m20List[firstPos].m_score;
				if (maxNewScore <= m_m20List[i].m_score)
					continue;
				needResort = true;
				if(m_si->m_debug||g_conf.m_logDebugPQR )
					logf(LOG_DEBUG, "pqr: re-ranking result "
					     "%ld (%s) from score %.02f to "
					     "score %.02f "
					     "in order to cluster it with "
					     "result "
					     "%ld (%s)",
					     i,
					     m_m20List[i].m_m20->m_r->ptr_ubuf,
					     (float)m_m20List[i].m_score,
					     (float)maxNewScore,
					     firstPos,
					     m_m20List[firstPos].m_m20->m_r->ptr_ubuf);
				// bump up the score to cluster this result
				m_m20List[i].m_score = maxNewScore;
			}
			else {
				hostPosTable.setValue(slot, i);
			}
		}
		else {
			// add the hostname of this result to the table
			if (!hostPosTable.addKey(key, i)) {
				g_errno = ENOMEM;
				return false;
			}
		}
	}

	// re-sort the array if necessary
	if (needResort) {
		log(LOG_DEBUG, "pqr: re-sorting results for clustering");
		gbmergesort( (void *) m_m20List, (size_t) m_numToSort,
			     (size_t) sizeof(M20List),
			     (int (*)(const void *, const void *))s_reSortFunction);
	}

	return true;
}

// Sort function for post query reranking's M20List
static int32_t s_firstSortFunction(const M20List * a, const M20List * b)
{
	// Sort by tier first, then score
	// When sorting by tier, an explicit match (0x40) in a higher tier
	// gets precedence over an implicit match (0x20) from a lower tier
	// Note: don't sort by tier, don't consider bitscores
	//if ( a->tier < b->tier &&
	//    (a->bitScore & 0x40 || !b->bitScore & 0x40) )
	//	return -1;
	//if ( a->tier > b->tier &&
	//    (b->bitScore & 0x40 || !a->bitScore & 0x40) )
	//	return 1;

	// Absolute match proximity
	//if ( a->m20->m_proximityScore > b->m20->proximityScore )
	//	return -1;
	//else if ( a->m20->m_proximityScore < b->m20->proximityScore )
	//	return 1;

	// same tier, same proximity, sort by score
	if ( a->m_score > b->m_score )
		return -1;
	if ( a->m_score < b->m_score )
		return 1;

	// same tier, same proximity, same score, sort by docid
	//if ( a->docId < b->docId )
	//	return -1;
	//if ( a->docId > b->docId )
	//	return 1;

	// same score, sort by host
	if ( a->m_host > b->m_host )
		return -1;
	if ( a->m_host < b->m_host )
		return 1;

	return 0;
}

// Sort function for post query reranking's M20List
static int32_t s_reSortFunction(const M20List * a, const M20List * b)
{
	// Sort by tier first, then score
	// When sorting by tier, an explicit match (0x40) in a higher tier
	// gets precedence over an implicit match (0x20) from a lower tier
	// Note: don't sort by tier, don't consider bitscores
	//if ( a->tier < b->tier &&
	//    (a->bitScore & 0x40 || !b->bitScore & 0x40) )
	//	return -1;
	//if ( a->tier > b->tier &&
	//    (b->bitScore & 0x40 || !a->bitScore & 0x40) )
	//	return 1;

	// Absolute match proximity
	//if ( a->m20->m_proximityScore > b->m20->proximityScore )
	//	return -1;
	//else if ( a->m20->m_proximityScore < b->m20->proximityScore )
	//	return 1;

	// same tier, same proximity, sort by score
	if ( a->m_score > b->m_score )
		return -1;
	if ( a->m_score < b->m_score )
		return 1;

	// same tier, same proximity, same score, sort by docid
	//if ( a->docId < b->docId )
	//	return -1;
	//if ( a->docId > b->docId )
	//	return 1;

	// same score, sort by host
	if ( a->m_host > b->m_host )
		return -1;
	if ( a->m_host < b->m_host )
		return 1;

	return 0;
}

#ifdef DEBUGGING_LANGUAGE
// Debug stuff, remove before flight
static void DoDump(char *loc, Msg20 **m20, long num,
		   score_t *scores, char *tiers) {
	int x;
	char *url;
	//log(LOG_DEBUG, "query: DoDump(): checkpoint %s AWL DEBUG", loc);
	for(x = 0; x < num; x++) {
		url = m20[x]->getUrl();
		if(!url) url = "None";
		//log( LOG_DEBUG, "query: DoDump(%d): "
		//     "tier:%d score:%ld [url:'%s'] msg20:%p\n AWL DEBUG",
		//     x, tiers[x], scores[x], url, m20[x] );
	}
}
#endif // DEBUGGING_LANGUAGE