open-source-search-engine/IndexTable2.h
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

807 lines
27 KiB
C++

// Matt Wells, copyright Aug 2001
// . calls for intersecting a bunch of IndexLists to generate docIds
// . IndexLists are data-less lists (keys only)
// . each key in an IndexList is a termId/score/adultBit/docId tuple
// . we try to use as small a sublist of each IndexList as possible to avoid
// wasting network bandwidth
// TODO: implement site clustering??????? in getNumResults()
// TODO: if we have in cache we can hash right into the table, but
// we must do that before blocking on something in case it disappears
// from the cache
// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
// be quote forced anyway. Some pages will match all but "cable hp"
// so we should break that up into it's 2 terms, cable and hp
// 6912 (30%) of queries are 1 word queries 138
// 7825 (34%) of queries are 2 word queries 156
// 4512 (20%) of queries are 3 word queries 90
// 1771 ( 8%) of queries are 4 word queries 32
// 869 ( 4%) of queries are 5 word queries 17
// 391 ( 2%) of queries are 6 word queries 8
// 290 ( 1%) of queries are 7 word queries 6
// 183 ( 1%) of queries are 8 word queries (4 per second)
#ifndef _INDEXTABLE2_H_
#define _INDEXTABLE2_H_
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
#include "Indexdb.h" // makeStartKey(), getTruncationLimit()
#include "IndexList.h" // for m_lists[]
#include "HashTableT.h"
// the final score of a docid
typedef uint32_t score_t;
//typedef float score_t;
// . get the docid from the ptr
// . works for both docid ptrs from m_topTree and m_topDocIdPtrs[]
inline int64_t getDocIdFromPtr ( char *docIdPtr ) {
int64_t d;
gbmemcpy ( &d , docIdPtr , 6 );
d >>= 2;
d &= DOCID_MASK;
return d;
};
// max # search results that can be viewed without using TopTree
#define MAX_RESULTS 1000
class IndexTable2 {
public:
// . returns false on error and sets errno
// . we now support multiple plus signs before the query term
// . start/endTermNums apply to phrase termIds only
// . allows us to set multiple bits when a phrase termId is matched
// in case the singleton was truncated, but doc has the phrase
// . if you want Default AND behaviour set requireAllTerms to true
// it is much faster, too
// . "termFreqs" are 1-1 with q->m_qterms[]
// . sets m_q to point to q
void init (Query *q ,
bool isDebug ,
void *logstate ,
bool requireAllTerms ,
class TopTree *topTree ,
char *coll ,
IndexList *lists ,
int32_t numLists ,
HashTableX *sortByDateTablePtr ,
int32_t docsWanted ,
int64_t *termFreqs ,
bool useDateLists ,
bool sortByDate ,
char sortBy , //1=time,2=dist,3=rel,4=pop
bool showExpiredEvents ,
float userLat ,
float userLon ,
bool doInnerLoopSiteClustering ,
bool doSiteClustering ,
bool getWeights ,
class Msg39Request *r );
// pre-allocate memory since intersection runs in a thread
bool alloc ( );
bool allocTopTree ( );
bool makeHashTables ( ) ;
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
void addLists_r ( int32_t *totalListSizes ,
float sortByDateWeight );
// some generic stuff
IndexTable2();
~IndexTable2();
void reset();
// Msg39 needs to call these
void freeMem ( ) ;
// sets m_affWeights and m_freqWeights from the provided weights
// in the Msg39Request.
void setAffWeights ( Msg39Request *r ) ;
bool recompute ( class Msg39Request *r ) ;
bool cacheIntersectionForRecompute ( class Msg39Request *r ) ;
void freeCacheRec ( int32_t i ) ;
// sets m_freqWeights[] based on termFreqs (IDF) in IMAP space
void setFreqWeights ( Query *q , bool phrase );
// computes a single final score from a score vector
score_t getWeightedScore ( unsigned char *scoresVec ,
int32_t nqt ,
float *freqWeights ,
float *affWeights ,
bool requireAllTerms );
void addLists2_r ( int32_t numLists ,
int32_t *imap ,
bool lastRound ,
int32_t numBaseLists ,
float sortByDateWeight ,
int32_t *minHardCountPtr );
bool eventHashLoop ( int32_t *listIndexes ,
char *listSigns ,
qvec_t *listExplicitBits ,
char **listEnds ,
char *listHash ,
char *listHardCount ,
int32_t *listPoints ,
char **ptrs ,
char **ptrEnds ,
char **oldPtrs ,
int32_t numPtrs ,
int64_t maxDocId ,
int32_t numListsToDo ,
int32_t numSlots ,
char **docIdPtrs ,
qvec_t *explicitBits ,
int16_t *hardCounts ,
uint8_t *eventIds ,
uint8_t *scoresVec ,
uint32_t *latVec,
uint32_t *lonVec,
//uint32_t *timeVec,
//uint32_t *endVec,
int32_t nqt ) ;
void hashTmpDocIds2 ( uint32_t *maxDocId ,
char **docIdPtrs ,
unsigned char *scoresVec ,
uint8_t *eventIds ,
uint32_t *latVec ,
uint32_t *lonVec ,
//uint32_t *timeVec ,
//uint32_t *endVec ,
qvec_t *explicitBits ,
int16_t *hardCounts ,
uint32_t mask ,
int32_t numSlots ,
int32_t nqt ) ;
// . used for getting which topDocId to kick out of the top list
int32_t getWeakestTopDocId ( char **topp ,
score_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
score_t *score ,
char **docIdPtr ) ;
void zeroOutVectorComponents ( unsigned char *scoresVec ,
qvec_t *ebits ,
int16_t *hardCounts ,
int32_t numDocIds ,
char rat ) ;
// . compute phrase and word weights based on phrase affinities
// and set final scores based on those weights
void computeAffWeights ( bool rat ,
int32_t numDocIds ,
char **docIdPtrs ,
uint8_t *scoresVec ,
qvec_t *explicitBits ,
int16_t *hardCounts ,
float *affWeights ,
float *affinities );
//void computeWeightedScores ( int32_t numDocIds ,
// int32_t *finalScores ,
// unsigned char *scoresVec ,
// qvec_t *explicitBits ,
// float *affWeights ,
// char **docIdPtrs );
// fill top docids with the best from top docids 2
int32_t fillTopDocIds ( //char **topp ,
//score_t *tops ,
//unsigned char *topb ,
int32_t numTop ,
char **tmpp2 ,
uint8_t *tmpsv2 ,
uint32_t *tmpdv2 ,
uint32_t *tmplatv2 ,
uint32_t *tmplonv2 ,
//uint32_t *tmptimev2 ,
//uint32_t *tmpendv2 ,
qvec_t *tmpev2 ,
int16_t *tmphc2 ,
uint8_t *tmpeids2,
int32_t numTmp2 );
// comparison between a result and the min top result
bool isBetterThanWeakest ( unsigned char bscore ,
score_t score ,
char *docIdPtr ,
unsigned char minTopBitScore ,
score_t minTopScore ,
char *minTopDocIdPtr );
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
// . these are set from calling addLists() above
// . we log all matching topDocIds if isDebug is true
char **getTopDocIdPtrs ( ) { return m_topDocIdPtrs; };
uint8_t *getTopBitScores ( ) { return m_topBitScores; };
score_t *getTopScores ( ) { return m_topScores; };
int32_t getNumTopDocIds ( ) { return m_numTopDocIds; };
// . get how many results we have in the topDocIds list
// . if "thatIncludeAllTerms" is true, results must have all terms
// from all indexLists that we haven't read ALL of yet
// . these are just used for stats keeping purposes in Msg3a for now
// . these are all 0 for top tree right now.. until i fix it
int32_t getNumExactExplicitMatches (){return m_numExactExplicitMatches;};
int32_t getNumExactImplicitMatches (){return m_numExactImplicitMatches;};
float *getFreqWeights ( ) { return m_freqWeights; };
// includes both synonym and phrase affinties combined
float *getAffWeights ( ) { return m_affWeights; };
float *getAffinities ( ) { return m_affinities; };
float *getFreqWeightsQS ( ) { return m_freqWeightsQS; };
float *getAffWeightsQS ( ) { return m_affWeightsQS; };
void setStuffFromImap();
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;
int64_t m_t2 ;
uint32_t m_totalDocIds;
int32_t m_numPanics;
int32_t m_numCollisions;
int32_t m_numPtrs; // in the beginning at least
int32_t m_numLoops;
int64_t m_estimatedTotalHits;
int32_t m_errno;
int32_t m_numSlots;
char *m_coll;
collnum_t m_collnum;
// this is true if no use to read more termlist data from disk, we
// will not get any more docids...
bool m_isDiskExhausted;
// point to array of term freqs, 1-1 with qterms
int64_t *m_termFreqs;
// how many docs in the collection?
int64_t m_docsInColl;
// . the imap stuff
// . m_imap is set in Query.cpp, but we contain it here
// . m_imap really only needed when doing rat=1 queries, but we use it
// for rat=0 because we may filter out UOR'ed QueryTerms or
// irrelevant synoynms
// . allows us to speed up the intersection process by intersecting
// smaller termlists first and thereby keeping the ongoing result
// set as small as possible
// . imap maps the query term's INTERSECTION ORDER to the query term
// number in Query::m_qterms[].
// . so if it is the the ith termlist to intersect, then its
// corresponding QueryTerm would be Query::m_qterms[m_imap[i]]
// . sometimes query terms are ignored as far as the intersection goes
// and we also intersect the term's termlists in a different order
// than the term's number in the Query::m_qterms[] array has it
// . m_blocksize[0] + m_blocksize[1] is how many QueryTerms termlists
// to intersect for the first call to IndexTable2::addLists2_r() for
// rat=1 operations.
// . we choose to intersect smaller termlists first to minimize the
// result set and maximize the speed of future intersections, since
// less results are involved
// . m_blocksize[0]=X and m_blocksize[1]=Y means to intersect
// m_lists[m_imap[0]], m_lists[m_imap[ 1]],... m_lists[m_imap[X-1]]
// (block #0)
// with
// m_lists[m_imap[X]], m_lists[m_imap[X+1]],... m_lists[m_imap[Y-1]]
// (block #1)
// . m_blocksize[i] can be > 1 because we need to intersect phrase
// termlists when intersecting their constituent word termlists in
// order to salvage docids that may not have the word terms
// explicitly, but do have the phrase terms explicitly, and thereby
// have the word terms implicitly.
// . m_nb is how many total blocks we have
// . m_sizes[i] is how many docids are in block #i total... *i think*..
// . after we intersect block #0 with block #1, further intersections
// are performed by calling addLists2_r() with one block at a time.
// . addLists2_r() will preserve the ongoing intersection in
// m_topDocIds2[], etc.
// . when it is called with a single block it hashes the docids in
// m_topDocIds2[] with the provided block of termlists to get the
// new result set which is again stored in m_topDocIds2[], ...
int32_t m_imap [ MAX_QUERY_TERMS ];
int32_t m_ni;
int32_t m_sizes [ MAX_QUERY_TERMS ];
int32_t m_blocksize [ MAX_QUERY_TERMS ];
int32_t m_nb;
// maps from query term space to imap space
int32_t m_revImap [ MAX_QUERY_TERMS ];
bool m_imapIsValid;
// for speed we must map Query::m_qterms[].m_leftPhraseTermNum into
// "imap space"
int32_t m_imapLeftPhraseTermNum [ MAX_QUERY_TERMS ];
int32_t m_imapRightPhraseTermNum [ MAX_QUERY_TERMS ];
// . these describe the lists associated with each m_termId
// . each query term has a corresponding
// term frequency weight (m_freqWeights[])
// . each query term has a corresponding
// phrase affinity weight (m_affWeights[])
// . each query term has a corresponding
// phrase affinity (m_phraseAffinities[][])
// . these are NOT exactly 1-1 with Query::m_qterms[]
// . m_freqWeights[i] is the freq weight for m_qterms[m_imap[i]],
// NOT m_qterms[i]
// . m_imap essentially REORDERS the QueryTerms for better intersection
// and may also
// remove some QueryTerms (like UOR'ed query terms and synonyms)
// . "nqt" is used throughout IndexTable2.cpp to indicate number of
// QueryTerms in
// "imap" which is always <= Query::m_numTerms = number of
// Query::m_qterms[]
float m_freqWeights [ MAX_QUERY_TERMS ];
float m_affWeights [ MAX_QUERY_TERMS ];
float m_affinities [ MAX_QUERY_TERMS ];
bool m_computedAffWeights;
// these two correlate with the Query::m_qterms[] instead of being
// in imap space. we pass these back in the Msg39Reply
float m_freqWeightsQS [ MAX_QUERY_TERMS ];
float m_affWeightsQS [ MAX_QUERY_TERMS ];
// we have a list of the top docids
char *m_topDocIdPtrs [ MAX_RESULTS ];
score_t m_topScores [ MAX_RESULTS ];
// 0x80: we have all hard-required terms
// 0x40: we have all other terms explicitly
// 0x20: we have all other terms implicitly
// & 0x1f: count of all terms we have implicitly (includes hard counts)
unsigned char m_topBitScores [ MAX_RESULTS ];
int32_t m_numTopDocIds ;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
// NSD: Search for 'gigablast' on gk0 gave just 30 results. But when
// a freshness of 30 days was given it gave over 200k results. That
// is because we limit the maxDocIdsToCompute, and in the normal case
// all the results were from gigablast.com and were being clustered out
// So add a check to limit the number of results from the same
// 8 bit dom hash having the same score to X (say 100)
//bool m_doInnerLoopSiteClustering;
bool m_doSiteClustering;
// justed used by Msg3a for stats tracking
int32_t m_numExactExplicitMatches ;
int32_t m_numExactImplicitMatches ;
// a reference to the query
Query *m_q;
bool m_useDateLists;
bool m_sortByDate;
// for events
char m_sortBy;
bool m_showExpiredEvents;
bool m_showInProgress;
// pointLat/pointLon multiplied by 10M into an int
uint32_t m_userLatIntComp;
uint32_t m_userLonIntComp;
int32_t m_latTermOff;
int32_t m_lonTermOff;
int32_t m_timeTermOff;
int32_t m_endTermOff;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
IndexList *m_lists;
int32_t m_numLists;
int32_t m_docsWanted;
// has init() been called?
bool m_initialized;
// are we in debug mode?
bool m_isDebug;
// for debug msgs
int32_t m_logstate;
bool m_doRecalc;
bool m_requireAllTerms;
int64_t m_numDocsInColl;
// the current "intersection" is stored in this table
char **m_tmpDocIdPtrs2;
uint32_t *m_tmpDateVec2;
uint32_t *m_tmpLatVec2;
uint32_t *m_tmpLonVec2;
//uint32_t *m_tmpTimeVec2;
//uint32_t *m_tmpEndVec2;
uint8_t *m_tmpScoresVec2;
uint8_t *m_tmpEventIds2;
qvec_t *m_tmpEbitVec2;
int16_t *m_tmpHardCounts2;
int32_t m_maxTmpDocIds2;
int32_t m_numTmpDocIds2;
int32_t m_nexti;
int32_t m_oldnexti;
time_t m_nowUTCMod;
// allocated memory
char *m_buf;
int32_t m_bufSize;
char *m_bufMiddle;
// for large hashtable for sortByDate
//char *m_bigBuf;
//int32_t m_bigBufSize;
// little buffer for the intersection
char *m_localBuf;
int32_t m_localBufSize;
// used by our call to Query::setBitScoresBoolean()
//char *m_bitScoresBuf;
//int32_t m_bitScoresBufSize;
// these are for removing component lists replaced by their compounds
int32_t *m_componentCodes;
bool m_getWeights;
bool m_searchingEvents;
HashTableX *m_sortByDateTablePtr;
class Msg39Request *m_r;
// . a hash table of docids
// . the &sq=docid1+docid2+...+docidN cgi parm can restrict the
// search results to this list of docids
bool m_useYesDocIdTable;
HashTableT<int64_t,char> m_dt;
// likewise, exclude any docid in this table
bool m_useNoDocIdTable;
HashTableT<int64_t,char> m_et;
// cache boolean results
bool m_useBoolTable;
HashTableT<qvec_t,char> m_bt;
// . encapsulate bool cache lookup for bitscores
// . FUCK! addLists2_r() calls this and tries to grow the table
// while in a thread!
inline uint8_t getBitScore(qvec_t ebits){
if (!m_useBoolTable)
return m_q->getBitScore(ebits);
// if table is mostly full do not add anything to it! we are
// like in a thread
if ( 100 * (m_bt.m_numSlotsUsed+1) >= m_bt.m_numSlots * 75 )
return m_q->getBitScore(ebits);
int32_t slot = -1;
uint8_t bscore;
if ((slot = m_bt.getSlot(ebits)) >=0){
//if (m_isDebug)
// logf(LOG_DEBUG,
// "query: getBitScoreCacheHit "
// "bits=0x%016"XINT64"",
// (int64_t) ebits);
bscore = m_bt.getValueFromSlot(slot);
}
else {
//if (m_isDebug)
// logf(LOG_DEBUG,
// "query: getBitScoreCacheMiss "
// "bits=0x%016"XINT64"",
// (int64_t) ebits);
bscore = m_q->getBitScore(ebits);
}
// store new bool value
if (slot < 0){
//if (m_isDebug)
// logf(LOG_DEBUG,
// "query: getBitScoreCacheAdd "
// "bits=0x%016"XINT64"",
// (int64_t) ebits);
m_bt.addKey(ebits,bscore,NULL);
}
return bscore;
};
};
// . get the LOWEST scoring docId from our list of top docIds
// . set "minBitScore22" and "score" for that lowest docId
// . inline this for speed
// . BUT lower docIds are considered higher scoring than higher docIds
inline int32_t IndexTable2::getWeakestTopDocId ( char **topp ,
score_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
score_t *score ,
char **docIdPtr ) {
int64_t tmp = 0LL;
score_t minScore = 0x7fffffff;
unsigned char minBitScore = 0xff;
char *minDocIdPtr = (char *)&tmp;
int32_t mini = -1;
for ( int32_t i = 0 ; i < numTop ; i++ ) {
if ( topb [i] > minBitScore ) continue;
if ( topb [i] < minBitScore ) goto gotIt;
if ( tops [i] > minScore ) continue;
if ( tops [i] < minScore ) goto gotIt;
if ( *(uint32_t *)(topp[i]+1 ) <
*(uint32_t *)(minDocIdPtr+1) ) continue;
if ( *(uint32_t *)(topp[i]+1 ) >
*(uint32_t *)(minDocIdPtr+1) ) goto gotIt;
if ( (*(unsigned char *)(topp[i] ) & 0xfc) <
(*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
// ties should not be happening for docid, unless
// it tied with initial setting of minDocIdPtr, in that
// case we should add it!
gotIt:
minScore = tops [i];
minBitScore = topb [i];
minDocIdPtr = topp [i];
mini = i;
}
// set the callers ptrs
*minBitScore2 = minBitScore;
*score = minScore;
*docIdPtr = minDocIdPtr;
// return the lowest scoring docId's position
return mini;
}
// . checks a result against the weakest to see which is better
inline bool IndexTable2::isBetterThanWeakest ( unsigned char bscore ,
score_t score ,
char *docIdPtr ,
unsigned char minTopBitScore ,
score_t minTopScore ,
char *minTopDocIdPtr ) {
// . branch on the bit score
// . ignore 0x80 | 0x40 bits
if ( (bscore&~0xc0) < (minTopBitScore&~0xc0) ) return false;
// automatically add to top if our bscore is the highest so far
if ( (bscore&~0xc0) > (minTopBitScore&~0xc0) ) return true;
// docId must have a better score if it tied matching terms
if ( score < minTopScore ) {
// clear the slot
//explicitBits[i] = 0;
//docIdPtrs[i] = NULL;
return false;
}
if ( score > minTopScore ) return true;
// continue if docId is too big
if ( *(uint32_t *)(docIdPtr+1) >
*(uint32_t *)(minTopDocIdPtr+1) ) {
// clear the slot
//explicitBits[i] = 0;
//docIdPtrs[i] = NULL;
return false;
}
// if top is equal, compare lower 6 bits
if ( (*(uint32_t *)(docIdPtr +1) ==
*(uint32_t *)(minTopDocIdPtr+1)) &&
(*(unsigned char *)(docIdPtr ) & 0xfc) >=
(*(unsigned char *)(minTopDocIdPtr) & 0xfc) ) {
// clear the slot
//explicitBits[i] = 0;
//docIdPtrs[i] = NULL;
return false;
}
return true;
}
inline score_t IndexTable2::getWeightedScore ( unsigned char *scoresVec ,
int32_t nqt ,
float *freqWeights ,
float *affWeights ,
bool requireAllTerms){
// get the min of the scores
float min = 9999999.0;
float phraseMin = 9999999.0;
float pscore;
float pre;
int32_t j;
// reset score
float score = 0.0;
// loop vars
//unsigned char iscore;
float s;
qvec_t ebit;
// tends to neutralize the affects of outliers
for ( int32_t t = 0; t < nqt; t++ ) {
// . if this is zero, ignore it completely
// . we do this for gbxlatitude2: etc.
// query terms whose scores are really
// eventids
if ( freqWeights[t] == 0.0 ) continue;
// invert score
pre = 255 - scoresVec[t];
// apply affinity weights
if ( affWeights[t] > 0.0 )
pre = pre * affWeights[t];
// get score
s = freqWeights[t] * 100.0 * pre;
// add it up
score += s;
// look at this term and its corresponding word or phrse
// terms and get the max of all. must apply the affWeights!
int32_t it = m_imap[t];
// which is the explicit bit for term #t?
ebit = m_q->m_qterms[it].m_explicitBit;
// . skip if term #t is not "hard" required
// . it could be a phrase term or a synonym term and not
// necessary
if ( (ebit & (m_q->m_requiredBits)) == 0 ) {
// demote if phrase
//pre /= 30.0;
// if we are a phrase term, keep the phrase min
if ( pre < phraseMin ) phraseMin = pre;
// get next term
continue;
}
// . get the max between us and our phrase terms
// . do not consider ourselves if phrase term has
// higher affinity than us!!
if ( (j = m_imapLeftPhraseTermNum[t]) >= 0 ) {
pscore = 255-scoresVec[j] ;
// if phrase is not really strong, demote it
//if ( affWeights[j] < affWeights[t] ) pscore /= 30.0;
if ( affWeights[j] > 0 ) pscore *= affWeights[j];
if ( pscore > pre ) pre = pscore;
// we should also weight it by our affinity weight, but
// punish by .5 since it is a phrase
pscore = 255-scoresVec[j] ;
if ( affWeights[t] > 0 ) pscore *= affWeights[t];
if ( pscore > pre ) pre = pscore;
}
if ( (j = m_imapRightPhraseTermNum[t]) >= 0 ) {
pscore = 255-scoresVec[j] ;
// if phrase is not really strong, demote it
//if ( affWeights[j] < affWeights[t] ) pscore /= 30.0;
if ( affWeights[j] > 0 ) pscore *= affWeights[j];
if ( pscore > pre ) pre = pscore;
// we should also weight it by our affinity weight, but
// punish by .5 since it is a phrase
pscore = 255-scoresVec[j] ;
if ( affWeights[t] > 0 ) pscore *= affWeights[t];
if ( pscore > pre ) pre = pscore;
}
// we must also consider all of our synonyms!!
if ( m_q->m_hasSynonyms ) {
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
// skip if term #k is not our synonym
//if ( m_q->m_qterms[k].m_synonymOf !=
// & m_q->m_qterms[it] ) continue;
// if term #k IMPLIES us, use his score if bigr
if ( (m_q->m_qterms[k].m_implicitBits &
m_q->m_qterms[it].m_explicitBit ) == 0 )
continue;
// convert him to imap space
j = m_imap[k];
// get his score
pscore = 255-scoresVec[j] ;
// . get the scores
// . 'bib' for the query 'michael bibby fact'
// has a syn weight of 0... so allow 0, it
// should punish us!
//if ( affWeights[j] > 0)pscore*=affWeights[j];
if ( affWeights[j] >= 0) pscore*=affWeights[j];
//if ( pscore > pre ) pre = pscore;
// we should also weight it by our affinity
// weight,but punish by .5 since it is a phrase
//pscore = 255-scoresVec[j] ;
if ( affWeights[t] >= 0) pscore*=affWeights[t];
if ( pscore > pre ) pre = pscore;
}
}
// do we have a new min score?
if ( pre >= min ) continue;
// rat=0 and boolean queries might not have this term!
// do not allow min to be 0!
if ( pre <= 0.0 && ! requireAllTerms ) continue;
// ok, we got a winner
if ( pre < min ) min = pre;
}
// use phrase min if we had no terms explicitly
if ( min == 9999999.0 ) min = phraseMin;
// multiply by the min, this rewards non-outliers
//if ( min > 1 && min != 9999999.0 ) score *= min * min;
if ( min != 9999999.0 ) score *= min * min;
//if ( min > 1 && min != 9999999.0 ) score *= min ;
//if ( min > 1 ) score *= min;
// scale down
score /= 100;
// sanity check
if ( ((float)((int32_t)(score+1.0))) < score ) {
logf(LOG_DEBUG,"query: got score breach, score=%f",score);
score = (float)0x7ffffff0;
//char *xx=NULL; *xx = 0;
}
// make sure never 0
return (score_t)(score + 1.0);
}
/*
// . computes a final score from a score vector and term weights
inline int32_t IndexTable2::getWeightedScore ( unsigned char *scoresVec ,
int32_t nqt ,
float *affWeights ) {
// tends to neutralize the affects of outliers
float score = 0.0;
for ( int32_t t = 0; t < nqt; t++ ) {
float s = m_freqWeights[t] * 100.0 *(255-scoresVec[t]);
// let's take this out until we figure out a way
// to deal with small samples without having
// to make Msg3a go to the next tier, when the
// combined samples from each split would be
// enough to compute an affinity. also, we do not
// want splits using different affinities!
if ( affWeights[t] > 0.0 )
s = s * affWeights[t];
// square root it
s = sqrt((double)s);
// do it again
s = sqrt((double)s);
// add up
score += s;
}
// average
float avg = score / nqt;
// square
//avg *= avg;
// scale up for round off
avg *= 100;
// never 0
avg += 1;
// square
return (int32_t)avg;
}
*/
#endif