open-source-search-engine/TopTree.h

157 lines
4.3 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jul 2004
// . class used to hold the top scoring search results
// . filled by IndexTable.cpp
// . used by Msg38 to get cluster info for each TopNode
// . used by Msg39 to serialize into a reply
#ifndef _TOPTREE_H_
#define _TOPTREE_H_
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
2014-03-14 00:09:33 +04:00
//#include "IndexTable2.h" // score_t definition
2013-08-03 00:12:24 +04:00
#include "RdbTree.h"
class TopNode {
public:
//unsigned char m_bscore ; // bit #6(0x40) is on if has all explicitly
// do not allow a higher-tiered node to outrank a lower that has
// bit #6 set, under any circumstance
char m_depth ;
// Msg39 now looks up the cluster recs so we can do clustering
// really quick on each machine, assuming we have a full split and the
// entire clusterdb is in our local disk page cache.
char m_clusterLevel;
key_t m_clusterRec;
2014-11-18 05:13:36 +03:00
// no longer needed, Msg3a does not need, it has already
2013-08-03 00:12:24 +04:00
//unsigned char m_tier ;
float m_score ;
2014-10-30 22:36:39 +03:00
int64_t m_docId;
// option for using int scores
2014-11-11 01:45:11 +03:00
int32_t m_intScore;
2013-08-03 00:12:24 +04:00
// clustering info
2014-11-11 01:45:11 +03:00
//int32_t m_kid ; // result from our same site below us
//uint32_t m_siteHash ;
//uint32_t m_contentHash ;
//int32_t m_rank ;
2013-08-03 00:12:24 +04:00
// the lower 64 bits of the cluster rec, used by Msg51, the new
// class for doing site clustering
//uint64_t m_clusterRec;
// . for getting similarity between titleRecs
// . this is so big only include if we need it
2014-11-11 01:45:11 +03:00
//int32_t m_vector [ VECTOR_SIZE ];
2013-08-03 00:12:24 +04:00
// tree info, indexes into m_nodes array
2014-11-11 01:45:11 +03:00
int32_t m_parent;
int32_t m_left; // kid
int32_t m_right; // kid
2013-08-03 00:12:24 +04:00
// so we can quickly remove its scoring info from the scoreinfo
// buf and replace with new docid's scoring info
//int64_t m_scoreInfoBufOffset;
2014-10-30 22:36:39 +03:00
//int64_t getDocId ( );
2013-08-03 00:12:24 +04:00
2014-10-30 22:36:39 +03:00
//int64_t getDocIdForMsg3a ( );
2013-08-03 00:12:24 +04:00
};
class TopTree {
public:
TopTree();
~TopTree();
// free mem
void reset();
// pre-allocate memory
2014-11-11 01:45:11 +03:00
bool setNumNodes ( int32_t docsWanted , bool doSiteClustering );
2013-08-03 00:12:24 +04:00
// . add a node
// . get an empty first, fill it in and call addNode(t)
2014-11-11 01:45:11 +03:00
int32_t getEmptyNode ( ) { return m_emptyNode; };
2013-08-03 00:12:24 +04:00
// . you can add a new node
// . it will NOT overwrite a node with same bscore/score/docid
// . it will NOT add if bscore/score/docid < m_tail node
// otherwise it will remove m_tail node if
// m_numNodes == m_numUsedNodes
2014-11-11 01:45:11 +03:00
bool addNode ( TopNode *t , int32_t tnn );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getLowNode ( ) { return m_lowNode ; };
2013-08-03 00:12:24 +04:00
// . this is computed and stored on demand
// . WARNING: only call after all nodes have been added!
2014-11-11 01:45:11 +03:00
int32_t getHighNode ( ) ;
2013-08-03 00:12:24 +04:00
float getMinScore ( ) {
if ( m_lowNode < 0 ) return -1.0;
return m_nodes[m_lowNode].m_score;
}
2014-11-11 01:45:11 +03:00
int32_t getPrev ( int32_t i );
int32_t getNext ( int32_t i );
2013-08-03 00:12:24 +04:00
bool checkTree ( bool printMsgs ) ;
2014-11-11 01:45:11 +03:00
int32_t computeDepth ( int32_t i ) ;
2013-08-03 00:12:24 +04:00
void deleteNodes ( );
2014-10-30 22:36:39 +03:00
bool hasDocId ( int64_t d );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
TopNode *getNode ( int32_t i ) { return &m_nodes[i]; }
2013-08-03 00:12:24 +04:00
// ptr to the mem block
TopNode *m_nodes;
2014-11-11 01:45:11 +03:00
int32_t m_allocSize;
2013-08-03 00:12:24 +04:00
// optional dedup vectors... very big, VECTOR_REC_SIZE-12 bytes each
// (512) so we make this an option
2014-11-11 01:45:11 +03:00
//int32_t *m_sampleVectors;
2013-08-03 00:12:24 +04:00
//bool m_useSampleVectors;
// which is next to be used, after m_nextPtr
2014-11-11 01:45:11 +03:00
int32_t m_numUsedNodes;
2013-08-03 00:12:24 +04:00
// total count
2014-11-11 01:45:11 +03:00
int32_t m_numNodes;
2013-08-03 00:12:24 +04:00
// the top of the tree
2014-11-11 01:45:11 +03:00
int32_t m_headNode;
2013-08-03 00:12:24 +04:00
// . always keep track of the high and low nodes
// . IndexTable.cpp likes to replace the low-scoring tail often
// . Msg39.cpp likes to print out starting at the high-scorer
// . these are indices into m_nodes[] array
2014-11-11 01:45:11 +03:00
int32_t m_lowNode;
int32_t m_highNode;
2013-08-03 00:12:24 +04:00
// use this to set "t" in call to addNode(t)
2014-11-11 01:45:11 +03:00
int32_t m_emptyNode;
2013-08-03 00:12:24 +04:00
bool m_pickRight;
float m_vcount ;
2014-11-11 01:45:11 +03:00
int32_t m_cap ;
2013-08-03 00:12:24 +04:00
float m_partial ;
bool m_doSiteClustering;
bool m_useIntScores;
2014-11-11 01:45:11 +03:00
int32_t m_docsWanted;
int64_t m_ridiculousMax;
2013-08-03 00:12:24 +04:00
char m_kickedOutDocIds;
2014-10-30 22:36:39 +03:00
//int64_t m_lastKickedOutDocId;
2014-11-11 01:45:11 +03:00
int32_t m_domCount[256];
2013-08-03 00:12:24 +04:00
// the node with the minimum "score" for that domHash
2014-11-11 01:45:11 +03:00
int32_t m_domMinNode[256];
2013-08-03 00:12:24 +04:00
// an embedded RdbTree for limiting the storing of keys to X
// keys per domHash, where X is usually "m_ridiculousMax"
RdbTree m_t2;
private:
2014-11-11 01:45:11 +03:00
void deleteNode ( int32_t i , uint8_t domHash ) ;
void setDepths ( int32_t i ) ;
int32_t rotateLeft ( int32_t i ) ;
int32_t rotateRight ( int32_t i ) ;
2013-08-03 00:12:24 +04:00
};
#endif