open-source-search-engine/TopTree.h
Matt Wells 2d4af1aefe index numbers as integers too, not just floats
so we can sort by spider date without losing
128 seconds of resolution.
2014-02-06 20:57:54 -08:00

153 lines
4.0 KiB
C++

// Matt Wells, copyright Jul 2004
// . class used to hold the top scoring search results
// . filled by IndexTable.cpp
// . used by Msg38 to get cluster info for each TopNode
// . used by Msg39 to serialize into a reply
#ifndef _TOPTREE_H_
#define _TOPTREE_H_
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
#include "IndexTable2.h" // score_t definition
#include "RdbTree.h"
class TopNode {
public:
//unsigned char m_bscore ; // bit #6(0x40) is on if has all explicitly
// do not allow a higher-tiered node to outrank a lower that has
// bit #6 set, under any circumstance
char m_depth ;
// Msg39 now looks up the cluster recs so we can do clustering
// really quick on each machine, assuming we have a full split and the
// entire clusterdb is in our local disk page cache.
char m_clusterLevel;
key_t m_clusterRec;
// no longer needed, Msg3a does not need, it has already
//unsigned char m_tier ;
float m_score ;
long long m_docId;
// option for using int scores
long m_intScore;
// clustering info
//long m_kid ; // result from our same site below us
//unsigned long m_siteHash ;
//unsigned long m_contentHash ;
//long m_rank ;
// the lower 64 bits of the cluster rec, used by Msg51, the new
// class for doing site clustering
//uint64_t m_clusterRec;
// . for getting similarity between titleRecs
// . this is so big only include if we need it
//long m_vector [ VECTOR_SIZE ];
// tree info, indexes into m_nodes array
long m_parent;
long m_left; // kid
long m_right; // kid
//long long getDocId ( );
//long long getDocIdForMsg3a ( );
};
class TopTree {
public:
TopTree();
~TopTree();
// free mem
void reset();
// pre-allocate memory
bool setNumNodes ( long docsWanted , bool doSiteClustering );
// . add a node
// . get an empty first, fill it in and call addNode(t)
long getEmptyNode ( ) { return m_emptyNode; };
// . you can add a new node
// . it will NOT overwrite a node with same bscore/score/docid
// . it will NOT add if bscore/score/docid < m_tail node
// otherwise it will remove m_tail node if
// m_numNodes == m_numUsedNodes
bool addNode ( TopNode *t , long tnn );
long getLowNode ( ) { return m_lowNode ; };
// . this is computed and stored on demand
// . WARNING: only call after all nodes have been added!
long getHighNode ( ) ;
float getMinScore ( ) {
if ( m_lowNode < 0 ) return -1.0;
return m_nodes[m_lowNode].m_score;
}
long getPrev ( long i );
long getNext ( long i );
bool checkTree ( bool printMsgs ) ;
long computeDepth ( long i ) ;
void deleteNodes ( );
bool hasDocId ( long long d );
TopNode *getNode ( long i ) { return &m_nodes[i]; }
// ptr to the mem block
TopNode *m_nodes;
long m_allocSize;
// optional dedup vectors... very big, VECTOR_REC_SIZE-12 bytes each
// (512) so we make this an option
//long *m_sampleVectors;
//bool m_useSampleVectors;
// which is next to be used, after m_nextPtr
long m_numUsedNodes;
// total count
long m_numNodes;
// the top of the tree
long m_headNode;
// . always keep track of the high and low nodes
// . IndexTable.cpp likes to replace the low-scoring tail often
// . Msg39.cpp likes to print out starting at the high-scorer
// . these are indices into m_nodes[] array
long m_lowNode;
long m_highNode;
// use this to set "t" in call to addNode(t)
long m_emptyNode;
bool m_pickRight;
float m_vcount ;
long m_cap ;
float m_partial ;
bool m_doSiteClustering;
bool m_useIntScores;
long m_docsWanted;
long m_ridiculousMax;
char m_kickedOutDocIds;
//long long m_lastKickedOutDocId;
long m_domCount[256];
// the node with the minimum "score" for that domHash
long m_domMinNode[256];
// an embedded RdbTree for limiting the storing of keys to X
// keys per domHash, where X is usually "m_ridiculousMax"
RdbTree m_t2;
private:
void deleteNode ( long i , uint8_t domHash ) ;
void setDepths ( long i ) ;
long rotateLeft ( long i ) ;
long rotateRight ( long i ) ;
};
#endif