open-source-search-engine/Indexdb.h
2014-05-01 17:07:31 -07:00

206 lines
6.1 KiB
C++

// Matt Wells, Copyright Apr 2001
// . format of a 12-byte indexdb key
// . tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// . tttttttt tttttttt ssssssss dddddddd s = ~score
// . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits)
// . format of a 6-byte indexdb key
// . ssssssss dddddddd dddddddd dddddddd d = docId, s = ~score
// . dddddddd dddddd1Z
#ifndef _INDEXDB_H_
#define _INDEXDB_H_
#include "Rdb.h"
#include "Conf.h"
#include "DiskPageCache.h"
// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h
#define NUMTERMIDBITS 48
// mask the lower 48 bits
#define TERMID_MASK (0x0000ffffffffffffLL)
#include "Titledb.h" // DOCID_MASK
// Msg5.cpp and Indexdb.cpp use this
//#define MIN_TRUNC (GB_PAGE_SIZE/6 * 4 + 6)
// keep it at LEAST 12 million to avoid disasters
#define MIN_TRUNC 12000000
//#define SPLIT_INDEXDB
//#define INDEXDB_SPLIT 2
//#define INDEXDB_SPLIT 8
//#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)
#define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)
#define MAX_SHARDS 128
class Indexdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( long treeMem );
bool setGroupIdTable();
bool verify ( char *coll );
void deepVerify ( char *coll );
bool addIndexList ( class IndexList *list ) ;
bool addColl ( char *coll, bool doVerify = true );
// . get start/end keys for IndexList from a termId
// . keys correspond to the start/end of the IndexList for this termId
// . NOTE: score is complemented when stored in the key
key_t makeStartKey ( long long termId );
key_t makeEndKey ( long long termId );
// . make a 12-byte key from all these components
// . since it is 12 bytes, the big bit will be set
key_t makeKey ( long long termId ,
unsigned char score ,
unsigned long long docId ,
bool isDelKey );
key_t makeFirstKey ( long long termId ) {
return makeKey ( termId , 255 , 0LL , true ); };
key_t makeLastKey ( long long termId ) {
return makeKey ( termId , 0 , DOCID_MASK , false ); };
// get a termId from a prefixHash and termHash
long long getTermId ( long long prefixHash , long long termHash ) {
return hash64 ( prefixHash , termHash ) & TERMID_MASK;};
// extract the termId from a key
long long getTermId ( key_t *k ) {
long long termId = 0LL;
memcpy ( &termId , ((char *)k) + 6 , 6 );
return termId ;
};
long long getTermId ( key_t k ) { return getTermId ( &k ); };
long long getDocId ( key_t k ) {
char *rec = (char *)&k;
return ((*(unsigned long long *)(rec)) >> 2) & DOCID_MASK; };
long long getDocId ( key_t *k ) {
return ((*(unsigned long long *)(k)) >> 2) & DOCID_MASK; };
unsigned char getScore ( key_t k ) {
char *rec = (char *)&k;
return ~rec[5]; };
unsigned char getScore ( char *k ) {return ~k[5]; };
/*
unsigned long getGroupId ( long long termId, long long docId ) {
if ( g_conf.m_fullSplit )
return g_titledb.getGroupId ( docId );
//#ifdef SPLIT_INDEXDB
if ( g_conf.m_indexdbSplit > 1 ) {
unsigned long groupId = (unsigned long)(termId >> 16);
groupId >>= m_groupIdShift;
unsigned long offset = docId & DOCID_OFFSET_MASK;
return m_groupIdTable[groupId+(offset*m_numGroups)];
}
//#else
else
return (unsigned long)(termId >> 16) &
g_hostdb.m_groupMask;
//#endif
}
unsigned long getGroupIdFromKey ( key_t *k ) {
if ( g_conf.m_fullSplit )
return g_titledb.getGroupId ( getDocId( k) );
//#ifdef SPLIT_INDEXDB
if ( g_conf.m_indexdbSplit > 1 ) {
unsigned long groupId = k->n1 & g_hostdb.m_groupMask;
groupId >>= m_groupIdShift;
unsigned long offset = (k->n0 >> 2) & DOCID_OFFSET_MASK;
return m_groupIdTable[groupId+(offset*m_numGroups)];
}
//#else
else
return k->n1 & g_hostdb.m_groupMask;
//#endif
}
//#ifdef SPLIT_INDEXDB
// for terms like gbdom:xyz.com that only reside in one group and
// are not split by docid into multiple groups. reduces disk seeks
// while spidering, cuz we use such terms for deduping and for
// doing quotas.
unsigned long getNoSplitGroupId ( key_t *k ) {
// keep it simple now
return k->n1 & g_hostdb.m_groupMask;
//unsigned long bgid = getBaseGroupId(k);
//return getSplitGroupId(bgid,0);
}
*/
/*
unsigned long getBaseGroupId ( key_t *k ) {
return k->n1 & g_hostdb.m_groupMask;
}
unsigned long getSplitGroupId ( unsigned long baseGroupId,
unsigned long offset ) {
if ( g_hostdb.m_numShards <= 1 ) return 0;
baseGroupId >>= m_groupIdShift;
return m_groupIdTable[baseGroupId+(offset*m_numGroups)];
}
*/
//#endif
// . accesses RdbMap to estimate size of the indexList for this termId
// . returns a pretty tight upper bound if indexList not truncated
// . if truncated, it's does linear interpolation (use exponential!)
long long getTermFreq ( collnum_t collnum , long long termId ) ;
//long getTruncationLimit ( ){return g_conf.m_indexdbTruncationLimit;};
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
//#ifdef SPLIT_INDEXDB
// . groupId Table, for getting the correct group id based
// on type bits of termId and lower bits of docId
unsigned long *m_groupIdTable;
long m_groupIdTableSize;
long m_groupIdShift;
long m_numGroups;
//#endif
};
extern class Indexdb g_indexdb;
extern class Indexdb g_indexdb2;
#endif
// . the search-within operator "|"
// - termlists are sorted by score so that when merging 2 termlists
// we can stop when we get the first 10 docIds that have both terms and
// we are certain that they are the top 10 highest scoring
// - but search within says to disregard the scores of the first list,
// so we can still be sure we got the top 10, i guess
// - sort by date: like search-within but everybody has a date so the
// termlist is huge!!! we can pass a sub-date termlist, say today's
// date and merge that one. if we get no hits then try the last 3 days
// date termlist. Shit, can't have one huge date termlist anyway cuz we
// need truncation to make the network thang work.