open-source-search-engine/Indexdb.h

// Matt Wells, Copyright Apr 2001

// . format of a 12-byte indexdb key
// . tttttttt tttttttt tttttttt tttttttt  t = termId (48bits)
// . tttttttt tttttttt ssssssss dddddddd  s = ~score
// . dddddddd dddddddd dddddddd dddddd0Z  d = docId (38 bits)

// . format of a 6-byte indexdb key
// . ssssssss dddddddd dddddddd dddddddd  d = docId, s = ~score
// . dddddddd dddddd1Z 

#ifndef _INDEXDB_H_
#define _INDEXDB_H_

#include "Rdb.h"
#include "Conf.h"
#include "DiskPageCache.h"

// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h
#define NUMTERMIDBITS 48
// mask the lower 48 bits
#define TERMID_MASK (0x0000ffffffffffffLL)

#include "Titledb.h" // DOCID_MASK

// Msg5.cpp and Indexdb.cpp use this
//#define MIN_TRUNC (GB_PAGE_SIZE/6 * 4 + 6)
// keep it at LEAST 12 million to avoid disasters
#define MIN_TRUNC 12000000

//#define SPLIT_INDEXDB
//#define INDEXDB_SPLIT 2
//#define INDEXDB_SPLIT 8
//#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)
#define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)
#define MAX_SHARDS 1024

class Indexdb {

 public:

	// resets rdb
	void reset();

	// sets up our m_rdb from g_conf (global conf class)
	bool init ( );

	// init the rebuild/secondary rdb, used by PageRepair.cpp
	bool init2 ( int32_t treeMem );

	bool setGroupIdTable();

	bool verify ( char *coll );
	void deepVerify ( char *coll );

	bool addIndexList ( class IndexList *list ) ;

	bool addColl ( char *coll, bool doVerify = true );

	// . get start/end keys for IndexList from a termId
	// . keys correspond to the start/end of the IndexList for this termId
	// . NOTE: score is complemented when stored in the key
	key_t makeStartKey ( int64_t termId );
	key_t makeEndKey   ( int64_t termId );

	// . make a 12-byte key from all these components
	// . since it is 12 bytes, the big bit will be set
	key_t makeKey ( int64_t          termId   , 
			unsigned char      score    , 
			uint64_t docId    , 
			bool               isDelKey );

	key_t makeFirstKey ( int64_t termId ) {
		return makeKey ( termId , 255 , 0LL , true ); };
	key_t makeLastKey  ( int64_t termId ) {
		return makeKey ( termId , 0   , DOCID_MASK , false ); };

	// get a termId from a prefixHash and termHash
	int64_t getTermId ( int64_t prefixHash , int64_t termHash ) {
		return hash64 ( prefixHash , termHash ) & TERMID_MASK;};

	// extract the termId from a key
	int64_t getTermId ( key_t *k ) {
		int64_t termId = 0LL;
		gbmemcpy ( &termId , ((char *)k) + 6 , 6 );
		return termId ;
	};
	int64_t getTermId ( key_t k ) { return getTermId ( &k ); };

	int64_t getDocId ( key_t k ) {
		char *rec = (char *)&k;
		return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };

	int64_t getDocId ( key_t *k ) {
		return ((*(uint64_t *)(k)) >> 2) & DOCID_MASK; };

	unsigned char getScore ( key_t k ) {
		char *rec = (char *)&k;
		return ~rec[5]; };

	unsigned char getScore ( char *k ) {return ~k[5]; };

	/*
	uint32_t getGroupId ( int64_t termId, int64_t docId ) {
		if ( g_conf.m_fullSplit )
			return g_titledb.getGroupId ( docId );
//#ifdef SPLIT_INDEXDB
		if ( g_conf.m_indexdbSplit > 1 ) {
			uint32_t groupId = (uint32_t)(termId >> 16);
			groupId >>= m_groupIdShift;
			uint32_t offset = docId & DOCID_OFFSET_MASK;
			return m_groupIdTable[groupId+(offset*m_numGroups)];
		}
//#else
		else
			return (uint32_t)(termId >> 16) &
				g_hostdb.m_groupMask;
//#endif
	} 

	uint32_t getGroupIdFromKey ( key_t *k ) {
		if ( g_conf.m_fullSplit )
			return g_titledb.getGroupId ( getDocId( k) );
//#ifdef SPLIT_INDEXDB
		if ( g_conf.m_indexdbSplit > 1 ) {
			uint32_t groupId = k->n1 & g_hostdb.m_groupMask;
			groupId >>= m_groupIdShift;
			uint32_t offset = (k->n0 >> 2) & DOCID_OFFSET_MASK;
			return m_groupIdTable[groupId+(offset*m_numGroups)];
		}
//#else
		else
			return k->n1 & g_hostdb.m_groupMask;
//#endif
	}
//#ifdef SPLIT_INDEXDB

	// for terms like gbdom:xyz.com that only reside in one group and
	// are not split by docid into multiple groups. reduces disk seeks
	// while spidering, cuz we use such terms for deduping and for
	// doing quotas.
	uint32_t getNoSplitGroupId ( key_t *k ) {
		// keep it simple now
		return k->n1 & g_hostdb.m_groupMask;
		//uint32_t bgid = getBaseGroupId(k);
		//return getSplitGroupId(bgid,0);
	}
	*/

	/*
	uint32_t getBaseGroupId ( key_t *k ) {
		return k->n1 & g_hostdb.m_groupMask;
	}

	uint32_t getSplitGroupId ( uint32_t baseGroupId,
					uint32_t offset ) {
		if ( g_hostdb.m_numShards <= 1 ) return 0;
		baseGroupId >>= m_groupIdShift;
		return m_groupIdTable[baseGroupId+(offset*m_numGroups)];
	}
	*/
//#endif

	// . accesses RdbMap to estimate size of the indexList for this termId
	// . returns a pretty tight upper bound if indexList not truncated
	// . if truncated, it's does linear interpolation (use exponential!)
	int64_t getTermFreq ( collnum_t collnum , int64_t termId ) ;

	//int32_t getTruncationLimit ( ){return g_conf.m_indexdbTruncationLimit;};

	//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
	Rdb      *getRdb   ( ) { return &m_rdb; };

	Rdb m_rdb;

	DiskPageCache *getDiskPageCache ( ) { return &m_pc; };

	DiskPageCache m_pc;

//#ifdef SPLIT_INDEXDB
	// . groupId Table, for getting the correct group id based
	//   on type bits of termId and lower bits of docId
	uint32_t *m_groupIdTable;
	int32_t m_groupIdTableSize;
	int32_t m_groupIdShift;
	int32_t m_numGroups;
//#endif
};

extern class Indexdb g_indexdb;
extern class Indexdb g_indexdb2;

#endif

// . the search-within operator "|"
//   - termlists are sorted by score so that when merging 2 termlists
//     we can stop when we get the first 10 docIds that have both terms and
//     we are certain that they are the top 10 highest scoring
//   - but search within says to disregard the scores of the first list,
//     so we can still be sure we got the top 10, i guess
//   - sort by date: like search-within but everybody has a date so the
//     termlist is huge!!! we can pass a sub-date termlist, say today's
//     date and merge that one. if we get no hits then try the last 3 days
//     date termlist. Shit, can't have one huge date termlist anyway cuz we 
//     need truncation to make the network thang work.
Initial file population. 2013-08-03 00:12:24 +04:00			`// Matt Wells, Copyright Apr 2001`

			`// . format of a 12-byte indexdb key`
			`// . tttttttt tttttttt tttttttt tttttttt t = termId (48bits)`
			`// . tttttttt tttttttt ssssssss dddddddd s = ~score`
			`// . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits)`

			`// . format of a 6-byte indexdb key`
			`// . ssssssss dddddddd dddddddd dddddddd d = docId, s = ~score`
			`// . dddddddd dddddd1Z`

			`#ifndef _INDEXDB_H_`
			`#define _INDEXDB_H_`

			`#include "Rdb.h"`
			`#include "Conf.h"`
			`#include "DiskPageCache.h"`

			`// we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h`
			`#define NUMTERMIDBITS 48`
			`// mask the lower 48 bits`
			`#define TERMID_MASK (0x0000ffffffffffffLL)`

			`#include "Titledb.h" // DOCID_MASK`

			`// Msg5.cpp and Indexdb.cpp use this`
			`//#define MIN_TRUNC (GB_PAGE_SIZE/6 * 4 + 6)`
			`// keep it at LEAST 12 million to avoid disasters`
			`#define MIN_TRUNC 12000000`

			`//#define SPLIT_INDEXDB`
			`//#define INDEXDB_SPLIT 2`
			`//#define INDEXDB_SPLIT 8`
			`//#define DOCID_OFFSET_MASK (INDEXDB_SPLIT-1)`
			`#define DOCID_OFFSET_MASK (g_conf.m_indexdbSplit-1)`
try to fix core on #0 and #16 from empty query. if empty query or n<=0 and &stream=1 then fix bug that was not sending back the reply properly. basically, disable streaming if msg40 would not block. upped MAX_SHARDS from 128 to 1024. should not take up any more mem really or slow things down. 2014-12-10 21:44:01 +03:00			`#define MAX_SHARDS 1024`
Initial file population. 2013-08-03 00:12:24 +04:00
			`class Indexdb {`

			`public:`

			`// resets rdb`
			`void reset();`

			`// sets up our m_rdb from g_conf (global conf class)`
			`bool init ( );`

			`// init the rebuild/secondary rdb, used by PageRepair.cpp`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`bool init2 ( int32_t treeMem );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`bool setGroupIdTable();`

			`bool verify ( char *coll );`
			`void deepVerify ( char *coll );`

			`bool addIndexList ( class IndexList *list ) ;`

			`bool addColl ( char *coll, bool doVerify = true );`

			`// . get start/end keys for IndexList from a termId`
			`// . keys correspond to the start/end of the IndexList for this termId`
			`// . NOTE: score is complemented when stored in the key`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`key_t makeStartKey ( int64_t termId );`
			`key_t makeEndKey ( int64_t termId );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . make a 12-byte key from all these components`
			`// . since it is 12 bytes, the big bit will be set`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`key_t makeKey ( int64_t termId ,`
Initial file population. 2013-08-03 00:12:24 +04:00			`unsigned char score ,`
replaced unsigned long long with uint64_t 2014-10-30 22:30:39 +03:00			`uint64_t docId ,`
Initial file population. 2013-08-03 00:12:24 +04:00			`bool isDelKey );`

replace long long with int64_t 2014-10-30 22:36:39 +03:00			`key_t makeFirstKey ( int64_t termId ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`return makeKey ( termId , 255 , 0LL , true ); };`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`key_t makeLastKey ( int64_t termId ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`return makeKey ( termId , 0 , DOCID_MASK , false ); };`

			`// get a termId from a prefixHash and termHash`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getTermId ( int64_t prefixHash , int64_t termHash ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`return hash64 ( prefixHash , termHash ) & TERMID_MASK;};`

			`// extract the termId from a key`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getTermId ( key_t *k ) {`
			`int64_t termId = 0LL;`
use gbmemcpy not memcpy so we can get profiler working again since memcpy can't be interrupted and backtrace() called. 2015-01-13 22:25:42 +03:00			`gbmemcpy ( &termId , ((char *)k) + 6 , 6 );`
Initial file population. 2013-08-03 00:12:24 +04:00			`return termId ;`
			`};`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getTermId ( key_t k ) { return getTermId ( &k ); };`
Initial file population. 2013-08-03 00:12:24 +04:00
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getDocId ( key_t k ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`char rec = (char )&k;`
replaced unsigned long long with uint64_t 2014-10-30 22:30:39 +03:00			`return (((uint64_t )(rec)) >> 2) & DOCID_MASK; };`
Initial file population. 2013-08-03 00:12:24 +04:00
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getDocId ( key_t *k ) {`
replaced unsigned long long with uint64_t 2014-10-30 22:30:39 +03:00			`return (((uint64_t )(k)) >> 2) & DOCID_MASK; };`
Initial file population. 2013-08-03 00:12:24 +04:00
			`unsigned char getScore ( key_t k ) {`
			`char rec = (char )&k;`
			`return ~rec[5]; };`

			`unsigned char getScore ( char *k ) {return ~k[5]; };`

			`/*`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t getGroupId ( int64_t termId, int64_t docId ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`if ( g_conf.m_fullSplit )`
			`return g_titledb.getGroupId ( docId );`
			`//#ifdef SPLIT_INDEXDB`
			`if ( g_conf.m_indexdbSplit > 1 ) {`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t groupId = (uint32_t)(termId >> 16);`
Initial file population. 2013-08-03 00:12:24 +04:00			`groupId >>= m_groupIdShift;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t offset = docId & DOCID_OFFSET_MASK;`
Initial file population. 2013-08-03 00:12:24 +04:00			`return m_groupIdTable[groupId+(offset*m_numGroups)];`
			`}`
			`//#else`
			`else`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`return (uint32_t)(termId >> 16) &`
Initial file population. 2013-08-03 00:12:24 +04:00			`g_hostdb.m_groupMask;`
			`//#endif`
			`}`

now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t getGroupIdFromKey ( key_t *k ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`if ( g_conf.m_fullSplit )`
			`return g_titledb.getGroupId ( getDocId( k) );`
			`//#ifdef SPLIT_INDEXDB`
			`if ( g_conf.m_indexdbSplit > 1 ) {`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t groupId = k->n1 & g_hostdb.m_groupMask;`
Initial file population. 2013-08-03 00:12:24 +04:00			`groupId >>= m_groupIdShift;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t offset = (k->n0 >> 2) & DOCID_OFFSET_MASK;`
Initial file population. 2013-08-03 00:12:24 +04:00			`return m_groupIdTable[groupId+(offset*m_numGroups)];`
			`}`
			`//#else`
			`else`
			`return k->n1 & g_hostdb.m_groupMask;`
			`//#endif`
			`}`
			`//#ifdef SPLIT_INDEXDB`

			`// for terms like gbdom:xyz.com that only reside in one group and`
			`// are not split by docid into multiple groups. reduces disk seeks`
			`// while spidering, cuz we use such terms for deduping and for`
			`// doing quotas.`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t getNoSplitGroupId ( key_t *k ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`// keep it simple now`
			`return k->n1 & g_hostdb.m_groupMask;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//uint32_t bgid = getBaseGroupId(k);`
Initial file population. 2013-08-03 00:12:24 +04:00			`//return getSplitGroupId(bgid,0);`
			`}`
			`*/`

			`/*`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t getBaseGroupId ( key_t *k ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`return k->n1 & g_hostdb.m_groupMask;`
			`}`

now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t getSplitGroupId ( uint32_t baseGroupId,`
			`uint32_t offset ) {`
move from groups to shards. got rid of annoying groupid bit mask thing. 2013-10-05 03:18:56 +04:00			`if ( g_hostdb.m_numShards <= 1 ) return 0;`
Initial file population. 2013-08-03 00:12:24 +04:00			`baseGroupId >>= m_groupIdShift;`
			`return m_groupIdTable[baseGroupId+(offset*m_numGroups)];`
			`}`
			`*/`
			`//#endif`

			`// . accesses RdbMap to estimate size of the indexList for this termId`
			`// . returns a pretty tight upper bound if indexList not truncated`
			`// . if truncated, it's does linear interpolation (use exponential!)`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t getTermFreq ( collnum_t collnum , int64_t termId ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t getTruncationLimit ( ){return g_conf.m_indexdbTruncationLimit;};`
Initial file population. 2013-08-03 00:12:24 +04:00
			`//RdbCache *getCache ( ) { return &m_rdb.m_cache; };`
			`Rdb *getRdb ( ) { return &m_rdb; };`

			`Rdb m_rdb;`

			`DiskPageCache *getDiskPageCache ( ) { return &m_pc; };`

			`DiskPageCache m_pc;`

			`//#ifdef SPLIT_INDEXDB`
			`// . groupId Table, for getting the correct group id based`
			`// on type bits of termId and lower bits of docId`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t *m_groupIdTable;`
			`int32_t m_groupIdTableSize;`
			`int32_t m_groupIdShift;`
			`int32_t m_numGroups;`
Initial file population. 2013-08-03 00:12:24 +04:00			`//#endif`
			`};`

			`extern class Indexdb g_indexdb;`
			`extern class Indexdb g_indexdb2;`

			`#endif`

			`// . the search-within operator "\|"`
			`// - termlists are sorted by score so that when merging 2 termlists`
			`// we can stop when we get the first 10 docIds that have both terms and`
			`// we are certain that they are the top 10 highest scoring`
			`// - but search within says to disregard the scores of the first list,`
			`// so we can still be sure we got the top 10, i guess`
			`// - sort by date: like search-within but everybody has a date so the`
			`// termlist is huge!!! we can pass a sub-date termlist, say today's`
			`// date and merge that one. if we get no hits then try the last 3 days`
			`// date termlist. Shit, can't have one huge date termlist anyway cuz we`
			`// need truncation to make the network thang work.`