open-source-search-engine/Clusterdb.h

// Copyright Matt Wells, Jul 2002

// . a clusterRec now no longer exists, per se
// . it is the same thing as the key of the titleRec in titledb
// . titleRecs now contain the site and content hashes in the low bits
//   of their key.
// . this allows us to store much cluster info in Titledb's RdbMap
// . so to get cluster info, just read in the titleRec, you do not even
//   need to uncompress it, just get the info from its key
// . we still use the cache here, however, to cache the keys (clusterRecs)
// . later, i may have to do some fancy footwork if we want to store all
//   clusterRecs (titleKeys) in memory.
// . TODO: what if stored file offsets in tfndb, too, then titledb RdbMap
//   would not be necessary?
//
// . clusterdb will now serve to help do fast site clustering by retaining
//   docids and site hashes in memory
//
//   00000000 00000000 0000000d dddddddd  d = docid
//   dddddddd dddddddd dddddddd dddddfll  f = family filter bit
//   llllssss ssssssss ssssssss sssssshz  q = year quarter bits
//                                        l = language bits
//   					  s = site hash
//   					  h = half bit
//   					  z = del bit

#ifndef _CLUSTERDB_H_
#define _CLUSTERDB_H_

//#include "TitleRec.h"  // SAMPLE_VECTOR_SIZE
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Titledb.h"
//#include "DiskPageCache.h"

// these are now just TitleRec keys
#define CLUSTER_REC_SIZE (sizeof(key_t))
// this now includes the gigabit vector
#define VECTOR_REC_SIZE (sizeof(key_t)+SAMPLE_VECTOR_SIZE+GIGABIT_VECTOR_SIZE)

class Clusterdb {

  public:

	// reset rdb
	void reset();

	// set up our private rdb
	bool init ( );

	// init the rebuild/secondary rdb, used by PageRepair.cpp
	bool init2 ( int32_t treeMem );

	bool verify ( char *coll );

	bool addColl ( char *coll, bool doVerify = true );

	Rdb *getRdb  ( ) { return &m_rdb; };

	// make the cluster rec
	void makeRecFromTitleRec ( char     *rec,
				   class TitleRec *titleRec,
				   bool      isDelKey );

	// make the cluster rec
	void makeRecFromTitleRecKey ( char *rec,
				      char *key,
				      bool  isDelKey );

	// make the cluster rec key
	key_t makeClusterRecKey ( int64_t     docId,
				  bool          familyFilter,
				  uint8_t       languageBits,
				  int32_t          siteHash,
				  bool          isDelKey,
				  bool          isHalfKey = false );

	key_t makeFirstClusterRecKey ( int64_t docId ) {
		return makeClusterRecKey ( docId, false, 0, 0, true ); };
	key_t makeLastClusterRecKey  ( int64_t docId ) {
		return makeClusterRecKey ( docId, true, 0xff, 0xffffffff,
					   false, true ); };

	// convert a titlerec key into a clusterec key
	key_t convertTitleRecKey ( key_t titleKey );

	/*
	uint32_t getGroupId ( int64_t docId ) {
		return g_titledb.getGroupId ( docId ); };

	// cluster rec should be stored on same host as titleRec with the
	// same docId that this key contains
	uint32_t getGroupIdFromKey ( key_t *key ) {
		return g_titledb.getGroupId ( getDocId ( *key ) ); };
	*/

	// NOTE: THESE NOW USE THE REAL CLUSTERDB REC
	// // docId occupies the most significant bytes of the key
	// now docId occupies the bits after the first 23
	int64_t getDocId ( void *k ) {
		//int64_t docId = (k.n0) >> (32+24);
		//docId |= ( ((uint64_t)(k.n1)) << 8 );
		int64_t docId = (((key_t *)k)->n0) >> 35;
		docId |= ( ((uint64_t)(((key_t *)k)->n1)) << 29 );
		return docId;
	};

	//int64_t getDocId ( char *r ) {
	//	return getDocId(*(key_t*)r);
	//}

	uint32_t getSiteHash26 ( char *r ) {
		//return g_titledb.getSiteHash ( (key_t *)r ); };
		return ((uint32_t)(((key_t*)r)->n0 >> 2) & 0x03FFFFFF);
	};

	uint32_t hasAdultContent ( char *r ) {
		//return g_titledb.hasAdultContent ( *(key_t *)r ); };
		return ((uint32_t)(((key_t*)r)->n0 >> 34) & 0x00000001);
	};

	unsigned char getLanguage ( char *r ) {
		return ((unsigned char)(((key_t*)r)->n0 >> 28) & 0x0000003F);
	}

	// NOTE: THESE USE THE OLD "CLUSTERDB" REC GENERATED BY MSG22 (VECTOR)
	//uint32_t getContentHash ( char *r ) {
	//	return g_titledb.getContentHash ( *(key_t *)r ); };

	char getFamilyFilter ( char *r ) {
		if ( (*(int64_t *)r) & 0x0000000400000000LL ) return 1;
		return 0;
	};


	//uint32_t hasAdultWords   ( char *r ) {
	//	return g_titledb.hasAdultWords ( *(key_t *)r ); };

	//uint32_t hasAdultCategory ( char *r ) {
	//	return g_titledb.hasAdultCategory ( *(key_t *)r ); };

	//unsigned char getLanguageFromVector ( char *r ) {
	//	return 0;
	//}

	// the random sample vector
	/*
	void getSampleVector     ( char *vec  ,
				   class Doc *doc,
				   char *coll ,
				   int32_t  collLen ,
				   int32_t niceness = 0 );
	*/
	//void getSampleVector     ( char *vec  , class TermTable *table );
	char getSampleSimilarity ( char *vec0 , char *vec1 , int32_t size );
	// get the content vector from a cluster rec (used by Msg38.cpp)
	//char *getSampleVector  ( char *rec ) { return rec + sizeof(key_t); };

	//char *getGigabitVector    ( char *rec ) {
	//	return rec + sizeof(key_t) + SAMPLE_VECTOR_SIZE ; };
	//char getGigabitSimilarity ( char *vec0 , char *vec1 ,
	//			    int32_t *qtable , int32_t numSlots ) ;

	//DiskPageCache *getDiskPageCache() { return &m_pc; };

  private:

	// this rdb holds urls waiting to be spidered or being spidered
	Rdb m_rdb;

	//DiskPageCache m_pc;
};

extern class Clusterdb g_clusterdb;
extern class Clusterdb g_clusterdb2;

#endif