// Matt Wells, Copyright May 2005 // . format of a 16-byte datedb key // . tttttttt tttttttt tttttttt tttttttt t = termId (48bits) // . tttttttt tttttttt DDDDDDDD DDDDDDDD D = ~date // DDDDDDDD DDDDDDDD ssssssss dddddddd s = ~score // . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits) // . format of a 10-byte indexdb key // . DDDDDDDD DDDDDDDD DDDDDDDD DDDDDDDD D = ~date // . ssssssss dddddddd dddddddd dddddddd // . dddddddd dddddd0Z s = ~score d = docId (38 bits) // // SPECIAL EVENTDB KEYS. for indexing events. // // . format of a 16-byte "eventdb" key with termId of 0 // . for sorting/constraining events with multiple start dates // . each start date has a "termId 0" key. "D" date is when // the event starts. score is the eventId. this key is // added by the Events::hashIntervals(eventId) function. // // . 00000000 00000000 00000000 00000000 t = termId (48bits) // . 00000000 00000000 DDDDDDDD DDDDDDDD D = ~date (in secs after epoch) // DDDDDDDD DDDDDDDD IIIIIIII dddddddd I = eventId // . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits) // . format of a 16-byte "eventdb" key from words/phrases // . each word/phrase of each event has one and only one key of this format. // . this key is added by the Events::hash() function. // // . tttttttt tttttttt tttttttt tttttttt t = termId (48bits) // . tttttttt tttttttt 00000000 00000000 // iiiiiiii IIIIIIII ssssssss dddddddd s = ~score, [I-i] = eventId RANGE // . dddddddd dddddddd dddddddd dddddd0Z d = docId (38 bits) #ifndef _DATEDB_H_ #define _DATEDB_H_ #include "Rdb.h" #include "Conf.h" #include "Indexdb.h" // we define these here, NUMDOCIDBITS is in ../titledb/Titledb.h #define NUMTERMIDBITS 48 // mask the lower 48 bits #define TERMID_MASK (0x0000ffffffffffffLL) #include "Titledb.h" // DOCID_MASK // Msg5.cpp and Datedb.cpp use this //#define MIN_TRUNC (PAGE_SIZE/6 * 4 + 6) // keep it at LEAST 12 million to avoid disasters #define MIN_TRUNC 12000000 class Datedb { public: // resets rdb void reset(); // sets up our m_rdb from g_conf (global conf class) bool init ( ); // init the rebuild/secondary rdb, used by PageRepair.cpp bool init2 ( int32_t treeMem ); bool verify ( char *coll ); bool addColl ( char *coll, bool doVerify = true ); bool addIndexList ( class IndexList *list ) ; // . make a 16-byte key from all these components // . since it is 16 bytes, the big bit will be set key128_t makeKey ( int64_t termId , uint32_t date , unsigned char score , uint64_t docId , bool isDelKey ); key128_t makeStartKey ( int64_t termId , uint32_t date1 ) { return makeKey ( termId , date1, 255 , 0LL , true ); }; key128_t makeEndKey ( int64_t termId , uint32_t date2 ) { return makeKey ( termId , date2, 0 , DOCID_MASK , false ); }; // works on 16 byte full key or 10 byte half key int64_t getDocId ( void *key ) { return ((*(uint64_t *)(key)) >> 2) & DOCID_MASK; }; unsigned char getScore ( void *key ) { return ~(((unsigned char *)key)[5]); }; // use the very top int32_t only /* uint32_t getGroupIdFromKey ( key128_t *key ) { if ( g_conf.m_fullSplit ) return g_titledb.getGroupId ( getDocId((char *)key) ); //#ifdef SPLIT_INDEXDB if ( g_conf.m_indexdbSplit > 1 ) { uint32_t groupId = (((uint32_t*)key)[3]) & g_hostdb.m_groupMask; groupId >>= g_indexdb.m_groupIdShift; uint32_t offset = (key->n0 >> 2) & DOCID_OFFSET_MASK; return g_indexdb.m_groupIdTable [ groupId+ (offset*g_indexdb.m_numGroups) ]; } //#else else return (((uint32_t *)key)[3]) & g_hostdb.m_groupMask; //#endif }; */ //#ifdef SPLIT_INDEXDB // for terms like gbdom:xyz.com that only reside in one group and // are not split by docid into multiple groups. reduces disk seeks // while spidering, cuz we use such terms for deduping and for // doing quotas. // ---> IS THIS RIGHT???? MDW uint32_t getNoSplitGroupId ( key128_t *k ) { char *xx=NULL;*xx=0; return 0; // wtf is this? still being used? //return (((uint32_t *)k)[3]) & g_hostdb.m_groupMask; //uint32_t bgid = getBaseGroupId(k); //return g_indexdb.getSplitGroupId(bgid,0); //return bgid; } //uint32_t getBaseGroupId ( key128_t *k ) { // return (((uint32_t *)k)[3]) & g_hostdb.m_groupMask; //} //#endif // extract the termId from a key int64_t getTermId ( key128_t *k ) { int64_t termId = 0LL; gbmemcpy ( &termId , ((char *)k) + 10 , 6 ); return termId ; }; int32_t getDate ( key128_t *k ) { uint32_t date = 0; date = (uint32_t)(k->n1 & 0x000000000000ffffULL); date <<= 16; date |= (uint32_t)((k->n0 & 0xffff000000000000ULL) >> 48); return ~date; } int32_t getEventIdStart ( void *k ) { uint32_t d = getDate ( (key128_t *)k ); return ((uint8_t *)(&d))[1]; }; int32_t getEventIdEnd ( void *k ) { uint32_t d = getDate ( (key128_t *)k ); return ((uint8_t *)(&d))[0]; }; //RdbCache *getCache ( ) { return &m_rdb.m_cache; }; Rdb *getRdb ( ) { return &m_rdb; }; Rdb m_rdb; //DiskPageCache *getDiskPageCache ( ) { return &m_pc; }; //DiskPageCache m_pc; }; extern class Datedb g_datedb; //extern class Datedb g_datedb2; #endif // . the search-within operator "|" // - termlists are sorted by score so that when merging 2 termlists // we can stop when we get the first 10 docIds that have both terms and // we are certain that they are the top 10 highest scoring // - but search within says to disregard the scores of the first list, // so we can still be sure we got the top 10, i guess // - sort by date: like search-within but everybody has a date so the // termlist is huge!!! we can pass a sub-date termlist, say today's // date and merge that one. if we get no hits then try the last 3 days // date termlist. Shit, can't have one huge date termlist anyway cuz we // need truncation to make the network thang work.