// Matt Wells, copyright Jun 2001 // . time intervals of events are now stored in timedb // . basically a linked list of the event time intervals // . allows us to cache the next start time for a docid/eventid // and do very efficient updating by just sampling the current time interval // of timedb to update next start times for just those docid/eventids that // need it // . we use the special cache g_sortByDateTable to cache one record for // each unique docid/eventid we have in timedb. // . if user sets their own "clockset" though this cache will not help so // we have to read the entire timedb for addLists_r() in that case #ifndef _TIMEDB_H_ #define _TIMEDB_H_ #include "Rdb.h" #include "Url.h" #include "Conf.h" #include "Xml.h" #include "Titledb.h" #include "Collectiondb.h" #include "CollectionRec.h" bool initAllSortByDateTables ( ) ; bool initSortByDateTable ( char *coll ) ; bool addTimedbKey ( key128_t *kp , unsigned long nowGlobal , class HashTableX *ht ) ; bool addTmpTimeList(RdbList *list,HashTableX *ht,time_t nowGlobal, long niceness ) ; bool compareTimeTables ( HashTableX *ht1 , HashTableX *ht2 , unsigned long now ); // mdw subtract 7hrs to get into utc, since all our date intervals are in utc #define START2009 (1230793200-7*3600) #define START2030 (1893481200-7*3600) class Timedb { public: // reset rdb void reset(); bool verify ( char *coll ); bool addColl ( char *coll, bool doVerify = true ); // init m_rdb bool init (); // init secondary/rebuild timedb bool init2 ( long treeMem ) ; key128_t makeKey ( time_t startTime , long long docId , uint16_t eventId , time_t endTime , time_t nextStartTime , bool isDelete ) ; key128_t makeStartKey ( time_t startTime ) { return makeKey ( startTime , 0LL, 0, 0, 0, true); }; key128_t makeEndKey ( time_t startTime ) { return makeKey ( startTime , MAX_DOCID,255,0x7fffffff, 0x7fffffff, false); }; // the time in the key is in minutes since jan 1, 2010 time_t getStartTime32 ( key128_t *k ) { return ((k->n1 >> 38) * 60 + START2009); } // need to mask out 26 bits time_t getEndTime32 ( key128_t *k ) { return (((k->n0 >> 27)&0x03ffffff) * 60 + START2009); } // need to mask out 26 bits time_t getNextStartTime32 ( key128_t *k ) { return (((k->n0 >> 1)&0x03ffffff) * 60 + START2009); } // a simple mask on this one long long getDocId ( key128_t *k ) { return (k->n1 & DOCID_MASK); }; // need to mask out 11 bits uint16_t getEventId ( key128_t *k ) { return ((k->n0 >> 53)&0x000007ff); } Rdb *getRdb() { return &m_rdb; }; long getNumTotalEvents() { long total = 0; // loop over all coll's sortbydate tables for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; total += cr->m_sortByDateTable.getNumSlotsUsed(); } return total; } // holds binary format time entries Rdb m_rdb; }; extern class Timedb g_timedb; extern class Timedb g_timedb2; class TimeSlot { public: unsigned long m_startTime; unsigned long m_endTime; unsigned long m_nextStartTime; }; // . if event already in progress, list those that close first first // . used by IndexTable2.cpp to get final scores of search results inline uint32_t getTimeScore ( //collnum_t collnum , uint64_t docId , uint16_t eventId , unsigned long nowGlobal , HashTableX *ht , bool showInProgress ) { // select the right table. each collection has its own... //HashTableX *ht = &g_collectiondb.m_recs[collnum]->m_sortByDateTable; // 32 bit keys had too many collisions, now use 64bit uint64_t key64 = eventId; key64 <<= NUMDOCIDBITS; // 38 key64 |= docId; // make a 32 bit hash of docid and eventid //uint32_t key32 = (uint32_t)(docId&0xffffffff) ; // just in case eventid was > 255 //if ( eventId > 255 ) { // key32 ^= (uint32_t)g_hashtab[eventId&0xff][0]; // key32 ^= (uint32_t)g_hashtab[eventId>>8 ][1]; //} //else { // key32 ^= (uint32_t)g_hashtab[eventId][0]; //} // lookup in hashtable to see if we got one already TimeSlot *old = (TimeSlot *)ht->getValue(&key64); // debug test // i think collisions here might be causing us to lose search // results because we get the wrong time for them! and their // intervals ptr ends up being empty. and we print out a debug msg // for that when computing ExpandedResults in Msg40.cpp! /* long slot = ht->getSlot(&key32); if ( slot >= 0 && slot+1 < ht->m_numSlots && ht->m_flags[slot+1] && *(uint32_t *)ht->getKeyFromSlot(slot+1) == key32 ) { log("time collision!!!!!"); TimeSlot *old2 = (TimeSlot *)ht->getValueFromSlot(slot+1); if ( old2->m_startTime < old->m_startTime ) old = old2; } */ // bail if not there if ( ! old ) { // this happens when doing a clockset and we match the // event's query terms but its been too long and timedb // does not have it anymore since we only like store // a year out or so since spider time //log("timedb: docid/eid not found d=%llu eid=%li.", // docId,(long)eventId); return 0; } // over? return 1 then, next smallest score if ( old->m_endTime < nowGlobal ) return 1; // in progress? then base on when closed if ( old->m_startTime < nowGlobal ) { // bad? if ( ! showInProgress ) return 0; // TODO: make sure these are always on top of the // guys that haven't started yet // . well since most times are like // (gdb) p ~(unsigned long)1319584941 // $2 = 2975382354 // this should work. but once we cross the 2B midpoint // it will cause these scores to be below those event's // scores that are starting in the future. return ~old->m_endTime; } // . if not yet started, complement score // . divide by 60 to make smaller than score above, but yet // maintain to the minute accuracy // . divide by 32 for speed! return ~(old->m_startTime) / 32; } #endif