open-source-search-engine/Timedb.h

192 lines
5.9 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jun 2001
// . time intervals of events are now stored in timedb
// . basically a linked list of the event time intervals
// . allows us to cache the next start time for a docid/eventid
// and do very efficient updating by just sampling the current time interval
// of timedb to update next start times for just those docid/eventids that
// need it
// . we use the special cache g_sortByDateTable to cache one record for
// each unique docid/eventid we have in timedb.
// . if user sets their own "clockset" though this cache will not help so
// we have to read the entire timedb for addLists_r() in that case
#ifndef _TIMEDB_H_
#define _TIMEDB_H_
#include "Rdb.h"
#include "Url.h"
#include "Conf.h"
#include "Xml.h"
#include "Titledb.h"
#include "Collectiondb.h"
#include "CollectionRec.h"
bool initAllSortByDateTables ( ) ;
bool initSortByDateTable ( char *coll ) ;
bool addTimedbKey ( key128_t *kp , unsigned long nowGlobal ,
class HashTableX *ht ) ;
bool addTmpTimeList(RdbList *list,HashTableX *ht,time_t nowGlobal,
long niceness ) ;
bool compareTimeTables ( HashTableX *ht1 , HashTableX *ht2 ,
unsigned long now );
// mdw subtract 7hrs to get into utc, since all our date intervals are in utc
#define START2009 (1230793200-7*3600)
#define START2030 (1893481200-7*3600)
class Timedb {
public:
// reset rdb
void reset();
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// init m_rdb
bool init ();
// init secondary/rebuild timedb
bool init2 ( long treeMem ) ;
key128_t makeKey ( time_t startTime ,
long long docId ,
uint16_t eventId ,
time_t endTime ,
time_t nextStartTime ,
bool isDelete ) ;
key128_t makeStartKey ( time_t startTime ) {
return makeKey ( startTime , 0LL, 0, 0, 0, true); };
key128_t makeEndKey ( time_t startTime ) {
return makeKey ( startTime , MAX_DOCID,255,0x7fffffff,
0x7fffffff, false); };
// the time in the key is in minutes since jan 1, 2010
time_t getStartTime32 ( key128_t *k ) {
return ((k->n1 >> 38) * 60 + START2009); }
// need to mask out 26 bits
time_t getEndTime32 ( key128_t *k ) {
return (((k->n0 >> 27)&0x03ffffff) * 60 + START2009); }
// need to mask out 26 bits
time_t getNextStartTime32 ( key128_t *k ) {
return (((k->n0 >> 1)&0x03ffffff) * 60 + START2009); }
// a simple mask on this one
long long getDocId ( key128_t *k ) {
return (k->n1 & DOCID_MASK); };
// need to mask out 11 bits
uint16_t getEventId ( key128_t *k ) {
return ((k->n0 >> 53)&0x000007ff); }
Rdb *getRdb() { return &m_rdb; };
long getNumTotalEvents() {
long total = 0;
// loop over all coll's sortbydate tables
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
total += cr->m_sortByDateTable.getNumSlotsUsed();
}
return total;
}
// holds binary format time entries
Rdb m_rdb;
};
extern class Timedb g_timedb;
extern class Timedb g_timedb2;
class TimeSlot {
public:
unsigned long m_startTime;
unsigned long m_endTime;
unsigned long m_nextStartTime;
};
// . if event already in progress, list those that close first first
// . used by IndexTable2.cpp to get final scores of search results
inline uint32_t getTimeScore ( //collnum_t collnum ,
uint64_t docId ,
uint16_t eventId ,
unsigned long nowGlobal ,
HashTableX *ht ,
bool showInProgress ) {
// select the right table. each collection has its own...
//HashTableX *ht = &g_collectiondb.m_recs[collnum]->m_sortByDateTable;
// 32 bit keys had too many collisions, now use 64bit
uint64_t key64 = eventId;
key64 <<= NUMDOCIDBITS; // 38
key64 |= docId;
// make a 32 bit hash of docid and eventid
//uint32_t key32 = (uint32_t)(docId&0xffffffff) ;
// just in case eventid was > 255
//if ( eventId > 255 ) {
// key32 ^= (uint32_t)g_hashtab[eventId&0xff][0];
// key32 ^= (uint32_t)g_hashtab[eventId>>8 ][1];
//}
//else {
// key32 ^= (uint32_t)g_hashtab[eventId][0];
//}
// lookup in hashtable to see if we got one already
TimeSlot *old = (TimeSlot *)ht->getValue(&key64);
// debug test
// i think collisions here might be causing us to lose search
// results because we get the wrong time for them! and their
// intervals ptr ends up being empty. and we print out a debug msg
// for that when computing ExpandedResults in Msg40.cpp!
/*
long slot = ht->getSlot(&key32);
if ( slot >= 0 &&
slot+1 < ht->m_numSlots &&
ht->m_flags[slot+1] &&
*(uint32_t *)ht->getKeyFromSlot(slot+1) == key32 ) {
log("time collision!!!!!");
TimeSlot *old2 = (TimeSlot *)ht->getValueFromSlot(slot+1);
if ( old2->m_startTime < old->m_startTime )
old = old2;
}
*/
// bail if not there
if ( ! old ) {
// this happens when doing a clockset and we match the
// event's query terms but its been too long and timedb
// does not have it anymore since we only like store
// a year out or so since spider time
//log("timedb: docid/eid not found d=%llu eid=%li.",
// docId,(long)eventId);
return 0;
}
// over? return 1 then, next smallest score
if ( old->m_endTime < nowGlobal ) return 1;
// in progress? then base on when closed
if ( old->m_startTime < nowGlobal ) {
// bad?
if ( ! showInProgress ) return 0;
// TODO: make sure these are always on top of the
// guys that haven't started yet
// . well since most times are like
// (gdb) p ~(unsigned long)1319584941
// $2 = 2975382354
// this should work. but once we cross the 2B midpoint
// it will cause these scores to be below those event's
// scores that are starting in the future.
return ~old->m_endTime;
}
// . if not yet started, complement score
// . divide by 60 to make smaller than score above, but yet
// maintain to the minute accuracy
// . divide by 32 for speed!
return ~(old->m_startTime) / 32;
}
#endif