open-source-search-engine/IndexList.h
2014-11-10 14:45:11 -08:00

127 lines
4.1 KiB
C++

// Matt Wells, copyright Jun 2001
// . IndexList is a list of keys
// . some keys are 12 bytes, some are 6 bytes (compressed)
// . see Indexdb.h for format of the keys
// . you can set from a TitleRec/SiteRec pair
// . you can set from a TermTable
// . we use this class to generate the final indexList for a parsed document
// . we have to do some #include's in the .cpp cuz the TitleRec contains an
// IndexList for holding indexed link Text
// . the TermTable has an addIndexList() function
// . we map 32bit scores from a TermTable to 8bits scores by taking the log
// of the score if it's >= 128*256, otherwise, we keep it as is
// override all funcs in RdbList where m_useShortKeys is true...
// skipCurrentRec() etc need to use m_useHalfKeys in RdbList cuz
// that is needed by many generic routines, like merge_r, RdbMap, Msg1, Msg22..
// We would need to make it a virtual function which would slow things down...
// or make those classes have specialized functions for IndexLists... in
// addition to the RdbLists they already support
#ifndef _INDEXLIST_H_
#define _INDEXLIST_H_
#include "RdbList.h"
//#include "SiteRec.h"
//#include "TermTable.h" // for setting from TitleRec/SiteRec
#include "Indexdb.h" // g_indexdb.makeKey()
class IndexList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , int32_t listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// . set from a termtable and old IndexList (can be NULL)
// . oldList is subtracted from this list
/*
bool set ( class TermTable *table ,
int64_t docId ,
class IndexList *oldList ,
class IndexList *newDateList ,
int32_t newDate ,
class IndexList *oldDateList ,
class Sections *newSections ,
class Sections *oldSections ,
uint64_t *chksum1Ptr , // = NULL,
int32_t niceness ); // = 2);
bool subtract ( TermTable *ourTable , class IndexList *oldList1 );
*/
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
//unsigned char score32to8 ( uint32_t score ) ;
//static uint32_t score8to32(unsigned char score8);
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
int64_t getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
int64_t getTermId12 ( char *rec ) {
return (*(uint64_t *)(&rec[4])) >> 16 ;
};
int64_t getTermId16 ( char *rec ) {
return (*(uint64_t *)(&rec[8])) >> 16 ;
};
//int64_t getTermId12 ( char *rec ) {
// return ((int64_t)(*(uint32_t *)(m_listPtrHi+2))<<14) |
// ((*(uint16_t *)(m_listPtrHi))>>2) ;
//};
// these 2 assume 12 and 6 byte keys respectively
int64_t getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
int64_t getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
int64_t getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
int64_t getDocId12 ( char *rec ) {
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
//int64_t getDocId12 ( char *rec ) {
// ((*(uint32_t *)rec)>>10) |
// (((int64_t)(*(uint16_t *)(rec+4)))<<22);
//};
int64_t getDocId6 ( char *rec ) {
int64_t docid;
*(int32_t *)(&docid) = *(int32_t *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
};
#endif