open-source-search-engine/IndexList.h

127 lines
4.1 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jun 2001
// . IndexList is a list of keys
// . some keys are 12 bytes, some are 6 bytes (compressed)
// . see Indexdb.h for format of the keys
// . you can set from a TitleRec/SiteRec pair
// . you can set from a TermTable
// . we use this class to generate the final indexList for a parsed document
// . we have to do some #include's in the .cpp cuz the TitleRec contains an
// IndexList for holding indexed link Text
// . the TermTable has an addIndexList() function
// . we map 32bit scores from a TermTable to 8bits scores by taking the log
// of the score if it's >= 128*256, otherwise, we keep it as is
// override all funcs in RdbList where m_useShortKeys is true...
// skipCurrentRec() etc need to use m_useHalfKeys in RdbList cuz
// that is needed by many generic routines, like merge_r, RdbMap, Msg1, Msg22..
// We would need to make it a virtual function which would slow things down...
// or make those classes have specialized functions for IndexLists... in
// addition to the RdbLists they already support
#ifndef _INDEXLIST_H_
#define _INDEXLIST_H_
#include "RdbList.h"
//#include "SiteRec.h"
//#include "TermTable.h" // for setting from TitleRec/SiteRec
#include "Indexdb.h" // g_indexdb.makeKey()
class IndexList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , long listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// . set from a termtable and old IndexList (can be NULL)
// . oldList is subtracted from this list
/*
bool set ( class TermTable *table ,
long long docId ,
class IndexList *oldList ,
class IndexList *newDateList ,
long newDate ,
class IndexList *oldDateList ,
class Sections *newSections ,
class Sections *oldSections ,
unsigned long long *chksum1Ptr , // = NULL,
long niceness ); // = 2);
bool subtract ( TermTable *ourTable , class IndexList *oldList1 );
*/
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
//unsigned char score32to8 ( unsigned long score ) ;
//static unsigned long score8to32(unsigned char score8);
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
long long getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
long long getTermId12 ( char *rec ) {
return (*(unsigned long long *)(&rec[4])) >> 16 ;
};
long long getTermId16 ( char *rec ) {
return (*(unsigned long long *)(&rec[8])) >> 16 ;
};
//long long getTermId12 ( char *rec ) {
// return ((long long)(*(unsigned long *)(m_listPtrHi+2))<<14) |
// ((*(unsigned short *)(m_listPtrHi))>>2) ;
//};
// these 2 assume 12 and 6 byte keys respectively
long long getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
long long getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
long long getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
long long getDocId12 ( char *rec ) {
return ((*(unsigned long long *)(rec)) >> 2) & DOCID_MASK; };
//long long getDocId12 ( char *rec ) {
// ((*(unsigned long *)rec)>>10) |
// (((long long)(*(unsigned short *)(rec+4)))<<22);
//};
long long getDocId6 ( char *rec ) {
long long docid;
*(long *)(&docid) = *(long *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
long getCurrentDate ( ) { return ~*(long *)(m_listPtr+6); };
};
#endif