mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
103 lines
2.6 KiB
C++
103 lines
2.6 KiB
C++
// Matt Wells, copyright Feb 2001
|
|
|
|
// . catdb record format:
|
|
// . number of catids (1 byte)
|
|
// . list of catids (4 bytes each)
|
|
// . tagdb file # (3 bytes)
|
|
// . tagdb version # (1 byte)
|
|
// . siteUrl (remaining bytes)
|
|
|
|
// . record key:
|
|
// . dddddddd dddddddd dddddddd dddddddd d = domain hash (w/o collection)
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = special url hash
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
|
|
|
|
#ifndef _CATDB_H_
|
|
#define _CATDB_H_
|
|
|
|
#define CATREC_CURRENT_VERSION 6
|
|
|
|
#include "Conf.h" // for setting rdb from Conf file
|
|
#include "Rdb.h"
|
|
#include "Url.h"
|
|
#include "Loop.h"
|
|
#include "DiskPageCache.h"
|
|
//#include "CollectionRec.h"
|
|
|
|
class Catdb {
|
|
|
|
public:
|
|
// reset rdb
|
|
void reset();
|
|
|
|
// . TODO: specialized cache because to store pre-parsed tagdb recs
|
|
// . TODO: have m_useSeals parameter???
|
|
bool init ( );
|
|
bool init2 ( int32_t treeMem );
|
|
|
|
bool verify ( char *coll );
|
|
|
|
bool addColl ( char *coll, bool doVerify = true );
|
|
|
|
// . used by ../rdb/Msg0 and ../rdb/Msg1
|
|
Rdb *getRdb ( ) { return &m_rdb; };
|
|
|
|
// calls getKeys and gets the top key
|
|
key_t makeKey ( Url *site , bool isDelete );
|
|
|
|
// binary search on the given list for the given key
|
|
void listSearch ( RdbList *list,
|
|
key_t exactKey,
|
|
char **data,
|
|
int32_t *dataSize );
|
|
|
|
|
|
// . get the serialized SiteRec from an RdbList of SiteRecs
|
|
// that is the best match for "url"
|
|
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
|
|
int32_t collLen ) ;
|
|
|
|
// . find the indirect matches in the list which match a sub path
|
|
// of the url
|
|
int32_t getIndirectMatches ( RdbList *list ,
|
|
Url *url ,
|
|
char **matchRecs ,
|
|
int32_t *matchRecSizes ,
|
|
int32_t maxMatches,
|
|
char *coll,
|
|
int32_t collLen );
|
|
|
|
// . get the keys of all the possible site records for this url
|
|
// . see below for the search order of the sub-urls
|
|
// . if "useIp" is true we use the ip of "url" to form the key range,
|
|
// not the cannoncial domain name
|
|
void getKeyRange ( bool useIp , Url *url ,
|
|
key_t *startKey , key_t *endKey );
|
|
|
|
DiskPageCache *getDiskPageCache() { return &m_pc; };
|
|
|
|
// normalize a url, no www.
|
|
void normalizeUrl ( Url *srcUrl, Url *dstUrl );
|
|
|
|
//int32_t getGroupId ( key_t *key ) {
|
|
// return key->n1 & g_hostdb.m_groupMask;
|
|
//}
|
|
|
|
private:
|
|
// for doing binary search on the list
|
|
char *moveToCorrectKey ( char *listPtr,
|
|
RdbList *list,
|
|
uint32_t domainHash );
|
|
|
|
// . we use the cache in here also for caching tagdb records
|
|
// and "not-founds" stored remotely (net cache)
|
|
Rdb m_rdb;
|
|
|
|
DiskPageCache m_pc;
|
|
|
|
};
|
|
|
|
extern class Catdb g_catdb;
|
|
|
|
#endif
|