open-source-search-engine/Tagdb.h
Matt 09de59f026 do not store cblock, etc. tags into tagdb to save
disk space. added tagdb file cache for better performance,
less disk accesses. will help reduce disk load.
put file cache sizes in master controls and if they change
then update the cache size dynamically.
2015-09-10 12:46:00 -06:00

591 lines
17 KiB
C++

// Matt Wells, copyright Jul 2008
#ifndef _TAGDB_H_
#define _TAGDB_H_
#include "Conf.h" // for setting rdb from Conf file
#include "Rdb.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
//#include "DiskPageCache.h"
//#include "CollectionRec.h"
#include "SafeBuf.h"
#include "Msg0.h"
// . now we can store multiple rating by multiple users or algorithms
// . we can use accountability, we can merge sources, etc.
// . we can use time-based merging
bool isTagTypeUnique ( int32_t tt ) ;
bool isTagTypeIndexable ( int32_t tt ) ;
//bool isTagTypeString ( int32_t tt ) ;
int32_t hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement );
// . Tag::m_type is this if its a dup in the TagRec
// . so if www.xyz.com has one tag and xyz.com has another, then
// the xyz.com tag should have its m_type set to TT_DUP, but only if we can only have
// one of those tag types...
#define TT_DUP 123456
#define TAGDB_KEY key128_t
// a TagRec can contain multiple Tags, even of the same Tag::m_type
class Tag {
public:
int32_t getSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
int32_t getRecSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
void set ( char *site ,
char *tagname ,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize );
int32_t print ( ) ;
bool printToBuf ( SafeBuf *sb );
bool printToBufAsAddRequest ( SafeBuf *sb );
bool printToBufAsXml ( SafeBuf *sb );
bool printToBufAsXml2 ( SafeBuf *sb );
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
bool printToBufAsTagVector ( SafeBuf *sb );
// just print the m_data...
bool printDataToBuf ( SafeBuf *sb );
bool isType ( char *t );
bool isIndexable ( ) {
return isTagTypeIndexable ( m_type ); }
//( m_dataSize == 1 || isType("meta") ); };
// for parsing output of printToBuf()
int32_t setFromBuf ( char *p , char *pend ) ;
int32_t setDataFromBuf ( char *p , char *pend ) ;
// skip of the username, whose size (including \0) is encoded
// as the first byte in the m_recData buffer
char *getTagData ( ) {return m_buf + *m_buf + 1;};
int32_t getTagDataSize ( ) {return m_bufSize - *m_buf - 1; };
// what user added this tag?
char *getUser ( ) { return m_buf + 1;};
// remove the terminating \0 which is included as part of the size
int32_t getUserLen ( ) { return *m_buf - 1; };
// used to determine if one Tag should overwrite the other! if they
// have the same dedup hash... then yes...
int32_t getDedupHash ( );
// tagdb uses 128 bit keys now
key128_t m_key;
int32_t m_recDataSize;
// when tag was added/updated
int32_t m_timestamp;
// . ip address of user adding tag
// . prevent multiple turk voters from same ip!
int32_t m_ip;
// each tag in a TagRec now has a unique id for ez deletion
//int32_t m_tagId;
// the "type" of tag. see the TagDesc array in Tagdb.cpp for a list
// of all the tag types. m_type is a hash of the type name.
int32_t m_type;
// . m_user[] IS ACTUALLY a 6-byte KEY for another TagRec
// . this is also a user's name like "mwells"
// . each user has a TagRec whose FULL key is this userHash
// . m_user[7] is always \0
//char m_user[8];
int32_t m_bufSize;
char m_buf[0];
};
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
// . used by CollectionRec::getRegExpNum()
int32_t getTagTypeFromStr( char *tagTypeName , int32_t tagnameLen = -1 );
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
char *getTagStrFromType ( int32_t tagType ) ;
// . max # of tags any one site or url can have
// . even AFTER the "inheritance loop"
// . includes the 4 bytes used for size and # of tags
//#define MAX_TAGREC_SIZE 1024
// max "oustanding" msg0 requests sent by TagRec::lookup()
#define MAX_TAGDB_REQUESTS 3
// . the latest version of the TagRec
//#define TAGREC_CURRENT_VERSION 0
class TagRec {
public:
TagRec();
~TagRec();
void reset();
void constructor ();
// . an rdb record is a key, dataSize, then the data
// . the "data" is al the stuff after "m_dataSize"
//key_t m_key;
//int32_t m_dataSize;
//uint16_t m_numTags;
//char m_version;
//char m_buf [ MAX_TAGREC_SIZE - 12 - 4 - 2 ];
//char *getKey () { return (char *)&m_key; };
//char *getData () { return (char *)this + 12 + 4; };
//int32_t getDataSize () { return m_dataSize; };
//void copy (class TagRec *tp ) {
// gbmemcpy ( this , (void *)tp , tp->getSize() ); };
// includes the 4 byte "header" which consists of the first 2 bytes
// being the size of the actual tag buffer and the second two bytes
// being the number of tags in the actual tag buffer
//int32_t getSize ( ) { return 12+4+1+2+m_dataSize; };
//int32_t getSize ( ) { return 12+4+m_dataSize;};
int32_t getNumTags ( );
int32_t getSize ( ) { return sizeof(TagRec); };
class Tag *getFirstTag ( ) {
if ( m_numListPtrs == 0 ) return NULL;
return (Tag *)m_listPtrs[0]->m_list;
}
bool isEmpty ( ) { return (getFirstTag()==NULL); };
// lists should be in order of precedence i guess
class Tag *getNextTag ( class Tag *tag ) {
// watch out
if ( ! tag ) return NULL;
// get rec size
int32_t recSize = tag->getRecSize();
// point to current tag
char *current = (char *)tag;
// find what list we are in
int32_t i;
for ( i = 0 ; i < m_numListPtrs ; i++ ) {
if ( current < m_listPtrs[i]->m_list ) continue;
if ( current >= m_listPtrs[i]->m_listEnd ) continue;
break;
}
// sanity
if ( i >= m_numListPtrs ) { char *xx=NULL;*xx=0; }
// advance
current += recSize;
// sanity check
if ( recSize > 500000 || recSize < 12 ) {
log("tagdb: corrupt tag recsize %i",(int)recSize);
return NULL;
char *xx=NULL;*xx=0;}
// breach list?
if ( current < m_listPtrs[i]->m_listEnd) return (Tag *)current;
// advance list
i++;
// breach of lists?
if ( i >= m_numListPtrs ) return NULL;
// return that list record then
return (Tag *)(m_listPtrs[i]->m_list);
};
// return the number the tags having particular tag types
int32_t getNumTagTypes ( char *tagTypeStr );
// get a tag from the tagType
class Tag *getTag ( char *tagTypeStr );
class Tag *getTag2 ( int32_t tagType );
//char *getRecEnd ( ) { return (char *)this + getSize(); };
//char *getMaxEnd ( ) { return (char *)this + (int32_t)MAX_TAGREC_SIZE; };
// . for showing under the summary of a search result in PageResults
// . also for Msg6a
int32_t print ( ) ;
bool printToBuf ( SafeBuf *sb );
bool printToBufAsAddRequest ( SafeBuf *sb );
bool printToBufAsXml ( SafeBuf *sb );
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
bool printToBufAsTagVector ( SafeBuf *sb );
// . make sure not a dup of a pre-existing tag
// . used by the clock code to not at a clock if already in there
// in Msg14.cpp
Tag *getTag ( char *tagTypeStr , char *dataPtr , int32_t dataSize );
int32_t getTimestamp ( char *tagTypeStr , int32_t defalt );
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
int32_t getLong ( char *tagTypeStr ,
int32_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
int32_t getLong ( int32_t tagId ,
int32_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
int64_t getLongLong ( char *tagTypeStr,
int64_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
char *getString ( char *tagTypeStr ,
char *defalt = NULL ,
int32_t *size = NULL ,
Tag **bookmark = NULL ,
int32_t *timestamp = NULL ,
char **user = NULL );
// we only store the first 6 chars of "user" into this TagRec, m_buf
/*
bool addTag ( char *tagTypeStr,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize );
// we convert the int32_t to a string for you here...
bool addTag ( char *tagTypeStr ,
int32_t timestamp ,
char *user ,
int32_t ip ,
int32_t dataAsLong ) {
char buf[16]; sprintf(buf,"%"INT32"",dataAsLong);
return addTag(tagTypeStr,timestamp,user,ip,buf,
gbstrlen(buf)); };
// same as above
bool addTag ( Tag *tag );
// add "negative" tags, so when these tags are added to the
// TagRec in tagdb, they will delete them
bool addDelTag ( char *tagTypeStr );
// now you can specify a unique tag id in the case of multiple tags
// that have the same tagType and user
bool removeTags ( char *tagTypeStr , char *user , int32_t tagId = 0 ) ;
bool removeTags ( int32_t tagType , char *user , int32_t tagId = 0 ) ;
bool removeTag ( class Tag *rmTag ) ;
// add/remove all the tags from "tagRec" to our list of tags
bool addTags ( TagRec *tagRec ) ;
bool removeTags ( TagRec *tagRec ) ;
// return false and set g_errno on error
bool addTags ( Tag *tags , char *tagEnd , char *bufEnd );
bool removeTags ( Tag *tags , char *tagEnd , char *bufEnd );
bool replaceTags ( Tag *tags , char *tagEnd , char *bufEnd );
*/
bool setFromBuf ( char *buf , int32_t bufSize );
bool serialize ( SafeBuf &dst );
bool setFromHttpRequest ( HttpRequest *r , TcpSocket *s );
// use this for setFromBuf()
SafeBuf m_sbuf;
// some specified input
//char *m_coll;
Url *m_url;
collnum_t m_collnum;
void (*m_callback ) ( void *state );
void *m_state;
// hold possible tagdb records
RdbList m_lists[MAX_TAGDB_REQUESTS];
// ptrs to lists in the m_lists[] array
RdbList *m_listPtrs[MAX_TAGDB_REQUESTS];
int32_t m_numListPtrs;
};
class Tagdb {
public:
// reset rdb
void reset();
// . TODO: specialized cache because to store pre-parsed tagdb recs
// . TODO: have m_useSeals parameter???
bool init ( );
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
//bool convert ( char *coll ) ;
bool addColl ( char *coll, bool doVerify = true );
// used by ../rdb/Msg0 and ../rdb/Msg1
Rdb *getRdb ( ) { return &m_rdb; };
//key128_t makeKey ( Url *u , bool isDelete ) ;
key128_t makeStartKey ( char *site );//Url *u ) ;
key128_t makeEndKey ( char *site );//Url *u ) ;
key128_t makeDomainStartKey ( Url *u ) ;
key128_t makeDomainEndKey ( Url *u ) ;
// . get the serialized TagRec from an RdbList of TagRecs from tagdb
// that is the best match for "url"
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
int32_t collLen, RdbList *retList) ;
//DiskPageCache *getDiskPageCache() { return &m_pc; };
//int32_t getGroupId (key_t *key) {return key->n1 & g_hostdb.m_groupMask;}
// . dump tagdb to stdout
// . dump as URL requests so we can re-add with blaster on each host
// . this replaces dumpTagdb() in main.cpp
// . sendPageTagdb() will process such URL requests
void dumpTagdb ( );
// private:
// . returns 0 if url is not a suburl of "site"
int32_t getMatchPoints ( Url *url , Url *site ) ;
bool setHashTable ( ) ;
// . we use the cache in here also for caching tagdb records
// and "not-founds" stored remotely (net cache)
Rdb m_rdb;
//DiskPageCache m_pc;
bool loadMinSiteInlinksBuffer ( );
bool loadMinSiteInlinksBuffer2 ( );
int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
SafeBuf m_siteBuf1;
SafeBuf m_siteBuf2;
};
// derive this from tagdb
class Turkdb {
public:
void reset();
bool init ( );
bool addColl ( char *coll, bool doVerify = true );
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
//DiskPageCache m_pc;
};
extern class Tagdb g_tagdb;
extern class Tagdb g_tagdb2;
extern class Turkdb g_turkdb;
//extern class Tagdb g_sitedb;
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) ;
///////////////////////////////////////////////
//
// Msg8a gets TagRecs from Tagdb
//
///////////////////////////////////////////////
// msg8a needs to keep a ptr to the tags it
// generates in the "inheritance loop". so don't
// store more than this! log if we do.
//#define MAX_TAGS 128
//#define MSG8A_MAX_REQUEST_SIZE 2048
// this msg class is for getting AND adding to tagdb
class Msg8a {
public:
Msg8a ();
~Msg8a ();
void reset();
// . get records from multiple subdomains of url
// . calls g_udpServer.sendRequest() on each subdomain of url
// . all matching records are merge into a final record
// i.e. site tags are also propagated accordingly
// . closest matching "site" is used as the "site" (the site url)
// . stores the tagRec in your "tagRec"
bool getTagRec ( Url *url ,
char *site , // set to NULL to auto set
//char *coll ,
collnum_t collnum,
//bool useCanonicalName ,
bool skipDomainLookup ,
int32_t niceness ,
void *state ,
void (* callback)(void *state ),
TagRec *tagRec ,
bool doInheritance = true ,
char rdbId = RDB_TAGDB);
bool launchGetRequests();
void gotAllReplies ( ) ;
// some specified input
//char *m_coll;
//int32_t m_collLen;
Url *m_url;
//bool m_doFullUrl;
char m_rdbId;
collnum_t m_collnum;
void (*m_callback ) ( void *state );
void *m_state;
Msg0 m_msg0s[MAX_TAGDB_REQUESTS];
key128_t m_siteStartKey ;
key128_t m_siteEndKey ;
// hold possible tagdb records
//RdbList m_lists[MAX_TAGDB_REQUESTS];
int32_t m_niceness;
char *m_dom;
char *m_hostEnd;
char *m_p;
int32_t m_requests;
int32_t m_replies;
char m_doneLaunching;
int32_t m_errno;
// we set this for the caller
TagRec *m_tagRec;
// hacks for msg6b
void *m_parent;
int32_t m_slotNum;
// hack for MsgE
void *m_state2;
void *m_state3;
bool m_doInheritance;
};
/*
void handleRequest9a ( UdpSlot *slot , int32_t niceness ) ;
class Msg9a {
public:
Msg9a ();
~Msg9a();
void reset() ;
bool launchAddRequests ( ) ;
static bool registerHandler ( ) {
return g_udpServer.registerHandler ( 0x9a, handleRequest9a );};
// . returns false if blocked, true otherwise
// . sets errno on error
// . "sites" is a NULL-terminated list of space-separated urls
// . if "deleteTags" is false, then the tags will be added to the
/// the TagRecs specified by the sites in "sites". if a TagRec
// does not exist for a given "site" then it will be added just
// so we can add the Tags to it. If it does exist, we will
// just append the given Tags to it. Tags with the same tagType and
// and "user" will be replaced by these Tags in the tagRecPtrs[].
// . if "deleteTags" is true, then the Tags matching the Tags
// given will be removed from each TagRec. If a nonzero timestamp
// or a username that does not start with \0 or a non-zero data
// is specified in the Tag, then that must match the Tag
// being removed as well. In this way we can remove specific
// Tags from a list of Tags that share the same m_tagType.
// . if "deleteTagRecs" is false then the entire TagRec will be removed
// from Tagdb. tagRecPtrs must be NULL in this case.
// . if provided, the ip vector is 1-1 with the sites in "sitesPtrs[]"
// and we set the Tag::m_ip with the corresponding
// ip given by that. this allows XmlDoc.cpp to "ip stamp" a tag.
// and the tag will be rendered invalid by XmlDoc.cpp if the ip of
// the site changes from that! in this way we can invalidate tags
// when a site changes ownership. assuming it also changes ip!
// . you can now pass in either "sites" or sitePtrs/numSitePtrs,
// whichever is easiest for you...
bool addTags ( char *sites ,
char **sitePtrs ,
int32_t numSitePtrs ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness ,
TagRec *tagRec ,
bool nukeTagRecs ,
int32_t *ipVector );//= NULL );
// like above, but we are adding the output of a
// './gb dump S main 0 -1 1' cmd
bool addTags ( char *dumpFile ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness );
void (*m_callback ) ( void *state );
void *m_state;
int32_t m_errno;
int32_t m_requests;
int32_t m_replies;
char *m_requestBuf;
int32_t m_requestBufSize;
char *m_p;
char *m_pend;
int32_t m_niceness;
};
*/
int32_t getY ( int64_t X , int64_t *x , int64_t *y , int32_t n ) ;
#endif
// Lookup order for the url hostname.domainname.com/mydir/mypage.html
// (aka 1.2.3.4/mydir/mypage.html):
// . hostname.domainname.com/mydir/mypage.html
// . hostname.domainname.com/mydir/
// . hostname.domainname.com
// . domainname.com/mydir/mypage.html
// . domainname.com/mydir/
// . domainname.com
// . 1.2.3.4 /mydir/mypage.html
// . 1.2.3.4 /mydir/
// . 1.2.3.4
// . 1.2.3