open-source-search-engine/Tagdb.h
2015-01-31 15:18:06 -07:00

588 lines
17 KiB
C++

// Matt Wells, copyright Jul 2008
#ifndef _TAGDB_H_
#define _TAGDB_H_
#include "Conf.h" // for setting rdb from Conf file
#include "Rdb.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "DiskPageCache.h"
//#include "CollectionRec.h"
#include "SafeBuf.h"
#include "Msg0.h"
// . now we can store multiple rating by multiple users or algorithms
// . we can use accountability, we can merge sources, etc.
// . we can use time-based merging
bool isTagTypeUnique ( int32_t tt ) ;
bool isTagTypeIndexable ( int32_t tt ) ;
//bool isTagTypeString ( int32_t tt ) ;
int32_t hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement );
// . Tag::m_type is this if its a dup in the TagRec
// . so if www.xyz.com has one tag and xyz.com has another, then
// the xyz.com tag should have its m_type set to TT_DUP, but only if we can only have
// one of those tag types...
#define TT_DUP 123456
#define TAGDB_KEY key128_t
// a TagRec can contain multiple Tags, even of the same Tag::m_type
class Tag {
public:
int32_t getSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
int32_t getRecSize ( ) { return sizeof(key128_t) + 4 + m_recDataSize; };
void set ( char *site ,
char *tagname ,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize );
int32_t print ( ) ;
bool printToBuf ( SafeBuf *sb );
bool printToBufAsAddRequest ( SafeBuf *sb );
bool printToBufAsXml ( SafeBuf *sb );
bool printToBufAsXml2 ( SafeBuf *sb );
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
bool printToBufAsTagVector ( SafeBuf *sb );
// just print the m_data...
bool printDataToBuf ( SafeBuf *sb );
bool isType ( char *t );
bool isIndexable ( ) {
return isTagTypeIndexable ( m_type ); }
//( m_dataSize == 1 || isType("meta") ); };
// for parsing output of printToBuf()
int32_t setFromBuf ( char *p , char *pend ) ;
int32_t setDataFromBuf ( char *p , char *pend ) ;
// skip of the username, whose size (including \0) is encoded
// as the first byte in the m_recData buffer
char *getTagData ( ) {return m_buf + *m_buf + 1;};
int32_t getTagDataSize ( ) {return m_bufSize - *m_buf - 1; };
// what user added this tag?
char *getUser ( ) { return m_buf + 1;};
// remove the terminating \0 which is included as part of the size
int32_t getUserLen ( ) { return *m_buf - 1; };
// used to determine if one Tag should overwrite the other! if they
// have the same dedup hash... then yes...
int32_t getDedupHash ( );
// tagdb uses 128 bit keys now
key128_t m_key;
int32_t m_recDataSize;
// when tag was added/updated
int32_t m_timestamp;
// . ip address of user adding tag
// . prevent multiple turk voters from same ip!
int32_t m_ip;
// each tag in a TagRec now has a unique id for ez deletion
//int32_t m_tagId;
// the "type" of tag. see the TagDesc array in Tagdb.cpp for a list
// of all the tag types. m_type is a hash of the type name.
int32_t m_type;
// . m_user[] IS ACTUALLY a 6-byte KEY for another TagRec
// . this is also a user's name like "mwells"
// . each user has a TagRec whose FULL key is this userHash
// . m_user[7] is always \0
//char m_user[8];
int32_t m_bufSize;
char m_buf[0];
};
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
// . used by CollectionRec::getRegExpNum()
int32_t getTagTypeFromStr( char *tagTypeName , int32_t tagnameLen = -1 );
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
char *getTagStrFromType ( int32_t tagType ) ;
// . max # of tags any one site or url can have
// . even AFTER the "inheritance loop"
// . includes the 4 bytes used for size and # of tags
//#define MAX_TAGREC_SIZE 1024
// max "oustanding" msg0 requests sent by TagRec::lookup()
#define MAX_TAGDB_REQUESTS 3
// . the latest version of the TagRec
//#define TAGREC_CURRENT_VERSION 0
class TagRec {
public:
TagRec();
~TagRec();
void reset();
void constructor ();
// . an rdb record is a key, dataSize, then the data
// . the "data" is al the stuff after "m_dataSize"
//key_t m_key;
//int32_t m_dataSize;
//uint16_t m_numTags;
//char m_version;
//char m_buf [ MAX_TAGREC_SIZE - 12 - 4 - 2 ];
//char *getKey () { return (char *)&m_key; };
//char *getData () { return (char *)this + 12 + 4; };
//int32_t getDataSize () { return m_dataSize; };
//void copy (class TagRec *tp ) {
// gbmemcpy ( this , (void *)tp , tp->getSize() ); };
// includes the 4 byte "header" which consists of the first 2 bytes
// being the size of the actual tag buffer and the second two bytes
// being the number of tags in the actual tag buffer
//int32_t getSize ( ) { return 12+4+1+2+m_dataSize; };
//int32_t getSize ( ) { return 12+4+m_dataSize;};
int32_t getNumTags ( );
int32_t getSize ( ) { return sizeof(TagRec); };
class Tag *getFirstTag ( ) {
if ( m_numListPtrs == 0 ) return NULL;
return (Tag *)m_listPtrs[0]->m_list;
}
bool isEmpty ( ) { return (getFirstTag()==NULL); };
// lists should be in order of precedence i guess
class Tag *getNextTag ( class Tag *tag ) {
// watch out
if ( ! tag ) return NULL;
// get rec size
int32_t recSize = tag->getRecSize();
// point to current tag
char *current = (char *)tag;
// find what list we are in
int32_t i;
for ( i = 0 ; i < m_numListPtrs ; i++ ) {
if ( current < m_listPtrs[i]->m_list ) continue;
if ( current >= m_listPtrs[i]->m_listEnd ) continue;
break;
}
// sanity
if ( i >= m_numListPtrs ) { char *xx=NULL;*xx=0; }
// advance
current += recSize;
// sanity check
if ( recSize > 500000 ) { char *xx=NULL;*xx=0;}
// breach list?
if ( current < m_listPtrs[i]->m_listEnd) return (Tag *)current;
// advance list
i++;
// breach of lists?
if ( i >= m_numListPtrs ) return NULL;
// return that list record then
return (Tag *)(m_listPtrs[i]->m_list);
};
// return the number the tags having particular tag types
int32_t getNumTagTypes ( char *tagTypeStr );
// get a tag from the tagType
class Tag *getTag ( char *tagTypeStr );
class Tag *getTag2 ( int32_t tagType );
//char *getRecEnd ( ) { return (char *)this + getSize(); };
//char *getMaxEnd ( ) { return (char *)this + (int32_t)MAX_TAGREC_SIZE; };
// . for showing under the summary of a search result in PageResults
// . also for Msg6a
int32_t print ( ) ;
bool printToBuf ( SafeBuf *sb );
bool printToBufAsAddRequest ( SafeBuf *sb );
bool printToBufAsXml ( SafeBuf *sb );
bool printToBufAsHtml ( SafeBuf *sb , char *prefix );
bool printToBufAsTagVector ( SafeBuf *sb );
// . make sure not a dup of a pre-existing tag
// . used by the clock code to not at a clock if already in there
// in Msg14.cpp
Tag *getTag ( char *tagTypeStr , char *dataPtr , int32_t dataSize );
int32_t getTimestamp ( char *tagTypeStr , int32_t defalt );
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
int32_t getLong ( char *tagTypeStr ,
int32_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
int32_t getLong ( int32_t tagId ,
int32_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
int64_t getLongLong ( char *tagTypeStr,
int64_t defalt ,
Tag **bookmark = NULL ,
int32_t *timeStamp = NULL ,
char **user = NULL );
char *getString ( char *tagTypeStr ,
char *defalt = NULL ,
int32_t *size = NULL ,
Tag **bookmark = NULL ,
int32_t *timestamp = NULL ,
char **user = NULL );
// we only store the first 6 chars of "user" into this TagRec, m_buf
/*
bool addTag ( char *tagTypeStr,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize );
// we convert the int32_t to a string for you here...
bool addTag ( char *tagTypeStr ,
int32_t timestamp ,
char *user ,
int32_t ip ,
int32_t dataAsLong ) {
char buf[16]; sprintf(buf,"%"INT32"",dataAsLong);
return addTag(tagTypeStr,timestamp,user,ip,buf,
gbstrlen(buf)); };
// same as above
bool addTag ( Tag *tag );
// add "negative" tags, so when these tags are added to the
// TagRec in tagdb, they will delete them
bool addDelTag ( char *tagTypeStr );
// now you can specify a unique tag id in the case of multiple tags
// that have the same tagType and user
bool removeTags ( char *tagTypeStr , char *user , int32_t tagId = 0 ) ;
bool removeTags ( int32_t tagType , char *user , int32_t tagId = 0 ) ;
bool removeTag ( class Tag *rmTag ) ;
// add/remove all the tags from "tagRec" to our list of tags
bool addTags ( TagRec *tagRec ) ;
bool removeTags ( TagRec *tagRec ) ;
// return false and set g_errno on error
bool addTags ( Tag *tags , char *tagEnd , char *bufEnd );
bool removeTags ( Tag *tags , char *tagEnd , char *bufEnd );
bool replaceTags ( Tag *tags , char *tagEnd , char *bufEnd );
*/
bool setFromBuf ( char *buf , int32_t bufSize );
bool serialize ( SafeBuf &dst );
bool setFromHttpRequest ( HttpRequest *r , TcpSocket *s );
// use this for setFromBuf()
SafeBuf m_sbuf;
// some specified input
//char *m_coll;
Url *m_url;
collnum_t m_collnum;
void (*m_callback ) ( void *state );
void *m_state;
// hold possible tagdb records
RdbList m_lists[MAX_TAGDB_REQUESTS];
// ptrs to lists in the m_lists[] array
RdbList *m_listPtrs[MAX_TAGDB_REQUESTS];
int32_t m_numListPtrs;
};
class Tagdb {
public:
// reset rdb
void reset();
// . TODO: specialized cache because to store pre-parsed tagdb recs
// . TODO: have m_useSeals parameter???
bool init ( );
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
//bool convert ( char *coll ) ;
bool addColl ( char *coll, bool doVerify = true );
// used by ../rdb/Msg0 and ../rdb/Msg1
Rdb *getRdb ( ) { return &m_rdb; };
//key128_t makeKey ( Url *u , bool isDelete ) ;
key128_t makeStartKey ( char *site );//Url *u ) ;
key128_t makeEndKey ( char *site );//Url *u ) ;
key128_t makeDomainStartKey ( Url *u ) ;
key128_t makeDomainEndKey ( Url *u ) ;
// . get the serialized TagRec from an RdbList of TagRecs from tagdb
// that is the best match for "url"
char *getRec ( RdbList *list , Url *url , int32_t *recSize ,char* coll,
int32_t collLen, RdbList *retList) ;
DiskPageCache *getDiskPageCache() { return &m_pc; };
//int32_t getGroupId (key_t *key) {return key->n1 & g_hostdb.m_groupMask;}
// . dump tagdb to stdout
// . dump as URL requests so we can re-add with blaster on each host
// . this replaces dumpTagdb() in main.cpp
// . sendPageTagdb() will process such URL requests
void dumpTagdb ( );
// private:
// . returns 0 if url is not a suburl of "site"
int32_t getMatchPoints ( Url *url , Url *site ) ;
bool setHashTable ( ) ;
// . we use the cache in here also for caching tagdb records
// and "not-founds" stored remotely (net cache)
Rdb m_rdb;
DiskPageCache m_pc;
bool loadMinSiteInlinksBuffer ( );
bool loadMinSiteInlinksBuffer2 ( );
int32_t getMinSiteInlinks ( uint32_t hostHash32 ) ;
SafeBuf m_siteBuf1;
SafeBuf m_siteBuf2;
};
// derive this from tagdb
class Turkdb {
public:
void reset();
bool init ( );
bool addColl ( char *coll, bool doVerify = true );
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
DiskPageCache m_pc;
};
extern class Tagdb g_tagdb;
extern class Tagdb g_tagdb2;
extern class Turkdb g_turkdb;
//extern class Tagdb g_sitedb;
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) ;
///////////////////////////////////////////////
//
// Msg8a gets TagRecs from Tagdb
//
///////////////////////////////////////////////
// msg8a needs to keep a ptr to the tags it
// generates in the "inheritance loop". so don't
// store more than this! log if we do.
//#define MAX_TAGS 128
//#define MSG8A_MAX_REQUEST_SIZE 2048
// this msg class is for getting AND adding to tagdb
class Msg8a {
public:
Msg8a ();
~Msg8a ();
void reset();
// . get records from multiple subdomains of url
// . calls g_udpServer.sendRequest() on each subdomain of url
// . all matching records are merge into a final record
// i.e. site tags are also propagated accordingly
// . closest matching "site" is used as the "site" (the site url)
// . stores the tagRec in your "tagRec"
bool getTagRec ( Url *url ,
char *site , // set to NULL to auto set
//char *coll ,
collnum_t collnum,
//bool useCanonicalName ,
bool skipDomainLookup ,
int32_t niceness ,
void *state ,
void (* callback)(void *state ),
TagRec *tagRec ,
bool doInheritance = true ,
char rdbId = RDB_TAGDB);
bool launchGetRequests();
void gotAllReplies ( ) ;
// some specified input
//char *m_coll;
//int32_t m_collLen;
Url *m_url;
//bool m_doFullUrl;
char m_rdbId;
collnum_t m_collnum;
void (*m_callback ) ( void *state );
void *m_state;
Msg0 m_msg0s[MAX_TAGDB_REQUESTS];
key128_t m_siteStartKey ;
key128_t m_siteEndKey ;
// hold possible tagdb records
//RdbList m_lists[MAX_TAGDB_REQUESTS];
int32_t m_niceness;
char *m_dom;
char *m_hostEnd;
char *m_p;
int32_t m_requests;
int32_t m_replies;
char m_doneLaunching;
int32_t m_errno;
// we set this for the caller
TagRec *m_tagRec;
// hacks for msg6b
void *m_parent;
int32_t m_slotNum;
// hack for MsgE
void *m_state2;
void *m_state3;
bool m_doInheritance;
};
/*
void handleRequest9a ( UdpSlot *slot , int32_t niceness ) ;
class Msg9a {
public:
Msg9a ();
~Msg9a();
void reset() ;
bool launchAddRequests ( ) ;
static bool registerHandler ( ) {
return g_udpServer.registerHandler ( 0x9a, handleRequest9a );};
// . returns false if blocked, true otherwise
// . sets errno on error
// . "sites" is a NULL-terminated list of space-separated urls
// . if "deleteTags" is false, then the tags will be added to the
/// the TagRecs specified by the sites in "sites". if a TagRec
// does not exist for a given "site" then it will be added just
// so we can add the Tags to it. If it does exist, we will
// just append the given Tags to it. Tags with the same tagType and
// and "user" will be replaced by these Tags in the tagRecPtrs[].
// . if "deleteTags" is true, then the Tags matching the Tags
// given will be removed from each TagRec. If a nonzero timestamp
// or a username that does not start with \0 or a non-zero data
// is specified in the Tag, then that must match the Tag
// being removed as well. In this way we can remove specific
// Tags from a list of Tags that share the same m_tagType.
// . if "deleteTagRecs" is false then the entire TagRec will be removed
// from Tagdb. tagRecPtrs must be NULL in this case.
// . if provided, the ip vector is 1-1 with the sites in "sitesPtrs[]"
// and we set the Tag::m_ip with the corresponding
// ip given by that. this allows XmlDoc.cpp to "ip stamp" a tag.
// and the tag will be rendered invalid by XmlDoc.cpp if the ip of
// the site changes from that! in this way we can invalidate tags
// when a site changes ownership. assuming it also changes ip!
// . you can now pass in either "sites" or sitePtrs/numSitePtrs,
// whichever is easiest for you...
bool addTags ( char *sites ,
char **sitePtrs ,
int32_t numSitePtrs ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness ,
TagRec *tagRec ,
bool nukeTagRecs ,
int32_t *ipVector );//= NULL );
// like above, but we are adding the output of a
// './gb dump S main 0 -1 1' cmd
bool addTags ( char *dumpFile ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness );
void (*m_callback ) ( void *state );
void *m_state;
int32_t m_errno;
int32_t m_requests;
int32_t m_replies;
char *m_requestBuf;
int32_t m_requestBufSize;
char *m_p;
char *m_pend;
int32_t m_niceness;
};
*/
int32_t getY ( int64_t X , int64_t *x , int64_t *y , int32_t n ) ;
#endif
// Lookup order for the url hostname.domainname.com/mydir/mypage.html
// (aka 1.2.3.4/mydir/mypage.html):
// . hostname.domainname.com/mydir/mypage.html
// . hostname.domainname.com/mydir/
// . hostname.domainname.com
// . domainname.com/mydir/mypage.html
// . domainname.com/mydir/
// . domainname.com
// . 1.2.3.4 /mydir/mypage.html
// . 1.2.3.4 /mydir/
// . 1.2.3.4
// . 1.2.3