#include "gb-include.h"
#include <sys/stat.h>
#include "Titledb.h"
#include "Tagdb.h"
#include "Categories.h"
#include "Unicode.h"
#include "Threads.h"
#include "Msg1.h"
#include "HttpServer.h"
#include "Pages.h"
#include "SiteGetter.h"
#include "HashTableX.h"
#include "Users.h"
#include "Process.h"
#include "Rebalance.h"
static void gotMsg0ReplyWrapper ( void *state );
//static void gotReplyWrapper9a ( void *state , UdpSlot *slot ) ;
//static void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) ;
//static void sendReply9a ( void *state ) ;
static HashTable s_ht;
static bool s_initialized = false;
// to stdout
long Tag::print ( ) {
SafeBuf sb;
printToBuf ( &sb );
// dump that
return fprintf(stderr,"%s\n",sb.getBufStart());
bool Tag::printToBuf ( SafeBuf *sb ) {
sb->safePrintf("k.hsthash=%016llx k.duphash=%08lx k.sitehash=%08lx ",
// print the tagname
sb->safePrintf ( "TAG=%s,\"%s\",",
getUser() );
// data size
//sb->safePrintf( "%li,", (long)getTagDataSize());
// print the date when this tag was added
time_t ts = m_timestamp;
struct tm *timeStruct = localtime ( &ts );
char tmp[100];
// print the time as a long, seconds since epoch
// print the ip added from
// print the tag id
// key.n1 is hash of the subdomain i think
if ( ! printDataToBuf ( sb ) ) return false;
// final quote
return true;
// . "site" can also be a specific url, but it must be normalized
// . i.e. of the form
void Tag::set ( char *site ,
char *tagname ,
long timestamp ,
char *user ,
long ip ,
char *data ,
long dataSize ) {
// get type from name
m_type = getTagTypeFromStr ( tagname , strlen(tagname) );
// sanity
//isTagTypeIndexable ( m_type );
m_timestamp = timestamp;
m_ip = ip;
long userLen = 0;
if ( user ) userLen = gbstrlen(user);
// truncate to 127 byte long
if ( userLen > 126 ) userLen = 126;
// first byte is size of user, then user plus \0 then data
//m_bufSize = 1 + userLen + 1 + dataSize;
// "site" must skip http://
//long slen = gbstrlen(site);
//if ( slen > 8 && strncasecmp(site,"http://",7)==0 )
// site += 7;
//else if ( slen > 8 && strncasecmp(site,"https://",8)==0 )
// site += 8;
// normalize
Url norm;
norm.set ( site );
// store user into special buffer
//long ulen = 0;
//if ( user ) {
// ulen = gbstrlen(user);
// if ( ulen > 7 ) ulen = 7;
//memset ( m_user , 0 , 8 );
//memcpy ( m_user , user , ulen );
char *p = m_buf;
// store size (includes \0)
*p++ = userLen + 1;
// then user name
memcpy ( p , user , userLen );
p += userLen;
// then \0
*p++ = '\0';
// store data now too
memcpy ( p , data , dataSize );
p += dataSize;
// NULL terminate if they did not! now all tag are strings and must
// be NULL terminated.
if ( data && p[-1] ) { // data && m_data[dataSize-1] ) {
//m_data[dataSize] = '\0';
*p++ = '\0';
// set it
m_bufSize = p - m_buf;
// top X bits should be hash of the domain only so all recs are on the
// same host near each other
//m_key.n1 = hash32 ( norm.getDomain() , norm.getDomainLen());
// too many tags were being read when k.n1 was the domain hash for
// sites like that had hundreds of subdomains. so go based on
// host instead.
// CRAP: using 32 bit hash we get collisions for crap like
// and
// so let's move to 64bit keys
//m_key.n1 = hash64 ( norm.getHost() , norm.getHostLen());
// i had to make this the hash of the site, not host,
// because
// was making the rdblist a few megabytes big!!
m_key.n1 = hash64n ( site );
// assume we are unique tag, that many of this type can exist
uint32_t upper32 = getDedupHash(); // m_type;
// if we are NOT unique... then hash username and data. thus we only
// replace a key if its the same tagtype, username and data. that
// way it will just update the timestamp and/or ip.
if ( ! isTagTypeUnique ( m_type ) ) {
// start hashing here
char *startHashing = (char *)&m_type;
// end here. include username (and tag data!)
char *endHashing = m_buf + m_bufSize;
// hash this many bytes
long hashSize = endHashing - startHashing;
// . set key
upper32 = hash32 ( startHashing , hashSize );
// put in upper 32
m_key.n0 = upper32;
// shift it up
m_key.n0 <<= 32;
// . then or in url hash
// . for the site "" this included the port!
// but for the most part if the site is just a hostname then
// this is basically just a hostname, too, but the hash will
// include the http:// and the ending /
// . was added as a site. so it has the
// same m_key.n1 as, but this part is different
// here. this is the full site hash really. so during the lookup
// i'd say filter out such tags if they don't match the site you
// are looking up.
//m_key.n0 |= (uint32_t) hash32 ( norm.getUrl() , norm.getUrlLen() );
// set positive bit so its not a delete record
m_key.n0 |= 0x01;
// the size of this class as an Rdb record
m_recDataSize = m_bufSize + sizeof(Tag) - sizeof(key128_t) - 4;
// . return # of ascii chars scanned in "p"
// . return 0 on error
// . parses output of printToBuf() above
// . k.n1=0x695b3 k.n0=0xa4118684fa4edf93 version=0 TAG=ruleset,"mwells",Jan-02-2009-18:26:04,<timestamp>,,3735437892,36 TAG=blog,"mwells",Jan-02-2009-18:26:04,,2207516434,1 TAG=site,"tagdb",Jan-02-2009-18:26:04,,833534375,
long Tag::setFromBuf ( char *p , char *pend ) {
// save our place
char *start = p;
// tags always start with " TAG="
if ( strncmp(p," TAG=",5) ) {
log("tagdb: error processing tag in setFromBuf().");
return 0;
// skip that
p += 5;
// get the type
char *type = p;
// get type length
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// that is the length
long typeLen = p - type;
// convert to number
m_type = getTagTypeFromStr ( type , typeLen );
// panic?
if ( m_type == -1 ) { char *xx=NULL;*xx=0;}
// now the user, skip comma and quote
// data buffer
char *dst = m_buf;
// point to it
char *user = p;
// get end of it
while ( p < pend && *p != '\"' ) p++;
// error?
if ( p == pend ) return 0;
// set length
long userLen = p - user;
// sanity. username total buf space including \0 <= 8
if ( userLen > 126 ) userLen = 126;
// copy it over into us
//memcpy ( m_user , user , userLen );
// NULL terminate
//m_user[userLen] = '\0';
// first byte is username size
*dst++ = userLen+1;
// then the username
memcpy ( dst , user , userLen );
dst += userLen;
// and finall null termination
*dst++ = '\0';
// skip quote and comma
// now the datasize
//long m_dataSize = atoi(p);
// skip till comma
//while ( p < pend && *p != ',' ) p++;
// error?
//if ( p == pend ) return 0;
// skip comma
// that is the time stamp in canonical form
// skip till comma
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// skip comma
// save start
char *ts = p;
// skip until comma again
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// this is the timestamp in seconds since epoch
m_timestamp = atoi(ts);
// skip comma
// ip address as text
char *ips = p;
// skip until comma again
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// convert it to binary
m_ip = atoip ( ips , p - ips );
// skip comma
// get the tag identifier
//m_tagId = atol(p);
//sscanf ( p , "%lu,",&m_tagId);
//long long big = atoll(p);
//m_tagId = (long)big;
// skip until comma again
//while ( p < pend && *p != ',' ) p++;
// error?
//if ( p == pend ) return 0;
// skip comma
// as a hack for now, override this, because before we were not 100%
// strings as tags, we had single byte values being printed out as
// strings of 3 bytes
//char *e = p;
//while ( e < pend && ! is_wspace_a(*e) ) e++;
//if ( e > pend ) return 0;
//m_dataSize = e - p;
// add in a \0
// . now is the data
// . return # of chars scanned in "p"
p += setDataFromBuf ( p , pend );
// . sanity check
// . all tags must be NULL terminated now
if ( m_buf[m_bufSize-1] != '\0' ) {char *xx=NULL; *xx=0; }
// we reset this since we now require that all tags are NULL terminated
// strings
//m_tagId = hash32 ( (char *)this,(long)sizeof(Tag)+m_dataSize , 0 );
// 0 is not valid
//if ( m_tagId == 0 ) m_tagId = 1;
// return how many bytes we read
return p - start;
// . return # of chars scanned in "p"
// . return 0 on error
long Tag::setDataFromBuf ( char *p , char *pend ) {
// string are special
//if ( isTagTypeString ( m_type ) ) {
// skip over username in the buffer to point to where to put tag data
char *dst = m_buf + *m_buf + 1;
// stop at space of
// advance
dst += (pend-p);
// update
m_bufSize = dst - m_buf;
// should be end delimter
char c = m_buf[m_bufSize-1];
// sanity check
if ( c && ! isspace(c) ) { char *xx=NULL;*xx=0; }
// strings are always NULL terminated, the datasize should
// include the NULL termination
// we basically insert the \0, and *p should point to the space
// right after the string...! so return m_dataSize - 1
return m_bufSize - 1;
// save it to count
char *start = p;
// print as decimal if just 1 byte
if ( m_dataSize == 1 ) {
long v = atoi(p);
if ( v > 256 ) { char *xx=NULL;*xx=0; }
m_data[0] = v;
// skip till whitespace or end
while ( p < pend && isdigit(*p) ) p++;
return p - start;
// skip 0x
if ( *p!='0' || *(p+1)!='x' ) { char *xx=NULL;*xx=0; }
p += 2;
// convert hexadecimal string into binary
long bytesStored = hexToBinary ( p , pend , m_data , false );
// sanity check
if ( bytesStored != m_dataSize ) { char*xx=NULL;*xx=0;}
// advance p, each byte is two characters
p += bytesStored * 2;
// return # of bytes in "p" we scanned
return p - start;
long hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement ) {
// keep tabs on how many bytes we store into "dst"
char *start = dst;
// read in hex values
while ( src < srcEnd ) {
// get FIRST hex digit
unsigned char v;
v = *(unsigned char *)src;
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
else if ( v >= '0' && v <= '9' ) v = v - '0';
else break;
// sanity check
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
// next character
// store it in the destination
*dst = v;
// sanity check, need one more char FOR SURE!
if ( src >= srcEnd ) { char*xx=NULL;*xx=0;}
// get the SECOND hex digit of this byte
v = *(unsigned char *)src;
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
else if ( v >= '0' && v <= '9' ) v = v - '0';
else break;
// sanity check
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
// next character
// shift last guy up 4 bits
*dst = *dst << 4;
// or in the new guy
*dst |= v;
// point to next byte now
if ( decrement ) dst--;
else dst++;
return dst - start;
bool Tag::printDataToBuf ( SafeBuf *sb ) {
// string are special
//if ( isTagTypeString ( m_type ) ) {
char *data = getTagData();
long dataSize = getTagDataSize();
// because of a bug of not appending the \0 and incrementing
// Tag::m_dataSize when we should have, we must deal with this!
for ( long i = 0 ; data[i] && i < dataSize ; i++ )
sb->safePrintf ( "%c" , data[i] );
return true;
// print as decimal if just 1 byte
if ( m_dataSize == 1 ) {
return true;
// the "score"
//for ( long i = 0 ; i < m_dataSize ; i++ )
// sb->safePrintf ( "%02hhx" , m_data[m_dataSize-i-1] );
// i guess just print it first byte first now
for ( long i = 0 ; i < m_dataSize ; i++ )
sb->safePrintf ( "%02hhx" , m_data[i] );
return true;
bool Tag::printToBufAsAddRequest ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ( "%s.user=%s" , str , getUser() );
// print the date when this tag was added
sb->safePrintf ("&%s.time=%li", str, m_timestamp );
// print the ip added from
// print the tag id
// the "score"
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
return true;
bool Tag::printToBufAsXml ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("\t\t<tag>\n\t\t\t<name>%s</name>\n\t\t\t<user>%s",
// print the date when this tag was added
// print the ip added from
// print the tag id
// the "score"
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
return true;
//if ( ! sb->safePrintf("\t\t<eventTagFromTagdb>"
// "<![CDATA[") )
bool Tag::printToBufAsXml2 ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("\t\t<eventTagdbTag>\n"
// who added the tag:
// when tag was added:
// ip added from
// name of the tag:
// the tag data
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
return true;
bool Tag::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("<tr><td>%s</td><td><b>%s</b>", prefix, str);
// the "score"
sb->safePrintf(" value=<b>");
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
// print the date when this tag was added
sb->safePrintf("</b> user=%s time=",getUser());
time_t ts = m_timestamp;
struct tm *timeStruct = localtime ( &ts );
char tmp[100];
// print the ip added from
sb->safePrintf(" ip=%s",iptoa(m_ip));
//sb->safePrintf(" id=%lu",(long)m_tagId);
return true;
bool Tag::printToBufAsTagVector ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print strings data types special
//if ( isTagTypeString ( m_type ) ) {
//sb->safePrintf("%s:%s ",str,m_data);
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf(" ");
return true;
// print the user that added this tag
sb->safePrintf ("%s:", str );
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf(" ");
return true;
bool Tag::isType ( char *t ) {
long h = hash32n ( t );
return (m_type == h);
TagRec::TagRec ( ) {
m_numListPtrs = 0;
void TagRec::constructor ( ) {
m_numListPtrs = 0;
// run a constructor on the lists
for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ ) {
m_lists[i].constructor();//m_alloc = NULL;
//m_lists[i].m_allocSize = 0;
TagRec::~TagRec ( ) {
void TagRec::reset ( ) {
m_numListPtrs = 0;
for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ )
Tag *TagRec::getTag ( char *tagTypeStr ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
return getTag2 ( tagType );
Tag *TagRec::getTag2 ( long tagType ) {
Tag *tag = getFirstTag();
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// got it
return tag;
// if not found return NULL
return NULL;
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
long TagRec::getLong ( char *tagTypeStr,
long defalt ,
Tag **bookmark ,
long *timestamp ,
char **user ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
return getLong ( tagType ,
defalt ,
bookmark ,
timestamp ,
user );
long TagRec::getLong ( long tagType ,
long defalt ,
Tag **bookmark ,
long *timestamp ,
char **user ) {
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// get the value as a long
long score = 0;
// the size
char *data = tag->getTagData();
long dataSize = tag->getTagDataSize();
//long size = m_dataSize;
// if ends in NULL trunc it
if ( data[dataSize-1] == '\0' ) dataSize--;
// trunc it
//if ( size > 4 ) size = 4;
// convert string to value, MUST be signed!!! the data
// should inclue a \0
score = atol2(data,dataSize);
// if only a single byte.need to preserve negatives (twos comp)
//if ( size == 1 ) score = (long)tag->m_data[0];
//else if ( size == 2 ) score = (long)*((short *)tag->m_data);
//else memcpy ( &score , tag->m_data , size );
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
return score;
// not found
return defalt;
long long TagRec::getLongLong ( char *tagTypeStr,
long long defalt ,
Tag **bookmark ,
long *timestamp ,
char **user ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// get the value as a long
long long score = 0;
// the size
char *data = tag->getTagData();
long dataSize = tag->getTagDataSize();
// if ends in NULL trunc it
if ( data[dataSize-1] == '\0' ) dataSize--;
// trunc it
//if ( size > 8 ) size = 8;
// now everything is a string
score = atoll2(data,dataSize);
// store it
//memcpy ( &score , tag->m_data , size );
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
return score;
// not found
return defalt;
char *TagRec::getString ( char *tagTypeStr,
char *defalt ,
long *size ,
Tag **bookmark ,
long *timestamp ,
char **user ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// want size? includes \0 probably
if ( size ) *size = tag->getTagDataSize();//m_dataSize;
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
// return it
return tag->getTagData();//m_data;
// not found
return defalt;
// add a special tag with null m_data. this tells Msg9a to delete
// all tags of this tag type before adding any other tags of this type
// that we might have. it is basically a "negative" tag.
bool TagRec::addDelTag ( char *tagTypeStr ) {
return addTag ( tagTypeStr ,
0 , // timestamp
NULL , // user
0 , // ip
NULL , // data
0 );// dataSize
// returns false and sets g_errno on error
bool TagRec::addTag ( char *tagTypeStr,
long timestamp ,
char *user ,
long ip ,
char *data ,
long dataSize ) {
// get the tagType
long tagType = getTagTypeFromStr ( tagTypeStr );
// breach check
if ( dataSize + sizeof(Tag) > MAX_TAGREC_SIZE ) {
return log("tagdb: no room to add tag");
// the Tag::m_dataSize is only 2 bytes... NOT ANYMORE, MDW
if ( dataSize < 0 ) { // >= 65536 ) {
return log("tagdb: tag dataSize of %li is >= 65536. "
"Bad value.", dataSize);
// sanity check -- no binary chars allowed, must all be strings!
// BUT they can have an empty string (i.e. just \0)
if ( dataSize == 1 && data[0] < 9 && data[0] >= 0 && data[0] ) {
char *xx=NULL;*xx=0; }
// make a tag
char buf[MAX_TAGREC_SIZE];
Tag *tag = (Tag *)buf;
// fill it in
tag->m_type = tagType;
tag->m_timestamp = timestamp;
tag->m_ip = ip;
tag->m_dataSize = dataSize;
// dummy value for now
tag->m_tagId = 0;
// careful!
if ( sizeof(Tag) + dataSize + 10 > MAX_TAGREC_SIZE ) {
return log("tagdb: no room to add tag data");
// store user into special buffer
long ulen = 0;
if ( user ) {
ulen = gbstrlen(user);
if ( ulen > 7 ) ulen = 7;
memset ( tag->m_user , 0 , 8 );
memcpy ( tag->m_user , user , ulen );
// store data now too
memcpy ( tag->m_data , data , dataSize );
// NULL terminate if they did not! now all tag are strings and must
// be NULL terminated.
if ( data && tag->m_data[dataSize-1] ) {
tag->m_data[dataSize] = '\0';
// the id is the hash for now (MDW)
tag->m_tagId = hash32 ( (char *)tag,(long)sizeof(tag)+dataSize , 0 );
// 0 is not valid
if ( tag->m_tagId == 0 ) tag->m_tagId = 1;
// now add that tag
return addTag ( tag );
// returns false and sets g_errno on error
bool TagRec::addTag ( Tag *TAG ) {
// . do not allow empty user
// . but "del tags" i.e. "negative tags" can have no user
if ( TAG->m_dataSize>0 && (!TAG->m_user || TAG->m_user[0] == '\0') ) {
char *xx=NULL;*xx=0;}
// sanity check
if ( TAG->m_tagId == 0 ) { char *xx=NULL;*xx=0;}
// come back up here if we did a remove operation
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not matching id
if ( tag->m_type != TAG->m_type ) continue;
// skip if does not match user
if ( memcmp(tag->m_user,TAG->m_user,7) ) continue;
// data now has to match too, so we will allow tags of the
// same type from the same user to be added if they have
// different data now. i would only do this for strings,
// but for longs and chars i would skip this check...
// so only replace "unique" tags of the same type.
// mostly strings and embedded tag recs will be non-unquie
if ( ! isTagTypeUnique ( tag->m_type ) ) {
if ( tag->m_dataSize != TAG->m_dataSize ) continue;
if ( memcmp(tag->m_data,TAG->m_data,tag->m_dataSize))
// Msg8a allows multiple ST_SITE tags in order to indicate
// what sites the other tags came from (i.e. used by the
// inheritance loop below)
// MDW: This is now covered by isTagTypeUnique() above.
//if ( tag->m_type == ST_SITE ) continue;
// it does match, so replace it!
//removeTags ( tag->m_type , tag->m_user );
removeTag ( tag );
// start from the top
goto loop;
// . ok, we "deduped" the tag
// . point to the end of the buf
char *p = getRecEnd();
// get the max end
char *pend = getMaxEnd();
// how much do we need?
long need = TAG->getSize();
// breach?
if ( p + need > pend ) {
char *site = getString("site","unknown");
log("tagdb: no room to add tag to buf. tagtype=%s "
"tagsize=%li site=%s",
getTagStrFromType ( TAG->m_type ) , need , site );
//char *xx=NULL;*xx=0;
return false;
// store it
memcpy ( p , TAG , need );
// update our counters
m_dataSize += need;
// SPECIAL: if it was ST_SITE, set our m_key, we are an Rdb record
//if ( TAG->m_type != ST_SITE ) return true;
if ( ! TAG->isType ("site") ) return true;
// set the key
Url u;
// convenience
char *site = TAG->m_data;
long size = TAG->m_dataSize;
// sanity check
if ( site[size-1] != '\0' ) { char *xx=NULL;*xx=0; }
// do not start with http:// ! wastes space!!
if (size>=8 && strncmp(site,"http://",7)==0 ) {
log("tagdb: don't sotre http:// in tags!");
char *xx=NULL;*xx=0;
// do not include the NULL
u.set ( site , size - 1 );
// set our key, the endKey is our "startKey"
m_key = g_tagdb.makeKey ( &u , false ); // isDelete?
// success, return true
return true;
bool TagRec::removeTags ( char *tagTypeStr , char *user , long tagId ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
return removeTags ( tagType , user , tagId );
bool TagRec::removeTags ( long tagType , char *user , long tagId ) {
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the rec, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// id if matches, that is good enough
if ( tagId && tag->m_tagId != tagId ) continue;
// skip if not matching id
if ( tagId == 0 && tag->m_type != tagType ) continue;
// skip if does not match user
if ( tagId == 0 && user && memcmp(tag->m_user,user,7))continue;
// remove that tag
removeTag ( tag );
// re do loop
goto loop;
// success
return true;
bool TagRec::removeTag ( Tag *rmTag ) {
// save this
long oldn = m_numTags;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the rec, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// must be it
if ( tag != rmTag ) continue;
// copy to here
char *dst = (char *)tag;
// size of tag we are removing
long size = tag->getSize();
// from here
char *src = dst + size;
// end of tag buffer
char *pend = getRecEnd();
// byte to move
long move = pend - src;
// it does match, so replace it!
memcpy ( dst , src , move );
// decrement counts
m_dataSize -= size;
// sanity check
if ( m_numTags != oldn - 1 ) { char *xx=NULL;*xx=0; }
// success, return true
return true;
// add all the tags from "tagRec" to our list of tags
bool TagRec::addTags ( TagRec *tagRec ) {
// start at the first tag
Tag *tag = tagRec->getFirstTag();
// . remove any tag of any of the tag types we got in "tagRec" ?
// . deal with "negative" tags
// . used by TagRec::addDelTag() above
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// if tag has m_data, skip.
if ( tag->m_data && tag->m_dataSize > 0 ) continue;
// otherwise, it is a signal to nuke all tags of this type
removeTags ( tag->m_type , NULL );
// start at the first tag again
tag = tagRec->getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// skip if it was a delete tag
if ( tag->m_dataSize <= 0 ) continue;
// do not transfer over ST_SITE tags if we already got one
//if ( tag->m_type == ST_SITE && getTag ( ST_SITE ) ) continue;
if ( tag->isType("site") && getTag("site") ) continue;
// add it, return false on error, g_errno should be set
if ( ! addTag ( tag ) ) return false;
return true;
// add all the tags from "tagRec" to our list of tags
bool TagRec::removeTags ( TagRec *tagRec ) {
// start at the first tag
Tag *tag = tagRec->getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// do not remove ST_SITE tags
//if ( tag->m_type == ST_SITE ) continue;
if ( tag->isType("site") ) continue;
// add it, return false on error, g_errno should be set
if ( ! removeTags ( tag->m_type , tag->m_user ) ) return false;
return true;
Tag *TagRec::getNextTag ( Tag *tag ) {
if ( m_numTags == 0 ) return NULL;
if ( ! tag ) return (Tag *)m_buf;
char *tagEnd = getRecEnd();
long size = tag->getSize();
char *ret = ((char *)tag) + size;
// overboard?
if ( ret >= tagEnd ) return NULL;
return (Tag *)ret;
// return the number of tags having the particular TagType
long TagRec::getNumTagTypes ( char *tagTypeStr ) {
long tagType = getTagTypeFromStr ( tagTypeStr );
long numTagType = 0;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// if there is tagType match then increment the count
if ( tag->m_type == tagType ) numTagType++;
return numTagType;
long TagRec::getNumTags ( ) {
long numTags = 0;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) )
// skip dups
if ( tag->m_type != TT_DUP ) numTags++;
return numTags;
// . &tagtype%li=<tagtype>
// . &tagdata%li=<data>
// . &deltag%li=1 (to delete it)
// . set &user=mwells, etc. in cookie of HttpReqest, "r" for user
// . "this" TagRec's user, ip and timestamp will be carried over to "newtr"
// . returns false and sets g_errno on error
bool TagRec::setFromHttpRequest ( HttpRequest *r, TcpSocket *s ) {
// clear it
// get the username from the cookie
//char *user = r->getStringFromCookie ( "username" , NULL );
//char *user = g_users.getUsername ( r );
// try from form
//if ( ! user ) user = r->getString ("username",NULL);
// if no user, don't bother!
//if ( ! user ) {
// g_errno = EBADENGINEER;
// return log("tagdb: no username supplied for modifying tagdb.");
// get the user ip address
long ip = 0;
if ( s ) ip = s->m_ip;
// get the time stamp
long now = getTimeGlobal();
// . loop over all urls/sites in text area
// . no! just use single url for now
// put all urls in this buffer
SafeBuf fou;
// try from textarea if the ST_SITE was not in the tag section
long uslen;
char *us = r->getString("u",&uslen);
if ( uslen <= 0 ) us = NULL;
if ( us ) fou.safeMemcpy ( us , uslen );
// read in file, file of urls
long ufuLen;
char *ufu = r->getString("ufu",&ufuLen);
if ( ufuLen <= 0 ) ufu = NULL;
if ( us ) ufu = NULL; // exclusive
if ( ufu ) fou.fillFromFile ( ufu );
// if st->m_urls has multiple urls, this "u" is not given in the
// http request! but a filename is... and Msg9::addTags() should add
// the ST_SITE field anyway...
if ( ! ufu && ! us ) return true;
// make it null terminated since we no longer do this automatically
// normalize it
//Url u; u.set ( us , uslen );
// point to it
//char *site = u.getUrl();
// skip http + ://
//site += u.getSchemeLen() + 3;
// include the \0
//long psize = gbstrlen(p) + 1;
// loop over all tags in the TagRec to mod them
for ( long i = 0 ; ; i++ ) {
char buf[32];
sprintf ( buf , "tagtype%li",i );
char *tagTypeStr = r->getString(buf,NULL,NULL);
// if not there we are done
if ( ! tagTypeStr ) break;
// should we delete it?
sprintf ( buf , "deltag%li",i);
char *deltag = r->getString(buf,NULL,NULL);
//if ( deltag && deltag[0] ) continue;
sprintf ( buf , "taguser%li",i);
char *tagUser = r->getString( buf,NULL,"admin");//user);
//if ( tagUser && tagUser[0]==0 ) tagUser = user;
sprintf ( buf , "tagtime%li",i);
long tagTime = r->getLong(buf,now);
sprintf ( buf , "tagip%li",i);
long tagIp = r->getLong(buf,ip);
// get the value of this tag
sprintf ( buf , "tagdata%li" , i );
char *dataPtr = r->getString ( buf , NULL );
// get the tag original key
key128_t key;
sprintf ( buf , "tagn1key%li" , i );
key.n1 = r->getLongLong ( buf, 0 );
sprintf ( buf , "tagn0key%li" , i );
key.n0 = r->getLongLong ( buf, 0LL );
// if empty skip it
if ( ! dataPtr ) continue;
if ( ! dataPtr[0] ) continue;
// is it numeric? i think only ST_COMMENT is not
//char isNum = true;
// get the numeric
//long tagType = getTagTypeFromStr ( tagTypeStr );
// set "isNum" to false if not numeric
//if ( tagType == ST_COMMENT ) isNum = false;
//if ( tagType == ST_SITE ) isNum = false;
//if ( tagType == ST_META ) isNum = false;
//if ( isTagTypeString ( tagType ) ) isNum = false;
//long dataSize = 0;
// . if it is a string, like ST_COMMENT
// . include the \0
//if ( ! isNum ) dataSize = gbstrlen(dataPtr) + 1;
// everything is now a string
long dataSize = gbstrlen(dataPtr) + 1;
// if numeric store in tag buf
long long data;
if ( isNum ) {
data = atoll ( dataPtr );//r->getLongLong(val,-1);
dataSize = 1;
if ( data >= 0xffLL ) dataSize = 2;
if ( data >= 0xffffLL ) dataSize = 3;
if ( data >= 0xffffffLL ) dataSize = 4;
if ( data >= 0xffffffffLL ) dataSize = 5;
if ( data >= 0xffffffffffLL ) dataSize = 6;
if ( data >= 0xffffffffffffLL ) dataSize = 7;
dataPtr = (char *)&data;
// add to tag buf
//addTag ( tagTypeStr ,
// tagTime ,
// tagUser ,
// tagIp ,
// dataPtr ,
// dataSize );
// loop over all urls in the url file if provided
char *up = fou.getBufStart();
for ( ; ; ) {
// set url
char *urlPtr = up;
// stop if EOF or processed the one url
if ( ! urlPtr ) break;
// advance it or NULL it out
up = fou.getNextLine ( up );
// null term the url ptr
if ( up ) up[-1] = '\0';
// save buffer spot in case we have to rewind
long saved = m_sbuf.length();
// . add to tag rdb recs in safebuf
// . this pushes the rdbid as first byte
// . mdwmdwmdw
Tag *tag = m_sbuf.addTag ( urlPtr, // us, // site ,
tagTypeStr ,
tagTime ,
tagUser ,
tagIp ,
dataSize ,
// do not push rdbid into safebuf
false ) ;
// error?
if ( ! tag )
return false;
bool deleteOldKey = false;
// if tag has different key, delete the old one
if ( key.n1 && tag->m_key != key ) deleteOldKey = true;
// if del was marked, delete old one and do not add new one
if ( deltag && deltag[0] ) {
// rewind over the tag we were about to add
m_sbuf.setLength ( saved );
// and add as a delete
deleteOldKey = true;
if ( deleteOldKey ) {
// make it negative
key128_t delKey = key;
delKey.n0 &= 0xfffffffffffffffeLL;
if (! m_sbuf.safeMemcpy((char *)&delKey,
return false;
// all done
//if ( getTag ( ST_SITE ) ) return ;
//if ( getTag("site") ) return;
// add the special ST_SITE tag
//addTag ( "site" , // ST_SITE ,
// now ,
// user ,
// ip ,
// p ,
// psize );
return true;
// to stdout
long TagRec::print ( ) {
SafeBuf sb;
printToBuf ( &sb );
// dump that
return fprintf(stderr,"%s\n",sb.getBufStart());
bool TagRec::printToBuf ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
//sb->safePrintf("k.n1=0x%08lx k.n0=0x%016llx version=%li",
// m_key.n1,m_key.n0,(long)m_version);
for ( ; tag ; tag = getNextTag ( tag ) ) {
if ( tag->m_type == TT_DUP ) continue;
tag->printToBuf ( sb );
return true;
// . return size of characters scanned from "p"
// . returns 0 on error
long TagRec::setFromBuf ( char *p , char *pend ) {
// remember the start
char *start = p;
// scan in the key
//if ( strncmp(p,"k.n1=0x",7) != 0 ) return 0;
// skip key stuff
//p += 7;
// clear our key
// read in the key
//key_t k;
//sscanf(p,"k.n1=0x%08lx k.n0=0x%016llx ",&k.n1,&k.n0);
// now do it the fast way and compare the results!
//p += 7 ;
//hexToBinary ( p , pend , ((char *)&m_key.n1)+3 , true );
//p += 8 + 8;
//hexToBinary ( p , pend , ((char *)&m_key.n0)+7 , true );
// test it
//if ( m_key.n1 != k.n1 || m_key.n0 != k.n0 ) { char *xx=NULL; *xx=0; }
//p = strstr ( p , " version=");
// error?
//if ( ! p ) return 0;
// skip " version="
//p += 9;
// get version
//m_version = atoi(p);
// skip p until space
//while ( p < pend && *p != ' ' ) p++;
// error?
//if ( p >= pend ) return 0;
// skip the space -- NO! tag parser wants the space
// point to the where we should serialize the tags into
//char *tagPtr = m_buf;
char tbuf[5000];
while ( p < pend ) {
// now we should be pointing to the tag
Tag *tag = (Tag *)tbuf;
// serialize the tag from the buf
long asciiBytesRead = tag->setFromBuf ( p , pend );
// if bad this is 0
if ( asciiBytesRead == 0 ) return 0;
// store tag into our safebuf. return 0 with g_errno set on err
// . mdwmdwmdw
if ( ! m_sbuf.addTag ( tag ) ) return 0;
// point to next tag to read into our binary buffer
//p += asciiBytesRead;
// inc our ptr to point to next tag if it exists
//tagPtr += tag->getSize();
// inc our count in the TagRec
// adjust our tag buffer size, TagRec::m_dataSize
//m_dataSize = tagPtr - m_buf;
// hey, it includes the other crap too!
// it includes m_numTags + m_version, see Tagdb.h
//m_dataSize += 2 + 1;
// clear all lists
// now make list point to that
//m_lists[0].m_list = m_sbuf.getBufStart();
//m_lists[0].m_listSize = m_sbuf.length();
//m_lists[0].m_listAllocSize = 0; // do not free it!
//m_numLists = 0;
//return getSize();
return p - start;
bool TagRec::setFromBuf ( char *p , long bufSize ) {
// assign to list! but do not free i guess
m_lists[0].m_list = p;
m_lists[0].m_listSize = bufSize;
m_lists[0].m_listEnd = p + bufSize;
m_lists[0].m_ownData = false;
m_lists[0].m_lastKeyIsValid = false;
m_lists[0].m_fixedDataSize = -1;
m_lists[0].m_useHalfKeys = false;
m_lists[0].m_ks = sizeof(key128_t);
m_listPtrs[0] = &m_lists[0];
m_numListPtrs = 1;
return true;
bool TagRec::serialize ( SafeBuf &dst ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) ) {
if ( tag->m_type == TT_DUP ) continue;
if ( ! dst.addTag ( tag ) ) return false;
return true;
bool TagRec::printToBufAsAddRequest ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsAddRequest ( sb);
return true;
bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsXml ( sb );
return true;
bool TagRec::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsHtml (sb,prefix);
return true;
bool TagRec::printToBufAsTagVector ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsTagVector ( sb );
return true;
Tag *TagRec::getTag ( char *tagTypeStr , char *dataPtr , long dataSize ) {
// get the tag type numerically
long tagType = getTagTypeFromStr ( tagTypeStr );
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if tag does not match "tagType"
if ( tag->m_type != tagType ) continue;
// skip dup tags
if ( tag->m_type == TT_DUP ) continue;
// skip if dataSize does not match
if ( tag->getTagDataSize() != dataSize ) continue;
// skip if data does not match
if ( memcmp ( tag->getTagData() , dataPtr , dataSize ) ) continue;
// we got a match
return tag;
return NULL;
// flags for a TagDescriptor
// is the tag a string type?
#define TDF_STRING 0x01
// can we have multiple tags of this type from the same user in the
// same TagRec?
#define TDF_ARRAY 0x02
// . should we index it?
// . index gbtagjapanese:<score>
// . also index "gbtagjapanese" if score != 0
// . TODO: actually use this
#define TDF_NOINDEX 0x04
class TagDesc {
char *m_name;
char m_flags;
// we compute the m_type of each TD on init
long m_type;
// map the tags to names
static TagDesc s_tagDesc[] = {
// data for the "lang" tag is 2 char language id followed by
// a comma then a score from 1 to 100 to indicate percentage.
// Allow multiple "lang" tags in one tagrec.
{"rootlang" ,TDF_STRING,0},
// title tag and incoming link text of the root page is stored here
// for determining default venue addresses
{"roottitles" ,TDF_STRING|TDF_NOINDEX,0},
//{"rootlangid" ,TDF_STRING|TDF_NOINDEX,0},
// for addresses of the website, can be multiple
{"langunknown" ,0x00,0},
{"english" ,0x00,0},
{"french" ,0x00,0},
{"spanish" ,0x00,0},
{"russian" ,0x00,0},
{"turkish" ,0x00,0},
{"japanese" ,0x00,0},
{"chinesetraditional" ,0x00,0},
{"chinesesimplified" ,0x00,0},
{"korean" ,0x00,0},
{"german" ,0x00,0},
{"dutch" ,0x00,0},
{"italian" ,0x00,0},
{"finnish" ,0x00,0},
{"swedish" ,0x00,0},
{"norwegian" ,0x00,0},
{"portuguese" ,0x00,0},
{"vietnamese" ,0x00,0},
{"arabic" ,0x00,0},
{"hebrew" ,0x00,0},
{"indonesian" ,0x00,0},
{"greek" ,0x00,0},
{"thai" ,0x00,0},
{"hindi" ,0x00,0},
{"bengala" ,0x00,0},
{"polish" ,0x00,0},
{"tagalog" ,0x00,0},
{"spam" ,0x00,0},
{"retail" ,0x00,0},
{"business" ,0x00,0},
{"adult" ,0x00,0},
{"forum" ,0x00,0},
{"blog" ,0x00,0},
{"news" ,0x00,0},
{"reference" ,0x00,0},
{"directory" ,0x00,0},
{"searchengine" ,0x00,0},
{"domainsquatter" ,0x00,0},
{"platform" ,0x00,0},
{"travel" ,0x00,0},
{"audio" ,0x00,0},
{"video" ,0x00,0},
{"socialnetworking" ,0x00,0},
{"manualban" ,0x00,0},
{"manualfilter" ,0x00,0},
// clock hashes are now stored in indexdb
//{"clock" ,0x00,0},
{"dateformat" ,0x00,0}, // 1 = american, 2 = european
{"ruleset" ,0x00,0},
//{"filtered" ,0x00,0},
//{"compromised" ,0x00,0},
//{"good" ,0x00,0},
{"deep" ,0x00,0},
//{"quality" ,0x00,0},
//{"dmozcatid" ,TDF_NOINDEX,0},
{"comment" ,TDF_STRING|TDF_NOINDEX,0},
// we now index this. really we need it for storing into title rec.
//{"meta" ,TDF_STRING,0},
// . website contact info
// . used by ContactInfo.cpp
// . TDB_ARRAY means not to "overwrite" even if username is the same
// . a website can have multiple street addresses, etc.
// . the "lines" of an single street address are separated by ';'
// instead of \n to maintain tagdb dump output readability
//{"streetaddress" ,TDF_ARRAY,0},
//{"phonenumber" ,TDF_ARRAY,0},
//{"faxnumber" ,TDF_ARRAY,0},
//{"emailaddress" ,TDF_ARRAY,0},
// . this tag can contain multiple zipcodes, separated by ' '
// . we do index these for local search
//{"zipcodes" ,0x00,0},
// . similar to zip codes, separated by ' '
// . TODO: we need to fix Places.cpp to label the places for these tags
// but for now we can do gbtagstreetaddress:munich and hope for
// the best, although we will get websites on "munich st.!", but
// maybe you can combine that with gbtagstreetaddress:germany
//{"countries", ,0x00,0},
//{"cities", ,0x00,0},
// this is "0" or "1". if it is "0" then the date lets XmlDoc.cpp know
// when we last tried to get the contact info for the site
{"hascontactinfo" ,0x00,0},
// street address using ; as delimeter
{"contactaddress" ,TDF_ARRAY|TDF_NOINDEX,0},
{"contactemails" ,TDF_ARRAY|TDF_NOINDEX,0},
//{"emailaddressonsite" ,TDF_ARRAY|TDF_NOINDEX,0},
//{"emailaddressoffsite" ,TDF_ARRAY|TDF_NOINDEX,0},
{"hascontactform" ,0x00,0},
// subscribe to google's blacklist and mark the sites as this
//{"malware" ,0x00,0},
// . this is used to define INDEPENDENT subsites
// . such INDEPENDENT subsites should never inherit from this tag rec
// . it is used to handle "homesteading" sites like
// and the like, and is automatically set by SiteGetter.cpp
// . if this is 1 then is considered a subsite
// . if this is 2 then is considered a subsite
// . if this is -1 then no subsite is found
// . this should never be 0 either
{"sitepathdepth" ,0x00,0},
// . used by XmlDoc::updateTagdb() and also used to determine
// if we should index a site in XmlDoc.cpp. to be indexed a site
// must be in google, or must have this tag type in its tag rec,
// or have some other, soon to be invented, tag
// . really this is all controlled by url filters table
// . allow multiple tags of this type from same "user"
{"authorityinlink" ,TDF_STRING|TDF_ARRAY,0},
{"pagerank" ,0x00,0},
{"ingoogle" ,0x00,0},
{"ingoogleblogs" ,0x00,0},
{"ingooglenews" ,0x00,0},
// geo location from this news site directory
// we now store site pop, etc. in tagdb
{"sitenuminlinks" ,0x00,0},
{"sitenuminlinksuniqueip" ,0x00,0},
{"sitenuminlinksuniquecblock" ,0x00,0},
{"sitenuminlinkstotal" ,0x00,0},
// keep these although no longer used
{"sitepop" ,0x00,0},
{"sitenuminlinksfresh" ,0x00,0},
// . the first ip we lookup for this domain
// . this is permanent and should never change
// . it is used by Spider.cpp to assign a host for throttling
// all urls/SpiderRequests from that ip
// . so if we did change it then that would result in two hosts
// doing the throttling, really messing things up
{"firstip" ,0x00,0}
{"" ,0x00,0},
{"user.xml" ,TDF_STRING,0},
{"user.login" ,TDF_STRING,0},
{"user.password" ,TDF_STRING,0},
{"user.securityanswer" ,TDF_STRING,0},
{"" ,TDF_STRING,0},
{"user.firstname" ,TDF_STRING,0},
{"user.lastname" ,TDF_STRING,0},
{"user.cookie" ,TDF_STRING,0},
{"user.zipcode" ,TDF_STRING,0},
{"" ,TDF_STRING,0},
{"user.state" ,TDF_STRING,0},
{"user.imageurl" ,TDF_STRING,0},
{"user.dob" ,TDF_STRING,0},
{"user.language" ,TDF_STRING,0},
{"user.creditcardname" ,TDF_STRING,0},
{"user.creditcardnum" ,TDF_STRING,0},
{"user.creditcardexp" ,TDF_STRING,0},
{"user.creditcardcode" ,TDF_STRING,0},
{"user.lastlogin" ,0x00,0},
{"user.numlogins" ,0x00,0},
{"user.openlinksnewwin" ,0x00,0},
{"user.usehttps" ,0x00,0},
{"user.maxreadhist" ,0x00,0},
{"user.maxsearchhist" ,0x00,0},
{"user.format" ,0x00,0},
{"user.acctbalance" ,0x00,0},
{"user.acctlimit" ,0x00,0},
{"user.acctsuspended" ,0x00,0},
{"user.acctbillemails" ,TDF_STRING,0},
{"user.adstopicid" ,0x00,0},
{"user.adsdailybudget" ,0x00,0},
{"user.adsdisabled" ,0x00,0},
{"user.feednumqueries" ,0x00,0},
{"user.feedcpq" ,0x00,0},
{"user.feeddailybudget" ,0x00,0},
{"user.feeddisabled" ,0x00,0},
{"user.feedpassword" ,TDF_STRING,0},
{"user.feeddailycount" ,TDF_ARRAY,0},
{"user.usertransrec" ,TDF_ARRAY,0},
{"user.userhistoryrec" ,TDF_ARRAY,0},
{"user.userpanelrec" ,TDF_ARRAY,0},
{"trans.amount" ,0x00,0},
{"trans.desc" ,TDF_STRING,0},
{"hist.wasread" ,0x00,0},
{"hist.url" ,TDF_STRING,0},
{"hist.gigabits" ,TDF_STRING,0},
{"hist.timespent" ,0x00,0},
{"panel.topcid" ,0x00,0},
{"panel.showmainstream" ,0x00,0},
{"panel.showblogs" ,0x00,0},
{"panel.showforum" ,0x00,0},
{"panel.showweb" ,0x00,0},
{"panel.showsearchbox" ,0x00,0},
{"panel.showimages" ,0x00,0},
{"panel.showvideo" ,0x00,0},
{"panel.showchatbox" ,0x00,0},
{"panel.showchatpics" ,0x00,0},
{"panel.popsliderval" ,0x00,0},
{"panel.agesliderval" ,0x00,0},
{"panel.windowxpos" ,0x00,0},
{"panel.windowypos" ,0x00,0},
{"panel.numstories" ,0x00,0},
{"panel.storylang" ,TDF_STRING,0},
{"panel.translatelang" ,TDF_STRING,0},
{"panel.displaylang" ,TDF_STRING,0},
{"panel.filterquery" ,TDF_STRING,0},
{"chat.comment" ,TDF_STRING,0},
{"ad.topicid" ,0x00,0},
{"ad.userid" ,0x00,0},
{"ad.adid" ,0x00,0},
{"ad.title" ,TDF_STRING,0},
{"ad.text" ,TDF_STRING,0},
{"ad.url" ,TDF_STRING,0},
{"ad.keywordstring" ,TDF_STRING,0},
{"ad.dailypledge" ,0x00,0},
{"ad.disabled" ,0x00,0},
{"ad.dailyimpresscount" ,TDF_ARRAY,0},
{"ad.dailyclickcount" ,TDF_ARRAY,0}
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
// . used by CollectionRec::getRegExpNum()
// . tagnameLen is -1 if unknown
long getTagTypeFromStr( char *tagname , long tagnameLen ) {
// this is now the hash
long tagType;
if ( tagnameLen == -1 ) tagType = hash32n ( tagname );
else tagType = hash32 ( tagname , tagnameLen );
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// sanity check, make sure it is a supported tag!
if ( ! s_ht.getValue ( tagType ) ) {
log("tagdb: unsupported tagname \"%s\"",tagname);
char *xx=NULL;*xx=0;
return -1;
return tagType;
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
char *getTagStrFromType ( long tagType ) {
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
TagDesc *td = (TagDesc *)s_ht.getValue ( tagType );
// sanity check
if ( ! td ) { char *xx=NULL;*xx=0; }
// return it
return td->m_name;
// a global class extern'd in .h file
Tagdb g_tagdb;
Tagdb g_tagdb2;
// a fake site for Tagdb::convert()
//Tagdb g_sitedb;
//static HashTableT<long long,long> s_lockTable;
//static HashTableX s_lockTable2;
// reset rdb and Xmls
void Tagdb::reset() {
bool Tagdb::setHashTable ( ) {
if ( s_initialized ) return true;
s_initialized = true;
// the hashtable of TagDescriptors
if ( ! s_ht.set ( 1024 ) )
return log("tagdb: Tagdb hash init failed.");
// stock it
long n = (long)sizeof(s_tagDesc)/(long)sizeof(TagDesc);
for ( long i = 0 ; i < n ; i++ ) {
TagDesc *td = &s_tagDesc[i];
char *s = td->m_name;
long slen = gbstrlen(s);
// use the same algo that Words.cpp computeWordIds does
long h = hash64Lower_a ( s , slen );
// call it a bad name if already in there
TagDesc *etd = (TagDesc *)s_ht.getValue ( h );
if ( etd )
return log("tagdb: Tag %s collides with old tag %s",
// set the type
td->m_type = h;
// add it
s_ht.addKey ( h , (long)td );
return true;
bool Tagdb::init ( ) {
// snity test
// log("tagdb: fix call to convert()");
// char *xx = NULL; *xx = 0;
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
long maxTreeNodes = g_conf.m_tagdbMaxTreeMem / 82;
//long long pcmem = 250000000; // 250MB
// TODO: make it a biased disk page cache!
long long pcmem = 160000000; // 160MB
// turn it off for rebuilding posdb, to 10MB anyway
pcmem = 10000000;
//long pcmem = 100000000;
// each entry in the cache is usually just a single record, no lists,
// unless a hostname has multiple sites in it. has 24 bytes more
// overhead in cache.
//long maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
// we now use a page cache
if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
return log("tagdb: Tagdb init failed.");
// init this
//if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") )
// return log("tagdb: lock table init failed.");
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
return m_rdb.init ( g_hostdb.m_dir ,
"tagdb" ,
true , // dedup same keys?
-1 , // fixed record size
2,//g_conf.m_tagdbMinFilesToMerge ,
g_conf.m_tagdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
&m_pc ,
false, // is titledb
true , // preload disk page cache
sizeof(key128_t), // key size
true ); // bias disk page cache?
bool Tagdb::init2 ( long treeMem ) {
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
long maxTreeNodes = treeMem / 82;
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
return m_rdb.init ( g_hostdb.m_dir ,
"tagdbRebuild" ,
true , // dedup same keys?
-1 , // fixed record size
50,//g_conf.m_tagdbMinFilesToMerge ,
treeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
NULL , // pc
false, // is titledb
false , // preload disk page cache
sizeof(key128_t), // key size
false ); // bias disk page cache?
bool Tagdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;//false;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
//if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
//log ( "tagdb: Verify failed, but scaling is allowed, passing." );
//return true;
return false;
bool Tagdb::verify ( char *coll ) {
char *rdbName = NULL;
rdbName = "Tagdb";
log ( LOG_DEBUG, "db: Verifying %s for coll %s...", rdbName, coll );
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key128_t startKey;
key128_t endKey;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_TAGDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
return log("tagdb: HEY! it did not block");
long count = 0;
long got = 0;
//long numOld = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
//key128_t k = list.getCurrentKey();
key128_t k;
list.getCurrentKey ( &k );
// skip negative keys
if ( (k.n0 & 0x01) == 0x00 ) continue;
// see if it is the "old" school tagdb rec
//char *data = list.getCurrentData();
//long dataSize = list.getCurrentDataSize();
// this is the file number in the old school tagdb recs
// and it is the version number in the new school style recs.
// just make sure the new school version number stays below 30!
//char version = *data;
// lower 3 bytes are the file number. >= 30 on gk
//if ( version >= 30 ) numOld++;
//unsigned long groupId = g_tagdb.getGroupId ( &k );
unsigned long shardNum = getShardNum ( RDB_TAGDB , &k );
if ( shardNum == getMyShardNum() ) got++;
if ( got != count ) {
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("tagdb: Out of first %li records in %s, only %li belong "
"to our group.",count,rdbName,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("tagdb: Are you sure you have the "
"right "
"data in the right directory? "
log ( "tagdb: Exiting due to %s inconsistency.", rdbName );
return g_conf.m_bypassValidation;
log ( LOG_DEBUG, "db: %s passed verification successfully for %li "
"recs.",rdbName, count );
// turn threads back on
// if no recs in tagdb, but sitedb exists, convert it
if ( count > 0 ) return true;
// . convert them
// . returns false and sets g_errno on error
//if ( ! convert ( coll ) ) return false;
return true;
// past blast -- for Tagdb::convert()
struct SiteType {
SiteType() : m_score(0) {}
SiteType& operator=(SiteType& o)
{m_type=o.m_type;m_score=o.m_score; return *this;}
// get this type's size
long getStoredSize() {
if (isType4Bytes(m_type)) return sizeof(m_type)+4;
else return sizeof(m_type)+1;
enum {
SPAM = FIRST_TYPE, //probablitity that it is spam
RETAIL, //selling something
BUSINESS, //a corporate storefront eg
ADULT, //not safe for kids, higher score = more hardcore
FORUM, //message board
BLOG, //or personal home page
NEWS, //articles, opinions magazines
REFERENCE, //all special interest sites
DIRECTORY, //links organized categorically
SEARCH_ENGINE, //indexed info
PLATFORM, //political candidate, or org
TRAVEL, //Travel sites
AUDIO, //podcast, streaming radio
VIDEO, //flash video
SOCIAL_NETWORKING,//dating, myspace, facebook
MANUAL_BAN, //a human hates this site
PAGE_RANK, //google's page rank
CLOCK1_PREHASH, //hash of unique preceeding 1st clock
CLOCK1_PREHASH_CNT, // count of tags to make 1st clock hash
DATE_FORMAT, //format of dates on page
CLOCK2_PREHASH, //hash of unique tags preceeding 2nd clock
CLOCK2_PREHASH_CNT, // count of tags to make 2nd clock hash
CLOCK3_PREHASH, //hash of unique tags preceeding 3rd clock
CLOCK3_PREHASH_CNT, // count of tags to make 3rd clock hash
CLOCK4_PREHASH, //hash of unique tags preceeding 4th clock
CLOCK4_PREHASH_CNT, // count of tags to make 4th clock hash
// ....ADD ALL NEW TYPES HERE... corruption upon ye if not
// . types can be 1 byte or 4 bytes. if they are 4 bytes, they must be
// added to this function
static bool isType4Bytes(int type) {
if ( type == CLOCK1_PREHASH ) return true;
if ( type == CLOCK2_PREHASH ) return true;
if ( type == CLOCK3_PREHASH ) return true;
if ( type == CLOCK4_PREHASH ) return true;
return false;
static long getScoreSize(uint8_t type) {
if ( type == CLOCK1_PREHASH ) return 4;
if ( type == CLOCK2_PREHASH ) return 4;
if ( type == CLOCK3_PREHASH ) return 4;
if ( type == CLOCK4_PREHASH ) return 4;
return 1;
bool isNormScore() {return m_type <= PAGE_RANK;}
uint8_t m_type;
uint32_t m_score;
// . convert the old Tagdb format into the new format
bool Tagdb::convert ( char *coll ) {
log("db: Trying to convert sitedb for coll %s into tagdb",coll);
collnum_t collnum = g_collectiondb.getCollnum ( coll );
// open up old sitedb files
long mem = 100000000;
long maxTreeNodes = mem / 82;
//Rdb sitedb;
g_sitedb.m_rdb.init ( g_hostdb.m_dir ,
"sitedb" ,
true , // dedup same keys?
-1 , // fixed record size
9999 , // MinFilesToMerge
100000000 , // g_conf.m_tagdbMaxTreeMem
maxTreeNodes ,
true , // balance tree?
0 , // g_conf.m_tagdbMaxCacheMem
0 , // maxCacheNodes
false , // half keys?
false , // m_tagdbSaveCache
NULL , // DiskPageCache *, &m_pc
false , // is titledb
false , // preload disk page cache
12 , // key size
false );// bias disk page cache?
g_sitedb.addColl ( coll, false );
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
key_t k;
bool threadsWereEnabled = !g_threads.areThreadsDisabled();
// loop over all tagdb recs in tagdb
if ( ! msg5.getList ( RDB_SITEDB ,
coll ,
&list ,
startKey ,
endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
if(threadsWereEnabled) g_threads.enableThreads();
return log("db: HEY! it did not block");
long count = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
k = list.getCurrentKey();
char *data = list.getCurrentData();
//long dataSize = list.getCurrentDataSize();
// point to end of it
//char *pend = data + dataSize;
// parse the old site rec
char *p = data;
long old_sfn = (*(long *)p) & 0x00ffffff;
//char old_version = p[3];
p += 4;
char *old_site = p;
long old_siteLen = gbstrlen(p);
p += old_siteLen + 1;
long old_time = *(long *)p;
p += 4;
char *old_comment = p;
p += gbstrlen(p) + 1;
//char *old_username = p;
p += gbstrlen(p) + 1;
//unsigned char siteFlags = *p;
p += 1;
//char siteQuality = *p;
p += 1;
//char incHere = *(long *)p;
uint8_t numTypes = *(uint8_t *)p;
p += 1;
// do not start with http:// ! wastes space!!
if (old_siteLen>=8 && strncmp(old_site,"http://",7)==0 ) {
old_site += 7;
old_siteLen -= 7;
// sanity check
//Url s; s.set ( old_site, old_siteLen );
//key_t newk = g_tagdb.makeKey ( &s , false );
//if ( k != newk ) { char *xx=NULL;*xx=0; }
// . without any tags, what is our dataSize?
// . version(1 byte)+site(X bytes)+NULLTerm(1 byte)+
// #Tags(2 bytes)
//long dataSize2 = 1 + old_siteLen + 1 + 2;
// set the new rec with this stuff
TagRec newgr;
//newgr.set ( k ,
// dataSize2 ,
// old_site );
long now = getTimeGlobal();
// add the "site" name as a tag (include NULL)
newgr.addTag ( ST_SITE , old_time , "conv" , 0,
old_site, gbstrlen(old_site)+1);
// the banned tag
if ( old_sfn == 30 ) {
char data = 1;
newgr.addTag ( ST_MANUAL_BAN ,now, "conv", 0,&data,1);
if ( old_sfn == 50 ) {
char data = 1;
newgr.addTag ( ST_DEEP,now, "conv", 0,&data,1);
// just for historical reasons, keep this too
newgr.addTag ( ST_RULESET , now , "conv",0,(char *)&old_sfn,1);
// . add in comment tag
// . this will increase newgr::m_dataEnd/m_dataSize
// . include NULL
if ( old_comment[0] )
newgr.addTag ( ST_COMMENT ,now, "conv", 0,
old_comment , gbstrlen(old_comment)+1);
// reset these
bool gotPrehash1 = false;
bool gotPrehash2 = false;
bool gotPrehash3 = false;
bool gotPrehash4 = false;
bool gotPrehashCount1 = false;
bool gotPrehashCount2 = false;
bool gotPrehashCount3 = false;
bool gotPrehashCount4 = false;
long prehash1;
long prehash2;
long prehash3;
long prehash4;
char prehashCount1;
char prehashCount2;
char prehashCount3;
char prehashCount4;
// now for the old SiteTypes
for ( long i = 0 ; i < numTypes ; i++ ) {
//while ( p < pend ) {
//SiteType *ost = (SiteType *)p;
// get the type
char siteType = *p; p++;
// and the score
char *siteTypeScore = p;
long siteTypeScoreSize =
p += siteTypeScoreSize;
// a 0 score in the old sitedb meant to ignore
if ( *siteTypeScore == 0 && siteTypeScoreSize == 1 )
// map the siteType 1-1 for the most part
long tagType = siteType + ST_SPAM;
// if the type is SiteType::CLOCK2-4_ re-map it
if ( siteType == SiteType::CLOCK1_PREHASH ) {
gotPrehash1 = true;
prehash1 = *(long *)siteTypeScore;
if ( siteType == SiteType::CLOCK1_PREHASH_CNT ) {
gotPrehashCount1 = true;
prehashCount1 = *(char *)siteTypeScore;
if ( siteType == SiteType::CLOCK2_PREHASH ) {
gotPrehash2 = true;
prehash2 = *(long *)siteTypeScore;
if ( siteType == SiteType::CLOCK2_PREHASH_CNT ) {
gotPrehashCount2 = true;
prehashCount2 = *(char *)siteTypeScore;
if ( siteType == SiteType::CLOCK3_PREHASH ) {
gotPrehash3 = true;
prehash3 = *(long *)siteTypeScore;
if ( siteType == SiteType::CLOCK3_PREHASH_CNT ) {
gotPrehashCount3 = true;
prehashCount3 = *(char *)siteTypeScore;
if ( siteType == SiteType::CLOCK4_PREHASH ) {
gotPrehash4 = true;
prehash4 = *(long *)siteTypeScore;
if ( siteType == SiteType::CLOCK4_PREHASH_CNT ) {
gotPrehashCount4 = true;
prehashCount4 = *(char *)siteTypeScore;
// but DATE_FORMAT is off
if ( siteType == SiteType::DATE_FORMAT )
// panic
if ( tagType >= ST_LAST_TAG ) {
log("db: got bad tagtype %li for sitedb rec.",
// add to new rec
newgr.addTag ( tagType , // should be 1-1
now ,
"conv" ,
0 , // ip
siteTypeScore ,
siteTypeScoreSize );
// add in the clock stuff
if ( gotPrehash1 && gotPrehashCount1 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount1;
memcpy ( tmp+1 , &prehash1, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
if ( gotPrehash2 && gotPrehashCount2 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount2;
memcpy ( tmp+1 , &prehash2, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
if ( gotPrehash3 && gotPrehashCount3 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount3;
memcpy ( tmp+1 , &prehash3, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
if ( gotPrehash4 && gotPrehashCount4 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount4;
memcpy ( tmp+1 , &prehash4, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
// now the langs
uint8_t numLangs = *p;
p += 1;
for ( long i = 0 ; i < numLangs ; i++ ) {
uint8_t langId = *p;
p += 1;
long score = (long)*(uint8_t *)p;
p += 1;
// add to new rec
newgr.addTag ( langId , // should be 1-1
now ,
"conv" ,
0 , // ip
(char *)&score ,
1 );
// print it out
SafeBuf sb;
logf(LOG_INFO,"tagdb: %s",sb.getBufStart());
Rdb *r = &g_tagdb.m_rdb;
// . add the new site rec back as a TagRec
// . it should overwrite the old one since the key is the same
// . this should not block
// . it should do a dump if tree is full
if ( ! r->addRecord ( collnum ,
newgr.getKey () ,
newgr.getData () ,
newgr.getDataSize() ,
log("tagdb: convert: %s",mstrerror(g_errno));
char *xx=NULL;*xx=0;
// do a blocking dump of tree if it's 90% full now
if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
log("tagdb: convert: dumping tree to disk.");
if ( ! r->dumpTree ( 0 ) ) // niceness
return log("tagdb: convert: dump failed.");
// if list not empty, get more
if ( list.isEmpty() ) { g_threads.enableThreads(); return true; }
// advance startKey
startKey = k;
startKey += 1;
// watch for wrap, that means done, too
if ( startKey < k ) { g_threads.enableThreads(); return true; }
// otherwise, do more
goto loop;
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
key_t Tagdb::makeKey ( Url *u , bool isDelete ) {
key_t k;
// hash full hostname
k.n1 = hash32 ( u->getHost() , u->getHostLen() );
// set lower 64 bits of key to hash of this url
k.n0 = hash64 ( u->getUrl() , u->getUrlLen() );
// clear low bit if we're a delete, otherwise set it
if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL;
else k.n0 |= 0x0000000000000001LL;
return k;
// . ssssssss ssssssss ssssssss ssssssss hash of site/url
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx tagType OR hash of that+user+data
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
key128_t Tagdb::makeStartKey ( char *site ) { // Url *u ) {
key128_t k;
// hash full hostname
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
k.n1 = hash64n ( site );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0;
return k;
key128_t Tagdb::makeEndKey ( char *site ) { // Url *u ) {
key128_t k;
// hash full hostname
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
k.n1 = hash64n ( site );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0xffffffffffffffffLL;
return k;
key128_t Tagdb::makeDomainStartKey ( Url *u ) {
key128_t k;
// hash full hostname
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0;
return k;
key128_t Tagdb::makeDomainEndKey ( Url *u ) {
key128_t k;
// hash full hostname
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0xffffffffffffffffLL;
return k;
// . returns 0 if "url" is not a suburl of "site"
// . otherwise, returns "percent" of "url" that matches "site"
long Tagdb::getMatchPoints ( Url *recUrl , Url *url ) {
// reset pts to 0
long pts = 0;
// temporary fix to the hostname key collision problem is Tagdb Rdb
long rhlen = recUrl->getHostLen ();
char *uhost = url ->getDomain ();
long uhlen = url ->getDomainLen ();
char *shost = recUrl->getDomain ();
long shlen = recUrl->getDomainLen ();
//long uip = url->getIp ();
//long sip = site->getIp ();
// MDW: we are not really doing ips like this now
if ( uhlen != shlen || strncmp( uhost, shost, uhlen ) != 0 )
// if ( ! uip || uip != sip ) return 0;
return 0;
// compare ports for bonus points
// but return 0 if site's port is not default
long rport = recUrl->getPort ();
long uport = url->getPort ();
if ( rport == uport ) pts += 1000000;
else if ( uport != url->getDefaultPort() ) return 0;
// now ensure url's path is a subpath of recUrl's
long rplen = recUrl->getPathLen();
char *rpath = recUrl->getPath();
long uplen = url->getPathLen();
char *upath = url->getPath();
if ( rplen > uplen ) return 0;
if ( strncmp ( upath , rpath , rplen ) != 0 ) return 0;
// . now we got a solid match
// . add 1 pt for each char in recUrl's path
// . so the longer recUrl's path the better the match (more specific)
// . this allows us to override TagRecs for deeper sub urls
pts += rplen;
// add in host size of the matching recUrl
pts += rhlen*1000;
// all done
return pts;
// for getting the final TagRec for a url
Msg8a::Msg8a() {
m_replies = 0;
m_requests = 0;
Msg8a::~Msg8a ( ) {
void Msg8a::reset() {
// do no free if in progress, reply may come in and corrupt the mem
if ( m_replies != m_requests && ! g_process.m_exiting ) {
char *xx=NULL;*xx=0; }
//for ( long i = 0 ; i < m_replies ; i++ )
// m_lists[i].reset();
m_replies = 0;
m_requests = 0;
// . get records from multiple subdomains of url
// . calls g_udpServer.sendRequest() on each subdomain of url
// . all matching records are merge into a final record
// i.e. site tags are also propagated accordingly
// . closest matching "site" is used as the "site" (the site url)
bool Msg8a::getTagRec ( Url *url ,
// site of the url
char *site ,
//char *coll ,
collnum_t collnum,
bool skipDomainLookup , // useCanonicalName ,
long niceness ,
void *state ,
void (* callback)(void *state ),
TagRec *tagRec ,
bool doInheritance ,
char rdbId ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return true;
// reset tag rec
tagRec->reset();//m_numListPtrs = 0;
// sanity check
if ( rdbId != RDB_TAGDB ) {char *xx=NULL;*xx=0;}
// save it
m_rdbId = rdbId;
// in use? need to wait before reusing
if ( m_replies != m_requests ) {char *xx=NULL;*xx=0; }
// then we gotta free the lists if any
m_niceness = niceness;
//m_coll = coll;
m_collnum = collnum;
m_tagRec = tagRec;
m_callback = callback;
m_state = state;
//m_url = url;
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_doneLaunching = false;
//m_doFullUrl = true;
//m_skipDomainLookup = skipDomainLookup;
// set siteLen to the provided site if it is non-NULL
long siteLen = 0;
if ( site ) siteLen = gbstrlen(site);
// . get the site
// . msge0 passes this in as NULL an expects us to figure it out
// . if site was NULL that means we guess it. default to hostname
// unless in a recognized for like /~mwells/
if ( ! site ) {
SiteGetter sg;
sg.getSite ( url->getUrl() ,
NULL , // tagrec
0 , // timestamp
collnum, // coll
NULL, // state
NULL); // callback
// if it set it to a recognized site, like ~mwells
// then set "site"
if ( sg.m_siteLen ) {
site = sg.m_site;
siteLen = sg.m_siteLen;
// if provided site was NULL and not of a ~mwells type of form
// then default it to hostname
if ( ! site ) {
site = url->getHost();
siteLen = url->getHostLen();
// temp null terminate it
char c = site[siteLen];
site[siteLen] = '\0';
// use that
m_siteStartKey = g_tagdb.makeStartKey ( site );//url );
m_siteEndKey = g_tagdb.makeEndKey ( site ); // url );
// un NULL terminate it
site[siteLen] = c;
// ignore this part of url is already root like
//if ( m_url->isRoot() ) m_doFullUrl = false;
// makeStartKey only works on the hostname of the url, so doing the
// full url has no effect right now
//m_doFullUrl = false;
// sendPageInject keeps "url" on the stack!
//m_url.set ( url->getUrl() , url->getUrlLen() );
m_url = url;
// save this
m_doInheritance = doInheritance;
// . launch a request for each subdomain of the url
// . the request format is
// . <url>\0<niceness><coll>\0
// . that way we can use a small request buffer and have different
// pointers to the different subdomains
//char *p = m_request;
// point to url
char *u = url->getUrl();
long ulen = url->getUrlLen();
// point to the TLD of the url
char *tld = url->getTLD();
// . if NULL, that is bad... TLD is unsupported
// . no! it could be an ip address!
// . anyway, if the tld does not exist, just return an empty tagrec
// do not set g_errno
if ( ! tld && ! url->isIp() ) return true;
//if ( ! tld ) { g_errno = EBADURL; return true; }
// url cannot have NULLs in it because handleRequest8a() uses
// gbstrlen() on it to get its size
for ( long i = 0 ; i < ulen ; i++ ) {
if ( u[i] ) continue;
log("TagRec: got bad url with NULL in it %s",u);
m_errno = EBADURL;
g_errno = EBADURL;
return true;
// skip over http://
long plen = url->getSchemeLen() + 3;
u += plen;
ulen -= plen;
// copy over url without the protocol thingy (http://)
//memcpy ( p , u , ulen );
// get the domain
m_dom = url->getDomain();
// if none, bad!
if ( ! m_dom && ! url->isIp() ) return true;
// save this
//m_host = url->getHost();
// get its delta
//long delta = dom - u;
// . save ptr for launchGetRequests()
// . move this BACKWARDS for subdomains that have a ton of .'s
// . no, now move towards domain
m_p = m_url->getHost();
// and save this too
m_hostEnd = m_url->getHost() + m_url->getHostLen();
// if ip just use the full "hostname" which is the full ip address
//if ( url->isIp() ) m_p = m_host;
// launch the requests
if ( ! launchGetRequests() ) return false;
// . they did it without blocking
// . this sets g_errno on error
// did not block
return true;
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg8a::launchGetRequests ( ) {
// clear it
g_errno = 0;
bool tryDomain = false;
// return true if nothing to launch
if ( m_doneLaunching ) return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno ) return (m_requests == m_replies);
// limit max to 5ish
if (m_requests >=MAX_TAGDB_REQUESTS) return (m_requests==m_replies);
// take a breath
// . first, try it by canonical domain name
// . if that finds no matches, then try it by ip domain
// get host
//char *subdom = m_p;
//long subdomLen = m_hostEnd - m_p;
key128_t startKey ;
key128_t endKey ;
//long siteHash32;
// . if our first time, do the full url!
// . need to do this because the turking process (XmlDoc::getTurkForm()
// and PageReindex.cpp:processTurkForm()) add tags to tagdb based on
// the full url.
if ( m_doFullUrl ) {
startKey = g_tagdb.makeStartKey ( m_url );
endKey = g_tagdb.makeEndKey ( m_url );
// . like the "norm" url above
// . we'll get back a list of tags for this hostname,
// but they could all be from different sites, some sites
// would be the hostname, other tags might be from sites
// that are a subsite of the hostname, so we have to make
// sure the tag's key.n0 matches this siteHash32
siteHash32 = hash32 ( m_url->getUrl() , m_url->getUrlLen());
else {
// make into a url
Url u;
u.set ( subdom , subdomLen );
// set key range now
startKey = g_tagdb.makeStartKey ( &u );
endKey = g_tagdb.makeEndKey ( &u );
// . like the "norm" url above
// . we'll get back a list of tags for this hostname,
// but they could all be from different sites, some sites
// would be the hostname, other tags might be from sites
// that are a subsite of the hostname, so we have to make
// sure the tag's key.n0 matches this siteHash32
siteHash32 = hash32 ( u.getUrl() , u.getUrlLen() );
if ( tryDomain ) {
startKey = g_tagdb.makeDomainStartKey ( m_url );
endKey = g_tagdb.makeDomainEndKey ( m_url );
if ( g_conf.m_logDebugTagdb )
log("tagdb: looking up domain tags for %s",
else {
// usually the site is the hostname but sometimes it is like
// ""
//startKey = g_tagdb.makeStartKey ( m_site );//url );
//endKey = g_tagdb.makeEndKey ( m_site ); // url );
startKey = m_siteStartKey;
endKey = m_siteEndKey;
if ( g_conf.m_logDebugTagdb )
log("tagdb: looking up site tags for %s",
// get the groupid
//unsigned long groupId = g_tagdb.getGroupId ( startKey );
// get the next mcast
Msg0 *m = &m_msg0s[m_requests];
// and the list
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
// bias based on the top 64 bits which is the hash of the "site" now
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
//Host *group = g_hostdb.getGroup ( gid );
long shardNum = getShardNum ( m_rdbId , &startKey );//, true );
Host *group = g_hostdb.getShard ( shardNum );
//long numTwins = g_hostdb.getNumHostsPerShard();
// use top byte!
uint8_t *sks = (uint8_t *)&startKey;
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
//long hostNum = 0;
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
// TODO: fix this!
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
// support more than 2 stripes now...
long hostNum = top % g_hostdb.getNumHostsPerShard();
long hostId = group[hostNum].m_hostId;
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( hostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
m_rdbId, //RDB_TAGDB ,
m_collnum ,
listPtr ,
(char *) &startKey ,
(char *) &endKey ,
10000000 , // minRecSizes
this , // state
gotMsg0ReplyWrapper ,
m_niceness ,
true , // error correction?
true , // include tree?
true , // doMerge?
-1 , // firstHostId
0 , // startFileNum
-1 , // numFiles
3600*24*365 );// timeout
// all done?
//if ( m_p == m_url->getDomain() ) m_doneLaunching = true;
// error?
if ( status && g_errno ) {
// g_errno should be set, we had an error
m_errno = g_errno;
return (m_requests == m_replies);
// successfully launched
// if we got a reply instantly
if ( status ) m_replies++;
if ( ! tryDomain ) { //&&
//! m_skipDomainLookup &&
//m_url->getHostLen() != m_url->getDomainLen() ) {
tryDomain = true;
goto loop;
// no more looping!
// i don't think we need to loop any more because we got all the
// tags for this hostname. then the lower bits of the Tag key
// corresponds to the actual SITE hash. so we gotta filter those
// out i guess after we read the whole list.
return (m_requests == m_replies);
//m_doneLaunching = true;
//goto loop;
// do not advance m_p if doing the full url first
if ( m_doFullUrl ) {
m_doFullUrl = false;
goto loop;
// . advance m_p
// . we go backwards to better support subdomains that have a ton
// of periods in them...
for ( ; m_p < m_dom && *m_p != '.' ; m_p++ );
// advance over .
if ( m_p != m_dom ) m_p++;
// if another dot that is bad!
if ( *m_p == '.' ) m_errno = EBADURL;
// launch another
goto loop;
void gotMsg0ReplyWrapper ( void *state ) {
Msg8a *THIS = (Msg8a *)state;
// we got one
// error?
if ( g_errno ) THIS->m_errno = g_errno;
// launchGetRequests() returns false if still waiting for replies...
if ( ! THIS->launchGetRequests() ) return;
// get all the replies
// set g_errno for the callback
if ( THIS->m_errno ) g_errno = THIS->m_errno;
// otherwise, call callback
THIS->m_callback ( THIS->m_state );
// get the TagRec from the reply
void Msg8a::gotAllReplies ( ) {
// if any had an error, don't do anything
if ( m_errno ) return;
// scan the lists
for ( long i = 0 ; i < m_replies ; i++ ) {
// breathe
// get list
RdbList *list = &m_tagRec->m_lists[i];
// skip if empty
if ( list->m_listSize <= 0 ) continue;
// panic msg
if ( list->m_listSize >= 10000000 ) {
log("tagdb: CAUTION!!! cutoff tagdb list!");
log("tagdb: CAUTION!!! will lost useful info!!");
char *xx=NULL;*xx=0;
// otherwise, add to array
m_tagRec->m_listPtrs[m_tagRec->m_numListPtrs] = list;
// advance
// . now scan all the tags for this HOSTNAME
// . filter out tags that are not for a supersite of our url
// . i.e. if our url is
// then hash
// and skip over any tag whose lower 32 bits does not match
// one of those hashes...
// . see where we set Tag::m_key.n0 in Tag::set() above:
// m_key.n0 |= (uint32_t) hash32 ( norm.getUrl(),norm.getUrlLen() );
// where "norm" is the provided site but with a http:// in front
// and a / at the end since Url::set() normalized it
// . m_url is the url we want to get the tags for
// . HACK: right now just restrict to the hostname!
Url norm;
norm.set ( m_url->getHost() , m_url->getHostLen() );
unsigned long siteHash32 = hash32 ( norm.getUrl(),norm.getUrlLen() );
// . and the domain too so we can ban domains
// . this is messed up because we can't just hash the domain, we have
// to hash it like a complete url because that is what Tag::set()
// does when it makes the key's top 32 bits.
unsigned long siteHash32d = 0;
long conti = 0;
siteHash32d = hash32_cont ( "http://",7,siteHash32d,&conti);
siteHash32d = hash32_cont ( norm.getDomain(),
siteHash32d = hash32_cont ( "/",1,siteHash32d,&conti);
// the non-del bit i guess. we forgot to shift up when we made
// the key above!
siteHash32 |= 0x01;
siteHash32d |= 0x01;
// scan tags in list and set Tag::m_type to TT_DUP if its a dup
Tag *tag = m_tagRec->getFirstTag();
HashTableX cx;
char cbuf[2048];
cx.set ( 4,0,64,cbuf,2048,false,m_niceness,"tagtypetab");
// . loop over all tags in all lists in order by key
// . each list should be from a different suburl?
// . the first list should be the narrowest/longest?
for ( ; tag ; tag = m_tagRec->getNextTag ( tag ) ) {
// breathe
// skip tag if it is not from the proper site. we are
// only guarenteed that all tags in this list are for the
// same HOSTNAME not SITE! site is in the lower bits
// of the tagdb key.
// should fix bug where we were reading
// sitenuminlinks from that tag and was always 0!! even
// when we'd add a count of 2k to the site...
// now filter out's tags!
// TODO: allow multiple different siteHash32 values to match
// here, use one siteHash32 for each possible suburl of "m_url"
// so if m_url is "" then we also
// can match hash32("" not just
// "" which is how it is now.
//unsigned long th32 = tag->m_key.n0 & 0xffffffff;
//if ( th32 != siteHash32 && th32 != siteHash32d ) {
// // maybe use TT_DIFFSITE instead of this! TODO!
// tag->m_type = TT_DUP;
// continue;
// form the hash!
uint32_t h32 = (unsigned long)((tag->m_key.n0) >> 32);
// skip if not unique
//if ( ! isTagTypeUnique ( tag->m_type ) ) continue;
// otherwise, record it
if ( cx.isInTable(&h32 ) ) // tag->m_type) )
tag->m_type = TT_DUP;
else if ( ! cx.addKey(&h32) ) {
m_errno = g_errno;
// get the TagRec from the reply
void TagRec::gotAllReplies ( ) {
// if any had an error, don't do anything
if ( m_errno ) return;
// time how long this takes and log it
long long startTime = gettimeofdayInMilliseconds();
// how many TagRecs we matched
long n = 0;
// arrays for pointing to best matching TagRecs
//char *data [128];
//long dataSizes [128];
//long dataScores [128];
char *recs [128];
long recScores [128];
// . each reply is a list of TagRecs
// . each TagRec is a standard Rdb record
// . key|dataSize|data...
// . go through all TagRecs and sort our list of ptrs to the
// best TagRecs
// . some TagRecs will not even match, so do not include those in
// our list of pointers
// . the closest matching TagRecs will be on top
// . inherit Tags from lesser matching TagRecs provided there
// is no such Tag::m_type from a closer matching TagRec
// . if is banned and has a 0 score for the
// ST_BANNED Tag, then it is effectively "unbanned" and should
// not inherit the score from for ST_BANNED.
// . so by scanning each TagRec in order, we compose our own
// final merged TagRec that may have a lot more Tags in it
// than any one matching TagRec
for ( long i = 0 ; i < m_replies ; i++ ) {
// get the list from this reply
RdbList *list = &m_lists[i];
// scan list
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// break if overflow
if ( n >= 128 ) break;
// get next rec
//char *d = list->getCurrentData ();
//long dsize = list->getCurrentDataSize();
char *rec = list->getCurrentRec();
// set TagRec to it
TagRec *gr = (TagRec *)rec;
// get the site
//char *site = gr->getString(ST_SITE,NULL);
char *site = gr->getString("site",NULL);
// sanity check
if ( ! site ) { char *xx=NULL;*xx=0; }
// make it a url
Url u;
u.set ( site , gbstrlen(site) );
// score it
long s = g_tagdb.getMatchPoints ( &u , m_url );
// skip it if not a match
if ( s <= 0 ) continue;
// save it
//data [n] = d;
//dataSize [n] = dsize;
recs [n] = rec;
recScores [n] = s;
// if no recs, we did not match anything
if ( n == 0 ) return;
// or on error
if ( m_errno ) return;
// bubble sort the recs by their scores, highest score first
bool swapped = false;
for ( long i = 1 ; i < n ; i++ ) {
// keep going if in correct order
if ( recScores[i-1] >= recScores[i] ) continue;
// swap
char *t1 = recs [i-1];
long t2 = recScores [i-1];
recs [i-1] = recs [i];
recs [i ] = t1;
recScores [i-1] = recScores [i];
recScores [i ] = t2;
swapped = true;
if ( swapped ) goto bubble;
// parse the best matching SiteData
//TagRec gr ; gr.set ( data[0] , dataSizes[0] );
// use the site from the best matching TagRec as our site
//m_siteUrl.set ( gr.getSite() , gr.getSiteLen() );
// reset the inheritance array
//char array[ST_LAST_TAG];
//memset ( array , -1 , 256 );
HashTable ia;
char ibuf [ 1024 * 8 ];
ia.set ( 1024 , ibuf , 1024 * 8 );
// we just store the tags, ptrs into the tags in the m_lists
//Tag *tags[MAX_TAGS];
// assume we got no tags
//long numTags = 0;
// size of all tags
//long size = 0;
// set our new tag rec
// . only get tags from the first matching tag rec if we should not
// do the inheritance loop
// . if they click "get rec" on PageTagdb, then do not do inheritance,
// but if they click "get tags", then do it!
if ( ! m_doInheritance && n > 0 ) n = 1;
// . DO NOT INHERIT ANYTHING FROM TAG RECS that have a sitePathDepth
// tag in them UNLESS the sitePathDepth does not work on us
// . i.e. if has a sitePathDepth of 2 in its TagRec and the
// url we are looking at is then we must assume that
// out site is we are an independent subsite of
// and inherit nothing from it
SiteGetter siteGetter;
// site getter sometimes adds recs to tagdb to add in a new subsite
// it finds... i'd imagine this will create a parsing inconsistency
// when injecting docs into the "test" coll... but oh well!
long timestamp = getTimeGlobal();
// . begin the "inheritance loop"
// . fill our m_tags[] array with the Tags that apply to us
for ( long i = 0 ; i < n ; i++ ) {
// breathe
// parse the TagRec (very fast)
TagRec *gr = (TagRec *)recs[i];
// is "url" an independent subsite of gr's site?
char *us = m_url->getUrl();
bool st=siteGetter.getSite(us,gr,timestamp,m_coll,m_niceness );
// sanity check, not allowed to block since state is NULL!
if ( ! st ) { char *xx=NULL;*xx=0; }
// are we independent subsite? if so, do not inherit
// from that. this is used to prevent
// from gaining the benefits of being on the
// site. TODO later: we should make another tag to indicate
// a subsite is expicitly independent. but for now we rely
// on the "sitepathdepth" tag automatically computed by
// SiteGetter.cpp.
//if ( siteGetter.isIndependentSubsite() ) continue;
// TODO:
// NONO, just do not inherit sitenumlinks or any tag
// that is marked as such!!! add a new flag to the tags!!!!!!
// always add the ST_SITE tag first from each tag so we know
// what site the other tags belong to
//Tag *stag = gr->getTag ( ST_SITE );
Tag *stag = gr->getTag ( "site" );
// only add if non null
if ( stag ) m_tagRec->addTag ( stag );
// last tag
Tag *last = NULL;
// loop over all tags in TagRec #i
// get the tag id of current tag
Tag *tag = gr->getNextTag ( last );
// assign
last = tag;
// was that the end of the tags? if so, go to next TagRec
if ( ! tag ) continue;
// get tag id
long tagType = tag->m_type;
// skip all ST_SITE tags, we added those first above
//if ( tagType == ST_SITE ) goto tagLoop;
if ( tag->isType("site") ) goto tagLoop;
// sanity check
//if ( tagType >= ST_LAST_TAG ) { char *xx=NULL;*xx=0;}
// for getting the next tag, remember this
last = tag;
// . have we added this yet?
// . if tagType added from a prev TagRec do not "inherit" it
//if(array[tagType] != -1 && array[tagType] != i) goto tagLoop;
long slot = ia.getSlot ( tagType );
if ( slot >= 0 && ia.getValueFromSlot(slot) != i) goto tagLoop;
// if tag type is "eventtag" then only add it if the site of this
// tagrec EQUALS our url. exact match... that way we make sure to only
// tag a single url, otherwise we might accidentally tag an entire site.
if ( tag->isType("eventtag") ) {
// must be in tagRec that matches us the closest
if ( i != 0 ) goto tagLoop;
// if no site, skip it
if ( ! stag ) goto tagLoop;
// and even then must match site exactly
char *site = stag->m_data;
// as string
char *url = m_url->getUrl();
long ulen = m_url->getUrlLen();
// skip our proto (http://)
url += m_url->getSchemeLen() + 3;
ulen -= m_url->getSchemeLen() + 3;
// remove trailing /
if ( ulen > 0 && url[ulen-1] == '/' ) ulen--;
// likewise for site
long slen = gbstrlen(site);
if ( slen > 0 && site[slen-1] == '/' ) slen--;
// skip if not exact
if ( slen != ulen ) goto tagLoop;
// compare, must match exactly, if not, do not add tag
if ( strncmp(url,site,slen) != 0 ) goto tagLoop;
// ok, add/inherit it
//tags[numTags++] = tag;
// add it directly to m_tagRec
if ( ! m_tagRec->addTag ( tag ) ) {
log("tagdb: addTag failed: %s",mstrerror(g_errno));
m_errno = g_errno;
// add in size
//size += tag->getSize();
// note it, so we do not add/inherit it from another TagRec
//array[tagType] = i;
ia.addKey ( tagType , i );
// add more tags
goto tagLoop;
// sanity!
//if ( size > 32000 ) { char *xx=NULL;*xx=0; }
//if ( size + 2 + 2 > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0; }
// then copy the tags into the buffer
//for ( long i = 0 ; i < numTags ; i++ )
// m_tagRec->addTag ( tags[i] );
// sanity check
//if ( p - m_tagRec > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
// free the mem
// time it
long long took = gettimeofdayInMilliseconds() - startTime;
if(took>10) log(LOG_INFO, "admin: gotreply for msg8a took %lli",took);
// Msg9a : for modifying TagRecs in Tagdb
Msg9a::Msg9a () {
m_requestBuf = NULL;
m_requests = 0;
m_replies = 0;
Msg9a::~Msg9a() { reset(); }
void Msg9a::reset() {
// guard against not waiting for all replies to come in
if ( m_requests != m_replies && ! g_process.m_exiting ) {
char *xx=NULL;*xx=0; }
if ( ! m_requestBuf ) return;
mfree ( m_requestBuf , m_requestBufSize , "msg9a" );
m_requestBuf = NULL;
// . returns false if blocked, true otherwise
// . sets errno on error
// . "urls" is a NULL-terminated list of space-separated urls
// . if "addTags" is true, then the tags in "tagRec" will be added to the
/// the TagRecs specified by the sites in "sites". if a TagRec
// does not exist for a given "site" then it will be added just
// so we can add the Tags to it. If it does exist, we will
// just append the given Tags to it.
// . to "delete" a tag, just assign it a dataSize of 0!
// . Tags added with the same user name and tag type of an existing tag
// will overwrite it.
// . you can now optionally supply an array of ptrs to sites, sitePtrs.
// . you can call this with your "tagRec" on the stack because we copy
// its contents into our own buffer here
bool Msg9a::addTags ( char *sites ,
char **sitePtrs ,
long numSitePtrs ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
long niceness ,
TagRec *tagRec ,
bool nukeTagRecs ,
long *ipVector ) {
// incase we are being re-used!
g_errno = 0;
// sanity check, one or the other
if ( sites && sitePtrs ) { char *xx=NULL;*xx=0; }
// ipVector only used with sitePtrs for now
if ( ! sitePtrs && ipVector ) { char *xx=NULL;*xx=0; }
// when we add the "site" tag to it use the timestamp from one
// of the tags we are adding... therefore we must require there be
// some tags! we do this to insure injection consistency into the
// "test" collection.
if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }
// use the first timestamp
long timestamp = tagRec->getFirstTag()->m_timestamp;
// . up to 20 oustanding Msg0 getting the exact TagRec for each site
// . when we get it we immediately modify it and then add it back
// using Msg4.
// . to resolve collisions we could assign a particular hostid
// to handle adding each site... yeah, how about the local host.
// . so forward the Msg9a add/del/rpl request to the responsible
// host. then it can lock the "site" until the add completes.
// . it should use Msg1 to add it.
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_niceness = niceness;
m_state = state;
m_callback = callback;
long collLen = gbstrlen(coll);
// how many urls in the sites do we have?
long numUrls = 0;
// point to buf
char *s = sites;
// count each one
while ( sites && *s ) {
// skip whitespace
while ( *s && is_wspace_a(*s) ) s++;
// alnum?
if ( *s ) numUrls++;
// skip url
while ( *s && ! is_wspace_a(*s) ) s++;
if ( sitePtrs )
numUrls = numSitePtrs;
// how much buf do we need to hold all the requests for all the sites
long need = 0;
// just a buffer of sites
if ( sites )
need += 2 * (gbstrlen(sites) + 1);
// otherwise, use the site ptrs
for ( long i = 0 ; i < numSitePtrs ; i++ )
need += 2 * (gbstrlen(sitePtrs[i]) + 1);
// how big is each request's header?
long header = 0;
// request size
header += 4;
// niceness
header += 1;
// collection
header += collLen + 1;
// flag
header += 1;
// the tag rec
header += tagRec->getSize();
// . add ST_SITE to each tagRec
// . we already accounted for the sites in the gbstrlen() above
header += sizeof(Tag);
// one header per url
need += header * numUrls;
// make a request buffer for all the requests
m_requestBuf = (char *)mmalloc ( need , "msg9a-add");
if ( ! m_requestBuf ) return true;
m_requestBufSize = need;
// carve it up
char *p = m_requestBuf;
// loop over sites
s = sites;
// reset sitePtr counter in case we are using those
long si = 0;
//long now = getTimeGlobal();
// loop it
for ( ; ; si++ ) {
// stop if all done
if ( sites && ! *s ) break;
// or this
if ( sitePtrs && si >= numSitePtrs ) break;
// make "s" point to the site if we are using ptrs
if ( sitePtrs ) s = sitePtrs[si];
// skip whitespace
while ( *s && is_wspace_a(*s) ) s++;
// skip over http:// (wastes space)
if ( strncmp(s,"http://",7)==0 ) s += 7;
// find end of url
char *send = s;
while ( *send && ! is_wspace_a(*send)) send++;
// get the length
long len = send - s;
// done? make sure we are using the site buffer and not ptrs
if ( sites && ! *s ) break;
// a place holder for the request size
long *rsizePtr = (long *)p; p += 4;
// track the size
char *start = p;
// first niceness
*p = niceness; p++;
// then coll
memcpy ( p , coll , collLen ); p += collLen;
// NULL term
*p++ = '\0';
// add flag first
*p = 0x00;
//if ( deleteTags ) *p = 0x01;
if ( nukeTagRecs ) *p = 0x02; // delete entire TagRec?
// now make the Tag!
//TagRec *tagRec = (TagRec *)p;
// sets its ip special if we should
long ip = 0;
if ( ipVector ) ip = ipVector[si];
// . copy it over
// . get the size
long size = tagRec->getSize();
// add in tagRec
memcpy ( p , tagRec , size );
// cat it to p
TagRec *newgr = (TagRec *)p;
// NULL terminate it temporarily
char c = s[len];
s[len] = 0;
// . remove the old site so the new one can replace it
// . we already contain a SITE_TAG and addTag() will NEVER
// replace that particular tag...
// . this is now removed above
//newgr->removeTag ( "site" , NULL );
// add the site
//newgr->addTag ( ST_SITE, now,"tagdb",0,s, len+1 );
newgr->addTag ( "site", timestamp,"tagdb",ip,s, len+1 );
// undo the NULL termination
s[len] = c;
// update the size
size = newgr->getSize();
// advance
p += size;
// how big was the request, store that
*rsizePtr = (p - start);
// advance s
s = send;
// reset ptr to request to launch
m_p = m_requestBuf;
// sanity check
if ( p - m_requestBuf > need ) { char *xx=NULL;*xx=0; }
// all done
m_pend = p;
// launch them
if ( ! launchAddRequests () ) return false;
// hey that should always block!
if ( ! g_errno ) { char *xx=NULL; *xx=0; }
// show erroer
log("tagdb: msg9a: %s",mstrerror(g_errno));
// free the allocated mem
// did not block...
return true;
// . "dumpFile" format contains one tag record per line as
// dumped from './gb dump S main 0 -1 1' cmd line cmd.
// . it is the format given by the TagRec::printToBuf() cmd
bool Msg9a::addTags ( char *dumpFile ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
long niceness ) {
g_errno = 0;
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_niceness = niceness;
m_state = state;
m_callback = callback;
long collLen = gbstrlen(coll);
// scan the dump file
char *p = dumpFile;
// the end of it
char *pend = p + gbstrlen(p);
// add up total sizes
long sum = 0;
// end of line ptr
char *eol;
// count
long count = 1;
// debug
//HashTable ht;
// do the scan
for ( ; p < pend ; p = eol + 1 ) {
// point to next line
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
// a fake tag rec
TagRec gr;
// . scan it into "gr"
// . returns size of the tag rec stored into "buf"
long bytesScanned = gr.setFromBuf ( p , eol );
// error?
if ( bytesScanned <= 0 ) {count++; continue;}
// get size
long size = gr.getSize();
// error?
if ( size <= 0 ) {count++; continue;}
//logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size);
// hash it for debug
//ht.addKey ( count , size );
// sanity check
if ( size > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
// sanity check
char *site = gr.getString("site",NULL);
if ( ! site ) { char *xx=NULL;*xx=0;}
// then request header size
size += 4 + 1 + collLen + 1 + 1;
// increment total size
sum += size;
// make the buf
m_requestBuf = (char *)mmalloc ( sum , "msg9adbuf");
m_requestBufSize = sum;
// store tags here
char *t = m_requestBuf;
// return true on error with g_errno set
if ( ! t ) return true;
// reset to beginning of file
p = dumpFile;
// reset
count = 1;
// do the scan
for ( ; p < pend ; p = eol + 1 ) {
// point to next line
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
// first is the request size
long *requestSizePtr = (long *)t; t += 4;
// see how big the request is
char *a = t;
// then niceness
*t++ = (char)MAX_NICENESS;
// then coll
memcpy ( t , coll , collLen ); t += collLen;
// null temrinate
*t++ = '\0';
// then the 1 byte flag (0 means add?)
*t++ = 0;
// store TagRec into the request buffer
TagRec *gr = (TagRec *)t;
// . scan it into "t"
// . returns size of the tag rec stored into "buf"
long bytesScanned = gr->setFromBuf ( p , eol );
// error?
if ( bytesScanned <= 0 ) {
log("tagdb: skipping tag rec #%li.",count++);
t -= (4+1+collLen+1+1);
// get size
long size = gr->getSize();
// error?
if ( size <= 0 ) {
log("tagdb: skipping tag rec #%li.",count++);
t -= (4+1+collLen+1+1);
// test it
//long slot = ht.getSlot ( count );
//if ( slot < 0 ) { char *xx=NULL;*xx=0; }
//long shouldbe = ht.getValueFromSlot ( slot );
//if ( size != shouldbe ) { char *xx=NULL;*xx=0; }
//logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size);
// increment storage ptr
t += size;
// store the size of the WHOLE REQUEST, does not
// include the request size itself. see
// launchRequests() below.
*requestSizePtr = (t - a);
// sanity check
if ( *requestSizePtr > 10000 ) { char*xx=NULL;*xx=0;}
// sanity check
if ( t - m_requestBuf != sum ) { char *xx=NULL;*xx=0; }
// use their ptrs for adding these tag recs
m_p = m_requestBuf;
m_pend = m_requestBuf + m_requestBufSize ;
// now add those tags
return launchAddRequests ( );
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg9a::launchAddRequests ( ) {
// clear it
g_errno = 0;
// return true if nothing to launch
if ( m_p >= m_pend ) return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno ) return (m_requests == m_replies);
// limit max oustanding to 20
if (m_requests - m_replies >= 20 ) return (m_requests==m_replies);
// take a breath
// parse our request
char *p = m_p;
// first is the request size
p += 4;
// then niceness
p += 1;
// then coll
p += gbstrlen(p) + 1;
// then the 1 byte flag
// then the tag rec
TagRec *tagRec = (TagRec *)p;
// . get the groupid
// . tagRec's key should already be valid because when you add
// a ST_SITE to a TagRec it sets TagRec::m_key (special thing)
//unsigned long groupId = g_tagdb.getGroupId ( &tagRec->m_key );
uint32_t shardNum = getShardNum ( RDB_TAGDB , &tagRec->m_key );
// get the host to send to
Host *hosts = g_hostdb.getGroup ( groupId );
// select a host in the group
long hostNum = tagRec->m_key.n1 % g_hostdb.getNumHostsPerShard();
// and his ptr
Host *h = &hosts[hostNum];
// get the next mcast
//Multicast *m = &m_casts[m_requests];
// reqeust size
long requestSize = *(long *)m_p; m_p += 4;
char *request = m_p; m_p += requestSize;
// . send to just one very specific host so he is the only one that
// controls modification to this particular tagdb rec. that way if
// we are changing its Tags we do not collide with another.
// . this returns false and sets g_errno on error
UdpServer *us = &g_udpServer;
bool status = us->sendRequest ( request ,
requestSize ,
0x9a ,
h->m_ip , // bestIp
h->m_port , // destPort
h->m_hostId , // hostId
NULL , // slotPtr
this , // state
gotReplyWrapper9a , // callback
365*24*3600 , // timeout
-1 , // backoff
-1 , // max wait in ms
NULL , // replybuf
0 , // replybufMaxSize
m_niceness );
// error?
if ( ! status ) {
// g_errno should be set, we had an error
m_errno = g_errno;
return (m_requests == m_replies);
// successfully launched
// launch another
goto loop;
void gotReplyWrapper9a ( void *state , UdpSlot *slot ) {
Msg9a *THIS = (Msg9a *) state;
// don't let him free our send buf, it is m_requestBuf
// which we allocated above
slot->m_sendBufAlloc = NULL;
// error? if so, save it
if ( g_errno && ! THIS->m_errno ) THIS->m_errno = g_errno;
if ( ! THIS->launchAddRequests() ) return;
// free the allocated mem
THIS->m_callback ( THIS->m_state );
class State9a {
UdpSlot *m_slot;
Msg5 m_msg5;
char m_requestType;
Msg1 m_msg1;
RdbList m_list;
// this has all the tags we need to add/remove/replace
TagRec *m_tagRec;
// this has the original tagRec and we modify it with "m_tagRec"
// to get the final TagRec we add back to Tagdb. it is the
// "accumulator" tagdb record.
TagRec m_accRec;
// enough mem to store a key_t and a 0 dataSize (long)
char m_tmp[12+4];
char m_niceness;
char *m_coll;
// linked list of ppl waiting in line to make mods
class State9a *m_next;
//class State9a *m_tail;
void handleRequest9a ( UdpSlot *slot , long niceness ) {
// get the request
char *request = slot->m_readBuf;
long requestSize = slot->m_readBufSize;
// overflow protection for corrupt requests
if ( requestSize < 4 ) {
g_udpServer.sendErrorReply ( slot , g_errno );
// make a new Msg9a
State9a *st ;
try { st = new (State9a); }
catch ( ... ) {
g_errno = ENOMEM;
log("msg9a: new(%i): %s", sizeof(State9a), mstrerror(g_errno));
return g_udpServer.sendErrorReply ( slot, g_errno );
mnew ( st , sizeof(State9a) , "Msg10" );
// parse the request
char *p = request;
// save slot for sending reply
st->m_slot = slot;
// get niceness
st->m_niceness = *(char *)p; p++;
// get coll
st->m_coll = p; p += gbstrlen(p) + 1;
// save this
st->m_requestType = *p; p++;
// the "tagRec" is the record
TagRec *tagRec = (TagRec *)p; p += tagRec->getSize();
// store ptr
st->m_tagRec = tagRec;
// reset this, we are the head/tail of the linked list so far
st->m_next = NULL;
// sanity check
//char *site = tagRec->getString(ST_SITE,NULL);
char *site = tagRec->getString("site",NULL);
// this is a no-no
if ( ! site ) { char *xx=NULL;*xx=0;}
// no tail after us
//st->m_tail = NULL;
// . get the lock on this site
// . the lower 64 bits of the key should be the url hash
long slotNum = s_lockTable2.getSlot ( &st->m_tagRec->m_key.n0 );
// if already in there, we have to wait because someone is already
// making mods to this TagRec
if ( slotNum >= 0 ) {
// log this for now?
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"tagdb: TAGDB handleRequest9a "
"waiting for lock st=0x%lx key.n0=%llu",(long)st,
State9a *p ;
p = *(State9a **)s_lockTable2.getValueFromSlot(slotNum);
// put us right after him in the linked list
st->m_next = p->m_next;
p->m_next = st;
// we could be the next in line
//if ( ! p->m_next ) p->m_next = st;
// we wait...
// delete our slot from the lock table
if ( ! s_lockTable2.addKey ( &st->m_tagRec->m_key.n0 , &st ) ) {
log("tagdb: failed to get lock : %s",mstrerror(g_errno));
// free him, we sent his reply
mdelete ( st , sizeof(State9a),"msg9afr");
delete (st);
return g_udpServer.sendErrorReply ( slot, g_errno );
// make a startKey and endKey from the tagRec's key
key_t startKey = tagRec->m_key;
key_t endKey = tagRec->m_key;
// startkey gets is low bit cleared though
startKey.n0 &= 0xfffffffffffffffeLL;
// delete record request, no need to look it up
if ( st->m_requestType == 0x02 ) {
// note it
SafeBuf sb; tagRec->printToBuf ( &sb );
log("tagdb: deleting TagRec for site %s",sb.getBufStart());
// use tmp buf in st
char *p = st->m_tmp;
// store key in the tmp buf
*(key_t *)p = startKey;
// advance
p += sizeof(key_t);
// and store the data size
*(long *)p = 0;
// advance
p += 4;
// set the list (just a negative rec in it)
st->m_list.set ( st->m_tmp , // list
4+sizeof(key_t) , // listSize
st->m_tmp , // alloc
4+sizeof(key_t) , // allocSize
(char *)&startKey , // startKey
(char *)&endKey , // endKey
-1 , // fixeDataSize
false , // ownData?
false , // useHalfKeys?
sizeof(key_t) );// keySize
if ( ! st->m_msg1.addList( &st->m_list ,
st->m_coll ,
st ,
sendReply9a ,
false , // forceLocal?
st->m_niceness ))
// return if blocked
sendReply9a( st );
// . get from msg5, return if it blocked
// . will probably not block since in the disk page cache a lot
if ( ! st->m_msg5.getList ( RDB_TAGDB ,
st->m_coll ,
&st->m_list ,
startKey ,
endKey ,
100000 , // minRecSizes
true , // include tree?
false , // addtocache?
0 , // maxcacheage
0 , // startfilenum
-1 , // numFiles
st ,
gotList ,
st->m_niceness ,
true ))// do err correction?
// log that for debug
//log("tagdb: msg5 call did not block. st=%lu",(long)st);
// sanity check - why not block if it had corruption?
if ( st->m_msg5.m_msg3.m_hadCorruption ) { char *xx=NULL;*xx=0; }
// it did not block...
gotList( st , NULL , NULL );
void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) {
// cast our state class
State9a *st = (State9a *)state;
// return right away if error getting the rec
if ( g_errno ) { sendReply9a ( st ); return; }
// note it
//log("tagdb: in gotlist st=%lu",(long)st);
// this is the TagRec rdb record
char *rec = st->m_list.getList ();
long recSize = st->m_list.getListSize();
// cast it as a TagRec
TagRec *accRec = &st->m_accRec;
// reset in case not in tagdb and rec/recSize is NULL/0
// copy it to our accumulator rec which has room to grow, the list
// does not
memcpy ( (char *)accRec , rec , recSize );
// free that list buffer now, we copied it into a larger buffer
// clear it
g_errno = 0;
// . add/remove the tags from the tagRec
// . add will replace tags with the same tag id and username
// . should deal with "negative" tags (addDelTag())
//if ( st->m_requestType == 0x00 ) accRec->addTags ( st->m_tagRec );
//else accRec->removeTags ( st->m_tagRec );
accRec->addTags ( st->m_tagRec );
// was there an error? abandon all operations on this TagRec if so
if ( g_errno ) { sendReply9a ( st ); return; }
// perform operations on others in the queue
st = st->m_next;
// debug for now
if ( st && g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"tagdb: calling lock for st=0x%lx",(long)st);
// if there was one, do it
if ( st ) goto loop;
// reset to original parent
st = (State9a *)state;
// debug msg
SafeBuf sb; accRec->printToBuf ( &sb );
log(LOG_DEBUG,"tagdb: adding to tagdb: %s",sb.getBufStart());
// set the list, it should free itself
st->m_list.set ( (char *)accRec , // list
accRec->getSize() , // allocSize
(char *)accRec , // alloc
accRec->getSize() , // allocSize
(char *)&accRec->m_key , // startKey
(char *)&accRec->m_key , // endKey
-1 , // fixeDataSize
false , // ownData?
false , // useHalfKeys?
sizeof(key_t) );// keySize
// add it back after the mods
if ( ! st->m_msg1.addList( &st->m_list ,
st->m_coll ,
st ,
sendReply9a ,
false , // forceLocal?
MAX_NICENESS ))// niceness
// i giess we did not block! send back the reply...
sendReply9a ( st );
void sendReply9a ( void *state ) {
// cast our state class
State9a *st = (State9a *)state;
// delete our slot from the lock table
s_lockTable2.removeKey ( &st->m_tagRec->m_key.n0 );
// log it
if (g_errno) log("tagdb: msg9a failed to add: %s",mstrerror(g_errno));
// save it, in case a function below clears g_errno
long saved = g_errno;
if ( saved ) g_udpServer.sendErrorReply( st->m_slot,saved);
// send empty reply
else g_udpServer.sendReply_ass(NULL,0,NULL,0,st->m_slot);
// save old guy
State9a *next = st->m_next;
// free him, we sent his reply
mdelete ( st , sizeof(State9a),"msg9afr");
delete (st);
// repeat for each guy waiting in line
st = next;
// if there was one, do it
if ( st ) goto loop;
// reset to original parent
st = (State9a *)state;
// OTHER functions
long getY ( long long X , long long *x , long long *y , long n ) {
// if we only have one point then there'll be no interpolation
if ( n == 1 ) return y[0];
// find the first x after our "X"
long j;
for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break;
// before/after first/last point means we don't have to interpolate
if ( j <= 0 ) return y[0 ];
if ( j >= n ) return y[n-1];
// linear interpolate between our 2 points (x0,y0) and (x1,y1)
long long x0 = x[j-1];
long long x1 = x[j ];
long long y0 = y[j-1];
long long y1 = y[j ];
// error if x1 less than x0
if ( x1 <= x0 ) {
log("tagdb: X coordinates are not in ascending order for map");
char *xx=NULL;*xx=0;
// otherwise we have a sloping line
return y0 + ( ((long long)X - x0) * (y1-y0) ) /(x1-x0) ;
// sendPageTagdb() is the HTML interface to tagdb
static void sendReplyWrapper ( void *state ) ;
static void sendReplyWrapper2 ( void *state ) ;
static bool sendReply ( void *state ) ;
static bool sendReply2 ( void *state ) ;
static bool getTagRec ( class State12 *st );
// don't change name to "State" cuz that might conflict with another
class State12 {
//Msg9a m_msg9a;
TcpSocket *m_socket;
bool m_adding;
//char *m_coll;
collnum_t m_collnum;
//long m_collLen;
//char *m_buf;
//long m_bufLen;
bool m_isLocal;
//long m_fileNum;
//bool m_isAdmin;
//bool m_isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
//char m_userType;
HttpRequest m_r;
//char *m_username;
TagRec m_tagRec;
TagRec m_newtr;
Msg8a m_msg8a;
Url m_url;
char *m_urls;
long m_urlsLen;
Msg1 m_msg1;
RdbList m_list;
//Msg1 m_msg1;
long m_niceness;
bool m_mergeTags;
//char m_tmp[16];
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the tagdb interface
// . call g_httpServer.sendDynamicPage() to send it
// . show a textarea for sites, then list all the different site tags
// and have an option to add/delete them
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) {
// are we the admin?
//bool isAdmin = g_collectiondb.isAdmin ( req , s );
// get the collection record
CollectionRec *cr = g_collectiondb.getRec ( req );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log("admin: No collection record found "
"for specified collection name. Could not add sites to "
"tagdb. Returning HTTP status of 500.");
return g_httpServer.sendErrorReply ( s , 500 ,
"collection does not exist");
bool isAssassin = cr->isAssassin ( s->m_ip );
if ( isAdmin ) isAssassin = true;
// bail if permission denied
if ( ! isAssassin ){
//&& ! cr->hasPermission ( req , s ) ) {
log("admin: Bad collection name or password. Could not add "
"sites to tagdb. Permission denied.");
return sendPagexxxx( s , req ,
"Collection name or "
"password is incorrect");
// make a state
State12 *st ;
try { st = new (State12); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageTagdb: new(%i): %s",
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State12) , "PageTagdb" );
//st->m_isAdmin = isAdmin;
//st->m_isAssassin = isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
//st->m_userType = g_pages.getUserType ( s , req );
// assume we've nothing to add
st->m_adding = false;
// save the socket
st->m_socket = s;
// i guess this is nuked, so copy it
st->m_r.copy ( req );
// make it high priority
st->m_niceness = 0;
// point to it
HttpRequest *r = &st->m_r;
// get the collection
long collLen = 0;
char *coll = r->getString ( "c" , &collLen , NULL /*default*/);
// get collection rec
CollectionRec *cr2 = g_collectiondb.getRec ( coll );
// bitch if no collection rec found
if ( ! cr2 || ! coll || collLen+1 > MAX_COLL_LEN ) {
g_errno = ENOCOLLREC;
log("admin: No collection record found "
"for specified collection name. Could not add sites to "
"tagdb. Returning HTTP status of 500.");
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
return g_httpServer.sendErrorReply ( s , 500 ,
"collection does not exist");
// . get fields from cgi field of the requested url
// . get the null-terminated, space-separated lists of sites to add
long urlsLen = 0;
char *urls = r->getString ( "u" , &urlsLen , NULL /*default*/);
//a quick hack so we can put multiple sites in a link
if(r->getLong("uenc", 0))
for(long i = 0; i < urlsLen; i++)
if(urls[i] == '+') urls[i] = '\n';
// get the file # of the tagdb file these sites should use
//long fileNum = r->getLong ("f",-1);
// get the archive filename of sites to add
long xlen;
char *x = r->getString("x",&xlen,NULL);
// trim off any spaces
while ( xlen > 0 && is_wspace_a(x[xlen-1]) ) x[--xlen]='\0';
// . get the username
// . just get from cookie so it is not broadcast over the web via a
// referral url
//st->m_username = r->getStringFromCookie("username");
//st->m_username = g_users.getUsername(r);
// are we coming from a local machine?
st->m_isLocal = r->isLocal();
// don't set this unless we have to free it
st->m_buf = NULL;
st->m_bufLen = 0;
// . set our archive filename of sites to add with this fileNum
// . "a" will be NULL if none supplied
if ( xlen ) {
File file;
file.set ( x );
// add 1 to bufLen for terminating \0
long bufLen = file.getFileSize() + 1 ;
char *buf = (char *) mmalloc ( bufLen , "PageTagdb");
if ( ! buf ) {
log("admin: File of sites is too big to add to tagdb."
" Allocation of %li bytes failed.",bufLen);
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
return g_httpServer.sendErrorReply(s,500,
}; ( buf , bufLen - 1 , 0 );
// NULL terminate the list of urls
buf [ bufLen - 1 ] = '\0';
st->m_buf = buf;
st->m_bufLen = bufLen ;
urls = buf;
urlsLen = bufLen;
// it references into the request, should be ok
//st->m_coll = coll;
st->m_collnum = cr->m_collnum;
//st->m_collLen = collLen;
//strcpy ( st->m_coll , coll );
// do not print "(null)" in the textarea
if ( ! urls ) urls = "";
// the url buffer
st->m_urls = urls;
st->m_urlsLen = urlsLen;
// sanity check
//bool delOp = r->getLong ("delop",0 );
//char *nuke = r->getString ("nuke" ,NULL );
//if ( nuke && ! delOp ) {
// g_errno = EBADENGINEER;
// log("tagdb: delete operation checkbox not checked.");
// mdelete ( st , sizeof(State12) , "PageTagdb" );
// delete (st);
// return g_httpServer.sendErrorReply(s,500,
// mstrerror(g_errno));
long ufuLen;
char *ufu = r->getString("ufu",&ufuLen);
if ( urls[0] == '\0' && ! ufu ) return sendReply ( st );
char *get = r->getString ("get",NULL );
// this is also a get operation but merges the tags from all TagRecs
char *merge = r->getString("tags",NULL);
// is this an add/update operation? or just get?
if ( get || merge ) st->m_adding = false;
else st->m_adding = true;
// if each line in the file is the output of a tagdb dump
// operation on the cmd line like this:
// k.n1=0x892f9 k.n0=0xac2ff39f8112b71f version=0 TAG=ruleset,
// "mwells",1,Jan-02-2009-18:26:04,333333333,,3735437892,36
// THEN we should just call msg9a directly and it should create
// a tag rec for each line and add that
bool isDumpFile = false;
if ( urls && strncmp(urls,"k.n1=",5)==0 ) isDumpFile = true;
if ( isDumpFile ) {
if ( ! st->m_msg9a.addTags ( st->m_urls , // dumpFile
st->m_coll ,
st ,
sendReplyWrapper2 ,
0 ))// niceness
return false;
return sendReply2 ( st );
// get/merge operations can skip the tag rec lookup
//if ( ! st->m_adding ) return sendReply ( st );
// regardless, we have to get the tagrec for all operations
//Url site;
st->m_mergeTags = merge;
return getTagRec ( st );
bool getTagRec ( State12 *st ) {
bool doInheritance = st->m_mergeTags;//(bool)merge;
char rdbId = RDB_TAGDB;
// then use facebookdb
//char *host = site.getHost();
//if ( strncmp(host,"fbid",4)==0 && is_digit(host[4]) )
// this replaces msg8a
if ( ! st->m_msg8a.getTagRec ( &st->m_url,//&site ,
// tell msg8a to try to guess the site
st->m_collnum ,
false, // skip dom lookup?
st->m_niceness ,
st ,
sendReplyWrapper ,
&st->m_tagRec ,
doInheritance ,
return false;
if ( ! st->m_msg8a.getTagRec ( &site , // &st->m_url,
true, //usecanonicalName
0, //niceness
sendReplyWrapper ,
&st->m_tagRec ,
doInheritance )){
return false;
return sendReply ( st );
void sendReplyWrapper ( void *state ) {
sendReply ( state );
static void sendReplyWrapper2 ( void *state ) {
State12 *st = (State12 *)state;
// re-get the tags from msg8a since we changed them
//sendReply2 ( state );
bool sendReply ( void *state ) {
// get our state class
State12 *st = (State12 *) state;
// get the request
HttpRequest *r = &st->m_r;
// and socket
TcpSocket *s = st->m_socket;
// the tagrec
//TagRec *gr = &st->m_tagRec;
// reset "gr" so it won't show the old tags of the first rec
// in the text area box on the tagdb page after the add is completed
//if ( st->m_adding ) gr->reset();
// . if urlsLen <= 0 or fileNum < 0 and we're not deleting
// . then we've nothing to add
//if ( urlsLen <= 0 ) return sendReply ( st );
// need a valid username
//if ( ! st->m_username || st->m_username[0] == '\0' ) {
// log("tagdb: bad username.");
// mdelete ( st , sizeof(State12) , "PageTagdb" );
// delete (st);
// return g_httpServer.sendErrorReply(s,500,
// mstrerror(g_errno));
if ( ! st->m_adding ) return sendReply2 ( st );
//char *nuke = r->getString ("nuke" ,NULL );
TagRec *newtr = &st->m_newtr;
// update it from the http request
newtr->setFromHttpRequest ( r , s );
// but remove the site tag
//newtr.removeTags ( "site" , NULL );
// add it into gr
//gr->addTags ( &newtr );
// copy it over to our state
//memcpy ( gr , &newtr , newtr.getSize() );
// debug
// this doesn't work because we do not set TagRec::m_listPtrs[0]
// to point to the list we make below (MDW 4/29/13)
//SafeBuf tmp;
//newtr->printToBuf ( &tmp );
//log(LOG_DEBUG,"tagdb: converted from http: %s",
// tmp.getBufStart() );
// make a startKey and endKey from the tagRec's key
//key_t startKey = gr->m_key;
//key_t endKey = gr->m_key;
// startkey gets is low bit cleared though
//startKey.n0 &= 0xfffffffffffffffeLL;
// add using msg9a
if ( ! st->m_msg9a.addTags ( st->m_urls ,
NULL , // sitePtrs
0 , // numSitePtrs
st->m_coll ,
st ,
sendReplyWrapper2 ,
0 , // niceness
&newtr , // gr
nuke ,
NULL )) // ipvec
return false;
// shrotcut
SafeBuf *sbuf = &newtr->m_sbuf;
// use the list we got
RdbList *list = &st->m_list;
key128_t startKey;
key128_t endKey;
// set it from safe buf
list->set ( sbuf->getBufStart() ,
sbuf->length() ,
0 ,
(char *)&startKey ,
(char *)&endKey ,
-1 ,
false ,
false ,
sizeof(key128_t) );
// no longer adding
st->m_adding = false;
// . just use TagRec::m_msg1 now
// . no, can't use that because tags are added using SafeBuf::addTag()
// which first pushes the rdbid, so we gotta use msg4
if ( ! st->m_msg1.addList ( list ,
st->m_collnum ,
st ,
sendReplyWrapper2 ,
false ,
st->m_niceness ) )
return false;
// . if addTagRecs() doesn't block then sendReply right away
// . this returns false if blocks, true otherwise
//return sendReply2 ( st );
return getTagRec ( st );
bool sendReply2 ( void *state ) {
// get our state class
State12 *st = (State12 *) state;
// get the request
HttpRequest *r = &st->m_r;
// and socket
TcpSocket *s = st->m_socket;
// page is not more than 32k
char buf[1024*32];
SafeBuf sb(buf, 1024*32);
// do they want an xml reply?
if( r->getLong("xml",0) ) { // was "raw"
sb.safePrintf("<?xml version=\"1.0\" "
log ( LOG_INFO,"sending raw page###\n");
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
// extract the socket
TcpSocket *s = st->m_socket;
// . nuke the state
// . first free the buffer, if non-NULL
//if (st->m_buf) mfree (st->m_buf, st->m_bufLen, "PageTagdb");
mdelete(st, sizeof(State12), "PageTagdb");
delete (st);
// . send this page
// . encapsulates in html header and tail
// . make a Mime
return g_httpServer.sendDynamicPage(s, sb.getBufStart(),
0, false, "text/xml",
-1, NULL, "ISO-8859-1");
// . print standard header
// . do not print big links if only an assassin, just print host ids
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );
// did we add some sites???
if ( st->m_adding ) {
// if there was an error let them know
if ( g_errno )
sb.safePrintf("<center>Error adding site(s): <b>"
mstrerror(g_errno) , g_errno );
else sb.safePrintf ("<center><b><font color=red>"
"Sites added successfully"
//char *c = st->m_coll;
char bb [ MAX_COLL_LEN + 60 ];
".poo { background-color:#%s;}\n"
"</style>\n" ,
// print interface to add sites
sb.safePrintf (
"<table %s>"
"<tr><td colspan=2>"
"</td></tr>", TABLE_STYLE , bb );
// sometimes we add a huge # of urls, so don't display them because
// it like freezes the silly browser
char *uu = st->m_urls;
if ( st->m_urlsLen > 100000 ) uu = "";
//sb.safePrintf ( "<tr bgcolor=#%s><td colspan=2>"
// "<center>"
// "</center>"
// "</td></tr>",
sb.safePrintf ( "<tr class=poo><td>"
"<font size=-2>"
"Enter a single URL and then click <i>Get Tags</i> to "
"get back its tags. Enter multiple URLs and select "
"the tags names and values in the other table "
"below in order to tag "
"them all with those tags when you click "
"<i>Add Tags</i>. "
"On the command line you can also issue a "
"<i>./gb 0 dump S main 0 -1 1</i>"
"command, for instance, to dump out the tagdb "
"contents for the <i>main</i> collection on "
"<i>host #0</i>. "
// text area for adding space separated sites/urls
//char *pp = "put sites here";
//char *pp = "";
//if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls"
sb.safePrintf (""
"<td width=70%%>"
"<textarea rows=16 cols=64 name=u>"
"%s</textarea></td></tr>" , uu );
// spam assassins should not use this much power, too risky
//if ( st->m_isAdmin ) {
// sb.safePrintf ("<i><font size=-1>Note: use 1.2.3.<b>0</b> to "
// "specify ip domain.</i><br>");
// allow filename to load them from
//if ( st->m_isAdmin ) {
sb.safePrintf("<tr class=poo>"
"<b>file of urls to tag</b>"
"<font size=-2>"
"If provided, Gigablast will read the URLs from "
"this file as if you pasted them into the text "
"area above. The text area will also be ignored."
"<td><input name=ufu "
"type=text size=40>"//<br>"
//"<i>file can also be dumped output of "
//"tagdb from the <b>gb dump S ...</b> "
//"<br><br>" );
// this is applied to every tag that is added for accountability
sb.safePrintf("<tr class=poo><td>"
"<br><font size=-2>"
"Stored with each tag you add for accountability."
"<input name=username type=text size=6 "
"value=\"admin\"> "
// as a safety, this must be checked for any delete operation
sb.safePrintf ("<tr class=poo><td><b>delete operation</b>"
"<font size=-2>"
"If checked "
"then the tag names you specify below will be "
"deleted for the URLs you provide in the text area "
"when you click <i>Add Tags</i>."
"</td><td><input type=\"checkbox\" "
"value=\"1\" name=\"delop\"></td></tr>");
// close up
sb.safePrintf ("<tr bgcolor=#%s><td colspan=2>"
// this is merge all by default right now but since
// zak is really only using we
// should be ok
"<input type=submit name=get "
"value=\"Get Tags\" border=0>"
//"<input type=submit name=get "
//"value=\"get best rec\" border=0>"
//"<input type=submit name=tags "
//"value=\"merge all matching recs\" border=0>"
//"<input type=submit name=nuke "
//"value=\"delete recs\" border=0>"
// "</form>"
// . show all tags we got values for
// . put a delete checkbox next to each one
// . show 5-10 dropdowns for adding new tags
// for some reason the "selected" option tags do not show up below
// on firefox unless i have this line.
sb.safePrintf (
"<table %s>"
"<tr><td colspan=20>"
"<center><b>Add Tag</b></center>"
"</td></tr>", TABLE_STYLE );
// count how many "tagRecs" we are taking tags from
Tag *jtag = st->m_tagRec.getFirstTag();
long numTagRecs = 0;
for ( ; jtag ; jtag = st->m_tagRec.getNextTag(jtag) ) {
// skip dups
if ( jtag->m_type == TT_DUP ) continue;
// count # of TagRecs contributing to the tags
//if ( tag && tag->m_type == ST_SITE ) numTagRecs++;
if ( jtag && jtag->isType("site") ) numTagRecs++;
// if we are displaying a COMBINATION of TagRecs merged together in
// the inheritance loop (above) then you can not edit that! you can
// only edit individual tag recs
bool canEdit = (numTagRecs <= 1);
if ( ! canEdit )
sb.safePrintf("<tr class=poo>"
"<td colspan=10><center><font color=red>"
"<b>Can not edit because more than one "
"TagRecs were merged</b></font></center>"
"</td></tr>\n" );
// headers
sb.safePrintf("<tr bgcolor=#%s>"
"<td><b>tag name</b></td>"
"<td><b>tag value</b></td>"
"<td><b>datasize (with NULL)</b></td>"
"<td><b>user ip</b></td>"
// set up the loop
Tag *itag = st->m_tagRec.getFirstTag();
//last = NULL;
long count = 0;
long empty = 0;
// loop over all tags in TagRec
for ( ; empty < 3 ; count++ ) {
// use this tag to print from
Tag *ctag = itag;
// advance
if ( itag ) itag = st->m_tagRec.getNextTag(itag);
// make it NULL, do not start over at the beginning
if ( empty > 0 ) ctag = NULL;
// skip dups
if ( ctag && ctag->m_type == TT_DUP ) continue;
// if ctag NULL and we are getting all tags, break
if ( ! canEdit && ! ctag ) break;
// assign for looping
//last = tag;
// if we are NULL, print out 3 empty tags
if ( ! ctag ) empty++;
// start the section
sb.safePrintf("<tr class=poo>");
// the delete tag checkbox
//sb.safePrintf("<tr bgcolor=#%s><td>",DARK_BLUE);
if ( ctag && canEdit ) // && tag->m_type != ST_SITE )
sb.safePrintf("<input name=deltag%li "
// start the next cell
// . skip ST_SITE, do not show dropdown for that
// . no, because for looking up tagRecs i like to see
// the site tag value, to see what subdomain is matched
//if ( ctag && ctag->m_type == ST_SITE ) continue;
// print drop down
if ( ! ctag ) sb.safePrintf("<select name=tagtype%li>",count);
// how many tags do we have?
long n = (long)sizeof(s_tagDesc)/(long)sizeof(TagDesc);
// the options
for ( long i = 0 ; ! ctag && i < n ; i++ ) {
TagDesc *td = &s_tagDesc[i];
// get tag name
char *tagName = td->m_name;
// skip if a reserved tag
//if ( strncasecmp ( tagName , "reserved" ,8)==0 )
// continue;
// select the item in the dropdown
char *selected = "";
// was it selected?
if ( ctag && td->m_type == ctag->m_type )
selected = " selected";
// show it in the drop down list
sb.safePrintf("<option value=\"%s\"%s>%s",
// close up the drop down list
if ( ! ctag ) sb.safePrintf("</select>");
else {
char *tagName = getTagStrFromType ( ctag->m_type );
sb.safePrintf("<input type=hidden name=tagtype%li "
// the score field for the drop down list, whatever tag id
// was selected will have this score
if ( canEdit )
sb.safePrintf("<input type=text name=tagdata%li "
"size=50 value=\"",count);
// show the value
if ( ctag ) ctag->printDataToBuf ( &sb );
// close up the input tag
if ( canEdit ) sb.safePrintf("\">");
// close up table cell
// if no tag, just placeholders
if ( ! ctag ) {
// data size
// username, timestamp only for non-empty tags
char *username = ctag->getUser();
long timestamp = ctag->m_timestamp;
long ip = 0;
char *ips = "&nbsp;";
if ( ctag->m_ip ) { ip=ctag->m_ip; ips=iptoa(ctag->m_ip);}
// convert timestamp to string
char tmp[64];
time_t ts = timestamp;
struct tm *timeStruct = localtime ( &ts );
if ( timestamp )
sb.safePrintf("<td><input type=hidden name=taguser%li "
sb.safePrintf("<td><input type=hidden name=tagtime%li "
sb.safePrintf("<td><input type=hidden name=tagip%li "
sb.safePrintf("<input type=hidden name=tagn1key%li "
sb.safePrintf("<input type=hidden name=tagn0key%li "
sb.safePrintf("<td>0x%lx</td>", (long)(ctag->m_key.n0>>32) );
// order 1 in since we always do that because
// we forgot to shift up one for the delbit
// above in Tag::set() when it sets m_key.n0
(long)(ctag->m_key.n0&0xffffffff) | 0x01);
// username,tmp,ips);
// do not print add or del tags buttons if we got tags from more
// than one TagRec!
if ( canEdit )
sb.safePrintf ("<tr bgcolor=#%s><td colspan=10><center>"
"<input type=submit name=add "
"value=\"Add Tags\" border=0>"
sb.safePrintf ( "</center></table>" );
sb.safePrintf ("</form>");
sb.safePrintf ("</html>");
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
// calculate buffer length
// extract the socket
//TcpSocket *s = st->m_socket;
// . nuke the state
// . first free the buffer, if non-NULL
//if ( st->m_buf ) mfree ( st->m_buf , st->m_bufLen , "PageTagdb" );
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
// print it out
//logf(LOG_DEBUG,"tagdb: %s",sb.getBufStart()+sb.length()-256);
// . send this page
// . encapsulates in html header and tail
// . make a Mime
return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
//void classifierDoneWrapper ( void *state ) {
// g_tagdbClassifier.m_running = false;
// . we can have multiple tags of this type per tag for a single username
// . by default, there can be multiple tags of the same type in the Tag as
// long as the usernames are all different. see addTag()'s deduping below.
bool isTagTypeUnique ( long tt ) {
// a dup?
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// look up in hash table
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
// if none, that is crazy
if ( ! td ) { char *xx=NULL;*xx=0; }
// return
if ( td->m_flags & TDF_ARRAY) return false;
return true;
bool isTagTypeIndexable ( long tt ) {
// a dup?
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// look up in hash table
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
// if none, that is crazy
if ( ! td ) { char *xx=NULL;*xx=0; }
// return false if we should not index it
if ( td->m_flags & TDF_NOINDEX ) return false;
// otherwise, index it
return true;
// . when displaying a tag we need to know if it is a string or not
// . that and the dataSize determine how we display it
bool isTagTypeString ( long tt ) {
// look up in hash table
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
// if none, that is crazy
if ( ! td ) { char *xx=NULL;*xx=0; }
// return
return (td->m_flags & TDF_STRING);
// used to determine if one Tag should overwrite the other! if they
// have the same dedup hash... then yes...
long Tag::getDedupHash ( ) {
// if unique use that!
if ( isTagTypeUnique ( m_type ) ) return m_type;
// if we are NOT unique... then hash username and data. thus we only
// replace a key if its the same tagtype, username and data. that
// way it will just update the timestamp and/or ip.
// start hashing here
char *startHashing = (char *)&m_type;
// end here. include username (and tag data!)
char *endHashing = m_buf + m_bufSize;
// if we are an event tag then PageEvents.cpp added us in the form of
// user%llutag%sval%li ... so ignore value (FACEBOOKDB)
//if ( m_type == s_eventTag ) {
// endHashing--;
// for (;endHashing-1>m_buf&&is_digit(endHashing[-1]);
// endHashing--);
// do not include bufsize in hash
long saved = m_bufSize;
m_bufSize = 0;
// hash this many bytes
long hashSize = endHashing - startHashing;
// set key
long dh = hash32 ( startHashing , hashSize );
// revert bufsize
m_bufSize = saved;
return dh;