open-source-search-engine/Tagdb.cpp
Matt 09de59f026 do not store cblock, etc. tags into tagdb to save
disk space. added tagdb file cache for better performance,
less disk accesses. will help reduce disk load.
put file cache sizes in master controls and if they change
then update the cache size dynamically.
2015-09-10 12:46:00 -06:00

5248 lines
154 KiB
C++

#include "gb-include.h"
#include <sys/stat.h>
#include "Titledb.h"
#include "Tagdb.h"
#include "Categories.h"
#include "Unicode.h"
#include "Threads.h"
#include "Msg1.h"
#include "HttpServer.h"
#include "Pages.h"
#include "SiteGetter.h"
#include "HashTableX.h"
#include "Users.h"
#include "Process.h"
#include "Rebalance.h"
static void gotMsg0ReplyWrapper ( void *state );
//static void gotReplyWrapper9a ( void *state , UdpSlot *slot ) ;
//static void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) ;
//static void sendReply9a ( void *state ) ;
static HashTableX s_ht;
static bool s_initialized = false;
// to stdout
int32_t Tag::print ( ) {
SafeBuf sb;
printToBuf ( &sb );
// dump that
return fprintf(stderr,"%s\n",sb.getBufStart());
}
bool Tag::printToBuf ( SafeBuf *sb ) {
sb->safePrintf("k.hsthash=%016"XINT64" "
"k.duphash=%08"XINT32" "
"k.sitehash=%08"XINT32" ",
m_key.n1,
(int32_t)(m_key.n0>>32),
(int32_t)(m_key.n0&0xffffffff));
// print the tagname
sb->safePrintf ( "TAG=%s,\"%s\",",
getTagStrFromType(m_type),
getUser() );
// data size
//sb->safePrintf( "%"INT32",", (int32_t)getTagDataSize());
// print the date when this tag was added
time_t ts = m_timestamp;
struct tm *timeStruct = localtime ( &ts );
char tmp[100];
strftime(tmp,100,"%b-%d-%Y-%H:%M:%S,",timeStruct);
sb->safePrintf("%s(%"UINT32"),",tmp,m_timestamp);
// print the time as a int32_t, seconds since epoch
//sb->safePrintf("%"UINT32",",m_timestamp);
// print the ip added from
sb->safePrintf("%s,",iptoa(m_ip));
// print the tag id
//sb->safePrintf("%"UINT32",\"",(int32_t)m_tagId);
// key.n1 is hash of the subdomain i think
//sb->safePrintf("%"UINT32",\"",m_key.n1);
sb->safePrintf("\"");
if ( ! printDataToBuf ( sb ) ) return false;
// final quote
sb->safePrintf("\"");
return true;
}
// . "site" can also be a specific url, but it must be normalized
// . i.e. of the form http://xyz.com/
void Tag::set ( char *site ,
char *tagname ,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize ) {
// get type from name
m_type = getTagTypeFromStr ( tagname , strlen(tagname) );
// sanity
//isTagTypeIndexable ( m_type );
m_timestamp = timestamp;
m_ip = ip;
int32_t userLen = 0;
if ( user ) userLen = gbstrlen(user);
// truncate to 127 byte int32_t
if ( userLen > 126 ) userLen = 126;
// first byte is size of user, then user plus \0 then data
//m_bufSize = 1 + userLen + 1 + dataSize;
// "site" must skip http://
//int32_t slen = gbstrlen(site);
//if ( slen > 8 && strncasecmp(site,"http://",7)==0 )
// site += 7;
//else if ( slen > 8 && strncasecmp(site,"https://",8)==0 )
// site += 8;
// normalize
Url norm;
norm.set ( site );
// store user into special buffer
//int32_t ulen = 0;
//if ( user ) {
// ulen = gbstrlen(user);
// if ( ulen > 7 ) ulen = 7;
//}
//memset ( m_user , 0 , 8 );
//gbmemcpy ( m_user , user , ulen );
char *p = m_buf;
// store size (includes \0)
*p++ = userLen + 1;
// then user name
gbmemcpy ( p , user , userLen );
p += userLen;
// then \0
*p++ = '\0';
// store data now too
gbmemcpy ( p , data , dataSize );
p += dataSize;
// NULL terminate if they did not! now all tag are strings and must
// be NULL terminated.
if ( data && p[-1] ) { // data && m_data[dataSize-1] ) {
//m_data[dataSize] = '\0';
*p++ = '\0';
//dataSize++;
//m_dataSize++;
}
// set it
m_bufSize = p - m_buf;
// top X bits should be hash of the domain only so all recs are on the
// same host near each other
//m_key.n1 = hash32 ( norm.getDomain() , norm.getDomainLen());
//
// too many tags were being read when k.n1 was the domain hash for
// sites like az.com that had hundreds of subdomains. so go based on
// host instead.
//
// CRAP: using 32 bit hash we get collisions for crap like
// thedietsolutionprogramscam.com and
// 2witchdoctors.a-livejasmin.com
// so let's move to 64bit keys
//m_key.n1 = hash64 ( norm.getHost() , norm.getHostLen());
// i had to make this the hash of the site, not host,
// because www.last.fm/user/xxxxx/
// was making the rdblist a few megabytes big!!
m_key.n1 = hash64n ( site );
// assume we are unique tag, that many of this type can exist
uint32_t upper32 = getDedupHash(); // m_type;
/*
// if we are NOT unique... then hash username and data. thus we only
// replace a key if its the same tagtype, username and data. that
// way it will just update the timestamp and/or ip.
if ( ! isTagTypeUnique ( m_type ) ) {
// start hashing here
char *startHashing = (char *)&m_type;
// end here. include username (and tag data!)
char *endHashing = m_buf + m_bufSize;
// hash this many bytes
int32_t hashSize = endHashing - startHashing;
// . set key
upper32 = hash32 ( startHashing , hashSize );
}
*/
// put in upper 32
m_key.n0 = upper32;
// shift it up
m_key.n0 <<= 32;
// . then or in url hash
// . for the site "www.paypal.com:1234" this included the port!
// but for the most part if the site is just a hostname then
// this is basically just a hostname, too, but the hash will
// include the http:// and the ending /
// . www.paypal.com:1234 was added as a site. so it has the
// same m_key.n1 as www.paypal.com, but this part is different
// here. this is the full site hash really. so during the lookup
// i'd say filter out such tags if they don't match the site you
// are looking up.
//m_key.n0 |= (uint32_t) hash32 ( norm.getUrl() , norm.getUrlLen() );
// set positive bit so its not a delete record
m_key.n0 |= 0x01;
// the size of this class as an Rdb record
m_recDataSize = m_bufSize + sizeof(Tag) - sizeof(key128_t) - 4;
}
// . return # of ascii chars scanned in "p"
// . return 0 on error
// . parses output of printToBuf() above
// . k.n1=0x695b3 k.n0=0xa4118684fa4edf93 version=0 TAG=ruleset,"mwells",Jan-02-2009-18:26:04,<timestamp>,67.16.94.2,3735437892,36 TAG=blog,"mwells",Jan-02-2009-18:26:04,67.16.94.2,2207516434,1 TAG=site,"tagdb",Jan-02-2009-18:26:04,0.0.0.0,833534375,mini-j-gaidin.livejournal.com/
int32_t Tag::setFromBuf ( char *p , char *pend ) {
// save our place
char *start = p;
// tags always start with " TAG="
if ( strncmp(p," TAG=",5) ) {
log("tagdb: error processing tag in setFromBuf().");
return 0;
}
// skip that
p += 5;
// get the type
char *type = p;
// get type length
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// that is the length
int32_t typeLen = p - type;
// convert to number
m_type = getTagTypeFromStr ( type , typeLen );
// panic?
if ( m_type == -1 ) { char *xx=NULL;*xx=0;}
// now the user, skip comma and quote
p+=2;
// data buffer
char *dst = m_buf;
// point to it
char *user = p;
// get end of it
while ( p < pend && *p != '\"' ) p++;
// error?
if ( p == pend ) return 0;
// set length
int32_t userLen = p - user;
// sanity. username total buf space including \0 <= 8
if ( userLen > 126 ) userLen = 126;
// copy it over into us
//gbmemcpy ( m_user , user , userLen );
// NULL terminate
//m_user[userLen] = '\0';
// first byte is username size
*dst++ = userLen+1;
// then the username
gbmemcpy ( dst , user , userLen );
dst += userLen;
// and finall null termination
*dst++ = '\0';
// skip quote and comma
p+=2;
// now the datasize
//int32_t m_dataSize = atoi(p);
// skip till comma
//while ( p < pend && *p != ',' ) p++;
// error?
//if ( p == pend ) return 0;
// skip comma
//p++;
// that is the time stamp in canonical form
// skip till comma
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// skip comma
p++;
// save start
char *ts = p;
// skip until comma again
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// this is the timestamp in seconds since epoch
m_timestamp = atoi(ts);
// skip comma
p++;
// ip address as text
char *ips = p;
// skip until comma again
while ( p < pend && *p != ',' ) p++;
// error?
if ( p == pend ) return 0;
// convert it to binary
m_ip = atoip ( ips , p - ips );
// skip comma
p++;
// get the tag identifier
//m_tagId = atol(p);
//sscanf ( p , "%"UINT32",",&m_tagId);
//int64_t big = atoll(p);
//m_tagId = (int32_t)big;
// skip until comma again
//while ( p < pend && *p != ',' ) p++;
// error?
//if ( p == pend ) return 0;
// skip comma
//p++;
//
// BEGIN HACK
//
// as a hack for now, override this, because before we were not 100%
// strings as tags, we had single byte values being printed out as
// strings of 3 bytes
//char *e = p;
//while ( e < pend && ! is_wspace_a(*e) ) e++;
//if ( e > pend ) return 0;
//m_dataSize = e - p;
// add in a \0
//m_dataSize++;
//
// END HACK
//
// . now is the data
// . return # of chars scanned in "p"
p += setDataFromBuf ( p , pend );
// . sanity check
// . all tags must be NULL terminated now
if ( m_buf[m_bufSize-1] != '\0' ) {char *xx=NULL; *xx=0; }
// we reset this since we now require that all tags are NULL terminated
// strings
//m_tagId = hash32 ( (char *)this,(int32_t)sizeof(Tag)+m_dataSize , 0 );
// 0 is not valid
//if ( m_tagId == 0 ) m_tagId = 1;
// return how many bytes we read
return p - start;
}
// . return # of chars scanned in "p"
// . return 0 on error
int32_t Tag::setDataFromBuf ( char *p , char *pend ) {
// string are special
//if ( isTagTypeString ( m_type ) ) {
// skip over username in the buffer to point to where to put tag data
char *dst = m_buf + *m_buf + 1;
// stop at space of
gbmemcpy(dst,p,pend-p);
// advance
dst += (pend-p);
// update
m_bufSize = dst - m_buf;
// should be end delimter
char c = m_buf[m_bufSize-1];
// sanity check
if ( c && ! isspace(c) ) { char *xx=NULL;*xx=0; }
// strings are always NULL terminated, the datasize should
// include the NULL termination
m_buf[m_bufSize-1]='\0';
// we basically insert the \0, and *p should point to the space
// right after the string...! so return m_dataSize - 1
return m_bufSize - 1;
/*
}
// save it to count
char *start = p;
// print as decimal if just 1 byte
if ( m_dataSize == 1 ) {
int32_t v = atoi(p);
if ( v > 256 ) { char *xx=NULL;*xx=0; }
m_data[0] = v;
// skip till whitespace or end
while ( p < pend && isdigit(*p) ) p++;
return p - start;
}
// skip 0x
if ( *p!='0' || *(p+1)!='x' ) { char *xx=NULL;*xx=0; }
p += 2;
// convert hexadecimal string into binary
int32_t bytesStored = hexToBinary ( p , pend , m_data , false );
// sanity check
if ( bytesStored != m_dataSize ) { char*xx=NULL;*xx=0;}
// advance p, each byte is two characters
p += bytesStored * 2;
// return # of bytes in "p" we scanned
return p - start;
*/
}
int32_t hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement ) {
// keep tabs on how many bytes we store into "dst"
char *start = dst;
// read in hex values
while ( src < srcEnd ) {
// get FIRST hex digit
unsigned char v;
v = *(unsigned char *)src;
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
else if ( v >= '0' && v <= '9' ) v = v - '0';
else break;
// sanity check
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
// next character
src++;
// store it in the destination
*dst = v;
// sanity check, need one more char FOR SURE!
if ( src >= srcEnd ) { char*xx=NULL;*xx=0;}
// get the SECOND hex digit of this byte
v = *(unsigned char *)src;
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
else if ( v >= '0' && v <= '9' ) v = v - '0';
else break;
// sanity check
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
// next character
src++;
// shift last guy up 4 bits
*dst = *dst << 4;
// or in the new guy
*dst |= v;
// point to next byte now
if ( decrement ) dst--;
else dst++;
}
return dst - start;
}
bool Tag::printDataToBuf ( SafeBuf *sb ) {
// string are special
//if ( isTagTypeString ( m_type ) ) {
char *data = getTagData();
int32_t dataSize = getTagDataSize();
// because of a bug of not appending the \0 and incrementing
// Tag::m_dataSize when we should have, we must deal with this!
//sb->safePrintf("%s",m_data);
for ( int32_t i = 0 ; data[i] && i < dataSize ; i++ )
sb->safePrintf ( "%c" , data[i] );
return true;
/*
}
// print as decimal if just 1 byte
if ( m_dataSize == 1 ) {
sb->safePrintf("%"INT32"",(int32_t)m_data[0]);
return true;
}
// the "score"
sb->safePrintf("0x");
//for ( int32_t i = 0 ; i < m_dataSize ; i++ )
// sb->safePrintf ( "%02hhx" , m_data[m_dataSize-i-1] );
// i guess just print it first byte first now
for ( int32_t i = 0 ; i < m_dataSize ; i++ )
sb->safePrintf ( "%02hhx" , m_data[i] );
*/
return true;
}
// /admin/tagdb?c=mdw&u=www.mdw123.com&ufu=&username=admin&tagtype0=sitenuminlinks&tagdata0=10&tagtype1=rootlang&tagdata1=&tagtype2=rootlang&tagdata2=&add=Add+Tags
bool Tag::printToBufAsAddRequest ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
sb->safePrintf("/admin/tagdb?");
// print the site
//sb->safePrintf("u=");
//sb->urlEncode ( m_url->getUrl() );
// print key of the tag as 16 byte key in ascii hex notation
// we don't know the "site" for all tags because "site" is a tag
// itself. we should take this in lieu of the "u=" url parm
// which is made to generate the key anyhow.
//sb->safePrintf("tagkey0=%s",KEYSTR(&m_key,16));
sb->safePrintf("&tagn0keyb0=%"INT64"",m_key.n0);
sb->safePrintf("&tagn1keyb0=%"INT64"",m_key.n1);
// print the user that added this tag
sb->safePrintf ( "&username=%s" , getUser() );
// the tag type, like "sitenuminlinks" or "rootlang"
sb->safePrintf("&tagtype0=%s",str);
// print the date when this tag was added
//sb->safePrintf ("&%s.time=%"INT32"", str, m_timestamp );
// print the tag id
//sb->safePrintf("&%s.id=%"UINT32"",str,(int32_t)m_tagId);
// the "score"
sb->safePrintf("&tagdata0=");//,str);
// print the m_data
SafeBuf tmp;
if ( ! printDataToBuf ( &tmp ) ) return false;
tmp.nullTerm();
sb->urlEncode(tmp.getBufStart());
sb->nullTerm();
return true;
}
bool Tag::printToBufAsXml ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("\t\t<tag>\n\t\t\t<name>%s</name>\n\t\t\t<user>%s",
str,getUser());
// print the date when this tag was added
sb->safePrintf("</user>\n\t\t\t<timestamp>%"INT32"</timestamp>\n",
m_timestamp);
// print the ip added from
sb->safePrintf("\t\t\t<ip>%s</ip>\n",iptoa(m_ip));
// print the tag id
//sb->safePrintf("\t\t\t<id>%"UINT32"</id>\n",(int32_t)m_tagId);
// the "score"
sb->safePrintf("\t\t\t<score>");
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf("</score>\n\t\t</tag>");
return true;
}
//if ( ! sb->safePrintf("\t\t<eventTagFromTagdb>"
// "<![CDATA[") )
bool Tag::printToBufAsXml2 ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("\t\t<eventTagdbTag>\n"
// who added the tag:
"\t\t\t<addedBy><![CDATA[%s]]></addedBy>\n"
// when tag was added:
"\t\t\t<addedTimestamp>%"UINT32"</addedTimestamp>\n"
// ip added from
"\t\t\t<addedFromIP><![CDATA[%s]]></addedFromIP>\n"
// name of the tag:
"\t\t\t<name><![CDATA[%s]]></name>\n"
// the tag data
"\t\t\t<data><![CDATA[",
getUser(),
m_timestamp,
iptoa(m_ip),
str);
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf("]]></data>\n"
"\t\t</eventTagdbTag>\n");
return true;
}
bool Tag::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print the user that added this tag
sb->safePrintf ("<tr><td>%s</td><td><b>%s</b>", prefix, str);
// the "score"
sb->safePrintf(" value=<b>");
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
// print the date when this tag was added
sb->safePrintf("</b> user=%s time=",getUser());
time_t ts = m_timestamp;
struct tm *timeStruct = localtime ( &ts );
char tmp[100];
strftime(tmp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);
sb->safePrintf("%s(%"UINT32")",tmp,m_timestamp);
// print the ip added from
sb->safePrintf(" ip=%s",iptoa(m_ip));
//sb->safePrintf(" id=%"UINT32"",(int32_t)m_tagId);
sb->safePrintf("</td></tr>\n");
return true;
}
bool Tag::printToBufAsTagVector ( SafeBuf *sb ) {
// print the tagname
char *str = getTagStrFromType ( m_type );
// print strings data types special
//if ( isTagTypeString ( m_type ) ) {
//sb->safePrintf("%s:%s ",str,m_data);
sb->safePrintf("%s:",str);
// print the m_data
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf(" ");
return true;
/*
}
// print the user that added this tag
sb->safePrintf ("%s:", str );
if ( ! printDataToBuf ( sb ) ) return false;
sb->safePrintf(" ");
return true;
*/
}
bool Tag::isType ( char *t ) {
int32_t h = hash32n ( t );
return (m_type == h);
}
TagRec::TagRec ( ) {
m_numListPtrs = 0;
}
void TagRec::constructor ( ) {
m_numListPtrs = 0;
// run a constructor on the lists
for ( int32_t i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ ) {
m_lists[i].constructor();//m_alloc = NULL;
//m_lists[i].m_allocSize = 0;
}
}
TagRec::~TagRec ( ) {
reset();
}
void TagRec::reset ( ) {
m_numListPtrs = 0;
for ( int32_t i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ )
m_lists[i].freeList();
}
Tag *TagRec::getTag ( char *tagTypeStr ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
return getTag2 ( tagType );
}
Tag *TagRec::getTag2 ( int32_t tagType ) {
Tag *tag = getFirstTag();
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// got it
return tag;
}
// if not found return NULL
return NULL;
}
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
int32_t TagRec::getLong ( char *tagTypeStr,
int32_t defalt ,
Tag **bookmark ,
int32_t *timestamp ,
char **user ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
return getLong ( tagType ,
defalt ,
bookmark ,
timestamp ,
user );
}
int32_t TagRec::getLong ( int32_t tagType ,
int32_t defalt ,
Tag **bookmark ,
int32_t *timestamp ,
char **user ) {
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// get the value as a int32_t
int32_t score = 0;
// the size
char *data = tag->getTagData();
int32_t dataSize = tag->getTagDataSize();
//int32_t size = m_dataSize;
// if ends in NULL trunc it
if ( data[dataSize-1] == '\0' ) dataSize--;
// trunc it
//if ( size > 4 ) size = 4;
// convert string to value, MUST be signed!!! the data
// should inclue a \0
score = atol2(data,dataSize);
// if only a single byte.need to preserve negatives (twos comp)
//if ( size == 1 ) score = (int32_t)tag->m_data[0];
//else if ( size == 2 ) score = (int32_t)*((int16_t *)tag->m_data);
//else gbmemcpy ( &score , tag->m_data , size );
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
return score;
}
// not found
return defalt;
}
int64_t TagRec::getLongLong ( char *tagTypeStr,
int64_t defalt ,
Tag **bookmark ,
int32_t *timestamp ,
char **user ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// get the value as a int32_t
int64_t score = 0;
// the size
char *data = tag->getTagData();
int32_t dataSize = tag->getTagDataSize();
// if ends in NULL trunc it
if ( data[dataSize-1] == '\0' ) dataSize--;
// trunc it
//if ( size > 8 ) size = 8;
// now everything is a string
score = atoll2(data,dataSize);
// store it
//gbmemcpy ( &score , tag->m_data , size );
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
return score;
}
// not found
return defalt;
}
char *TagRec::getString ( char *tagTypeStr,
char *defalt ,
int32_t *size ,
Tag **bookmark ,
int32_t *timestamp ,
char **user ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
// start here
Tag *tag ;
if ( ! bookmark ) tag = getFirstTag();
else tag = getNextTag ( *bookmark );
// loop over all tags in the buf
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not a match
if ( tag->m_type != tagType ) continue;
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// want size? includes \0 probably
if ( size ) *size = tag->getTagDataSize();//m_dataSize;
// bookmark, et al
if ( bookmark ) *bookmark = tag;
if ( timestamp ) *timestamp = tag->m_timestamp;
if ( user ) *user = tag->getUser();
// return it
return tag->getTagData();//m_data;
}
// not found
return defalt;
}
/*
// add a special tag with null m_data. this tells Msg9a to delete
// all tags of this tag type before adding any other tags of this type
// that we might have. it is basically a "negative" tag.
bool TagRec::addDelTag ( char *tagTypeStr ) {
return addTag ( tagTypeStr ,
0 , // timestamp
NULL , // user
0 , // ip
NULL , // data
0 );// dataSize
}
// returns false and sets g_errno on error
bool TagRec::addTag ( char *tagTypeStr,
int32_t timestamp ,
char *user ,
int32_t ip ,
char *data ,
int32_t dataSize ) {
// get the tagType
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
// breach check
if ( dataSize + sizeof(Tag) > MAX_TAGREC_SIZE ) {
g_errno = EBUFTOOSMALL;
return log("tagdb: no room to add tag");
}
// the Tag::m_dataSize is only 2 bytes... NOT ANYMORE, MDW
if ( dataSize < 0 ) { // >= 65536 ) {
g_errno = EBADENGINEER;
return log("tagdb: tag dataSize of %"INT32" is >= 65536. "
"Bad value.", dataSize);
}
// sanity check -- no binary chars allowed, must all be strings!
// BUT they can have an empty string (i.e. just \0)
if ( dataSize == 1 && data[0] < 9 && data[0] >= 0 && data[0] ) {
char *xx=NULL;*xx=0; }
// make a tag
char buf[MAX_TAGREC_SIZE];
Tag *tag = (Tag *)buf;
// fill it in
tag->m_type = tagType;
tag->m_timestamp = timestamp;
tag->m_ip = ip;
tag->m_dataSize = dataSize;
// dummy value for now
tag->m_tagId = 0;
// careful!
if ( sizeof(Tag) + dataSize + 10 > MAX_TAGREC_SIZE ) {
g_errno = EBUFTOOSMALL;
return log("tagdb: no room to add tag data");
}
// store user into special buffer
int32_t ulen = 0;
if ( user ) {
ulen = gbstrlen(user);
if ( ulen > 7 ) ulen = 7;
}
memset ( tag->m_user , 0 , 8 );
gbmemcpy ( tag->m_user , user , ulen );
// store data now too
gbmemcpy ( tag->m_data , data , dataSize );
// NULL terminate if they did not! now all tag are strings and must
// be NULL terminated.
if ( data && tag->m_data[dataSize-1] ) {
tag->m_data[dataSize] = '\0';
dataSize++;
tag->m_dataSize++;
}
// the id is the hash for now (MDW)
tag->m_tagId = hash32 ( (char *)tag,(int32_t)sizeof(tag)+dataSize , 0 );
// 0 is not valid
if ( tag->m_tagId == 0 ) tag->m_tagId = 1;
// now add that tag
return addTag ( tag );
}
// returns false and sets g_errno on error
bool TagRec::addTag ( Tag *TAG ) {
// . do not allow empty user
// . but "del tags" i.e. "negative tags" can have no user
if ( TAG->m_dataSize>0 && (!TAG->m_user || TAG->m_user[0] == '\0') ) {
char *xx=NULL;*xx=0;}
// sanity check
if ( TAG->m_tagId == 0 ) { char *xx=NULL;*xx=0;}
// come back up here if we did a remove operation
loop:
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if not matching id
if ( tag->m_type != TAG->m_type ) continue;
// skip if does not match user
if ( memcmp(tag->m_user,TAG->m_user,7) ) continue;
// data now has to match too, so we will allow tags of the
// same type from the same user to be added if they have
// different data now. i would only do this for strings,
// but for int32_ts and chars i would skip this check...
// so only replace "unique" tags of the same type.
// mostly strings and embedded tag recs will be non-unquie
if ( ! isTagTypeUnique ( tag->m_type ) ) {
if ( tag->m_dataSize != TAG->m_dataSize ) continue;
if ( memcmp(tag->m_data,TAG->m_data,tag->m_dataSize))
continue;
}
// Msg8a allows multiple ST_SITE tags in order to indicate
// what sites the other tags came from (i.e. used by the
// inheritance loop below)
// MDW: This is now covered by isTagTypeUnique() above.
//if ( tag->m_type == ST_SITE ) continue;
// it does match, so replace it!
//removeTags ( tag->m_type , tag->m_user );
removeTag ( tag );
// start from the top
goto loop;
}
// . ok, we "deduped" the tag
// . point to the end of the buf
char *p = getRecEnd();
// get the max end
char *pend = getMaxEnd();
// how much do we need?
int32_t need = TAG->getSize();
// breach?
if ( p + need > pend ) {
char *site = getString("site","unknown");
g_errno = EBUFTOOSMALL;
log("tagdb: no room to add tag to buf. tagtype=%s "
"tagsize=%"INT32" site=%s",
getTagStrFromType ( TAG->m_type ) , need , site );
//char *xx=NULL;*xx=0;
return false;
}
// store it
gbmemcpy ( p , TAG , need );
// update our counters
m_numTags++;
m_dataSize += need;
// SPECIAL: if it was ST_SITE, set our m_key, we are an Rdb record
//if ( TAG->m_type != ST_SITE ) return true;
if ( ! TAG->isType ("site") ) return true;
// set the key
Url u;
// convenience
char *site = TAG->m_data;
int32_t size = TAG->m_dataSize;
// sanity check
if ( site[size-1] != '\0' ) { char *xx=NULL;*xx=0; }
// do not start with http:// ! wastes space!!
if (size>=8 && strncmp(site,"http://",7)==0 ) {
log("tagdb: don't sotre http:// in tags!");
char *xx=NULL;*xx=0;
}
// do not include the NULL
u.set ( site , size - 1 );
// set our key, the endKey is our "startKey"
m_key = g_tagdb.makeKey ( &u , false ); // isDelete?
// success, return true
return true;
}
bool TagRec::removeTags ( char *tagTypeStr , char *user , int32_t tagId ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
return removeTags ( tagType , user , tagId );
}
bool TagRec::removeTags ( int32_t tagType , char *user , int32_t tagId ) {
loop:
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the rec, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// id if matches, that is good enough
if ( tagId && tag->m_tagId != tagId ) continue;
// skip if not matching id
if ( tagId == 0 && tag->m_type != tagType ) continue;
// skip if does not match user
if ( tagId == 0 && user && memcmp(tag->m_user,user,7))continue;
// remove that tag
removeTag ( tag );
// re do loop
goto loop;
}
// success
return true;
}
bool TagRec::removeTag ( Tag *rmTag ) {
// save this
int32_t oldn = m_numTags;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the rec, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// must be it
if ( tag != rmTag ) continue;
// copy to here
char *dst = (char *)tag;
// size of tag we are removing
int32_t size = tag->getSize();
// from here
char *src = dst + size;
// end of tag buffer
char *pend = getRecEnd();
// byte to move
int32_t move = pend - src;
// it does match, so replace it!
gbmemcpy ( dst , src , move );
// decrement counts
m_numTags--;
m_dataSize -= size;
}
// sanity check
if ( m_numTags != oldn - 1 ) { char *xx=NULL;*xx=0; }
// success, return true
return true;
}
// add all the tags from "tagRec" to our list of tags
bool TagRec::addTags ( TagRec *tagRec ) {
// start at the first tag
Tag *tag = tagRec->getFirstTag();
// . remove any tag of any of the tag types we got in "tagRec" ?
// . deal with "negative" tags
// . used by TagRec::addDelTag() above
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// if tag has m_data, skip.
if ( tag->m_data && tag->m_dataSize > 0 ) continue;
// otherwise, it is a signal to nuke all tags of this type
removeTags ( tag->m_type , NULL );
}
// start at the first tag again
tag = tagRec->getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// skip if it was a delete tag
if ( tag->m_dataSize <= 0 ) continue;
// do not transfer over ST_SITE tags if we already got one
//if ( tag->m_type == ST_SITE && getTag ( ST_SITE ) ) continue;
if ( tag->isType("site") && getTag("site") ) continue;
// add it, return false on error, g_errno should be set
if ( ! addTag ( tag ) ) return false;
}
return true;
}
// add all the tags from "tagRec" to our list of tags
bool TagRec::removeTags ( TagRec *tagRec ) {
// start at the first tag
Tag *tag = tagRec->getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
// do not remove ST_SITE tags
//if ( tag->m_type == ST_SITE ) continue;
if ( tag->isType("site") ) continue;
// add it, return false on error, g_errno should be set
if ( ! removeTags ( tag->m_type , tag->m_user ) ) return false;
}
return true;
}
Tag *TagRec::getNextTag ( Tag *tag ) {
if ( m_numTags == 0 ) return NULL;
if ( ! tag ) return (Tag *)m_buf;
char *tagEnd = getRecEnd();
int32_t size = tag->getSize();
char *ret = ((char *)tag) + size;
// overboard?
if ( ret >= tagEnd ) return NULL;
return (Tag *)ret;
}
*/
// return the number of tags having the particular TagType
int32_t TagRec::getNumTagTypes ( char *tagTypeStr ) {
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
int32_t numTagType = 0;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip dups
if ( tag->m_type == TT_DUP ) continue;
// if there is tagType match then increment the count
if ( tag->m_type == tagType ) numTagType++;
}
return numTagType;
}
int32_t TagRec::getNumTags ( ) {
int32_t numTags = 0;
// start at the first tag
Tag *tag = getFirstTag();
// loop over all tags in the buf, see if we got a dup
for ( ; tag ; tag = getNextTag ( tag ) )
// skip dups
if ( tag->m_type != TT_DUP ) numTags++;
return numTags;
}
// . &tagtype%"INT32"=<tagtype>
// . &tagdata%"INT32"=<data>
// . &deltag%"INT32"=1 (to delete it)
// . set &user=mwells, etc. in cookie of HttpReqest, "r" for user
// . "this" TagRec's user, ip and timestamp will be carried over to "newtr"
// . returns false and sets g_errno on error
bool TagRec::setFromHttpRequest ( HttpRequest *r, TcpSocket *s ) {
// clear it
//reset();
// get the username from the cookie
//char *user = r->getStringFromCookie ( "username" , NULL );
//char *user = g_users.getUsername ( r );
// try from form
//if ( ! user ) user = r->getString ("username",NULL);
// if no user, don't bother!
//if ( ! user ) {
// g_errno = EBADENGINEER;
// return log("tagdb: no username supplied for modifying tagdb.");
//}
// get the user ip address
int32_t ip = 0;
if ( s ) ip = s->m_ip;
// get the time stamp
int32_t now = getTimeGlobal();
// . loop over all urls/sites in text area
// . no! just use single url for now
// put all urls in this buffer
SafeBuf fou;
// try from textarea if the ST_SITE was not in the tag section
int32_t uslen;
char *us = r->getString("u",&uslen);
if ( uslen <= 0 ) us = NULL;
if ( us ) fou.safeMemcpy ( us , uslen );
// read in file, file of urls
int32_t ufuLen;
char *ufu = r->getString("ufu",&ufuLen);
if ( ufuLen <= 0 ) ufu = NULL;
if ( us ) ufu = NULL; // exclusive
if ( ufu ) fou.fillFromFile ( ufu );
// if st->m_urls has multiple urls, this "u" is not given in the
// http request! but a filename is... and Msg9::addTags() should add
// the ST_SITE field anyway...
if ( ! ufu && ! us ) return true;
// make it null terminated since we no longer do this automatically
fou.pushChar('\0');
// normalize it
//Url u; u.set ( us , uslen );
// point to it
//char *site = u.getUrl();
// skip http + ://
//site += u.getSchemeLen() + 3;
// include the \0
//int32_t psize = gbstrlen(p) + 1;
// loop over all tags in the TagRec to mod them
for ( int32_t i = 0 ; ; i++ ) {
char buf[32];
sprintf ( buf , "tagtype%"INT32"",i );
char *tagTypeStr = r->getString(buf,NULL,NULL);
// if not there we are done
if ( ! tagTypeStr ) break;
// should we delete it?
sprintf ( buf , "deltag%"INT32"",i);
char *deltag = r->getString(buf,NULL,NULL);
//if ( deltag && deltag[0] ) continue;
sprintf ( buf , "taguser%"INT32"",i);
char *tagUser = r->getString( buf,NULL,"admin");//user);
//if ( tagUser && tagUser[0]==0 ) tagUser = user;
sprintf ( buf , "tagtime%"INT32"",i);
int32_t tagTime = r->getLong(buf,now);
sprintf ( buf , "tagip%"INT32"",i);
int32_t tagIp = r->getLong(buf,ip);
// get the value of this tag
sprintf ( buf , "tagdata%"INT32"" , i );
char *dataPtr = r->getString ( buf , NULL );
// get the tag original key
key128_t key;
sprintf ( buf , "tagn1key%"INT32"" , i );
key.n1 = r->getLongLong ( buf, 0 );
sprintf ( buf , "tagn0key%"INT32"" , i );
key.n0 = r->getLongLong ( buf, 0LL );
// for supporting dumping/adding of tagdb using wget
sprintf ( buf , "tagn1key%"INT32"b" , i );
int64_t v1 = r->getLongLong ( buf, key.n1 );
sprintf ( buf , "tagn0key%"INT32"b" , i );
int64_t v0 = r->getLongLong ( buf, key.n0 );
bool hackKey = ( v1 || v0 );
key.n1 = v1;
key.n0 = v0;
// if empty skip it
if ( ! dataPtr ) continue;
if ( ! dataPtr[0] ) continue;
// is it numeric? i think only ST_COMMENT is not
//char isNum = true;
// get the numeric
//int32_t tagType = getTagTypeFromStr ( tagTypeStr );
// set "isNum" to false if not numeric
//if ( tagType == ST_COMMENT ) isNum = false;
//if ( tagType == ST_SITE ) isNum = false;
//if ( tagType == ST_META ) isNum = false;
//if ( isTagTypeString ( tagType ) ) isNum = false;
//int32_t dataSize = 0;
// . if it is a string, like ST_COMMENT
// . include the \0
//if ( ! isNum ) dataSize = gbstrlen(dataPtr) + 1;
// everything is now a string
int32_t dataSize = gbstrlen(dataPtr) + 1;
// if numeric store in tag buf
/*
int64_t data;
if ( isNum ) {
data = atoll ( dataPtr );//r->getLongLong(val,-1);
dataSize = 1;
if ( data >= 0xffLL ) dataSize = 2;
if ( data >= 0xffffLL ) dataSize = 3;
if ( data >= 0xffffffLL ) dataSize = 4;
if ( data >= 0xffffffffLL ) dataSize = 5;
if ( data >= 0xffffffffffLL ) dataSize = 6;
if ( data >= 0xffffffffffffLL ) dataSize = 7;
dataPtr = (char *)&data;
}
*/
// add to tag buf
//addTag ( tagTypeStr ,
// tagTime ,
// tagUser ,
// tagIp ,
// dataPtr ,
// dataSize );
// loop over all urls in the url file if provided
char *up = fou.getBufStart();
for ( ; ; ) {
// set url
char *urlPtr = up;
// stop if EOF or processed the one url
if ( ! urlPtr ) break;
// advance it or NULL it out
up = fou.getNextLine ( up );
// null term the url ptr
if ( up ) up[-1] = '\0';
// save buffer spot in case we have to rewind
int32_t saved = m_sbuf.length();
// . add to tag rdb recs in safebuf
// . this pushes the rdbid as first byte
// . mdwmdwmdw
Tag *tag = m_sbuf.addTag ( urlPtr, // us, // site ,
tagTypeStr ,
tagTime ,
tagUser ,
tagIp ,
dataPtr,
dataSize ,
RDB_TAGDB,
// do not push rdbid into safebuf
false ) ;
// error?
if ( ! tag )
return false;
// hack the key
if ( hackKey ) // key.n1 != 0 || key.n0 != 0 )
tag->m_key = key;
bool deleteOldKey = false;
// if tag has different key, delete the old one
if ( key.n1 && tag->m_key != key ) deleteOldKey = true;
// if del was marked, delete old one and do not add new one
if ( deltag && deltag[0] ) {
// rewind over the tag we were about to add
m_sbuf.setLength ( saved );
// and add as a delete
deleteOldKey = true;
}
if ( deleteOldKey ) {
// make it negative
key128_t delKey = key;
delKey.n0 &= 0xfffffffffffffffeLL;
if (! m_sbuf.safeMemcpy((char *)&delKey,
sizeof(key128_t)))
return false;
}
}
}
// all done
//if ( getTag ( ST_SITE ) ) return ;
//if ( getTag("site") ) return;
// add the special ST_SITE tag
//addTag ( "site" , // ST_SITE ,
// now ,
// user ,
// ip ,
// p ,
// psize );
return true;
}
// to stdout
int32_t TagRec::print ( ) {
SafeBuf sb;
printToBuf ( &sb );
// dump that
return fprintf(stderr,"%s\n",sb.getBufStart());
}
bool TagRec::printToBuf ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
//sb->safePrintf("k.n1=0x%08"XINT32" k.n0=0x%016"XINT64" version=%"INT32"",
// m_key.n1,m_key.n0,(int32_t)m_version);
for ( ; tag ; tag = getNextTag ( tag ) ) {
if ( tag->m_type == TT_DUP ) continue;
tag->printToBuf ( sb );
sb->pushChar('\n');
}
return true;
}
// . return size of characters scanned from "p"
// . returns 0 on error
/*
int32_t TagRec::setFromBuf ( char *p , char *pend ) {
// remember the start
char *start = p;
// scan in the key
//if ( strncmp(p,"k.n1=0x",7) != 0 ) return 0;
// skip key stuff
//p += 7;
// clear our key
//m_key.setToMin();
// read in the key
//key_t k;
//sscanf(p,"k.n1=0x%08"XINT32" k.n0=0x%016"XINT64" ",&k.n1,&k.n0);
// now do it the fast way and compare the results!
//p += 7 ;
//hexToBinary ( p , pend , ((char *)&m_key.n1)+3 , true );
//p += 8 + 8;
//hexToBinary ( p , pend , ((char *)&m_key.n0)+7 , true );
// test it
//if ( m_key.n1 != k.n1 || m_key.n0 != k.n0 ) { char *xx=NULL; *xx=0; }
//p = strstr ( p , " version=");
// error?
//if ( ! p ) return 0;
// skip " version="
//p += 9;
// get version
//m_version = atoi(p);
// skip p until space
//while ( p < pend && *p != ' ' ) p++;
// error?
//if ( p >= pend ) return 0;
// skip the space -- NO! tag parser wants the space
//p++;
// point to the where we should serialize the tags into
//char *tagPtr = m_buf;
char tbuf[5000];
while ( p < pend ) {
// now we should be pointing to the tag
Tag *tag = (Tag *)tbuf;
// serialize the tag from the buf
int32_t asciiBytesRead = tag->setFromBuf ( p , pend );
// if bad this is 0
if ( asciiBytesRead == 0 ) return 0;
// store tag into our safebuf. return 0 with g_errno set on err
// . mdwmdwmdw
if ( ! m_sbuf.addTag ( tag ) ) return 0;
// point to next tag to read into our binary buffer
//p += asciiBytesRead;
// inc our ptr to point to next tag if it exists
//tagPtr += tag->getSize();
// inc our count in the TagRec
//m_numTags++;
// adjust our tag buffer size, TagRec::m_dataSize
//m_dataSize = tagPtr - m_buf;
// hey, it includes the other crap too!
// it includes m_numTags + m_version, see Tagdb.h
//m_dataSize += 2 + 1;
}
// clear all lists
//resetLists();
// now make list point to that
//m_lists[0].m_list = m_sbuf.getBufStart();
//m_lists[0].m_listSize = m_sbuf.length();
//m_lists[0].m_listAllocSize = 0; // do not free it!
//m_numLists = 0;
//return getSize();
return p - start;
}
*/
bool TagRec::setFromBuf ( char *p , int32_t bufSize ) {
// assign to list! but do not free i guess
m_lists[0].m_list = p;
m_lists[0].m_listSize = bufSize;
m_lists[0].m_listEnd = p + bufSize;
m_lists[0].m_ownData = false;
m_lists[0].m_lastKeyIsValid = false;
m_lists[0].m_fixedDataSize = -1;
m_lists[0].m_useHalfKeys = false;
m_lists[0].m_ks = sizeof(key128_t);
m_listPtrs[0] = &m_lists[0];
m_numListPtrs = 1;
return true;
}
bool TagRec::serialize ( SafeBuf &dst ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) ) {
if ( tag->m_type == TT_DUP ) continue;
if ( ! dst.addTag ( tag ) ) return false;
}
return true;
}
bool TagRec::printToBufAsAddRequest ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsAddRequest ( sb);
return true;
}
bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsXml ( sb );
return true;
}
bool TagRec::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsHtml (sb,prefix);
return true;
}
bool TagRec::printToBufAsTagVector ( SafeBuf *sb ) {
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) )
if ( tag->m_type != TT_DUP ) tag->printToBufAsTagVector ( sb );
return true;
}
Tag *TagRec::getTag ( char *tagTypeStr , char *dataPtr , int32_t dataSize ) {
// get the tag type numerically
int32_t tagType = getTagTypeFromStr ( tagTypeStr );
Tag *tag = getFirstTag();
for ( ; tag ; tag = getNextTag ( tag ) ) {
// skip if tag does not match "tagType"
if ( tag->m_type != tagType ) continue;
// skip dup tags
if ( tag->m_type == TT_DUP ) continue;
// skip if dataSize does not match
if ( tag->getTagDataSize() != dataSize ) continue;
// skip if data does not match
if ( memcmp ( tag->getTagData() , dataPtr , dataSize ) ) continue;
// we got a match
return tag;
}
return NULL;
}
//
// flags for a TagDescriptor
//
// is the tag a string type?
#define TDF_STRING 0x01
// can we have multiple tags of this type from the same user in the
// same TagRec?
#define TDF_ARRAY 0x02
// . should we index it?
// . index gbtagjapanese:<score>
// . also index "gbtagjapanese" if score != 0
// . TODO: actually use this
#define TDF_NOINDEX 0x04
class TagDesc {
public:
char *m_name;
char m_flags;
// we compute the m_type of each TD on init
int32_t m_type;
};
// map the tags to names
static TagDesc s_tagDesc[] = {
// data for the "lang" tag is 2 char language id followed by
// a comma then a score from 1 to 100 to indicate percentage.
// Allow multiple "lang" tags in one tagrec.
{"rootlang" ,TDF_STRING,0},
// title tag and incoming link text of the root page is stored here
// for determining default venue addresses
{"roottitles" ,TDF_STRING|TDF_NOINDEX,0},
//{"rootlangid" ,TDF_STRING|TDF_NOINDEX,0},
// for addresses of the website, can be multiple
{"venueaddress" ,TDF_STRING|TDF_ARRAY|TDF_NOINDEX,0},
/*
{"langunknown" ,0x00,0},
{"english" ,0x00,0},
{"french" ,0x00,0},
{"spanish" ,0x00,0},
{"russian" ,0x00,0},
{"turkish" ,0x00,0},
{"japanese" ,0x00,0},
{"chinesetraditional" ,0x00,0},
{"chinesesimplified" ,0x00,0},
{"korean" ,0x00,0},
{"german" ,0x00,0},
{"dutch" ,0x00,0},
{"italian" ,0x00,0},
{"finnish" ,0x00,0},
{"swedish" ,0x00,0},
{"norwegian" ,0x00,0},
{"portuguese" ,0x00,0},
{"vietnamese" ,0x00,0},
{"arabic" ,0x00,0},
{"hebrew" ,0x00,0},
{"indonesian" ,0x00,0},
{"greek" ,0x00,0},
{"thai" ,0x00,0},
{"hindi" ,0x00,0},
{"bengala" ,0x00,0},
{"polish" ,0x00,0},
{"tagalog" ,0x00,0},
*/
/*
{"spam" ,0x00,0},
{"retail" ,0x00,0},
{"business" ,0x00,0},
{"adult" ,0x00,0},
{"forum" ,0x00,0},
{"blog" ,0x00,0},
{"news" ,0x00,0},
{"reference" ,0x00,0},
{"directory" ,0x00,0},
{"searchengine" ,0x00,0},
{"domainsquatter" ,0x00,0},
{"platform" ,0x00,0},
{"travel" ,0x00,0},
{"audio" ,0x00,0},
{"video" ,0x00,0},
{"socialnetworking" ,0x00,0},
*/
{"manualban" ,0x00,0},
{"manualfilter" ,0x00,0},
// clock hashes are now stored in indexdb
//{"clock" ,0x00,0},
{"dateformat" ,0x00,0}, // 1 = american, 2 = european
{"ruleset" ,0x00,0},
//{"filtered" ,0x00,0},
//{"compromised" ,0x00,0},
//{"good" ,0x00,0},
{"deep" ,0x00,0},
//{"quality" ,0x00,0},
//{"dmozcatid" ,TDF_NOINDEX,0},
{"comment" ,TDF_STRING|TDF_NOINDEX,0},
// we now index this. really we need it for storing into title rec.
{"site" ,TDF_STRING|TDF_ARRAY,0},
//{"meta" ,TDF_STRING,0},
// . website contact info
// . used by ContactInfo.cpp
// . TDB_ARRAY means not to "overwrite" even if username is the same
// . a website can have multiple street addresses, etc.
// . the "lines" of an single street address are separated by ';'
// instead of \n to maintain tagdb dump output readability
//{"streetaddress" ,TDF_ARRAY,0},
//{"phonenumber" ,TDF_ARRAY,0},
//{"faxnumber" ,TDF_ARRAY,0},
//{"emailaddress" ,TDF_ARRAY,0},
// . this tag can contain multiple zipcodes, separated by ' '
// . we do index these for local search
//{"zipcodes" ,0x00,0},
// . similar to zip codes, separated by ' '
// . TODO: we need to fix Places.cpp to label the places for these tags
// but for now we can do gbtagstreetaddress:munich and hope for
// the best, although we will get websites on "munich st.!", but
// maybe you can combine that with gbtagstreetaddress:germany
//{"countries", ,0x00,0},
//{"cities", ,0x00,0},
// this is "0" or "1". if it is "0" then the date lets XmlDoc.cpp know
// when we last tried to get the contact info for the site
{"hascontactinfo" ,0x00,0},
// street address using ; as delimeter
{"contactaddress" ,TDF_ARRAY|TDF_NOINDEX,0},
{"contactemails" ,TDF_ARRAY|TDF_NOINDEX,0},
//{"emailaddressonsite" ,TDF_ARRAY|TDF_NOINDEX,0},
//{"emailaddressoffsite" ,TDF_ARRAY|TDF_NOINDEX,0},
{"hascontactform" ,0x00,0},
// subscribe to google's blacklist and mark the sites as this
//{"malware" ,0x00,0},
// . this is used to define INDEPENDENT subsites
// . such INDEPENDENT subsites should never inherit from this tag rec
// . it is used to handle "homesteading" sites like geocities.com
// and the like, and is automatically set by SiteGetter.cpp
// . if this is 1 then xyz.com/yyyyy/ is considered a subsite
// . if this is 2 then xyz.com/yyyyy/zzzzz/ is considered a subsite
// . if this is -1 then no subsite is found
// . this should never be 0 either
{"sitepathdepth" ,0x00,0},
// . used by XmlDoc::updateTagdb() and also used to determine
// if we should index a site in XmlDoc.cpp. to be indexed a site
// must be in google, or must have this tag type in its tag rec,
// or have some other, soon to be invented, tag
// . really this is all controlled by url filters table
// . allow multiple tags of this type from same "user"
{"authorityinlink" ,TDF_STRING|TDF_ARRAY,0},
{"pagerank" ,0x00,0},
{"ingoogle" ,0x00,0},
{"ingoogleblogs" ,0x00,0},
{"ingooglenews" ,0x00,0},
// geo location from this news site directory
{"abyznewslinks.address",0x00,0},
// we now store site pop, etc. in tagdb
{"sitenuminlinks" ,0x00,0},
{"sitenuminlinksuniqueip" ,0x00,0},
{"sitenuminlinksuniquecblock" ,0x00,0},
{"sitenuminlinkstotal" ,0x00,0},
// keep these although no longer used
{"sitepop" ,0x00,0},
{"sitenuminlinksfresh" ,0x00,0},
// . the first ip we lookup for this domain
// . this is permanent and should never change
// . it is used by Spider.cpp to assign a host for throttling
// all urls/SpiderRequests from that ip
// . so if we did change it then that would result in two hosts
// doing the throttling, really messing things up
{"firstip" ,0x00,0}
/*
{"user.id" ,0x00,0},
{"user.xml" ,TDF_STRING,0},
{"user.login" ,TDF_STRING,0},
{"user.password" ,TDF_STRING,0},
{"user.securityquestion",TDF_STRING,0},
{"user.securityanswer" ,TDF_STRING,0},
{"user.email" ,TDF_STRING,0},
{"user.firstname" ,TDF_STRING,0},
{"user.lastname" ,TDF_STRING,0},
{"user.cookie" ,TDF_STRING,0},
{"user.zipcode" ,TDF_STRING,0},
{"user.city" ,TDF_STRING,0},
{"user.state" ,TDF_STRING,0},
{"user.imageurl" ,TDF_STRING,0},
{"user.dob" ,TDF_STRING,0},
{"user.language" ,TDF_STRING,0},
{"user.creditcardname" ,TDF_STRING,0},
{"user.creditcardnum" ,TDF_STRING,0},
{"user.creditcardexp" ,TDF_STRING,0},
{"user.creditcardcode" ,TDF_STRING,0},
{"user.lastlogin" ,0x00,0},
{"user.numlogins" ,0x00,0},
{"user.openlinksnewwin" ,0x00,0},
{"user.usehttps" ,0x00,0},
{"user.maxreadhist" ,0x00,0},
{"user.maxsearchhist" ,0x00,0},
{"user.format" ,0x00,0},
{"user.acctbalance" ,0x00,0},
{"user.acctlimit" ,0x00,0},
{"user.acctsuspended" ,0x00,0},
{"user.acctbillemails" ,TDF_STRING,0},
{"user.adstopicid" ,0x00,0},
{"user.adsdailybudget" ,0x00,0},
{"user.adsdisabled" ,0x00,0},
{"user.feednumqueries" ,0x00,0},
{"user.feedcpq" ,0x00,0},
{"user.feeddailybudget" ,0x00,0},
{"user.feeddisabled" ,0x00,0},
{"user.feedpassword" ,TDF_STRING,0},
{"user.feeddailycount" ,TDF_ARRAY,0},
{"user.usertransrec" ,TDF_ARRAY,0},
{"user.userhistoryrec" ,TDF_ARRAY,0},
{"user.userpanelrec" ,TDF_ARRAY,0},
{"trans.amount" ,0x00,0},
{"trans.desc" ,TDF_STRING,0},
{"hist.wasread" ,0x00,0},
{"hist.url" ,TDF_STRING,0},
{"hist.gigabits" ,TDF_STRING,0},
{"hist.timespent" ,0x00,0},
{"panel.topcid" ,0x00,0},
{"panel.showmainstream" ,0x00,0},
{"panel.showblogs" ,0x00,0},
{"panel.showforum" ,0x00,0},
{"panel.showweb" ,0x00,0},
{"panel.showsearchbox" ,0x00,0},
{"panel.showimages" ,0x00,0},
{"panel.showvideo" ,0x00,0},
{"panel.showchatbox" ,0x00,0},
{"panel.showchatpics" ,0x00,0},
{"panel.chatboxnumlines",0x00,0},
{"panel.popsliderval" ,0x00,0},
{"panel.agesliderval" ,0x00,0},
{"panel.windowxpos" ,0x00,0},
{"panel.windowypos" ,0x00,0},
{"panel.numstories" ,0x00,0},
{"panel.storylang" ,TDF_STRING,0},
{"panel.translatelang" ,TDF_STRING,0},
{"panel.displaylang" ,TDF_STRING,0},
{"panel.filterquery" ,TDF_STRING,0},
{"panel.sendemailalerts",TDF_STRING,0},
{"chat.comment" ,TDF_STRING,0},
{"ad.topicid" ,0x00,0},
{"ad.userid" ,0x00,0},
{"ad.adid" ,0x00,0},
{"ad.title" ,TDF_STRING,0},
{"ad.text" ,TDF_STRING,0},
{"ad.url" ,TDF_STRING,0},
{"ad.keywordstring" ,TDF_STRING,0},
{"ad.dailypledge" ,0x00,0},
{"ad.disabled" ,0x00,0},
{"ad.dailyimpresscount" ,TDF_ARRAY,0},
{"ad.dailyclickcount" ,TDF_ARRAY,0}
*/
};
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
// . used by CollectionRec::getRegExpNum()
// . tagnameLen is -1 if unknown
int32_t getTagTypeFromStr( char *tagname , int32_t tagnameLen ) {
// this is now the hash
int32_t tagType;
if ( tagnameLen == -1 ) tagType = hash32n ( tagname );
else tagType = hash32 ( tagname , tagnameLen );
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// sanity check, make sure it is a supported tag!
if ( ! s_ht.getValue ( &tagType ) ) {
log("tagdb: unsupported tagname \"%s\"",tagname);
char *xx=NULL;*xx=0;
return -1;
}
return tagType;
}
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
char *getTagStrFromType ( int32_t tagType ) {
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
TagDesc **ptd = (TagDesc **)s_ht.getValue ( &tagType );
// sanity check
if ( ! ptd ) { char *xx=NULL;*xx=0; }
// return it
return (*ptd)->m_name;
}
// a global class extern'd in .h file
Tagdb g_tagdb;
Tagdb g_tagdb2;
// a fake site for Tagdb::convert()
//Tagdb g_sitedb;
//static HashTableT<int64_t,int32_t> s_lockTable;
//static HashTableX s_lockTable2;
// reset rdb and Xmls
void Tagdb::reset() {
m_rdb.reset();
m_siteBuf1.purge();
m_siteBuf2.purge();
//s_lockTable2.reset();
}
bool Tagdb::setHashTable ( ) {
if ( s_initialized ) return true;
s_initialized = true;
// the hashtable of TagDescriptors
//if ( ! s_ht.set ( 1024 ) )
if ( ! s_ht.set ( 4,sizeof(TagDesc *),1024,NULL,0,false,0,"tgdbtb" ) )
return log("tagdb: Tagdb hash init failed.");
// stock it
int32_t n = (int32_t)sizeof(s_tagDesc)/(int32_t)sizeof(TagDesc);
for ( int32_t i = 0 ; i < n ; i++ ) {
TagDesc *td = &s_tagDesc[i];
char *s = td->m_name;
int32_t slen = gbstrlen(s);
// use the same algo that Words.cpp computeWordIds does
int32_t h = hash64Lower_a ( s , slen );
// call it a bad name if already in there
TagDesc **petd = (TagDesc **)s_ht.getValue ( &h );
if ( petd )
return log("tagdb: Tag %s collides with old tag %s",
td->m_name,(*petd)->m_name);
// set the type
td->m_type = h;
// add it
s_ht.addKey ( &h , &td );
}
return true;
}
bool Tagdb::init ( ) {
// snity test
//if ( TAGREC_CURRENT_VERSION >= 30 ) {
// log("tagdb: fix call to convert()");
// char *xx = NULL; *xx = 0;
//}
// force it now
g_conf.m_tagdbMaxTreeMem = 101028000;
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
int32_t maxTreeNodes = g_conf.m_tagdbMaxTreeMem / 82;
//int64_t pcmem = 250000000; // 250MB
// TODO: make it a biased disk page cache!
int64_t pcmem = 160000000; // 160MB
// turn it off for rebuilding posdb, to 10MB anyway
pcmem = 10000000;
//int32_t pcmem = 100000000;
// each entry in the cache is usually just a single record, no lists,
// unless a hostname has multiple sites in it. has 24 bytes more
// overhead in cache.
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
// we now use a page cache
// if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
// return log("tagdb: Tagdb init failed.");
// init this
//if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") )
// return log("tagdb: lock table init failed.");
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
return m_rdb.init ( g_hostdb.m_dir ,
"tagdb" ,
true , // dedup same keys?
-1 , // fixed record size
-1,//g_conf.m_tagdbMinFilesToMerge ,
g_conf.m_tagdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
NULL,//&m_pc ,
false, // is titledb
true , // preload disk page cache
sizeof(key128_t), // key size
true ); // bias disk page cache?
}
bool Tagdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
int32_t maxTreeNodes = treeMem / 82;
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
return m_rdb.init ( g_hostdb.m_dir ,
"tagdbRebuild" ,
true , // dedup same keys?
-1 , // fixed record size
50,//g_conf.m_tagdbMinFilesToMerge ,
treeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
NULL , // pc
false, // is titledb
false , // preload disk page cache
sizeof(key128_t), // key size
false ); // bias disk page cache?
}
/*
bool Tagdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;//false;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
//if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
//log ( "tagdb: Verify failed, but scaling is allowed, passing." );
//return true;
return false;
}
*/
bool Tagdb::verify ( char *coll ) {
char *rdbName = NULL;
rdbName = "Tagdb";
log ( LOG_DEBUG, "db: Verifying %s for coll %s...", rdbName, coll );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key128_t startKey;
key128_t endKey;
startKey.setMin();
endKey.setMax();
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_TAGDB ,
cr->m_collnum ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("tagdb: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
//int32_t numOld = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
//key128_t k = list.getCurrentKey();
key128_t k;
list.getCurrentKey ( &k );
// skip negative keys
if ( (k.n0 & 0x01) == 0x00 ) continue;
count++;
// see if it is the "old" school tagdb rec
//char *data = list.getCurrentData();
//int32_t dataSize = list.getCurrentDataSize();
// this is the file number in the old school tagdb recs
// and it is the version number in the new school style recs.
// just make sure the new school version number stays below 30!
//char version = *data;
// lower 3 bytes are the file number. >= 30 on gk
//if ( version >= 30 ) numOld++;
//uint32_t groupId = g_tagdb.getGroupId ( &k );
uint32_t shardNum = getShardNum ( RDB_TAGDB , &k );
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("tagdb: Out of first %"INT32" records in %s, only %"INT32" belong "
"to our group.",count,rdbName,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("tagdb: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "tagdb: Exiting due to %s inconsistency.", rdbName );
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_DEBUG, "db: %s passed verification successfully for %"INT32" "
"recs.",rdbName, count );
// turn threads back on
g_threads.enableThreads();
// if no recs in tagdb, but sitedb exists, convert it
if ( count > 0 ) return true;
// . convert them
// . returns false and sets g_errno on error
//if ( ! convert ( coll ) ) return false;
// DONE
g_threads.enableThreads();
return true;
}
/////////////
//
// past blast -- for Tagdb::convert()
//
////////////
/*
struct SiteType {
SiteType() : m_score(0) {}
SiteType& operator=(SiteType& o)
{m_type=o.m_type;m_score=o.m_score; return *this;}
// get this type's size
int32_t getStoredSize() {
if (isType4Bytes(m_type)) return sizeof(m_type)+4;
else return sizeof(m_type)+1;
};
enum {
FIRST_TYPE = 0,
SPAM = FIRST_TYPE, //probablitity that it is spam
RETAIL, //selling something
BUSINESS, //a corporate storefront eg ibm.com
ADULT, //not safe for kids, higher score = more hardcore
FORUM, //message board
BLOG, //or personal home page
NEWS, //articles, opinions magazines
REFERENCE, //all special interest sites
DIRECTORY, //links organized categorically
SEARCH_ENGINE, //indexed info
DOMAIN_SQUATTER,
PLATFORM, //political candidate, or org
TRAVEL, //Travel sites
AUDIO, //podcast, streaming radio
VIDEO, //flash video
SOCIAL_NETWORKING,//dating, myspace, facebook
MANUAL_BAN, //a human hates this site
PAGE_RANK, //google's page rank
CLOCK1_PREHASH, //hash of unique preceeding 1st clock
CLOCK1_PREHASH_CNT, // count of tags to make 1st clock hash
DATE_FORMAT, //format of dates on page
CLOCK2_PREHASH, //hash of unique tags preceeding 2nd clock
CLOCK2_PREHASH_CNT, // count of tags to make 2nd clock hash
CLOCK3_PREHASH, //hash of unique tags preceeding 3rd clock
CLOCK3_PREHASH_CNT, // count of tags to make 3rd clock hash
CLOCK4_PREHASH, //hash of unique tags preceeding 4th clock
CLOCK4_PREHASH_CNT, // count of tags to make 4th clock hash
// ....ADD ALL NEW TYPES HERE... corruption upon ye if not
LAST_TYPE,
BAD_TYPE = LAST_TYPE,
TOTAL_TYPE_COUNT = (LAST_TYPE-FIRST_TYPE)
};
// . types can be 1 byte or 4 bytes. if they are 4 bytes, they must be
// added to this function
static bool isType4Bytes(int type) {
if ( type == CLOCK1_PREHASH ) return true;
if ( type == CLOCK2_PREHASH ) return true;
if ( type == CLOCK3_PREHASH ) return true;
if ( type == CLOCK4_PREHASH ) return true;
return false;
}
static int32_t getScoreSize(uint8_t type) {
if ( type == CLOCK1_PREHASH ) return 4;
if ( type == CLOCK2_PREHASH ) return 4;
if ( type == CLOCK3_PREHASH ) return 4;
if ( type == CLOCK4_PREHASH ) return 4;
return 1;
};
bool isNormScore() {return m_type <= PAGE_RANK;}
uint8_t m_type;
uint32_t m_score;
};
// . convert the old Tagdb format into the new format
bool Tagdb::convert ( char *coll ) {
g_threads.disableThreads();
log("db: Trying to convert sitedb for coll %s into tagdb",coll);
collnum_t collnum = g_collectiondb.getCollnum ( coll );
// open up old sitedb files
int32_t mem = 100000000;
int32_t maxTreeNodes = mem / 82;
//Rdb sitedb;
g_sitedb.m_rdb.init ( g_hostdb.m_dir ,
"sitedb" ,
true , // dedup same keys?
-1 , // fixed record size
9999 , // MinFilesToMerge
100000000 , // g_conf.m_tagdbMaxTreeMem
maxTreeNodes ,
true , // balance tree?
0 , // g_conf.m_tagdbMaxCacheMem
0 , // maxCacheNodes
false , // half keys?
false , // m_tagdbSaveCache
NULL , // DiskPageCache *, &m_pc
false , // is titledb
false , // preload disk page cache
12 , // key size
false );// bias disk page cache?
//g_collectiondb.init(true);
g_sitedb.addColl ( coll, false );
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
startKey.setMin();
endKey.setMax();
key_t k;
bool threadsWereEnabled = !g_threads.areThreadsDisabled();
g_threads.disableThreads();
loop:
// loop over all tagdb recs in tagdb
if ( ! msg5.getList ( RDB_SITEDB ,
coll ,
&list ,
startKey ,
endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
if(threadsWereEnabled) g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
k = list.getCurrentKey();
count++;
char *data = list.getCurrentData();
//int32_t dataSize = list.getCurrentDataSize();
// point to end of it
//char *pend = data + dataSize;
// parse the old site rec
char *p = data;
int32_t old_sfn = (*(int32_t *)p) & 0x00ffffff;
//char old_version = p[3];
p += 4;
char *old_site = p;
int32_t old_siteLen = gbstrlen(p);
p += old_siteLen + 1;
int32_t old_time = *(int32_t *)p;
p += 4;
char *old_comment = p;
p += gbstrlen(p) + 1;
//char *old_username = p;
p += gbstrlen(p) + 1;
//unsigned char siteFlags = *p;
p += 1;
//char siteQuality = *p;
p += 1;
//char incHere = *(int32_t *)p;
uint8_t numTypes = *(uint8_t *)p;
p += 1;
// do not start with http:// ! wastes space!!
if (old_siteLen>=8 && strncmp(old_site,"http://",7)==0 ) {
old_site += 7;
old_siteLen -= 7;
}
// sanity check
//Url s; s.set ( old_site, old_siteLen );
//key_t newk = g_tagdb.makeKey ( &s , false );
//if ( k != newk ) { char *xx=NULL;*xx=0; }
// . without any tags, what is our dataSize?
// . version(1 byte)+site(X bytes)+NULLTerm(1 byte)+
// #Tags(2 bytes)
//int32_t dataSize2 = 1 + old_siteLen + 1 + 2;
// set the new rec with this stuff
TagRec newgr;
//newgr.set ( k ,
// dataSize2 ,
// TAGREC_CURRENT_VERSION ,
// old_site );
int32_t now = getTimeGlobal();
// add the "site" name as a tag (include NULL)
newgr.addTag ( ST_SITE , old_time , "conv" , 0,
old_site, gbstrlen(old_site)+1);
// the banned tag
if ( old_sfn == 30 ) {
char data = 1;
newgr.addTag ( ST_MANUAL_BAN ,now, "conv", 0,&data,1);
}
if ( old_sfn == 50 ) {
char data = 1;
newgr.addTag ( ST_DEEP,now, "conv", 0,&data,1);
}
// just for historical reasons, keep this too
newgr.addTag ( ST_RULESET , now , "conv",0,(char *)&old_sfn,1);
// . add in comment tag
// . this will increase newgr::m_dataEnd/m_dataSize
// . include NULL
if ( old_comment[0] )
newgr.addTag ( ST_COMMENT ,now, "conv", 0,
old_comment , gbstrlen(old_comment)+1);
// reset these
bool gotPrehash1 = false;
bool gotPrehash2 = false;
bool gotPrehash3 = false;
bool gotPrehash4 = false;
bool gotPrehashCount1 = false;
bool gotPrehashCount2 = false;
bool gotPrehashCount3 = false;
bool gotPrehashCount4 = false;
int32_t prehash1;
int32_t prehash2;
int32_t prehash3;
int32_t prehash4;
char prehashCount1;
char prehashCount2;
char prehashCount3;
char prehashCount4;
// now for the old SiteTypes
for ( int32_t i = 0 ; i < numTypes ; i++ ) {
//while ( p < pend ) {
//SiteType *ost = (SiteType *)p;
// get the type
char siteType = *p; p++;
// and the score
char *siteTypeScore = p;
int32_t siteTypeScoreSize =
SiteType::getScoreSize(siteType);
p += siteTypeScoreSize;
// a 0 score in the old sitedb meant to ignore
if ( *siteTypeScore == 0 && siteTypeScoreSize == 1 )
continue;
// map the siteType 1-1 for the most part
int32_t tagType = siteType + ST_SPAM;
// if the type is SiteType::CLOCK2-4_ re-map it
if ( siteType == SiteType::CLOCK1_PREHASH ) {
gotPrehash1 = true;
prehash1 = *(int32_t *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK1_PREHASH_CNT ) {
gotPrehashCount1 = true;
prehashCount1 = *(char *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK2_PREHASH ) {
gotPrehash2 = true;
prehash2 = *(int32_t *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK2_PREHASH_CNT ) {
gotPrehashCount2 = true;
prehashCount2 = *(char *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK3_PREHASH ) {
gotPrehash3 = true;
prehash3 = *(int32_t *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK3_PREHASH_CNT ) {
gotPrehashCount3 = true;
prehashCount3 = *(char *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK4_PREHASH ) {
gotPrehash4 = true;
prehash4 = *(int32_t *)siteTypeScore;
continue;
}
if ( siteType == SiteType::CLOCK4_PREHASH_CNT ) {
gotPrehashCount4 = true;
prehashCount4 = *(char *)siteTypeScore;
continue;
}
// but DATE_FORMAT is off
if ( siteType == SiteType::DATE_FORMAT )
tagType = ST_DATE_FORMAT;
// panic
if ( tagType >= ST_LAST_TAG ) {
log("db: got bad tagtype %"INT32" for sitedb rec.",
(int32_t)tagType);
continue;
}
// add to new rec
newgr.addTag ( tagType , // should be 1-1
now ,
"conv" ,
0 , // ip
siteTypeScore ,
siteTypeScoreSize );
}
// add in the clock stuff
if ( gotPrehash1 && gotPrehashCount1 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount1;
gbmemcpy ( tmp+1 , &prehash1, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
}
if ( gotPrehash2 && gotPrehashCount2 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount2;
gbmemcpy ( tmp+1 , &prehash2, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
}
if ( gotPrehash3 && gotPrehashCount3 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount3;
gbmemcpy ( tmp+1 , &prehash3, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
}
if ( gotPrehash4 && gotPrehashCount4 ) {
// make a 5 byte thingy
char tmp[5];
tmp[0] = prehashCount4;
gbmemcpy ( tmp+1 , &prehash4, 4 );
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
}
// now the langs
uint8_t numLangs = *p;
p += 1;
for ( int32_t i = 0 ; i < numLangs ; i++ ) {
uint8_t langId = *p;
p += 1;
int32_t score = (int32_t)*(uint8_t *)p;
p += 1;
// add to new rec
newgr.addTag ( langId , // should be 1-1
now ,
"conv" ,
0 , // ip
(char *)&score ,
1 );
}
// print it out
SafeBuf sb;
newgr.printToBuf(&sb);
logf(LOG_INFO,"tagdb: %s",sb.getBufStart());
Rdb *r = &g_tagdb.m_rdb;
// . add the new site rec back as a TagRec
// . it should overwrite the old one since the key is the same
// . this should not block
// . it should do a dump if tree is full
if ( ! r->addRecord ( collnum ,
newgr.getKey () ,
newgr.getData () ,
newgr.getDataSize() ,
MAX_NICENESS )) {
log("tagdb: convert: %s",mstrerror(g_errno));
char *xx=NULL;*xx=0;
}
// do a blocking dump of tree if it's 90% full now
if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
log("tagdb: convert: dumping tree to disk.");
if ( ! r->dumpTree ( 0 ) ) // niceness
return log("tagdb: convert: dump failed.");
}
}
// if list not empty, get more
if ( list.isEmpty() ) { g_threads.enableThreads(); return true; }
// advance startKey
startKey = k;
startKey += 1;
// watch for wrap, that means done, too
if ( startKey < k ) { g_threads.enableThreads(); return true; }
// otherwise, do more
goto loop;
}
*/
/*
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
key_t Tagdb::makeKey ( Url *u , bool isDelete ) {
key_t k;
// hash full hostname
k.n1 = hash32 ( u->getHost() , u->getHostLen() );
// set lower 64 bits of key to hash of this url
k.n0 = hash64 ( u->getUrl() , u->getUrlLen() );
// clear low bit if we're a delete, otherwise set it
if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL;
else k.n0 |= 0x0000000000000001LL;
return k;
}
*/
// . ssssssss ssssssss ssssssss ssssssss hash of site/url
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx tagType OR hash of that+user+data
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
key128_t Tagdb::makeStartKey ( char *site ) { // Url *u ) {
key128_t k;
// hash full hostname
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
k.n1 = hash64n ( site );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0;
return k;
}
key128_t Tagdb::makeEndKey ( char *site ) { // Url *u ) {
key128_t k;
// hash full hostname
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
k.n1 = hash64n ( site );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0xffffffffffffffffLL;
return k;
}
key128_t Tagdb::makeDomainStartKey ( Url *u ) {
key128_t k;
// hash full hostname
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0;
return k;
}
key128_t Tagdb::makeDomainEndKey ( Url *u ) {
key128_t k;
// hash full hostname
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
// set lower 64 bits of key to hash of this url
k.n0 = 0xffffffffffffffffLL;
return k;
}
/*
// . returns 0 if "url" is not a suburl of "site"
// . otherwise, returns "percent" of "url" that matches "site"
int32_t Tagdb::getMatchPoints ( Url *recUrl , Url *url ) {
// reset pts to 0
int32_t pts = 0;
// temporary fix to the hostname key collision problem is Tagdb Rdb
int32_t rhlen = recUrl->getHostLen ();
char *uhost = url ->getDomain ();
int32_t uhlen = url ->getDomainLen ();
char *shost = recUrl->getDomain ();
int32_t shlen = recUrl->getDomainLen ();
//int32_t uip = url->getIp ();
//int32_t sip = site->getIp ();
// MDW: we are not really doing ips like this now
if ( uhlen != shlen || strncmp( uhost, shost, uhlen ) != 0 )
// if ( ! uip || uip != sip ) return 0;
return 0;
// compare ports for bonus points
// but return 0 if site's port is not default
int32_t rport = recUrl->getPort ();
int32_t uport = url->getPort ();
if ( rport == uport ) pts += 1000000;
else if ( uport != url->getDefaultPort() ) return 0;
// now ensure url's path is a subpath of recUrl's
int32_t rplen = recUrl->getPathLen();
char *rpath = recUrl->getPath();
int32_t uplen = url->getPathLen();
char *upath = url->getPath();
if ( rplen > uplen ) return 0;
if ( strncmp ( upath , rpath , rplen ) != 0 ) return 0;
// . now we got a solid match
// . add 1 pt for each char in recUrl's path
// . so the longer recUrl's path the better the match (more specific)
// . this allows us to override TagRecs for deeper sub urls
pts += rplen;
// add in host size of the matching recUrl
pts += rhlen*1000;
// all done
return pts;
}
*/
///////////////////////////////////////////////
//
// for getting the final TagRec for a url
//
///////////////////////////////////////////////
Msg8a::Msg8a() {
m_replies = 0;
m_requests = 0;
}
Msg8a::~Msg8a ( ) {
reset();
}
void Msg8a::reset() {
// do no free if in progress, reply may come in and corrupt the mem
if ( m_replies != m_requests && ! g_process.m_exiting ) {
char *xx=NULL;*xx=0; }
//for ( int32_t i = 0 ; i < m_replies ; i++ )
// m_lists[i].reset();
m_replies = 0;
m_requests = 0;
}
// . get records from multiple subdomains of url
// . calls g_udpServer.sendRequest() on each subdomain of url
// . all matching records are merge into a final record
// i.e. site tags are also propagated accordingly
// . closest matching "site" is used as the "site" (the site url)
bool Msg8a::getTagRec ( Url *url ,
// site of the url
char *site ,
//char *coll ,
collnum_t collnum,
bool skipDomainLookup , // useCanonicalName ,
int32_t niceness ,
void *state ,
void (* callback)(void *state ),
TagRec *tagRec ,
bool doInheritance ,
char rdbId ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return true;
}
// reset tag rec
tagRec->reset();//m_numListPtrs = 0;
// sanity check
if ( rdbId != RDB_TAGDB ) {char *xx=NULL;*xx=0;}
// save it
m_rdbId = rdbId;
// in use? need to wait before reusing
if ( m_replies != m_requests ) {char *xx=NULL;*xx=0; }
// then we gotta free the lists if any
reset();
m_niceness = niceness;
//m_coll = coll;
m_collnum = collnum;
m_tagRec = tagRec;
m_callback = callback;
m_state = state;
//m_url = url;
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_doneLaunching = false;
//m_doFullUrl = true;
//m_skipDomainLookup = skipDomainLookup;
// set siteLen to the provided site if it is non-NULL
int32_t siteLen = 0;
if ( site ) siteLen = gbstrlen(site);
// . get the site
// . msge0 passes this in as NULL an expects us to figure it out
// . if site was NULL that means we guess it. default to hostname
// unless in a recognized for like /~mwells/
if ( ! site ) {
SiteGetter sg;
sg.getSite ( url->getUrl() ,
NULL , // tagrec
0 , // timestamp
collnum, // coll
m_niceness,
NULL, // state
NULL); // callback
// if it set it to a recognized site, like ~mwells
// then set "site"
if ( sg.m_siteLen ) {
site = sg.m_site;
siteLen = sg.m_siteLen;
}
}
// if provided site was NULL and not of a ~mwells type of form
// then default it to hostname
if ( ! site ) {
site = url->getHost();
siteLen = url->getHostLen();
}
// temp null terminate it
char c = site[siteLen];
site[siteLen] = '\0';
// use that
m_siteStartKey = g_tagdb.makeStartKey ( site );//url );
m_siteEndKey = g_tagdb.makeEndKey ( site ); // url );
// un NULL terminate it
site[siteLen] = c;
// ignore this part of url is already root like
//if ( m_url->isRoot() ) m_doFullUrl = false;
// makeStartKey only works on the hostname of the url, so doing the
// full url has no effect right now
//m_doFullUrl = false;
// sendPageInject keeps "url" on the stack!
//m_url.set ( url->getUrl() , url->getUrlLen() );
m_url = url;
// save this
m_doInheritance = doInheritance;
// . launch a request for each subdomain of the url
// . the request format is
// . <url>\0<niceness><coll>\0
// . that way we can use a small request buffer and have different
// pointers to the different subdomains
//char *p = m_request;
// point to url
char *u = url->getUrl();
int32_t ulen = url->getUrlLen();
// point to the TLD of the url
char *tld = url->getTLD();
// . if NULL, that is bad... TLD is unsupported
// . no! it could be an ip address!
// . anyway, if the tld does not exist, just return an empty tagrec
// do not set g_errno
if ( ! tld && ! url->isIp() ) return true;
//if ( ! tld ) { g_errno = EBADURL; return true; }
// url cannot have NULLs in it because handleRequest8a() uses
// gbstrlen() on it to get its size
for ( int32_t i = 0 ; i < ulen ; i++ ) {
if ( u[i] ) continue;
log("TagRec: got bad url with NULL in it %s",u);
m_errno = EBADURL;
g_errno = EBADURL;
return true;
}
// skip over http://
int32_t plen = url->getSchemeLen() + 3;
u += plen;
ulen -= plen;
// copy over url without the protocol thingy (http://)
//gbmemcpy ( p , u , ulen );
// get the domain
m_dom = url->getDomain();
// if none, bad!
if ( ! m_dom && ! url->isIp() ) return true;
// save this
//m_host = url->getHost();
// get its delta
//int32_t delta = dom - u;
// . save ptr for launchGetRequests()
// . move this BACKWARDS for subdomains that have a ton of .'s
// . no, now move towards domain
m_p = m_url->getHost();
// and save this too
m_hostEnd = m_url->getHost() + m_url->getHostLen();
// if ip just use the full "hostname" which is the full ip address
//if ( url->isIp() ) m_p = m_host;
// launch the requests
if ( ! launchGetRequests() ) return false;
// . they did it without blocking
// . this sets g_errno on error
gotAllReplies();
// did not block
return true;
}
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg8a::launchGetRequests ( ) {
// clear it
g_errno = 0;
bool tryDomain = false;
loop:
// return true if nothing to launch
if ( m_doneLaunching ) return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno ) return (m_requests == m_replies);
// limit max to 5ish
if (m_requests >=MAX_TAGDB_REQUESTS) return (m_requests==m_replies);
// take a breath
QUICKPOLL(m_niceness);
// . first, try it by canonical domain name
// . if that finds no matches, then try it by ip domain
// get host
//char *subdom = m_p;
//int32_t subdomLen = m_hostEnd - m_p;
key128_t startKey ;
key128_t endKey ;
//int32_t siteHash32;
// . if our first time, do the full url!
// . need to do this because the turking process (XmlDoc::getTurkForm()
// and PageReindex.cpp:processTurkForm()) add tags to tagdb based on
// the full url.
/*
if ( m_doFullUrl ) {
startKey = g_tagdb.makeStartKey ( m_url );
endKey = g_tagdb.makeEndKey ( m_url );
// . like the "norm" url above
// . we'll get back a list of tags for this hostname,
// but they could all be from different sites, some sites
// would be the hostname, other tags might be from sites
// that are a subsite of the hostname, so we have to make
// sure the tag's key.n0 matches this siteHash32
siteHash32 = hash32 ( m_url->getUrl() , m_url->getUrlLen());
}
else {
// make into a url
Url u;
u.set ( subdom , subdomLen );
// set key range now
startKey = g_tagdb.makeStartKey ( &u );
endKey = g_tagdb.makeEndKey ( &u );
// . like the "norm" url above
// . we'll get back a list of tags for this hostname,
// but they could all be from different sites, some sites
// would be the hostname, other tags might be from sites
// that are a subsite of the hostname, so we have to make
// sure the tag's key.n0 matches this siteHash32
siteHash32 = hash32 ( u.getUrl() , u.getUrlLen() );
}
*/
if ( tryDomain ) {
startKey = g_tagdb.makeDomainStartKey ( m_url );
endKey = g_tagdb.makeDomainEndKey ( m_url );
if ( g_conf.m_logDebugTagdb )
log("tagdb: looking up domain tags for %s",
m_url->getUrl());
}
else {
// usually the site is the hostname but sometimes it is like
// "www.last.fm/user/breendaxx/"
//startKey = g_tagdb.makeStartKey ( m_site );//url );
//endKey = g_tagdb.makeEndKey ( m_site ); // url );
startKey = m_siteStartKey;
endKey = m_siteEndKey;
if ( g_conf.m_logDebugTagdb )
log("tagdb: looking up site tags for %s",
m_url->getUrl());
}
// get the groupid
//uint32_t groupId = g_tagdb.getGroupId ( startKey );
// get the next mcast
Msg0 *m = &m_msg0s[m_requests];
// and the list
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
// bias based on the top 64 bits which is the hash of the "site" now
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
//Host *group = g_hostdb.getGroup ( gid );
int32_t shardNum = getShardNum ( m_rdbId , &startKey );//, true );
Host *group = g_hostdb.getShard ( shardNum );
//int32_t numTwins = g_hostdb.getNumHostsPerShard();
// use top byte!
uint8_t *sks = (uint8_t *)&startKey;
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
//int32_t hostNum = 0;
//if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
// TODO: fix this!
//if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
// support more than 2 stripes now...
int32_t hostNum = top % g_hostdb.getNumHostsPerShard();
int32_t hostId = group[hostNum].m_hostId;
// . launch this request, even if to ourselves
// . TODO: just use msg0!!
bool status = m->getList ( hostId , // hostId
0 , // ip
0 , // port
0 , // maxCacheAge
false , // addToCache
m_rdbId, //RDB_TAGDB ,
m_collnum ,
listPtr ,
(char *) &startKey ,
(char *) &endKey ,
10000000 , // minRecSizes
this , // state
gotMsg0ReplyWrapper ,
m_niceness ,
true , // error correction?
true , // include tree?
true , // doMerge?
-1 , // firstHostId
0 , // startFileNum
-1 , // numFiles
3600*24*365 );// timeout
// all done?
//if ( m_p == m_url->getDomain() ) m_doneLaunching = true;
// error?
if ( status && g_errno ) {
// g_errno should be set, we had an error
m_errno = g_errno;
return (m_requests == m_replies);
}
// successfully launched
m_requests++;
// if we got a reply instantly
if ( status ) m_replies++;
if ( ! tryDomain ) { //&&
//! m_skipDomainLookup &&
//m_url->getHostLen() != m_url->getDomainLen() ) {
tryDomain = true;
goto loop;
}
//
// no more looping!
//
// i don't think we need to loop any more because we got all the
// tags for this hostname. then the lower bits of the Tag key
// corresponds to the actual SITE hash. so we gotta filter those
// out i guess after we read the whole list.
//
return (m_requests == m_replies);
//m_doneLaunching = true;
//goto loop;
/*
// do not advance m_p if doing the full url first
if ( m_doFullUrl ) {
m_doFullUrl = false;
goto loop;
}
// . advance m_p
// . we go backwards to better support subdomains that have a ton
// of periods in them...
for ( ; m_p < m_dom && *m_p != '.' ; m_p++ );
// advance over .
if ( m_p != m_dom ) m_p++;
// if another dot that is bad!
if ( *m_p == '.' ) m_errno = EBADURL;
// launch another
goto loop;
*/
}
void gotMsg0ReplyWrapper ( void *state ) {
Msg8a *THIS = (Msg8a *)state;
// we got one
THIS->m_replies++;
// error?
if ( g_errno ) THIS->m_errno = g_errno;
// launchGetRequests() returns false if still waiting for replies...
if ( ! THIS->launchGetRequests() ) return;
// get all the replies
THIS->gotAllReplies();
// set g_errno for the callback
if ( THIS->m_errno ) g_errno = THIS->m_errno;
// otherwise, call callback
THIS->m_callback ( THIS->m_state );
}
// get the TagRec from the reply
void Msg8a::gotAllReplies ( ) {
// if any had an error, don't do anything
if ( m_errno ) return;
// scan the lists
for ( int32_t i = 0 ; i < m_replies ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// get list
RdbList *list = &m_tagRec->m_lists[i];
// skip if empty
if ( list->m_listSize <= 0 ) continue;
// panic msg
if ( list->m_listSize >= 10000000 ) {
log("tagdb: CAUTION!!! cutoff tagdb list!");
log("tagdb: CAUTION!!! will lost useful info!!");
char *xx=NULL;*xx=0;
}
// otherwise, add to array
m_tagRec->m_listPtrs[m_tagRec->m_numListPtrs] = list;
// advance
m_tagRec->m_numListPtrs++;
}
// . now scan all the tags for this HOSTNAME
// . filter out tags that are not for a supersite of our url
// . i.e. if our url is www.xyz.com/tim/bob/file.html
// then hash
// http://www.xyz.com/
// http://www.xyz.com/tim/
// http://www.xyz.com/tim/bob/
// and skip over any tag whose lower 32 bits does not match
// one of those hashes...
// . see where we set Tag::m_key.n0 in Tag::set() above:
// m_key.n0 |= (uint32_t) hash32 ( norm.getUrl(),norm.getUrlLen() );
// where "norm" is the provided site but with a http:// in front
// and a / at the end since Url::set() normalized it
// . m_url is the url we want to get the tags for
// . HACK: right now just restrict to the hostname!
/*
Url norm;
norm.set ( m_url->getHost() , m_url->getHostLen() );
uint32_t siteHash32 = hash32 ( norm.getUrl(),norm.getUrlLen() );
// . and the domain too so we can ban domains
// . this is messed up because we can't just hash the domain, we have
// to hash it like a complete url because that is what Tag::set()
// does when it makes the key's top 32 bits.
uint32_t siteHash32d = 0;
int32_t conti = 0;
siteHash32d = hash32_cont ( "http://",7,siteHash32d,&conti);
siteHash32d = hash32_cont ( norm.getDomain(),
norm.getDomainLen(),
siteHash32d,
&conti);
siteHash32d = hash32_cont ( "/",1,siteHash32d,&conti);
// the non-del bit i guess. we forgot to shift up when we made
// the key above!
siteHash32 |= 0x01;
siteHash32d |= 0x01;
*/
// scan tags in list and set Tag::m_type to TT_DUP if its a dup
Tag *tag = m_tagRec->getFirstTag();
HashTableX cx;
char cbuf[2048];
cx.set ( 4,0,64,cbuf,2048,false,m_niceness,"tagtypetab");
// . loop over all tags in all lists in order by key
// . each list should be from a different suburl?
// . the first list should be the narrowest/longest?
for ( ; tag ; tag = m_tagRec->getNextTag ( tag ) ) {
// breathe
QUICKPOLL(m_niceness);
// skip tag if it is not from the proper site. we are
// only guarenteed that all tags in this list are for the
// same HOSTNAME not SITE! site is in the lower bits
// of the tagdb key.
// should fix www.paypal.com:1234 bug where we were reading
// sitenuminlinks from that tag and was always 0!! even
// when we'd add a count of 2k to the www.paypal.com site...
// now filter out www.paypal.com:1234's tags!
// TODO: allow multiple different siteHash32 values to match
// here, use one siteHash32 for each possible suburl of "m_url"
// so if m_url is "http://www.xyz.com/tim/" then we also
// can match hash32("http://www.xyz.com/tim/" not just
// "http://www.xyz.com/" which is how it is now.
//uint32_t th32 = tag->m_key.n0 & 0xffffffff;
//if ( th32 != siteHash32 && th32 != siteHash32d ) {
// // maybe use TT_DIFFSITE instead of this! TODO!
// tag->m_type = TT_DUP;
// continue;
//}
// form the hash!
uint32_t h32 = (uint32_t)((tag->m_key.n0) >> 32);
// skip if not unique
//if ( ! isTagTypeUnique ( tag->m_type ) ) continue;
// otherwise, record it
if ( cx.isInTable(&h32 ) ) // tag->m_type) )
tag->m_type = TT_DUP;
else if ( ! cx.addKey(&h32) ) {
m_errno = g_errno;
return;
}
}
}
/*
// get the TagRec from the reply
void TagRec::gotAllReplies ( ) {
// if any had an error, don't do anything
if ( m_errno ) return;
// time how long this takes and log it
int64_t startTime = gettimeofdayInMilliseconds();
// how many TagRecs we matched
int32_t n = 0;
// arrays for pointing to best matching TagRecs
//char *data [128];
//int32_t dataSizes [128];
//int32_t dataScores [128];
char *recs [128];
int32_t recScores [128];
// . each reply is a list of TagRecs
// . each TagRec is a standard Rdb record
// . key|dataSize|data...
// . go through all TagRecs and sort our list of ptrs to the
// best TagRecs
// . some TagRecs will not even match, so do not include those in
// our list of pointers
// . the closest matching TagRecs will be on top
// . inherit Tags from lesser matching TagRecs provided there
// is no such Tag::m_type from a closer matching TagRec
// . if xyz.com is banned and abc.xyz.com has a 0 score for the
// ST_BANNED Tag, then it is effectively "unbanned" and should
// not inherit the score from xyz.com for ST_BANNED.
// . so by scanning each TagRec in order, we compose our own
// final merged TagRec that may have a lot more Tags in it
// than any one matching TagRec
for ( int32_t i = 0 ; i < m_replies ; i++ ) {
// get the list from this reply
RdbList *list = &m_lists[i];
// scan list
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// break if overflow
if ( n >= 128 ) break;
// get next rec
//char *d = list->getCurrentData ();
//int32_t dsize = list->getCurrentDataSize();
char *rec = list->getCurrentRec();
// set TagRec to it
TagRec *gr = (TagRec *)rec;
// get the site
//char *site = gr->getString(ST_SITE,NULL);
char *site = gr->getString("site",NULL);
// sanity check
if ( ! site ) { char *xx=NULL;*xx=0; }
// make it a url
Url u;
u.set ( site , gbstrlen(site) );
// score it
int32_t s = g_tagdb.getMatchPoints ( &u , m_url );
// skip it if not a match
if ( s <= 0 ) continue;
// save it
//data [n] = d;
//dataSize [n] = dsize;
recs [n] = rec;
recScores [n] = s;
n++;
}
}
// if no recs, we did not match anything
if ( n == 0 ) return;
// or on error
if ( m_errno ) return;
// bubble sort the recs by their scores, highest score first
bubble:
bool swapped = false;
for ( int32_t i = 1 ; i < n ; i++ ) {
// keep going if in correct order
if ( recScores[i-1] >= recScores[i] ) continue;
// swap
char *t1 = recs [i-1];
int32_t t2 = recScores [i-1];
recs [i-1] = recs [i];
recs [i ] = t1;
recScores [i-1] = recScores [i];
recScores [i ] = t2;
swapped = true;
}
if ( swapped ) goto bubble;
// parse the best matching SiteData
//TagRec gr ; gr.set ( data[0] , dataSizes[0] );
// use the site from the best matching TagRec as our site
//m_siteUrl.set ( gr.getSite() , gr.getSiteLen() );
// reset the inheritance array
//char array[ST_LAST_TAG];
//memset ( array , -1 , 256 );
HashTable ia;
char ibuf [ 1024 * 8 ];
ia.set ( 1024 , ibuf , 1024 * 8 );
// we just store the tags, ptrs into the tags in the m_lists
//Tag *tags[MAX_TAGS];
// assume we got no tags
//int32_t numTags = 0;
// size of all tags
//int32_t size = 0;
// set our new tag rec
m_tagRec->reset();
// . only get tags from the first matching tag rec if we should not
// do the inheritance loop
// . if they click "get rec" on PageTagdb, then do not do inheritance,
// but if they click "get tags", then do it!
if ( ! m_doInheritance && n > 0 ) n = 1;
// . DO NOT INHERIT ANYTHING FROM TAG RECS that have a sitePathDepth
// tag in them UNLESS the sitePathDepth does not work on us
// . i.e. if xyz.com has a sitePathDepth of 2 in its TagRec and the
// url we are looking at is xyz.com/a/b/c/d then we must assume that
// out site is xyz.com/a/b/ we are an independent subsite of
// xyz.com and inherit nothing from it
SiteGetter siteGetter;
// site getter sometimes adds recs to tagdb to add in a new subsite
// it finds... i'd imagine this will create a parsing inconsistency
// when injecting docs into the "qatest123" coll... but oh well!
int32_t timestamp = getTimeGlobal();
// . begin the "inheritance loop"
// . fill our m_tags[] array with the Tags that apply to us
for ( int32_t i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// parse the TagRec (very fast)
TagRec *gr = (TagRec *)recs[i];
// is "url" an independent subsite of gr's site?
char *us = m_url->getUrl();
bool st=siteGetter.getSite(us,gr,timestamp,m_coll,m_niceness );
// sanity check, not allowed to block since state is NULL!
if ( ! st ) { char *xx=NULL;*xx=0; }
// are we independent subsite? if so, do not inherit
// from that. this is used to prevent www.geocities.com/~mark/
// from gaining the benefits of being on the www.geocities.com
// site. TODO later: we should make another tag to indicate
// a subsite is expicitly independent. but for now we rely
// on the "sitepathdepth" tag automatically computed by
// SiteGetter.cpp.
//if ( siteGetter.isIndependentSubsite() ) continue;
//
// TODO:
// NONO, just do not inherit sitenumlinks or any tag
// that is marked as such!!! add a new flag to the tags!!!!!!
//
// always add the ST_SITE tag first from each tag so we know
// what site the other tags belong to
//Tag *stag = gr->getTag ( ST_SITE );
Tag *stag = gr->getTag ( "site" );
// only add if non null
if ( stag ) m_tagRec->addTag ( stag );
// last tag
Tag *last = NULL;
// loop over all tags in TagRec #i
tagLoop:
// get the tag id of current tag
Tag *tag = gr->getNextTag ( last );
// assign
last = tag;
// was that the end of the tags? if so, go to next TagRec
if ( ! tag ) continue;
// get tag id
int32_t tagType = tag->m_type;
// skip all ST_SITE tags, we added those first above
//if ( tagType == ST_SITE ) goto tagLoop;
if ( tag->isType("site") ) goto tagLoop;
// sanity check
//if ( tagType >= ST_LAST_TAG ) { char *xx=NULL;*xx=0;}
// for getting the next tag, remember this
last = tag;
// . have we added this yet?
// . if tagType added from a prev TagRec do not "inherit" it
//if(array[tagType] != -1 && array[tagType] != i) goto tagLoop;
int32_t slot = ia.getSlot ( tagType );
if ( slot >= 0 && ia.getValueFromSlot(slot) != i) goto tagLoop;
// if tag type is "eventtag" then only add it if the site of this
// tagrec EQUALS our url. exact match... that way we make sure to only
// tag a single url, otherwise we might accidentally tag an entire site.
if ( tag->isType("eventtag") ) {
// must be in tagRec that matches us the closest
if ( i != 0 ) goto tagLoop;
// if no site, skip it
if ( ! stag ) goto tagLoop;
// and even then must match site exactly
char *site = stag->m_data;
// as string
char *url = m_url->getUrl();
int32_t ulen = m_url->getUrlLen();
// skip our proto (http://)
url += m_url->getSchemeLen() + 3;
ulen -= m_url->getSchemeLen() + 3;
// remove trailing /
if ( ulen > 0 && url[ulen-1] == '/' ) ulen--;
// likewise for site
int32_t slen = gbstrlen(site);
if ( slen > 0 && site[slen-1] == '/' ) slen--;
// skip if not exact
if ( slen != ulen ) goto tagLoop;
// compare, must match exactly, if not, do not add tag
if ( strncmp(url,site,slen) != 0 ) goto tagLoop;
}
// ok, add/inherit it
//tags[numTags++] = tag;
// add it directly to m_tagRec
if ( ! m_tagRec->addTag ( tag ) ) {
log("tagdb: addTag failed: %s",mstrerror(g_errno));
m_errno = g_errno;
break;
}
// add in size
//size += tag->getSize();
// note it, so we do not add/inherit it from another TagRec
//array[tagType] = i;
ia.addKey ( tagType , i );
// add more tags
goto tagLoop;
}
// sanity!
//if ( size > 32000 ) { char *xx=NULL;*xx=0; }
//if ( size + 2 + 2 > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0; }
// then copy the tags into the buffer
//for ( int32_t i = 0 ; i < numTags ; i++ )
// m_tagRec->addTag ( tags[i] );
// sanity check
//if ( p - m_tagRec > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
// free the mem
reset();
// time it
int64_t took = gettimeofdayInMilliseconds() - startTime;
if(took>10) log(LOG_INFO, "admin: gotreply for msg8a took %"INT64"",took);
}
*/
/*
///////////////////////////////////////////////
//
// Msg9a : for modifying TagRecs in Tagdb
//
///////////////////////////////////////////////
Msg9a::Msg9a () {
m_requestBuf = NULL;
m_requests = 0;
m_replies = 0;
}
Msg9a::~Msg9a() { reset(); }
void Msg9a::reset() {
// guard against not waiting for all replies to come in
if ( m_requests != m_replies && ! g_process.m_exiting ) {
char *xx=NULL;*xx=0; }
if ( ! m_requestBuf ) return;
mfree ( m_requestBuf , m_requestBufSize , "msg9a" );
m_requestBuf = NULL;
}
// . returns false if blocked, true otherwise
// . sets errno on error
// . "urls" is a NULL-terminated list of space-separated urls
// . if "addTags" is true, then the tags in "tagRec" will be added to the
/// the TagRecs specified by the sites in "sites". if a TagRec
// does not exist for a given "site" then it will be added just
// so we can add the Tags to it. If it does exist, we will
// just append the given Tags to it.
// . to "delete" a tag, just assign it a dataSize of 0!
// . Tags added with the same user name and tag type of an existing tag
// will overwrite it.
// . you can now optionally supply an array of ptrs to sites, sitePtrs.
// . you can call this with your "tagRec" on the stack because we copy
// its contents into our own buffer here
bool Msg9a::addTags ( char *sites ,
char **sitePtrs ,
int32_t numSitePtrs ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness ,
TagRec *tagRec ,
bool nukeTagRecs ,
int32_t *ipVector ) {
// incase we are being re-used!
reset();
g_errno = 0;
// sanity check, one or the other
if ( sites && sitePtrs ) { char *xx=NULL;*xx=0; }
// ipVector only used with sitePtrs for now
if ( ! sitePtrs && ipVector ) { char *xx=NULL;*xx=0; }
// when we add the "site" tag to it use the timestamp from one
// of the tags we are adding... therefore we must require there be
// some tags! we do this to insure injection consistency into the
// "qatest123" collection.
if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }
// use the first timestamp
int32_t timestamp = tagRec->getFirstTag()->m_timestamp;
// . up to 20 oustanding Msg0 getting the exact TagRec for each site
// . when we get it we immediately modify it and then add it back
// using Msg4.
// . to resolve collisions we could assign a particular hostid
// to handle adding each site... yeah, how about the local host.
// . so forward the Msg9a add/del/rpl request to the responsible
// host. then it can lock the "site" until the add completes.
// . it should use Msg1 to add it.
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_niceness = niceness;
m_state = state;
m_callback = callback;
int32_t collLen = gbstrlen(coll);
// how many urls in the sites do we have?
int32_t numUrls = 0;
// point to buf
char *s = sites;
// count each one
while ( sites && *s ) {
// skip whitespace
while ( *s && is_wspace_a(*s) ) s++;
// alnum?
if ( *s ) numUrls++;
// skip url
while ( *s && ! is_wspace_a(*s) ) s++;
}
if ( sitePtrs )
numUrls = numSitePtrs;
// how much buf do we need to hold all the requests for all the sites
int32_t need = 0;
// just a buffer of sites
if ( sites )
need += 2 * (gbstrlen(sites) + 1);
// otherwise, use the site ptrs
for ( int32_t i = 0 ; i < numSitePtrs ; i++ )
need += 2 * (gbstrlen(sitePtrs[i]) + 1);
// how big is each request's header?
int32_t header = 0;
// request size
header += 4;
// niceness
header += 1;
// collection
header += collLen + 1;
// flag
header += 1;
// the tag rec
header += tagRec->getSize();
// . add ST_SITE to each tagRec
// . we already accounted for the sites in the gbstrlen() above
header += sizeof(Tag);
// one header per url
need += header * numUrls;
// make a request buffer for all the requests
m_requestBuf = (char *)mmalloc ( need , "msg9a-add");
if ( ! m_requestBuf ) return true;
m_requestBufSize = need;
// carve it up
char *p = m_requestBuf;
// loop over sites
s = sites;
// reset sitePtr counter in case we are using those
int32_t si = 0;
//int32_t now = getTimeGlobal();
// loop it
for ( ; ; si++ ) {
// stop if all done
if ( sites && ! *s ) break;
// or this
if ( sitePtrs && si >= numSitePtrs ) break;
// make "s" point to the site if we are using ptrs
if ( sitePtrs ) s = sitePtrs[si];
// skip whitespace
while ( *s && is_wspace_a(*s) ) s++;
// skip over http:// (wastes space)
if ( strncmp(s,"http://",7)==0 ) s += 7;
// find end of url
char *send = s;
while ( *send && ! is_wspace_a(*send)) send++;
// get the length
int32_t len = send - s;
// done? make sure we are using the site buffer and not ptrs
if ( sites && ! *s ) break;
// a place holder for the request size
int32_t *rsizePtr = (int32_t *)p; p += 4;
// track the size
char *start = p;
// first niceness
*p = niceness; p++;
// then coll
gbmemcpy ( p , coll , collLen ); p += collLen;
// NULL term
*p++ = '\0';
// add flag first
*p = 0x00;
//if ( deleteTags ) *p = 0x01;
if ( nukeTagRecs ) *p = 0x02; // delete entire TagRec?
p++;
// now make the Tag!
//TagRec *tagRec = (TagRec *)p;
// sets its ip special if we should
int32_t ip = 0;
if ( ipVector ) ip = ipVector[si];
// . copy it over
// . get the size
int32_t size = tagRec->getSize();
// add in tagRec
gbmemcpy ( p , tagRec , size );
// cat it to p
TagRec *newgr = (TagRec *)p;
// NULL terminate it temporarily
char c = s[len];
s[len] = 0;
// . remove the old site so the new one can replace it
// . we already contain a SITE_TAG and addTag() will NEVER
// replace that particular tag...
// . this is now removed above
//newgr->removeTag ( "site" , NULL );
// add the site
//newgr->addTag ( ST_SITE, now,"tagdb",0,s, len+1 );
newgr->addTag ( "site", timestamp,"tagdb",ip,s, len+1 );
// undo the NULL termination
s[len] = c;
// update the size
size = newgr->getSize();
// advance
p += size;
// how big was the request, store that
*rsizePtr = (p - start);
// advance s
s = send;
}
// reset ptr to request to launch
m_p = m_requestBuf;
// sanity check
if ( p - m_requestBuf > need ) { char *xx=NULL;*xx=0; }
// all done
m_pend = p;
// launch them
if ( ! launchAddRequests () ) return false;
// hey that should always block!
if ( ! g_errno ) { char *xx=NULL; *xx=0; }
// show erroer
log("tagdb: msg9a: %s",mstrerror(g_errno));
// free the allocated mem
reset();
// did not block...
return true;
}
// . "dumpFile" format contains one tag record per line as
// dumped from './gb dump S main 0 -1 1' cmd line cmd.
// . it is the format given by the TagRec::printToBuf() cmd
bool Msg9a::addTags ( char *dumpFile ,
char *coll ,
void *state ,
void (*callback)(void *state) ,
int32_t niceness ) {
g_errno = 0;
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_niceness = niceness;
m_state = state;
m_callback = callback;
int32_t collLen = gbstrlen(coll);
// scan the dump file
char *p = dumpFile;
// the end of it
char *pend = p + gbstrlen(p);
// add up total sizes
int32_t sum = 0;
// end of line ptr
char *eol;
// count
int32_t count = 1;
// debug
//HashTable ht;
// do the scan
for ( ; p < pend ; p = eol + 1 ) {
// point to next line
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
// a fake tag rec
TagRec gr;
// . scan it into "gr"
// . returns size of the tag rec stored into "buf"
int32_t bytesScanned = gr.setFromBuf ( p , eol );
// error?
if ( bytesScanned <= 0 ) {count++; continue;}
// get size
int32_t size = gr.getSize();
// error?
if ( size <= 0 ) {count++; continue;}
//logf(LOG_DEBUG,"tagdb: tag %"INT32" size=%"INT32"",count++,size);
// hash it for debug
//ht.addKey ( count , size );
count++;
// sanity check
if ( size > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
// sanity check
char *site = gr.getString("site",NULL);
if ( ! site ) { char *xx=NULL;*xx=0;}
// then request header size
size += 4 + 1 + collLen + 1 + 1;
// increment total size
sum += size;
}
// make the buf
m_requestBuf = (char *)mmalloc ( sum , "msg9adbuf");
m_requestBufSize = sum;
// store tags here
char *t = m_requestBuf;
// return true on error with g_errno set
if ( ! t ) return true;
// reset to beginning of file
p = dumpFile;
// reset
count = 1;
// do the scan
for ( ; p < pend ; p = eol + 1 ) {
// point to next line
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
// first is the request size
int32_t *requestSizePtr = (int32_t *)t; t += 4;
// see how big the request is
char *a = t;
// then niceness
*t++ = (char)MAX_NICENESS;
// then coll
gbmemcpy ( t , coll , collLen ); t += collLen;
// null temrinate
*t++ = '\0';
// then the 1 byte flag (0 means add?)
*t++ = 0;
// store TagRec into the request buffer
TagRec *gr = (TagRec *)t;
// . scan it into "t"
// . returns size of the tag rec stored into "buf"
int32_t bytesScanned = gr->setFromBuf ( p , eol );
// error?
if ( bytesScanned <= 0 ) {
log("tagdb: skipping tag rec #%"INT32".",count++);
t -= (4+1+collLen+1+1);
continue;
}
// get size
int32_t size = gr->getSize();
// error?
if ( size <= 0 ) {
log("tagdb: skipping tag rec #%"INT32".",count++);
t -= (4+1+collLen+1+1);
continue;
}
// test it
//int32_t slot = ht.getSlot ( count );
//if ( slot < 0 ) { char *xx=NULL;*xx=0; }
//int32_t shouldbe = ht.getValueFromSlot ( slot );
//if ( size != shouldbe ) { char *xx=NULL;*xx=0; }
count++;
//logf(LOG_DEBUG,"tagdb: tag %"INT32" size=%"INT32"",count++,size);
// increment storage ptr
t += size;
// store the size of the WHOLE REQUEST, does not
// include the request size itself. see
// launchRequests() below.
*requestSizePtr = (t - a);
// sanity check
if ( *requestSizePtr > 10000 ) { char*xx=NULL;*xx=0;}
}
// sanity check
if ( t - m_requestBuf != sum ) { char *xx=NULL;*xx=0; }
// use their ptrs for adding these tag recs
m_p = m_requestBuf;
m_pend = m_requestBuf + m_requestBufSize ;
// now add those tags
return launchAddRequests ( );
}
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg9a::launchAddRequests ( ) {
// clear it
g_errno = 0;
loop:
// return true if nothing to launch
if ( m_p >= m_pend ) return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno ) return (m_requests == m_replies);
// limit max oustanding to 20
if (m_requests - m_replies >= 20 ) return (m_requests==m_replies);
// take a breath
QUICKPOLL(m_niceness);
// parse our request
char *p = m_p;
// first is the request size
p += 4;
// then niceness
p += 1;
// then coll
p += gbstrlen(p) + 1;
// then the 1 byte flag
p++;
// then the tag rec
TagRec *tagRec = (TagRec *)p;
// . get the groupid
// . tagRec's key should already be valid because when you add
// a ST_SITE to a TagRec it sets TagRec::m_key (special thing)
//uint32_t groupId = g_tagdb.getGroupId ( &tagRec->m_key );
uint32_t shardNum = getShardNum ( RDB_TAGDB , &tagRec->m_key );
// get the host to send to
Host *hosts = g_hostdb.getGroup ( groupId );
// select a host in the group
int32_t hostNum = tagRec->m_key.n1 % g_hostdb.getNumHostsPerShard();
// and his ptr
Host *h = &hosts[hostNum];
// get the next mcast
//Multicast *m = &m_casts[m_requests];
// reqeust size
int32_t requestSize = *(int32_t *)m_p; m_p += 4;
char *request = m_p; m_p += requestSize;
// . send to just one very specific host so he is the only one that
// controls modification to this particular tagdb rec. that way if
// we are changing its Tags we do not collide with another.
// . this returns false and sets g_errno on error
UdpServer *us = &g_udpServer;
bool status = us->sendRequest ( request ,
requestSize ,
0x9a ,
h->m_ip , // bestIp
h->m_port , // destPort
h->m_hostId , // hostId
NULL , // slotPtr
this , // state
gotReplyWrapper9a , // callback
365*24*3600 , // timeout
-1 , // backoff
-1 , // max wait in ms
NULL , // replybuf
0 , // replybufMaxSize
m_niceness );
// error?
if ( ! status ) {
// g_errno should be set, we had an error
m_errno = g_errno;
return (m_requests == m_replies);
}
// successfully launched
m_requests++;
// launch another
goto loop;
}
void gotReplyWrapper9a ( void *state , UdpSlot *slot ) {
Msg9a *THIS = (Msg9a *) state;
THIS->m_replies++;
// don't let him free our send buf, it is m_requestBuf
// which we allocated above
slot->m_sendBufAlloc = NULL;
// error? if so, save it
if ( g_errno && ! THIS->m_errno ) THIS->m_errno = g_errno;
if ( ! THIS->launchAddRequests() ) return;
// free the allocated mem
THIS->reset();
THIS->m_callback ( THIS->m_state );
}
class State9a {
public:
UdpSlot *m_slot;
Msg5 m_msg5;
char m_requestType;
Msg1 m_msg1;
RdbList m_list;
// this has all the tags we need to add/remove/replace
TagRec *m_tagRec;
// this has the original tagRec and we modify it with "m_tagRec"
// to get the final TagRec we add back to Tagdb. it is the
// "accumulator" tagdb record.
TagRec m_accRec;
// enough mem to store a key_t and a 0 dataSize (int32_t)
char m_tmp[12+4];
char m_niceness;
char *m_coll;
// linked list of ppl waiting in line to make mods
class State9a *m_next;
//class State9a *m_tail;
};
void handleRequest9a ( UdpSlot *slot , int32_t niceness ) {
// get the request
char *request = slot->m_readBuf;
int32_t requestSize = slot->m_readBufSize;
// overflow protection for corrupt requests
if ( requestSize < 4 ) {
g_errno = EBUFTOOSMALL;
g_udpServer.sendErrorReply ( slot , g_errno );
return;
}
// make a new Msg9a
State9a *st ;
try { st = new (State9a); }
catch ( ... ) {
g_errno = ENOMEM;
log("msg9a: new(%i): %s", sizeof(State9a), mstrerror(g_errno));
return g_udpServer.sendErrorReply ( slot, g_errno );
}
mnew ( st , sizeof(State9a) , "Msg10" );
// parse the request
char *p = request;
// save slot for sending reply
st->m_slot = slot;
// get niceness
st->m_niceness = *(char *)p; p++;
// get coll
st->m_coll = p; p += gbstrlen(p) + 1;
// save this
st->m_requestType = *p; p++;
// the "tagRec" is the record
TagRec *tagRec = (TagRec *)p; p += tagRec->getSize();
// store ptr
st->m_tagRec = tagRec;
// reset this, we are the head/tail of the linked list so far
st->m_next = NULL;
// sanity check
//char *site = tagRec->getString(ST_SITE,NULL);
char *site = tagRec->getString("site",NULL);
// this is a no-no
if ( ! site ) { char *xx=NULL;*xx=0;}
// no tail after us
//st->m_tail = NULL;
// . get the lock on this site
// . the lower 64 bits of the key should be the url hash
int32_t slotNum = s_lockTable2.getSlot ( &st->m_tagRec->m_key.n0 );
// if already in there, we have to wait because someone is already
// making mods to this TagRec
if ( slotNum >= 0 ) {
// log this for now?
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"tagdb: TAGDB handleRequest9a "
"waiting for lock st=0x%"XINT32" key.n0=%"UINT64"",(int32_t)st,
st->m_tagRec->m_key.n0);
State9a *p ;
p = *(State9a **)s_lockTable2.getValueFromSlot(slotNum);
// put us right after him in the linked list
st->m_next = p->m_next;
p->m_next = st;
// we could be the next in line
//if ( ! p->m_next ) p->m_next = st;
// we wait...
return;
}
// delete our slot from the lock table
if ( ! s_lockTable2.addKey ( &st->m_tagRec->m_key.n0 , &st ) ) {
log("tagdb: failed to get lock : %s",mstrerror(g_errno));
// free him, we sent his reply
mdelete ( st , sizeof(State9a),"msg9afr");
delete (st);
return g_udpServer.sendErrorReply ( slot, g_errno );
}
// make a startKey and endKey from the tagRec's key
key_t startKey = tagRec->m_key;
key_t endKey = tagRec->m_key;
// startkey gets is low bit cleared though
startKey.n0 &= 0xfffffffffffffffeLL;
// delete record request, no need to look it up
if ( st->m_requestType == 0x02 ) {
// note it
SafeBuf sb; tagRec->printToBuf ( &sb );
log("tagdb: deleting TagRec for site %s",sb.getBufStart());
// use tmp buf in st
char *p = st->m_tmp;
// store key in the tmp buf
*(key_t *)p = startKey;
// advance
p += sizeof(key_t);
// and store the data size
*(int32_t *)p = 0;
// advance
p += 4;
// set the list (just a negative rec in it)
st->m_list.set ( st->m_tmp , // list
4+sizeof(key_t) , // listSize
st->m_tmp , // alloc
4+sizeof(key_t) , // allocSize
(char *)&startKey , // startKey
(char *)&endKey , // endKey
-1 , // fixeDataSize
false , // ownData?
false , // useHalfKeys?
sizeof(key_t) );// keySize
if ( ! st->m_msg1.addList( &st->m_list ,
RDB_TAGDB ,
st->m_coll ,
st ,
sendReply9a ,
false , // forceLocal?
st->m_niceness ))
// return if blocked
return;
sendReply9a( st );
return;
}
// . get from msg5, return if it blocked
// . will probably not block since in the disk page cache a lot
if ( ! st->m_msg5.getList ( RDB_TAGDB ,
st->m_coll ,
&st->m_list ,
startKey ,
endKey ,
100000 , // minRecSizes
true , // include tree?
false , // addtocache?
0 , // maxcacheage
0 , // startfilenum
-1 , // numFiles
st ,
gotList ,
st->m_niceness ,
true ))// do err correction?
return;
// log that for debug
//log("tagdb: msg5 call did not block. st=%"UINT32"",(int32_t)st);
// sanity check - why not block if it had corruption?
if ( st->m_msg5.m_msg3.m_hadCorruption ) { char *xx=NULL;*xx=0; }
// it did not block...
gotList( st , NULL , NULL );
}
void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) {
// cast our state class
State9a *st = (State9a *)state;
// return right away if error getting the rec
if ( g_errno ) { sendReply9a ( st ); return; }
// note it
//log("tagdb: in gotlist st=%"UINT32"",(int32_t)st);
// this is the TagRec rdb record
char *rec = st->m_list.getList ();
int32_t recSize = st->m_list.getListSize();
// cast it as a TagRec
TagRec *accRec = &st->m_accRec;
// reset in case not in tagdb and rec/recSize is NULL/0
accRec->reset();
// copy it to our accumulator rec which has room to grow, the list
// does not
gbmemcpy ( (char *)accRec , rec , recSize );
// free that list buffer now, we copied it into a larger buffer
st->m_list.reset();
loop:
// clear it
g_errno = 0;
// . add/remove the tags from the tagRec
// . add will replace tags with the same tag id and username
// . should deal with "negative" tags (addDelTag())
//if ( st->m_requestType == 0x00 ) accRec->addTags ( st->m_tagRec );
//else accRec->removeTags ( st->m_tagRec );
accRec->addTags ( st->m_tagRec );
// was there an error? abandon all operations on this TagRec if so
if ( g_errno ) { sendReply9a ( st ); return; }
// perform operations on others in the queue
st = st->m_next;
// debug for now
if ( st && g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"tagdb: calling lock for st=0x%"XINT32"",(int32_t)st);
// if there was one, do it
if ( st ) goto loop;
// reset to original parent
st = (State9a *)state;
// debug msg
SafeBuf sb; accRec->printToBuf ( &sb );
log(LOG_DEBUG,"tagdb: adding to tagdb: %s",sb.getBufStart());
// set the list, it should free itself
st->m_list.set ( (char *)accRec , // list
accRec->getSize() , // allocSize
(char *)accRec , // alloc
accRec->getSize() , // allocSize
(char *)&accRec->m_key , // startKey
(char *)&accRec->m_key , // endKey
-1 , // fixeDataSize
false , // ownData?
false , // useHalfKeys?
sizeof(key_t) );// keySize
// add it back after the mods
if ( ! st->m_msg1.addList( &st->m_list ,
RDB_TAGDB ,
st->m_coll ,
st ,
sendReply9a ,
false , // forceLocal?
MAX_NICENESS ))// niceness
return;
// i giess we did not block! send back the reply...
sendReply9a ( st );
}
void sendReply9a ( void *state ) {
// cast our state class
State9a *st = (State9a *)state;
// delete our slot from the lock table
s_lockTable2.removeKey ( &st->m_tagRec->m_key.n0 );
// log it
if (g_errno) log("tagdb: msg9a failed to add: %s",mstrerror(g_errno));
// save it, in case a function below clears g_errno
int32_t saved = g_errno;
loop:
if ( saved ) g_udpServer.sendErrorReply( st->m_slot,saved);
// send empty reply
else g_udpServer.sendReply_ass(NULL,0,NULL,0,st->m_slot);
// save old guy
State9a *next = st->m_next;
// free him, we sent his reply
mdelete ( st , sizeof(State9a),"msg9afr");
delete (st);
// repeat for each guy waiting in line
st = next;
// if there was one, do it
if ( st ) goto loop;
// reset to original parent
st = (State9a *)state;
}
*/
///////////////////////////////////////////////
//
// OTHER functions
//
///////////////////////////////////////////////
int32_t getY ( int64_t X , int64_t *x , int64_t *y , int32_t n ) {
// if we only have one point then there'll be no interpolation
if ( n == 1 ) return y[0];
// find the first x after our "X"
int32_t j;
for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break;
// before/after first/last point means we don't have to interpolate
if ( j <= 0 ) return y[0 ];
if ( j >= n ) return y[n-1];
// linear interpolate between our 2 points (x0,y0) and (x1,y1)
int64_t x0 = x[j-1];
int64_t x1 = x[j ];
int64_t y0 = y[j-1];
int64_t y1 = y[j ];
// error if x1 less than x0
if ( x1 <= x0 ) {
log("tagdb: X coordinates are not in ascending order for map");
char *xx=NULL;*xx=0;
}
// otherwise we have a sloping line
return y0 + ( ((int64_t)X - x0) * (y1-y0) ) /(x1-x0) ;
}
///////////////////////////////////////////////
//
// sendPageTagdb() is the HTML interface to tagdb
//
///////////////////////////////////////////////
static void sendReplyWrapper ( void *state ) ;
static void sendReplyWrapper2 ( void *state ) ;
static bool sendReply ( void *state ) ;
static bool sendReply2 ( void *state ) ;
static bool getTagRec ( class State12 *st );
// don't change name to "State" cuz that might conflict with another
class State12 {
public:
//Msg9a m_msg9a;
TcpSocket *m_socket;
bool m_adding;
//char *m_coll;
collnum_t m_collnum;
//int32_t m_collLen;
//char *m_buf;
//int32_t m_bufLen;
bool m_isLocal;
//int32_t m_fileNum;
//bool m_isMasterAdmin;
//bool m_isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
//char m_userType;
HttpRequest m_r;
//char *m_username;
TagRec m_tagRec;
TagRec m_newtr;
Msg8a m_msg8a;
Url m_url;
char *m_urls;
int32_t m_urlsLen;
Msg1 m_msg1;
RdbList m_list;
//Msg1 m_msg1;
int32_t m_niceness;
bool m_mergeTags;
//char m_tmp[16];
};
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the tagdb interface
// . call g_httpServer.sendDynamicPage() to send it
// . show a textarea for sites, then list all the different site tags
// and have an option to add/delete them
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) {
// are we the admin?
//bool isAdmin = g_collectiondb.isAdmin ( req , s );
// get the collection record
CollectionRec *cr = g_collectiondb.getRec ( req );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log("admin: No collection record found "
"for specified collection name. Could not add sites to "
"tagdb. Returning HTTP status of 500.");
return g_httpServer.sendErrorReply ( s , 500 ,
"collection does not exist");
}
/*
bool isAssassin = cr->isAssassin ( s->m_ip );
if ( isAdmin ) isAssassin = true;
// bail if permission denied
if ( ! isAssassin ){
//&& ! cr->hasPermission ( req , s ) ) {
log("admin: Bad collection name or password. Could not add "
"sites to tagdb. Permission denied.");
return sendPagexxxx( s , req ,
"Collection name or "
"password is incorrect");
}
*/
// make a state
State12 *st ;
try { st = new (State12); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageTagdb: new(%"INT32"): %s",
(int32_t)sizeof(State12),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State12) , "PageTagdb" );
//st->m_isMasterAdmin = isAdmin;
//st->m_isAssassin = isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
//st->m_userType = g_pages.getUserType ( s , req );
// assume we've nothing to add
st->m_adding = false;
// save the socket
st->m_socket = s;
// i guess this is nuked, so copy it
st->m_r.copy ( req );
// make it high priority
st->m_niceness = 0;
// point to it
HttpRequest *r = &st->m_r;
// get the collection
int32_t collLen = 0;
char *coll = r->getString ( "c" , &collLen , NULL /*default*/);
// get collection rec
CollectionRec *cr2 = g_collectiondb.getRec ( coll );
// bitch if no collection rec found
if ( ! cr2 || ! coll || collLen+1 > MAX_COLL_LEN ) {
g_errno = ENOCOLLREC;
log("admin: No collection record found "
"for specified collection name. Could not add sites to "
"tagdb. Returning HTTP status of 500.");
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
return g_httpServer.sendErrorReply ( s , 500 ,
"collection does not exist");
}
// . get fields from cgi field of the requested url
// . get the null-terminated, space-separated lists of sites to add
int32_t urlsLen = 0;
char *urls = r->getString ( "u" , &urlsLen , NULL /*default*/);
//a quick hack so we can put multiple sites in a link
if(r->getLong("uenc", 0))
for(int32_t i = 0; i < urlsLen; i++)
if(urls[i] == '+') urls[i] = '\n';
// get the file # of the tagdb file these sites should use
//int32_t fileNum = r->getLong ("f",-1);
// get the archive filename of sites to add
/*
int32_t xlen;
char *x = r->getString("x",&xlen,NULL);
// trim off any spaces
while ( xlen > 0 && is_wspace_a(x[xlen-1]) ) x[--xlen]='\0';
*/
// . get the username
// . just get from cookie so it is not broadcast over the web via a
// referral url
//st->m_username = r->getStringFromCookie("username");
//st->m_username = g_users.getUsername(r);
// are we coming from a local machine?
st->m_isLocal = r->isLocal();
/*
// don't set this unless we have to free it
st->m_buf = NULL;
st->m_bufLen = 0;
// . set our archive filename of sites to add with this fileNum
// . "a" will be NULL if none supplied
if ( xlen ) {
File file;
file.set ( x );
// add 1 to bufLen for terminating \0
int32_t bufLen = file.getFileSize() + 1 ;
char *buf = (char *) mmalloc ( bufLen , "PageTagdb");
if ( ! buf ) {
log("admin: File of sites is too big to add to tagdb."
" Allocation of %"INT32" bytes failed.",bufLen);
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));
}
file.open(O_RDONLY);
file.read ( buf , bufLen - 1 , 0 );
// NULL terminate the list of urls
buf [ bufLen - 1 ] = '\0';
st->m_buf = buf;
st->m_bufLen = bufLen ;
urls = buf;
urlsLen = bufLen;
}
*/
// it references into the request, should be ok
//st->m_coll = coll;
st->m_collnum = cr->m_collnum;
//st->m_collLen = collLen;
//strcpy ( st->m_coll , coll );
// do not print "(null)" in the textarea
if ( ! urls ) urls = "";
// the url buffer
st->m_urls = urls;
st->m_urlsLen = urlsLen;
// sanity check
//bool delOp = r->getLong ("delop",0 );
//char *nuke = r->getString ("nuke" ,NULL );
//if ( nuke && ! delOp ) {
// g_errno = EBADENGINEER;
// log("tagdb: delete operation checkbox not checked.");
// mdelete ( st , sizeof(State12) , "PageTagdb" );
// delete (st);
// return g_httpServer.sendErrorReply(s,500,
// mstrerror(g_errno));
//}
int32_t ufuLen;
char *ufu = r->getString("ufu",&ufuLen);
if ( urls[0] == '\0' && ! ufu ) return sendReply ( st );
char *get = r->getString ("get",NULL );
// this is also a get operation but merges the tags from all TagRecs
char *merge = r->getString("tags",NULL);
// is this an add/update operation? or just get?
if ( get || merge ) st->m_adding = false;
else st->m_adding = true;
// if each line in the file is the output of a tagdb dump
// operation on the cmd line like this:
// k.n1=0x892f9 k.n0=0xac2ff39f8112b71f version=0 TAG=ruleset,
// "mwells",1,Jan-02-2009-18:26:04,333333333,67.16.94.2,3735437892,36
// THEN we should just call msg9a directly and it should create
// a tag rec for each line and add that
/*
bool isDumpFile = false;
if ( urls && strncmp(urls,"k.n1=",5)==0 ) isDumpFile = true;
if ( isDumpFile ) {
if ( ! st->m_msg9a.addTags ( st->m_urls , // dumpFile
st->m_coll ,
st ,
sendReplyWrapper2 ,
0 ))// niceness
return false;
return sendReply2 ( st );
}
*/
// get/merge operations can skip the tag rec lookup
//if ( ! st->m_adding ) return sendReply ( st );
// regardless, we have to get the tagrec for all operations
//Url site;
//site.set(urls,gbstrlen(urls));
st->m_url.set(urls,gbstrlen(urls));
st->m_mergeTags = merge;
return getTagRec ( st );
}
bool getTagRec ( State12 *st ) {
bool doInheritance = st->m_mergeTags;//(bool)merge;
char rdbId = RDB_TAGDB;
// fbid09729034234.com then use facebookdb
//char *host = site.getHost();
//if ( strncmp(host,"fbid",4)==0 && is_digit(host[4]) )
// rdbId = RDB_FACEBOOKDB;
// this replaces msg8a
if ( ! st->m_msg8a.getTagRec ( &st->m_url,//&site ,
// tell msg8a to try to guess the site
NULL,
st->m_collnum ,
false, // skip dom lookup?
st->m_niceness ,
st ,
sendReplyWrapper ,
&st->m_tagRec ,
doInheritance ,
rdbId))
return false;
/*
if ( ! st->m_msg8a.getTagRec ( &site , // &st->m_url,
st->m_coll,
st->m_collLen,
true, //usecanonicalName
0, //niceness
st,
sendReplyWrapper ,
&st->m_tagRec ,
doInheritance )){
return false;
}
*/
return sendReply ( st );
}
void sendReplyWrapper ( void *state ) {
sendReply ( state );
}
static void sendReplyWrapper2 ( void *state ) {
State12 *st = (State12 *)state;
// re-get the tags from msg8a since we changed them
getTagRec(st);
//sendReply2 ( state );
}
bool sendReply ( void *state ) {
// get our state class
State12 *st = (State12 *) state;
// get the request
HttpRequest *r = &st->m_r;
// and socket
TcpSocket *s = st->m_socket;
// the tagrec
//TagRec *gr = &st->m_tagRec;
// reset "gr" so it won't show the old tags of the first rec
// in the text area box on the tagdb page after the add is completed
//if ( st->m_adding ) gr->reset();
// . if urlsLen <= 0 or fileNum < 0 and we're not deleting
// . then we've nothing to add
//if ( urlsLen <= 0 ) return sendReply ( st );
// need a valid username
//if ( ! st->m_username || st->m_username[0] == '\0' ) {
// log("tagdb: bad username.");
// mdelete ( st , sizeof(State12) , "PageTagdb" );
// delete (st);
// return g_httpServer.sendErrorReply(s,500,
// mstrerror(g_errno));
//}
if ( ! st->m_adding ) return sendReply2 ( st );
// no permmission?
bool isMasterAdmin = g_conf.isMasterAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
if ( ! isMasterAdmin &&
! isCollAdmin ) {
g_errno = ENOPERM;
return sendReply2 ( st );
}
//char *nuke = r->getString ("nuke" ,NULL );
TagRec *newtr = &st->m_newtr;
// update it from the http request
newtr->setFromHttpRequest ( r , s );
// but remove the site tag
//newtr.removeTags ( "site" , NULL );
// add it into gr
//gr->addTags ( &newtr );
// copy it over to our state
//gbmemcpy ( gr , &newtr , newtr.getSize() );
// debug
// this doesn't work because we do not set TagRec::m_listPtrs[0]
// to point to the list we make below (MDW 4/29/13)
//SafeBuf tmp;
//newtr->printToBuf ( &tmp );
//log(LOG_DEBUG,"tagdb: converted from http: %s",
// tmp.getBufStart() );
// make a startKey and endKey from the tagRec's key
//key_t startKey = gr->m_key;
//key_t endKey = gr->m_key;
// startkey gets is low bit cleared though
//startKey.n0 &= 0xfffffffffffffffeLL;
/*
// add using msg9a
if ( ! st->m_msg9a.addTags ( st->m_urls ,
NULL , // sitePtrs
0 , // numSitePtrs
st->m_coll ,
st ,
sendReplyWrapper2 ,
0 , // niceness
&newtr , // gr
nuke ,
NULL )) // ipvec
return false;
*/
// shrotcut
SafeBuf *sbuf = &newtr->m_sbuf;
// use the list we got
RdbList *list = &st->m_list;
key128_t startKey;
key128_t endKey;
startKey.setMin();
endKey.setMax();
// set it from safe buf
list->set ( sbuf->getBufStart() ,
sbuf->length() ,
NULL ,
0 ,
(char *)&startKey ,
(char *)&endKey ,
-1 ,
false ,
false ,
sizeof(key128_t) );
// no longer adding
st->m_adding = false;
// . just use TagRec::m_msg1 now
// . no, can't use that because tags are added using SafeBuf::addTag()
// which first pushes the rdbid, so we gotta use msg4
if ( ! st->m_msg1.addList ( list ,
RDB_TAGDB ,
st->m_collnum ,
st ,
sendReplyWrapper2 ,
false ,
st->m_niceness ) )
return false;
// . if addTagRecs() doesn't block then sendReply right away
// . this returns false if blocks, true otherwise
//return sendReply2 ( st );
return getTagRec ( st );
}
bool sendReply2 ( void *state ) {
// get our state class
State12 *st = (State12 *) state;
// get the request
HttpRequest *r = &st->m_r;
// and socket
TcpSocket *s = st->m_socket;
// page is not more than 32k
char buf[1024*32];
SafeBuf sb(buf, 1024*32);
// do they want an xml reply?
if( r->getLong("xml",0) ) { // was "raw"
sb.safePrintf("<?xml version=\"1.0\" "
"encoding=\"ISO-8859-1\"?>\n"
"<response>\n");
st->m_tagRec.printToBufAsXml(&sb);
sb.safePrintf("</response>");
log ( LOG_INFO,"sending raw page###\n");
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
// extract the socket
TcpSocket *s = st->m_socket;
// . nuke the state
// . first free the buffer, if non-NULL
//if (st->m_buf) mfree (st->m_buf, st->m_bufLen, "PageTagdb");
mdelete(st, sizeof(State12), "PageTagdb");
delete (st);
// . send this page
// . encapsulates in html header and tail
// . make a Mime
return g_httpServer.sendDynamicPage(s, sb.getBufStart(),
sb.length(),
0, false, "text/xml",
-1, NULL, "ISO-8859-1");
}
// . print standard header
// . do not print big links if only an assassin, just print host ids
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );
// did we add some sites???
if ( st->m_adding ) {
// if there was an error let them know
if ( g_errno )
sb.safePrintf("<center>Error adding site(s): <b>"
"%s[%i]</b><br><br></center>\n",
mstrerror(g_errno) , g_errno );
else sb.safePrintf ("<center><b><font color=red>"
"Sites added successfully"
"</font></b><br><br></center>\n");
}
//char *c = st->m_coll;
char bb [ MAX_COLL_LEN + 60 ];
bb[0]='\0';
sb.safePrintf(
"<style>"
".poo { background-color:#%s;}\n"
"</style>\n" ,
LIGHT_BLUE );
// print interface to add sites
sb.safePrintf (
"<table %s>"
"<tr><td colspan=2>"
"<center><b>Tagdb</b>%s</center>"
"</td></tr>", TABLE_STYLE , bb );
// sometimes we add a huge # of urls, so don't display them because
// it like freezes the silly browser
char *uu = st->m_urls;
if ( st->m_urlsLen > 100000 ) uu = "";
//sb.safePrintf ( "<tr bgcolor=#%s><td colspan=2>"
// "<center>"
// "</center>"
// "</td></tr>",
// DARK_BLUE);
sb.safePrintf ( "<tr class=poo><td>"
"<b>urls</b>"
"<br>"
"<font size=-2>"
"Enter a single URL and then click <i>Get Tags</i> to "
"get back its tags. Enter multiple URLs and select "
"the tags names and values in the other table "
"below in order to tag "
"them all with those tags when you click "
"<i>Add Tags</i>. "
"On the command line you can also issue a "
"<i>./gb 0 dump S main 0 -1 1</i>"
"command, for instance, to dump out the tagdb "
"contents for the <i>main</i> collection on "
"<i>host #0</i>. "
"</font>"
"</td>");
// text area for adding space separated sites/urls
//char *pp = "put sites here";
//char *pp = "";
//if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls"
sb.safePrintf (""
"<td width=70%%>"
"<br>"
"<textarea rows=16 cols=64 name=u>"
"%s</textarea></td></tr>" , uu );
// spam assassins should not use this much power, too risky
//if ( st->m_isMasterAdmin ) {
// sb.safePrintf ("<i><font size=-1>Note: use 1.2.3.<b>0</b> to "
// "specify ip domain.</i><br>");
//}
// allow filename to load them from
//if ( st->m_isMasterAdmin ) {
sb.safePrintf("<tr class=poo>"
"<td>"
"<b>file of urls to tag</b>"
"<br>"
"<font size=-2>"
"If provided, Gigablast will read the URLs from "
"this file as if you pasted them into the text "
"area above. The text area will also be ignored."
"</font>"
"</td>"
"<td><input name=ufu "
"type=text size=40>"//<br>"
//"<i>file can also be dumped output of "
//"tagdb from the <b>gb dump S ...</b> "
//"command.</i>"
//"<br><br>" );
"</td></tr>"
);
//}
// this is applied to every tag that is added for accountability
sb.safePrintf("<tr class=poo><td>"
"<b>username</b>"
"<br><font size=-2>"
"Stored with each tag you add for accountability."
"</font>"
"</td><td>"
"<input name=username type=text size=6 "
"value=\"admin\"> "
"</td></tr>"
);//,st->m_username);
// as a safety, this must be checked for any delete operation
sb.safePrintf ("<tr class=poo><td><b>delete operation</b>"
"<br>"
"<font size=-2>"
"If checked "
"then the tag names you specify below will be "
"deleted for the URLs you provide in the text area "
"when you click <i>Add Tags</i>."
"</font>"
"</td><td><input type=\"checkbox\" "
"value=\"1\" name=\"delop\"></td></tr>");
// close up
sb.safePrintf ("<tr bgcolor=#%s><td colspan=2>"
"<center>"
// this is merge all by default right now but since
// zak is really only using eventtaghashxxxx.com we
// should be ok
"<input type=submit name=get "
"value=\"Get Tags\" border=0>"
//"<input type=submit name=get "
//"value=\"get best rec\" border=0>"
//"<input type=submit name=tags "
//"value=\"merge all matching recs\" border=0>"
//"<input type=submit name=nuke "
//"value=\"delete recs\" border=0>"
// "</form>"
"</center>"
"</td></tr></table>"
"<br><br>"
, DARK_BLUE
);
// . show all tags we got values for
// . put a delete checkbox next to each one
// . show 5-10 dropdowns for adding new tags
// for some reason the "selected" option tags do not show up below
// on firefox unless i have this line.
sb.safePrintf (
"<table %s>"
"<tr><td colspan=20>"
"<center><b>Add Tag</b></center>"
"</td></tr>", TABLE_STYLE );
// count how many "tagRecs" we are taking tags from
Tag *jtag = st->m_tagRec.getFirstTag();
int32_t numTagRecs = 0;
for ( ; jtag ; jtag = st->m_tagRec.getNextTag(jtag) ) {
// skip dups
if ( jtag->m_type == TT_DUP ) continue;
// count # of TagRecs contributing to the tags
//if ( tag && tag->m_type == ST_SITE ) numTagRecs++;
if ( jtag && jtag->isType("site") ) numTagRecs++;
}
// if we are displaying a COMBINATION of TagRecs merged together in
// the inheritance loop (above) then you can not edit that! you can
// only edit individual tag recs
bool canEdit = (numTagRecs <= 1);
if ( ! canEdit )
sb.safePrintf("<tr class=poo>"
"<td colspan=10><center><font color=red>"
"<b>Can not edit because more than one "
"TagRecs were merged</b></font></center>"
"</td></tr>\n" );
// headers
sb.safePrintf("<tr bgcolor=#%s>"
//"<td><b>delete?</b></td>"
"<td><b>del?</b></td>"
"<td><b>tag name</b></td>"
"<td><b>tag value</b></td>"
"<td><b>datasize (with NULL)</b></td>"
"<td><b>username</b></td>"
"<td><b>timestamp</b></td>"
"<td><b>user ip</b></td>"
"<td><b>deduphash32</b></td>"
"<td><b>sitehash32</b></td>"
"</tr>\n",
DARK_BLUE);
// set up the loop
Tag *itag = st->m_tagRec.getFirstTag();
//last = NULL;
int32_t count = 0;
int32_t empty = 0;
// loop over all tags in TagRec
for ( ; empty < 3 ; count++ ) {
// use this tag to print from
Tag *ctag = itag;
// advance
if ( itag ) itag = st->m_tagRec.getNextTag(itag);
// make it NULL, do not start over at the beginning
if ( empty > 0 ) ctag = NULL;
// skip dups
if ( ctag && ctag->m_type == TT_DUP ) continue;
// if ctag NULL and we are getting all tags, break
if ( ! canEdit && ! ctag ) break;
// assign for looping
//last = tag;
// if we are NULL, print out 3 empty tags
if ( ! ctag ) empty++;
// start the section
sb.safePrintf("<tr class=poo>");
// the delete tag checkbox
//sb.safePrintf("<tr bgcolor=#%s><td>",DARK_BLUE);
sb.safePrintf("<td>");
if ( ctag && canEdit ) // && tag->m_type != ST_SITE )
sb.safePrintf("<input name=deltag%"INT32" "
"type=checkbox>",count);
else
sb.safePrintf("&nbsp;");
sb.safePrintf("</td>");
// start the next cell
sb.safePrintf("<td>");
// . skip ST_SITE, do not show dropdown for that
// . no, because for looking up tagRecs i like to see
// the site tag value, to see what subdomain is matched
//if ( ctag && ctag->m_type == ST_SITE ) continue;
// print drop down
if ( ! ctag ) sb.safePrintf("<select name=tagtype%"INT32">",count);
// how many tags do we have?
int32_t n = (int32_t)sizeof(s_tagDesc)/(int32_t)sizeof(TagDesc);
// the options
for ( int32_t i = 0 ; ! ctag && i < n ; i++ ) {
TagDesc *td = &s_tagDesc[i];
// get tag name
char *tagName = td->m_name;
// skip if a reserved tag
//if ( strncasecmp ( tagName , "reserved" ,8)==0 )
// continue;
// select the item in the dropdown
char *selected = "";
// was it selected?
if ( ctag && td->m_type == ctag->m_type )
selected = " selected";
// show it in the drop down list
sb.safePrintf("<option value=\"%s\"%s>%s",
tagName,selected,tagName);
}
// close up the drop down list
if ( ! ctag ) sb.safePrintf("</select>");
else {
char *tagName = getTagStrFromType ( ctag->m_type );
sb.safePrintf("<input type=hidden name=tagtype%"INT32" "
"value=\"%s\">%s",
count,tagName,tagName);
}
sb.safePrintf("</td><td>");
// the score field for the drop down list, whatever tag id
// was selected will have this score
if ( canEdit )
sb.safePrintf("<input type=text name=tagdata%"INT32" "
"size=50 value=\"",count);
// show the value
if ( ctag ) ctag->printDataToBuf ( &sb );
// close up the input tag
if ( canEdit ) sb.safePrintf("\">");
// close up table cell
sb.safePrintf("\n</td>");
// if no tag, just placeholders
if ( ! ctag ) {
sb.safePrintf("<td>&nbsp;</td>"
"<td>&nbsp;</td>"
"<td>&nbsp;</td>"
"<td>&nbsp;</td>"
"<td>&nbsp;</td>"
"<td>&nbsp;</td></tr>");
continue;
}
// data size
sb.safePrintf("<td>%"INT32"</td>",(int32_t)ctag->getTagDataSize());
// username, timestamp only for non-empty tags
char *username = ctag->getUser();
int32_t timestamp = ctag->m_timestamp;
int32_t ip = 0;
char *ips = "&nbsp;";
if ( ctag->m_ip ) { ip=ctag->m_ip; ips=iptoa(ctag->m_ip);}
// convert timestamp to string
char tmp[64];
sprintf(tmp,"&nbsp;");
time_t ts = timestamp;
struct tm *timeStruct = localtime ( &ts );
if ( timestamp )
strftime(tmp,64,"%b-%d-%Y-%H:%M:%S",timeStruct);
sb.safePrintf("<td><input type=hidden name=taguser%"INT32" "
"value=%s>%s</td>",
count,username,username);
sb.safePrintf("<td><input type=hidden name=tagtime%"INT32" "
"value=%"INT32">%s</td>",
count,timestamp,tmp);
sb.safePrintf("<td><input type=hidden name=tagip%"INT32" "
"value=%"INT32">%s",
count,ip,ips);
sb.safePrintf("<input type=hidden name=tagn1key%"INT32" "
"value=%"UINT64">",
count,ctag->m_key.n1);
sb.safePrintf("<input type=hidden name=tagn0key%"INT32" "
"value=%"UINT64">",
count,ctag->m_key.n0);
sb.safePrintf("</td>");
sb.safePrintf("<td>0x%"XINT32"</td>", (int32_t)(ctag->m_key.n0>>32) );
sb.safePrintf("<td>0x%"XINT32"</td>",
// order 1 in since we always do that because
// we forgot to shift up one for the delbit
// above in Tag::set() when it sets m_key.n0
(int32_t)(ctag->m_key.n0&0xffffffff) | 0x01);
//sb.safePrintf("<td>%s</td><td>%s</td><td>%s</td>",
// username,tmp,ips);
sb.safePrintf("</tr>");
}
// do not print add or del tags buttons if we got tags from more
// than one TagRec!
if ( canEdit )
sb.safePrintf ("<tr bgcolor=#%s><td colspan=10><center>"
"<input type=submit name=add "
"value=\"Add Tags\" border=0>"
"</center></td>"
"</tr>\n",DARK_BLUE);
sb.safePrintf ( "</center></table>" );
sb.safePrintf ("</form>");
sb.safePrintf ("</html>");
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
// calculate buffer length
// extract the socket
//TcpSocket *s = st->m_socket;
// . nuke the state
// . first free the buffer, if non-NULL
//if ( st->m_buf ) mfree ( st->m_buf , st->m_bufLen , "PageTagdb" );
mdelete ( st , sizeof(State12) , "PageTagdb" );
delete (st);
// print it out
//logf(LOG_DEBUG,"tagdb: %s",sb.getBufStart()+sb.length()-256);
// . send this page
// . encapsulates in html header and tail
// . make a Mime
return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}
//void classifierDoneWrapper ( void *state ) {
// g_tagdbClassifier.m_running = false;
//}
// . we can have multiple tags of this type per tag for a single username
// . by default, there can be multiple tags of the same type in the Tag as
// int32_t as the usernames are all different. see addTag()'s deduping below.
bool isTagTypeUnique ( int32_t tt ) {
// a dup?
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// look up in hash table
TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
if ( ! tdp ) {
log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
"not indexable",tt);
return false;
}
// do not core for now
TagDesc *td = *tdp;
if ( ! td ) {
log("tagdb: got unknown tag type %"INT32" assuming "
"unique",tt);
return true;
}
// if none, that is crazy
if ( ! td ) { char *xx=NULL;*xx=0; }
// return
if ( td->m_flags & TDF_ARRAY) return false;
return true;
}
bool isTagTypeIndexable ( int32_t tt ) {
// a dup?
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
// make sure table is valid
if ( ! s_initialized ) g_tagdb.setHashTable();
// look up in hash table
TagDesc **tdp = (TagDesc **)s_ht.getValue ( &tt );
// do not core for now
if ( ! tdp ) {
log("tagdb: got unknown tag type %"INT32" assuming "
"not indexable",tt);
return false;
}
TagDesc *td = *tdp;
if ( ! td ) {
log("tagdb: tag desc is NULL for tag type %"INT32" assuming "
"not indexable",tt);
return false;
}
// if none, that is crazy MDW coring here:
if ( ! td ) { char *xx=NULL;*xx=0; }
// return false if we should not index it
if ( td->m_flags & TDF_NOINDEX ) return false;
// otherwise, index it
return true;
}
// . when displaying a tag we need to know if it is a string or not
// . that and the dataSize determine how we display it
/*
bool isTagTypeString ( int32_t tt ) {
// look up in hash table
TagDesc *td = (TagDesc **)s_ht.getValue ( tt );
// if none, that is crazy
if ( ! td ) { char *xx=NULL;*xx=0; }
// return
return (td->m_flags & TDF_STRING);
}
*/
// used to determine if one Tag should overwrite the other! if they
// have the same dedup hash... then yes...
int32_t Tag::getDedupHash ( ) {
// if unique use that!
if ( isTagTypeUnique ( m_type ) ) return m_type;
// if we are NOT unique... then hash username and data. thus we only
// replace a key if its the same tagtype, username and data. that
// way it will just update the timestamp and/or ip.
// start hashing here
char *startHashing = (char *)&m_type;
// end here. include username (and tag data!)
char *endHashing = m_buf + m_bufSize;
// if we are an event tag then PageEvents.cpp added us in the form of
// user%"UINT64"tag%sval%"INT32" ... so ignore value (FACEBOOKDB)
//if ( m_type == s_eventTag ) {
// endHashing--;
// for (;endHashing-1>m_buf&&is_digit(endHashing[-1]);
// endHashing--);
//}
// do not include bufsize in hash
int32_t saved = m_bufSize;
m_bufSize = 0;
// hash this many bytes
int32_t hashSize = endHashing - startHashing;
// set key
int32_t dh = hash32 ( startHashing , hashSize );
// revert bufsize
m_bufSize = saved;
return dh;
}
// make sure sizeof(Entry2)=5 not 8!
#pragma pack(1)
class Entry1 {
public:
uint32_t m_hostHash32;
uint32_t m_siteNumInlinksUniqueCBlock;
};
class Entry2 {
public:
uint32_t m_hostHash32;
uint8_t m_siteNumInlinksUniqueCBlock;
};
static int linkSort1Cmp ( const void *a, const void *b ) {
Entry1 *ea = (Entry1 *)a;
Entry1 *eb = (Entry1 *)b;
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
return 0;
}
static int linkSort2Cmp ( const void *a, const void *b ) {
Entry2 *ea = (Entry2 *)a;
Entry2 *eb = (Entry2 *)b;
if ( ea->m_hostHash32 > eb->m_hostHash32 ) return 1;
if ( ea->m_hostHash32 < eb->m_hostHash32 ) return -1;
return 0;
}
bool Tagdb::loadMinSiteInlinksBuffer ( ) {
if ( ! loadMinSiteInlinksBuffer2() ) return false;
// sanity testing
uint32_t hostHash32 = hash32n("www.imdb.com");
int32_t msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 10 ) {
log("tagdb: bad siteinlinks. linkedin.com not found.");
//return false;
}
hostHash32 = hash32n("0009.org" );
msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 0 ) {
log("tagdb: bad siteinlinks. 0009.org not found.");
//return false;
}
// slot #1 in the buffer. make sure b-stepping doesn't lose it between
// the roundoff error cracks.
hostHash32 = hash32n("www.hindu.com");
msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 3 ) {
log("tagdb: bad siteinlinks. www.hindu.com not found "
"(%"INT32").",
hostHash32);
//return false;
}
Url tmp;
tmp.set("gnu.org");
hostHash32 = tmp.getHash32WithWWW();
msi = getMinSiteInlinks ( hostHash32 );
if ( msi < 0 ) {
log("tagdb: bad siteinlinks. www.gnu.org not found.");
//return false;
}
return true;
}
bool Tagdb::loadMinSiteInlinksBuffer2 ( ) {
// use 4 bytes for the first 130,000 entries or so to hold
// # of site inlinks. then we only need 1 byte since the remaining
// 25M are <256 sitenuminlinksunqiecblocks
m_siteBuf1.load(g_hostdb.m_dir,"sitelinks1.dat","stelnks1");
m_siteBuf2.load(g_hostdb.m_dir,"sitelinks2.dat","stelnks2");
m_siteBuf1.setLabel("sitelnks");
m_siteBuf2.setLabel("sitelnks");
if ( m_siteBuf1.length() > 0 &&
m_siteBuf2.length() > 0 )
return true;
log("gb: loading %ssitelinks.txt",g_hostdb.m_dir);
// ok, make it
SafeBuf tmp;
tmp.load(g_hostdb.m_dir,"sitelinks.txt");
if ( tmp.length() <= 0 ) {
log("gb: fatal error. could not find required file "
"./sitelinks.txt");
return false;
}
log("gb: starting initial creation of sitelinks1.dat and "
"sitelinks2.dat files");
// now parse each line in that
char *p = tmp.getBufStart();
char *pend = p + tmp.length();
char *newp = NULL;
SafeBuf buf1;
SafeBuf buf2;
int32_t count = 0;
for ( ; p < pend ; p = newp ) {
if ( ++count % 1000000 == 0 )
log("gb: parsing line # %"INT32,count);
// advance to next line
newp = p;
for ( ; newp < pend && *newp != '\n' ; newp++ );
if ( newp < pend ) newp++;
// parse this line
int32_t numLinks = atoi(p);
// skip number
for ( ; *p && *p != ' ' && *p != '\n' ; p++ );
// strange
if ( ! *p || *p == '\n' ) continue;
// skip spaces
for ( ; *p == ' ' ; p++ );
// get hostname
char *host = p;
// find end of it
for ( ; *p && *p != '\n' && *p != ' ' && *p != '\t' ; p++ );
// hash it
uint32_t hostHash32 = hash32 ( host , p - host );
// store in buffer
if ( numLinks >= 256 ) {
Entry1 e1;
e1.m_siteNumInlinksUniqueCBlock = numLinks;
e1.m_hostHash32 = hostHash32;
buf1.safeMemcpy ( &e1 , sizeof(Entry1) );
}
else {
Entry2 e2;
e2.m_siteNumInlinksUniqueCBlock = numLinks;
e2.m_hostHash32 = hostHash32;
buf2.safeMemcpy ( &e2 , sizeof(Entry2) );
}
}
log("gb: sorting sitelink data");
// now sort each one
qsort ( buf1.getBufStart() ,
buf1.length()/sizeof(Entry1),
sizeof(Entry1),
linkSort1Cmp );
qsort ( buf2.getBufStart() ,
buf2.length()/sizeof(Entry2),
sizeof(Entry2),
linkSort2Cmp );
// now copy to the official buffer so we only alloc what we need
m_siteBuf1.safeMemcpy ( &buf1 );
m_siteBuf2.safeMemcpy ( &buf2 );
log("gb: saving sitelinks1.dat and sitelinks2.dat");
m_siteBuf1.save(g_hostdb.m_dir,"sitelinks1.dat");
m_siteBuf2.save(g_hostdb.m_dir,"sitelinks2.dat");
return true;
}
int32_t Tagdb::getMinSiteInlinks ( uint32_t hostHash32 ) {
if ( m_siteBuf1.length() <= 0 ) {
log("tagdb: load not called");
char *xx=NULL;*xx=0;
}
// first check buf1 doing bstep
int32_t ne = m_siteBuf1.length() / sizeof(Entry1);
Entry1 *ep = (Entry1 *)m_siteBuf1.getBufStart();
Entry2 *fp = NULL;
int32_t i = ne / 2;
int32_t step = ne / 2;
int32_t count = 0;
int32_t divs = 0;
int32_t dir = 0;
loop1:
if ( i < 0 ) i = 0;
if ( i >= ne ) i = ne-1;
step /= 2;
if ( step == 1 )
goto linearScan1;
if ( hostHash32 < ep[i].m_hostHash32 ) {
i -= step;
goto loop1;
}
if ( hostHash32 > ep[i].m_hostHash32 ) {
i += step;
goto loop1;
}
return ep[i].m_siteNumInlinksUniqueCBlock;
linearScan1:
if ( hostHash32 < ep[i].m_hostHash32 ) {
if ( i == 0 ) goto tryNextBuf;
if ( dir == +1 ) goto tryNextBuf;
i--;
dir = -1;
goto linearScan1;
}
if ( hostHash32 > ep[i].m_hostHash32 ) {
if ( i == ne-1 ) goto tryNextBuf;
if ( dir == -1 ) goto tryNextBuf;
i++;
dir = +1;
goto linearScan1;
}
return ep[i].m_siteNumInlinksUniqueCBlock;
tryNextBuf:
// reset parms
ne = m_siteBuf2.length() / sizeof(Entry2);
fp = (Entry2 *)m_siteBuf2.getBufStart();
i = ne / 2;
step = ne / 2;
count = 0;
divs = 0;
dir = 0;
loop2:
if ( i < 0 ) i = 0;
if ( i >= ne ) i = ne-1;
step /= 2;
if ( step == 1 )
goto linearScan2;
if ( hostHash32 < fp[i].m_hostHash32 ) {
i -= step;
goto loop2;
}
if ( hostHash32 > fp[i].m_hostHash32 ) {
i += step;
goto loop2;
}
return fp[i].m_siteNumInlinksUniqueCBlock;
linearScan2:
if ( hostHash32 < fp[i].m_hostHash32 ) {
if ( i == 0 ) return -1;
if ( dir == +1 ) return -1;
i--;
dir = -1;
goto linearScan2;
}
if ( hostHash32 > fp[i].m_hostHash32 ) {
if ( i == ne-1 ) return -1;
if ( dir == -1 ) return -1;
i++;
dir = +1;
goto linearScan2;
}
return fp[i].m_siteNumInlinksUniqueCBlock;
}