mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
8a49e87a61
now we store a "sharded by termid" bit in posdb key for checksums, etc keys that are not sharded by docid. save having to do disk seeks on every host in the cluster to do a dup check, etc.
4815 lines
143 KiB
C++
4815 lines
143 KiB
C++
#include "gb-include.h"
|
|
|
|
#include <sys/stat.h>
|
|
#include "Titledb.h"
|
|
#include "Tagdb.h"
|
|
#include "Categories.h"
|
|
#include "Unicode.h"
|
|
#include "Threads.h"
|
|
#include "Msg1.h"
|
|
#include "HttpServer.h"
|
|
#include "Pages.h"
|
|
#include "SiteGetter.h"
|
|
#include "HashTableX.h"
|
|
#include "Users.h"
|
|
#include "Process.h"
|
|
#include "Rebalance.h"
|
|
|
|
static void gotMsg0ReplyWrapper ( void *state );
|
|
//static void gotReplyWrapper9a ( void *state , UdpSlot *slot ) ;
|
|
|
|
//static void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) ;
|
|
//static void sendReply9a ( void *state ) ;
|
|
|
|
static HashTable s_ht;
|
|
|
|
static bool s_initialized = false;
|
|
|
|
// to stdout
|
|
long Tag::print ( ) {
|
|
SafeBuf sb;
|
|
printToBuf ( &sb );
|
|
// dump that
|
|
return fprintf(stderr,"%s\n",sb.getBufStart());
|
|
}
|
|
|
|
bool Tag::printToBuf ( SafeBuf *sb ) {
|
|
|
|
sb->safePrintf("k.hsthash=%016llx k.duphash=%08lx k.sitehash=%08lx ",
|
|
m_key.n1,
|
|
(long)(m_key.n0>>32),
|
|
(long)(m_key.n0&0xffffffff));
|
|
// print the tagname
|
|
sb->safePrintf ( "TAG=%s,\"%s\",",
|
|
getTagStrFromType(m_type),
|
|
getUser() );
|
|
// data size
|
|
//sb->safePrintf( "%li,", (long)getTagDataSize());
|
|
// print the date when this tag was added
|
|
time_t ts = m_timestamp;
|
|
struct tm *timeStruct = localtime ( &ts );
|
|
char tmp[100];
|
|
strftime(tmp,100,"%b-%d-%Y-%H:%M:%S,",timeStruct);
|
|
sb->safePrintf("%s(%lu),",tmp,m_timestamp);
|
|
// print the time as a long, seconds since epoch
|
|
//sb->safePrintf("%lu,",m_timestamp);
|
|
// print the ip added from
|
|
sb->safePrintf("%s,",iptoa(m_ip));
|
|
// print the tag id
|
|
//sb->safePrintf("%lu,\"",(long)m_tagId);
|
|
// key.n1 is hash of the subdomain i think
|
|
//sb->safePrintf("%lu,\"",m_key.n1);
|
|
sb->safePrintf("\"");
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
// final quote
|
|
sb->safePrintf("\"");
|
|
return true;
|
|
}
|
|
|
|
// . "site" can also be a specific url, but it must be normalized
|
|
// . i.e. of the form http://xyz.com/
|
|
void Tag::set ( char *site ,
|
|
char *tagname ,
|
|
long timestamp ,
|
|
char *user ,
|
|
long ip ,
|
|
char *data ,
|
|
long dataSize ) {
|
|
// get type from name
|
|
m_type = getTagTypeFromStr ( tagname , strlen(tagname) );
|
|
// sanity
|
|
//isTagTypeIndexable ( m_type );
|
|
m_timestamp = timestamp;
|
|
m_ip = ip;
|
|
long userLen = 0;
|
|
if ( user ) userLen = gbstrlen(user);
|
|
// truncate to 127 byte long
|
|
if ( userLen > 126 ) userLen = 126;
|
|
// first byte is size of user, then user plus \0 then data
|
|
//m_bufSize = 1 + userLen + 1 + dataSize;
|
|
// "site" must skip http://
|
|
//long slen = gbstrlen(site);
|
|
//if ( slen > 8 && strncasecmp(site,"http://",7)==0 )
|
|
// site += 7;
|
|
//else if ( slen > 8 && strncasecmp(site,"https://",8)==0 )
|
|
// site += 8;
|
|
|
|
// normalize
|
|
Url norm;
|
|
norm.set ( site );
|
|
|
|
// store user into special buffer
|
|
//long ulen = 0;
|
|
//if ( user ) {
|
|
// ulen = gbstrlen(user);
|
|
// if ( ulen > 7 ) ulen = 7;
|
|
//}
|
|
//memset ( m_user , 0 , 8 );
|
|
//memcpy ( m_user , user , ulen );
|
|
char *p = m_buf;
|
|
// store size (includes \0)
|
|
*p++ = userLen + 1;
|
|
// then user name
|
|
memcpy ( p , user , userLen );
|
|
p += userLen;
|
|
// then \0
|
|
*p++ = '\0';
|
|
// store data now too
|
|
memcpy ( p , data , dataSize );
|
|
p += dataSize;
|
|
// NULL terminate if they did not! now all tag are strings and must
|
|
// be NULL terminated.
|
|
if ( data && p[-1] ) { // data && m_data[dataSize-1] ) {
|
|
//m_data[dataSize] = '\0';
|
|
*p++ = '\0';
|
|
//dataSize++;
|
|
//m_dataSize++;
|
|
}
|
|
// set it
|
|
m_bufSize = p - m_buf;
|
|
|
|
// top X bits should be hash of the domain only so all recs are on the
|
|
// same host near each other
|
|
//m_key.n1 = hash32 ( norm.getDomain() , norm.getDomainLen());
|
|
//
|
|
// too many tags were being read when k.n1 was the domain hash for
|
|
// sites like az.com that had hundreds of subdomains. so go based on
|
|
// host instead.
|
|
//
|
|
// CRAP: using 32 bit hash we get collisions for crap like
|
|
// thedietsolutionprogramscam.com and
|
|
// 2witchdoctors.a-livejasmin.com
|
|
// so let's move to 64bit keys
|
|
//m_key.n1 = hash64 ( norm.getHost() , norm.getHostLen());
|
|
// i had to make this the hash of the site, not host,
|
|
// because www.last.fm/user/xxxxx/
|
|
// was making the rdblist a few megabytes big!!
|
|
m_key.n1 = hash64n ( site );
|
|
// assume we are unique tag, that many of this type can exist
|
|
uint32_t upper32 = getDedupHash(); // m_type;
|
|
/*
|
|
// if we are NOT unique... then hash username and data. thus we only
|
|
// replace a key if its the same tagtype, username and data. that
|
|
// way it will just update the timestamp and/or ip.
|
|
if ( ! isTagTypeUnique ( m_type ) ) {
|
|
// start hashing here
|
|
char *startHashing = (char *)&m_type;
|
|
// end here. include username (and tag data!)
|
|
char *endHashing = m_buf + m_bufSize;
|
|
// hash this many bytes
|
|
long hashSize = endHashing - startHashing;
|
|
// . set key
|
|
upper32 = hash32 ( startHashing , hashSize );
|
|
}
|
|
*/
|
|
|
|
// put in upper 32
|
|
m_key.n0 = upper32;
|
|
// shift it up
|
|
m_key.n0 <<= 32;
|
|
// . then or in url hash
|
|
// . for the site "www.paypal.com:1234" this included the port!
|
|
// but for the most part if the site is just a hostname then
|
|
// this is basically just a hostname, too, but the hash will
|
|
// include the http:// and the ending /
|
|
// . www.paypal.com:1234 was added as a site. so it has the
|
|
// same m_key.n1 as www.paypal.com, but this part is different
|
|
// here. this is the full site hash really. so during the lookup
|
|
// i'd say filter out such tags if they don't match the site you
|
|
// are looking up.
|
|
//m_key.n0 |= (uint32_t) hash32 ( norm.getUrl() , norm.getUrlLen() );
|
|
// set positive bit so its not a delete record
|
|
m_key.n0 |= 0x01;
|
|
|
|
// the size of this class as an Rdb record
|
|
m_recDataSize = m_bufSize + sizeof(Tag) - sizeof(key128_t) - 4;
|
|
}
|
|
|
|
// . return # of ascii chars scanned in "p"
|
|
// . return 0 on error
|
|
// . parses output of printToBuf() above
|
|
// . k.n1=0x695b3 k.n0=0xa4118684fa4edf93 version=0 TAG=ruleset,"mwells",Jan-02-2009-18:26:04,<timestamp>,67.16.94.2,3735437892,36 TAG=blog,"mwells",Jan-02-2009-18:26:04,67.16.94.2,2207516434,1 TAG=site,"tagdb",Jan-02-2009-18:26:04,0.0.0.0,833534375,mini-j-gaidin.livejournal.com/
|
|
long Tag::setFromBuf ( char *p , char *pend ) {
|
|
// save our place
|
|
char *start = p;
|
|
// tags always start with " TAG="
|
|
if ( strncmp(p," TAG=",5) ) {
|
|
log("tagdb: error processing tag in setFromBuf().");
|
|
return 0;
|
|
}
|
|
// skip that
|
|
p += 5;
|
|
|
|
// get the type
|
|
char *type = p;
|
|
// get type length
|
|
while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
if ( p == pend ) return 0;
|
|
// that is the length
|
|
long typeLen = p - type;
|
|
// convert to number
|
|
m_type = getTagTypeFromStr ( type , typeLen );
|
|
// panic?
|
|
if ( m_type == -1 ) { char *xx=NULL;*xx=0;}
|
|
// now the user, skip comma and quote
|
|
p+=2;
|
|
|
|
// data buffer
|
|
char *dst = m_buf;
|
|
// point to it
|
|
char *user = p;
|
|
// get end of it
|
|
while ( p < pend && *p != '\"' ) p++;
|
|
// error?
|
|
if ( p == pend ) return 0;
|
|
// set length
|
|
long userLen = p - user;
|
|
// sanity. username total buf space including \0 <= 8
|
|
if ( userLen > 126 ) userLen = 126;
|
|
// copy it over into us
|
|
//memcpy ( m_user , user , userLen );
|
|
// NULL terminate
|
|
//m_user[userLen] = '\0';
|
|
// first byte is username size
|
|
*dst++ = userLen+1;
|
|
// then the username
|
|
memcpy ( dst , user , userLen );
|
|
dst += userLen;
|
|
// and finall null termination
|
|
*dst++ = '\0';
|
|
// skip quote and comma
|
|
p+=2;
|
|
|
|
// now the datasize
|
|
//long m_dataSize = atoi(p);
|
|
// skip till comma
|
|
//while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
//if ( p == pend ) return 0;
|
|
// skip comma
|
|
//p++;
|
|
|
|
// that is the time stamp in canonical form
|
|
// skip till comma
|
|
while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
if ( p == pend ) return 0;
|
|
// skip comma
|
|
p++;
|
|
|
|
// save start
|
|
char *ts = p;
|
|
// skip until comma again
|
|
while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
if ( p == pend ) return 0;
|
|
// this is the timestamp in seconds since epoch
|
|
m_timestamp = atoi(ts);
|
|
// skip comma
|
|
p++;
|
|
|
|
// ip address as text
|
|
char *ips = p;
|
|
// skip until comma again
|
|
while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
if ( p == pend ) return 0;
|
|
// convert it to binary
|
|
m_ip = atoip ( ips , p - ips );
|
|
// skip comma
|
|
p++;
|
|
|
|
// get the tag identifier
|
|
//m_tagId = atol(p);
|
|
//sscanf ( p , "%lu,",&m_tagId);
|
|
//long long big = atoll(p);
|
|
//m_tagId = (long)big;
|
|
// skip until comma again
|
|
//while ( p < pend && *p != ',' ) p++;
|
|
// error?
|
|
//if ( p == pend ) return 0;
|
|
// skip comma
|
|
//p++;
|
|
|
|
//
|
|
// BEGIN HACK
|
|
//
|
|
// as a hack for now, override this, because before we were not 100%
|
|
// strings as tags, we had single byte values being printed out as
|
|
// strings of 3 bytes
|
|
//char *e = p;
|
|
//while ( e < pend && ! is_wspace_a(*e) ) e++;
|
|
//if ( e > pend ) return 0;
|
|
//m_dataSize = e - p;
|
|
// add in a \0
|
|
//m_dataSize++;
|
|
|
|
//
|
|
// END HACK
|
|
//
|
|
|
|
// . now is the data
|
|
// . return # of chars scanned in "p"
|
|
p += setDataFromBuf ( p , pend );
|
|
|
|
// . sanity check
|
|
// . all tags must be NULL terminated now
|
|
if ( m_buf[m_bufSize-1] != '\0' ) {char *xx=NULL; *xx=0; }
|
|
|
|
// we reset this since we now require that all tags are NULL terminated
|
|
// strings
|
|
//m_tagId = hash32 ( (char *)this,(long)sizeof(Tag)+m_dataSize , 0 );
|
|
// 0 is not valid
|
|
//if ( m_tagId == 0 ) m_tagId = 1;
|
|
|
|
// return how many bytes we read
|
|
return p - start;
|
|
}
|
|
|
|
// . return # of chars scanned in "p"
|
|
// . return 0 on error
|
|
long Tag::setDataFromBuf ( char *p , char *pend ) {
|
|
// string are special
|
|
//if ( isTagTypeString ( m_type ) ) {
|
|
// skip over username in the buffer to point to where to put tag data
|
|
char *dst = m_buf + *m_buf + 1;
|
|
// stop at space of
|
|
memcpy(dst,p,pend-p);
|
|
// advance
|
|
dst += (pend-p);
|
|
// update
|
|
m_bufSize = dst - m_buf;
|
|
// should be end delimter
|
|
char c = m_buf[m_bufSize-1];
|
|
// sanity check
|
|
if ( c && ! isspace(c) ) { char *xx=NULL;*xx=0; }
|
|
// strings are always NULL terminated, the datasize should
|
|
// include the NULL termination
|
|
m_buf[m_bufSize-1]='\0';
|
|
// we basically insert the \0, and *p should point to the space
|
|
// right after the string...! so return m_dataSize - 1
|
|
return m_bufSize - 1;
|
|
/*
|
|
}
|
|
// save it to count
|
|
char *start = p;
|
|
// print as decimal if just 1 byte
|
|
if ( m_dataSize == 1 ) {
|
|
long v = atoi(p);
|
|
if ( v > 256 ) { char *xx=NULL;*xx=0; }
|
|
m_data[0] = v;
|
|
// skip till whitespace or end
|
|
while ( p < pend && isdigit(*p) ) p++;
|
|
return p - start;
|
|
}
|
|
// skip 0x
|
|
if ( *p!='0' || *(p+1)!='x' ) { char *xx=NULL;*xx=0; }
|
|
p += 2;
|
|
// convert hexadecimal string into binary
|
|
long bytesStored = hexToBinary ( p , pend , m_data , false );
|
|
// sanity check
|
|
if ( bytesStored != m_dataSize ) { char*xx=NULL;*xx=0;}
|
|
// advance p, each byte is two characters
|
|
p += bytesStored * 2;
|
|
// return # of bytes in "p" we scanned
|
|
return p - start;
|
|
*/
|
|
}
|
|
|
|
long hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement ) {
|
|
// keep tabs on how many bytes we store into "dst"
|
|
char *start = dst;
|
|
// read in hex values
|
|
while ( src < srcEnd ) {
|
|
// get FIRST hex digit
|
|
unsigned char v;
|
|
v = *(unsigned char *)src;
|
|
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
|
|
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
|
|
else if ( v >= '0' && v <= '9' ) v = v - '0';
|
|
else break;
|
|
// sanity check
|
|
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
|
|
// next character
|
|
src++;
|
|
// store it in the destination
|
|
*dst = v;
|
|
// sanity check, need one more char FOR SURE!
|
|
if ( src >= srcEnd ) { char*xx=NULL;*xx=0;}
|
|
// get the SECOND hex digit of this byte
|
|
v = *(unsigned char *)src;
|
|
if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10;
|
|
else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10;
|
|
else if ( v >= '0' && v <= '9' ) v = v - '0';
|
|
else break;
|
|
// sanity check
|
|
if ( v >= 16 ) { char *xx=NULL;*xx=0;}
|
|
// next character
|
|
src++;
|
|
// shift last guy up 4 bits
|
|
*dst = *dst << 4;
|
|
// or in the new guy
|
|
*dst |= v;
|
|
// point to next byte now
|
|
if ( decrement ) dst--;
|
|
else dst++;
|
|
}
|
|
return dst - start;
|
|
}
|
|
|
|
|
|
bool Tag::printDataToBuf ( SafeBuf *sb ) {
|
|
// string are special
|
|
//if ( isTagTypeString ( m_type ) ) {
|
|
|
|
char *data = getTagData();
|
|
long dataSize = getTagDataSize();
|
|
// because of a bug of not appending the \0 and incrementing
|
|
// Tag::m_dataSize when we should have, we must deal with this!
|
|
//sb->safePrintf("%s",m_data);
|
|
for ( long i = 0 ; data[i] && i < dataSize ; i++ )
|
|
sb->safePrintf ( "%c" , data[i] );
|
|
return true;
|
|
/*
|
|
}
|
|
// print as decimal if just 1 byte
|
|
if ( m_dataSize == 1 ) {
|
|
sb->safePrintf("%li",(long)m_data[0]);
|
|
return true;
|
|
}
|
|
// the "score"
|
|
sb->safePrintf("0x");
|
|
//for ( long i = 0 ; i < m_dataSize ; i++ )
|
|
// sb->safePrintf ( "%02hhx" , m_data[m_dataSize-i-1] );
|
|
// i guess just print it first byte first now
|
|
for ( long i = 0 ; i < m_dataSize ; i++ )
|
|
sb->safePrintf ( "%02hhx" , m_data[i] );
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
bool Tag::printToBufAsAddRequest ( SafeBuf *sb ) {
|
|
// print the tagname
|
|
char *str = getTagStrFromType ( m_type );
|
|
// print the user that added this tag
|
|
sb->safePrintf ( "%s.user=%s" , str , getUser() );
|
|
// print the date when this tag was added
|
|
sb->safePrintf ("&%s.time=%li", str, m_timestamp );
|
|
// print the ip added from
|
|
sb->safePrintf("&%s.ip=%s",str,iptoa(m_ip));
|
|
// print the tag id
|
|
//sb->safePrintf("&%s.id=%lu",str,(long)m_tagId);
|
|
// the "score"
|
|
sb->safePrintf("&%s.data=",str);
|
|
// print the m_data
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
return true;
|
|
}
|
|
|
|
bool Tag::printToBufAsXml ( SafeBuf *sb ) {
|
|
// print the tagname
|
|
char *str = getTagStrFromType ( m_type );
|
|
// print the user that added this tag
|
|
sb->safePrintf ("\t\t<tag>\n\t\t\t<name>%s</name>\n\t\t\t<user>%s",
|
|
str,getUser());
|
|
// print the date when this tag was added
|
|
sb->safePrintf("</user>\n\t\t\t<timestamp>%li</timestamp>\n",
|
|
m_timestamp);
|
|
// print the ip added from
|
|
sb->safePrintf("\t\t\t<ip>%s</ip>\n",iptoa(m_ip));
|
|
// print the tag id
|
|
//sb->safePrintf("\t\t\t<id>%lu</id>\n",(long)m_tagId);
|
|
// the "score"
|
|
sb->safePrintf("\t\t\t<score>");
|
|
// print the m_data
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
sb->safePrintf("</score>\n\t\t</tag>");
|
|
return true;
|
|
}
|
|
|
|
//if ( ! sb->safePrintf("\t\t<eventTagFromTagdb>"
|
|
// "<![CDATA[") )
|
|
bool Tag::printToBufAsXml2 ( SafeBuf *sb ) {
|
|
// print the tagname
|
|
char *str = getTagStrFromType ( m_type );
|
|
// print the user that added this tag
|
|
sb->safePrintf ("\t\t<eventTagdbTag>\n"
|
|
// who added the tag:
|
|
"\t\t\t<addedBy><![CDATA[%s]]></addedBy>\n"
|
|
// when tag was added:
|
|
"\t\t\t<addedTimestamp>%lu</addedTimestamp>\n"
|
|
// ip added from
|
|
"\t\t\t<addedFromIP><![CDATA[%s]]></addedFromIP>\n"
|
|
// name of the tag:
|
|
"\t\t\t<name><![CDATA[%s]]></name>\n"
|
|
// the tag data
|
|
"\t\t\t<data><![CDATA[",
|
|
getUser(),
|
|
m_timestamp,
|
|
iptoa(m_ip),
|
|
str);
|
|
// print the m_data
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
sb->safePrintf("]]></data>\n"
|
|
"\t\t</eventTagdbTag>\n");
|
|
return true;
|
|
}
|
|
|
|
bool Tag::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
|
|
// print the tagname
|
|
char *str = getTagStrFromType ( m_type );
|
|
// print the user that added this tag
|
|
sb->safePrintf ("<tr><td>%s</td><td><b>%s</b>", prefix, str);
|
|
// the "score"
|
|
sb->safePrintf(" value=<b>");
|
|
// print the m_data
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
// print the date when this tag was added
|
|
sb->safePrintf("</b> user=%s time=",getUser());
|
|
time_t ts = m_timestamp;
|
|
struct tm *timeStruct = localtime ( &ts );
|
|
char tmp[100];
|
|
strftime(tmp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);
|
|
sb->safePrintf("%s(%lu)",tmp,m_timestamp);
|
|
// print the ip added from
|
|
sb->safePrintf(" ip=%s",iptoa(m_ip));
|
|
//sb->safePrintf(" id=%lu",(long)m_tagId);
|
|
sb->safePrintf("</td></tr>\n");
|
|
return true;
|
|
}
|
|
|
|
bool Tag::printToBufAsTagVector ( SafeBuf *sb ) {
|
|
// print the tagname
|
|
char *str = getTagStrFromType ( m_type );
|
|
// print strings data types special
|
|
//if ( isTagTypeString ( m_type ) ) {
|
|
//sb->safePrintf("%s:%s ",str,m_data);
|
|
sb->safePrintf("%s:",str);
|
|
// print the m_data
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
sb->safePrintf(" ");
|
|
return true;
|
|
/*
|
|
}
|
|
// print the user that added this tag
|
|
sb->safePrintf ("%s:", str );
|
|
if ( ! printDataToBuf ( sb ) ) return false;
|
|
sb->safePrintf(" ");
|
|
return true;
|
|
*/
|
|
}
|
|
|
|
bool Tag::isType ( char *t ) {
|
|
long h = hash32n ( t );
|
|
return (m_type == h);
|
|
}
|
|
|
|
|
|
|
|
TagRec::TagRec ( ) {
|
|
m_numListPtrs = 0;
|
|
}
|
|
|
|
void TagRec::constructor ( ) {
|
|
m_numListPtrs = 0;
|
|
// run a constructor on the lists
|
|
for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ ) {
|
|
m_lists[i].constructor();//m_alloc = NULL;
|
|
//m_lists[i].m_allocSize = 0;
|
|
}
|
|
}
|
|
|
|
TagRec::~TagRec ( ) {
|
|
reset();
|
|
}
|
|
|
|
void TagRec::reset ( ) {
|
|
m_numListPtrs = 0;
|
|
for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ )
|
|
m_lists[i].freeList();
|
|
}
|
|
|
|
Tag *TagRec::getTag ( char *tagTypeStr ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
return getTag2 ( tagType );
|
|
}
|
|
|
|
Tag *TagRec::getTag2 ( long tagType ) {
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the buf
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if not a match
|
|
if ( tag->m_type != tagType ) continue;
|
|
// skip dups
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// got it
|
|
return tag;
|
|
}
|
|
// if not found return NULL
|
|
return NULL;
|
|
}
|
|
|
|
// . functions to act on a site "tag buf", like that in Msg16::m_tagRec
|
|
// . first 2 bytes is size, 2nd to bytes is # of tags, then the tags
|
|
long TagRec::getLong ( char *tagTypeStr,
|
|
long defalt ,
|
|
Tag **bookmark ,
|
|
long *timestamp ,
|
|
char **user ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
return getLong ( tagType ,
|
|
defalt ,
|
|
bookmark ,
|
|
timestamp ,
|
|
user );
|
|
}
|
|
|
|
long TagRec::getLong ( long tagType ,
|
|
long defalt ,
|
|
Tag **bookmark ,
|
|
long *timestamp ,
|
|
char **user ) {
|
|
// start here
|
|
Tag *tag ;
|
|
if ( ! bookmark ) tag = getFirstTag();
|
|
else tag = getNextTag ( *bookmark );
|
|
// loop over all tags in the buf
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if not a match
|
|
if ( tag->m_type != tagType ) continue;
|
|
// skip dups
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// get the value as a long
|
|
long score = 0;
|
|
// the size
|
|
char *data = tag->getTagData();
|
|
long dataSize = tag->getTagDataSize();
|
|
//long size = m_dataSize;
|
|
// if ends in NULL trunc it
|
|
if ( data[dataSize-1] == '\0' ) dataSize--;
|
|
// trunc it
|
|
//if ( size > 4 ) size = 4;
|
|
// convert string to value, MUST be signed!!! the data
|
|
// should inclue a \0
|
|
score = atol2(data,dataSize);
|
|
// if only a single byte.need to preserve negatives (twos comp)
|
|
//if ( size == 1 ) score = (long)tag->m_data[0];
|
|
//else if ( size == 2 ) score = (long)*((short *)tag->m_data);
|
|
//else memcpy ( &score , tag->m_data , size );
|
|
// bookmark, et al
|
|
if ( bookmark ) *bookmark = tag;
|
|
if ( timestamp ) *timestamp = tag->m_timestamp;
|
|
if ( user ) *user = tag->getUser();
|
|
return score;
|
|
}
|
|
// not found
|
|
return defalt;
|
|
}
|
|
|
|
long long TagRec::getLongLong ( char *tagTypeStr,
|
|
long long defalt ,
|
|
Tag **bookmark ,
|
|
long *timestamp ,
|
|
char **user ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
// start here
|
|
Tag *tag ;
|
|
if ( ! bookmark ) tag = getFirstTag();
|
|
else tag = getNextTag ( *bookmark );
|
|
// loop over all tags in the buf
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if not a match
|
|
if ( tag->m_type != tagType ) continue;
|
|
// skip dups
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// get the value as a long
|
|
long long score = 0;
|
|
// the size
|
|
char *data = tag->getTagData();
|
|
long dataSize = tag->getTagDataSize();
|
|
// if ends in NULL trunc it
|
|
if ( data[dataSize-1] == '\0' ) dataSize--;
|
|
// trunc it
|
|
//if ( size > 8 ) size = 8;
|
|
// now everything is a string
|
|
score = atoll2(data,dataSize);
|
|
// store it
|
|
//memcpy ( &score , tag->m_data , size );
|
|
// bookmark, et al
|
|
if ( bookmark ) *bookmark = tag;
|
|
if ( timestamp ) *timestamp = tag->m_timestamp;
|
|
if ( user ) *user = tag->getUser();
|
|
return score;
|
|
}
|
|
// not found
|
|
return defalt;
|
|
}
|
|
|
|
char *TagRec::getString ( char *tagTypeStr,
|
|
char *defalt ,
|
|
long *size ,
|
|
Tag **bookmark ,
|
|
long *timestamp ,
|
|
char **user ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
// start here
|
|
Tag *tag ;
|
|
if ( ! bookmark ) tag = getFirstTag();
|
|
else tag = getNextTag ( *bookmark );
|
|
// loop over all tags in the buf
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if not a match
|
|
if ( tag->m_type != tagType ) continue;
|
|
// skip dups
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// want size? includes \0 probably
|
|
if ( size ) *size = tag->getTagDataSize();//m_dataSize;
|
|
// bookmark, et al
|
|
if ( bookmark ) *bookmark = tag;
|
|
if ( timestamp ) *timestamp = tag->m_timestamp;
|
|
if ( user ) *user = tag->getUser();
|
|
// return it
|
|
return tag->getTagData();//m_data;
|
|
}
|
|
// not found
|
|
return defalt;
|
|
}
|
|
|
|
/*
|
|
// add a special tag with null m_data. this tells Msg9a to delete
|
|
// all tags of this tag type before adding any other tags of this type
|
|
// that we might have. it is basically a "negative" tag.
|
|
bool TagRec::addDelTag ( char *tagTypeStr ) {
|
|
return addTag ( tagTypeStr ,
|
|
0 , // timestamp
|
|
NULL , // user
|
|
0 , // ip
|
|
NULL , // data
|
|
0 );// dataSize
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool TagRec::addTag ( char *tagTypeStr,
|
|
long timestamp ,
|
|
char *user ,
|
|
long ip ,
|
|
char *data ,
|
|
long dataSize ) {
|
|
// get the tagType
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
// breach check
|
|
if ( dataSize + sizeof(Tag) > MAX_TAGREC_SIZE ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
return log("tagdb: no room to add tag");
|
|
}
|
|
// the Tag::m_dataSize is only 2 bytes... NOT ANYMORE, MDW
|
|
if ( dataSize < 0 ) { // >= 65536 ) {
|
|
g_errno = EBADENGINEER;
|
|
return log("tagdb: tag dataSize of %li is >= 65536. "
|
|
"Bad value.", dataSize);
|
|
}
|
|
// sanity check -- no binary chars allowed, must all be strings!
|
|
// BUT they can have an empty string (i.e. just \0)
|
|
if ( dataSize == 1 && data[0] < 9 && data[0] >= 0 && data[0] ) {
|
|
char *xx=NULL;*xx=0; }
|
|
// make a tag
|
|
char buf[MAX_TAGREC_SIZE];
|
|
Tag *tag = (Tag *)buf;
|
|
// fill it in
|
|
tag->m_type = tagType;
|
|
tag->m_timestamp = timestamp;
|
|
tag->m_ip = ip;
|
|
tag->m_dataSize = dataSize;
|
|
// dummy value for now
|
|
tag->m_tagId = 0;
|
|
// careful!
|
|
if ( sizeof(Tag) + dataSize + 10 > MAX_TAGREC_SIZE ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
return log("tagdb: no room to add tag data");
|
|
}
|
|
// store user into special buffer
|
|
long ulen = 0;
|
|
if ( user ) {
|
|
ulen = gbstrlen(user);
|
|
if ( ulen > 7 ) ulen = 7;
|
|
}
|
|
memset ( tag->m_user , 0 , 8 );
|
|
memcpy ( tag->m_user , user , ulen );
|
|
// store data now too
|
|
memcpy ( tag->m_data , data , dataSize );
|
|
// NULL terminate if they did not! now all tag are strings and must
|
|
// be NULL terminated.
|
|
if ( data && tag->m_data[dataSize-1] ) {
|
|
tag->m_data[dataSize] = '\0';
|
|
dataSize++;
|
|
tag->m_dataSize++;
|
|
}
|
|
// the id is the hash for now (MDW)
|
|
tag->m_tagId = hash32 ( (char *)tag,(long)sizeof(tag)+dataSize , 0 );
|
|
// 0 is not valid
|
|
if ( tag->m_tagId == 0 ) tag->m_tagId = 1;
|
|
// now add that tag
|
|
return addTag ( tag );
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool TagRec::addTag ( Tag *TAG ) {
|
|
// . do not allow empty user
|
|
// . but "del tags" i.e. "negative tags" can have no user
|
|
if ( TAG->m_dataSize>0 && (!TAG->m_user || TAG->m_user[0] == '\0') ) {
|
|
char *xx=NULL;*xx=0;}
|
|
// sanity check
|
|
if ( TAG->m_tagId == 0 ) { char *xx=NULL;*xx=0;}
|
|
// come back up here if we did a remove operation
|
|
loop:
|
|
// start at the first tag
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if not matching id
|
|
if ( tag->m_type != TAG->m_type ) continue;
|
|
// skip if does not match user
|
|
if ( memcmp(tag->m_user,TAG->m_user,7) ) continue;
|
|
// data now has to match too, so we will allow tags of the
|
|
// same type from the same user to be added if they have
|
|
// different data now. i would only do this for strings,
|
|
// but for longs and chars i would skip this check...
|
|
// so only replace "unique" tags of the same type.
|
|
// mostly strings and embedded tag recs will be non-unquie
|
|
if ( ! isTagTypeUnique ( tag->m_type ) ) {
|
|
if ( tag->m_dataSize != TAG->m_dataSize ) continue;
|
|
if ( memcmp(tag->m_data,TAG->m_data,tag->m_dataSize))
|
|
continue;
|
|
}
|
|
// Msg8a allows multiple ST_SITE tags in order to indicate
|
|
// what sites the other tags came from (i.e. used by the
|
|
// inheritance loop below)
|
|
// MDW: This is now covered by isTagTypeUnique() above.
|
|
//if ( tag->m_type == ST_SITE ) continue;
|
|
// it does match, so replace it!
|
|
//removeTags ( tag->m_type , tag->m_user );
|
|
removeTag ( tag );
|
|
// start from the top
|
|
goto loop;
|
|
}
|
|
// . ok, we "deduped" the tag
|
|
// . point to the end of the buf
|
|
char *p = getRecEnd();
|
|
// get the max end
|
|
char *pend = getMaxEnd();
|
|
// how much do we need?
|
|
long need = TAG->getSize();
|
|
// breach?
|
|
if ( p + need > pend ) {
|
|
char *site = getString("site","unknown");
|
|
g_errno = EBUFTOOSMALL;
|
|
log("tagdb: no room to add tag to buf. tagtype=%s "
|
|
"tagsize=%li site=%s",
|
|
getTagStrFromType ( TAG->m_type ) , need , site );
|
|
//char *xx=NULL;*xx=0;
|
|
return false;
|
|
}
|
|
// store it
|
|
memcpy ( p , TAG , need );
|
|
// update our counters
|
|
m_numTags++;
|
|
m_dataSize += need;
|
|
|
|
// SPECIAL: if it was ST_SITE, set our m_key, we are an Rdb record
|
|
//if ( TAG->m_type != ST_SITE ) return true;
|
|
if ( ! TAG->isType ("site") ) return true;
|
|
|
|
// set the key
|
|
Url u;
|
|
// convenience
|
|
char *site = TAG->m_data;
|
|
long size = TAG->m_dataSize;
|
|
// sanity check
|
|
if ( site[size-1] != '\0' ) { char *xx=NULL;*xx=0; }
|
|
// do not start with http:// ! wastes space!!
|
|
if (size>=8 && strncmp(site,"http://",7)==0 ) {
|
|
log("tagdb: don't sotre http:// in tags!");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// do not include the NULL
|
|
u.set ( site , size - 1 );
|
|
// set our key, the endKey is our "startKey"
|
|
m_key = g_tagdb.makeKey ( &u , false ); // isDelete?
|
|
|
|
// success, return true
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::removeTags ( char *tagTypeStr , char *user , long tagId ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
return removeTags ( tagType , user , tagId );
|
|
}
|
|
|
|
bool TagRec::removeTags ( long tagType , char *user , long tagId ) {
|
|
loop:
|
|
// start at the first tag
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the rec, see if we got a dup
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// id if matches, that is good enough
|
|
if ( tagId && tag->m_tagId != tagId ) continue;
|
|
// skip if not matching id
|
|
if ( tagId == 0 && tag->m_type != tagType ) continue;
|
|
// skip if does not match user
|
|
if ( tagId == 0 && user && memcmp(tag->m_user,user,7))continue;
|
|
// remove that tag
|
|
removeTag ( tag );
|
|
// re do loop
|
|
goto loop;
|
|
}
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::removeTag ( Tag *rmTag ) {
|
|
// save this
|
|
long oldn = m_numTags;
|
|
// start at the first tag
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the rec, see if we got a dup
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// must be it
|
|
if ( tag != rmTag ) continue;
|
|
// copy to here
|
|
char *dst = (char *)tag;
|
|
// size of tag we are removing
|
|
long size = tag->getSize();
|
|
// from here
|
|
char *src = dst + size;
|
|
// end of tag buffer
|
|
char *pend = getRecEnd();
|
|
// byte to move
|
|
long move = pend - src;
|
|
// it does match, so replace it!
|
|
memcpy ( dst , src , move );
|
|
// decrement counts
|
|
m_numTags--;
|
|
m_dataSize -= size;
|
|
}
|
|
// sanity check
|
|
if ( m_numTags != oldn - 1 ) { char *xx=NULL;*xx=0; }
|
|
// success, return true
|
|
return true;
|
|
}
|
|
|
|
// add all the tags from "tagRec" to our list of tags
|
|
bool TagRec::addTags ( TagRec *tagRec ) {
|
|
|
|
// start at the first tag
|
|
Tag *tag = tagRec->getFirstTag();
|
|
// . remove any tag of any of the tag types we got in "tagRec" ?
|
|
// . deal with "negative" tags
|
|
// . used by TagRec::addDelTag() above
|
|
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
|
|
// if tag has m_data, skip.
|
|
if ( tag->m_data && tag->m_dataSize > 0 ) continue;
|
|
// otherwise, it is a signal to nuke all tags of this type
|
|
removeTags ( tag->m_type , NULL );
|
|
}
|
|
|
|
// start at the first tag again
|
|
tag = tagRec->getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
|
|
// skip if it was a delete tag
|
|
if ( tag->m_dataSize <= 0 ) continue;
|
|
// do not transfer over ST_SITE tags if we already got one
|
|
//if ( tag->m_type == ST_SITE && getTag ( ST_SITE ) ) continue;
|
|
if ( tag->isType("site") && getTag("site") ) continue;
|
|
// add it, return false on error, g_errno should be set
|
|
if ( ! addTag ( tag ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// add all the tags from "tagRec" to our list of tags
|
|
bool TagRec::removeTags ( TagRec *tagRec ) {
|
|
// start at the first tag
|
|
Tag *tag = tagRec->getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) {
|
|
// do not remove ST_SITE tags
|
|
//if ( tag->m_type == ST_SITE ) continue;
|
|
if ( tag->isType("site") ) continue;
|
|
// add it, return false on error, g_errno should be set
|
|
if ( ! removeTags ( tag->m_type , tag->m_user ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Tag *TagRec::getNextTag ( Tag *tag ) {
|
|
if ( m_numTags == 0 ) return NULL;
|
|
if ( ! tag ) return (Tag *)m_buf;
|
|
char *tagEnd = getRecEnd();
|
|
long size = tag->getSize();
|
|
char *ret = ((char *)tag) + size;
|
|
// overboard?
|
|
if ( ret >= tagEnd ) return NULL;
|
|
return (Tag *)ret;
|
|
}
|
|
*/
|
|
|
|
// return the number of tags having the particular TagType
|
|
long TagRec::getNumTagTypes ( char *tagTypeStr ) {
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
long numTagType = 0;
|
|
// start at the first tag
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip dups
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// if there is tagType match then increment the count
|
|
if ( tag->m_type == tagType ) numTagType++;
|
|
}
|
|
return numTagType;
|
|
}
|
|
|
|
long TagRec::getNumTags ( ) {
|
|
long numTags = 0;
|
|
// start at the first tag
|
|
Tag *tag = getFirstTag();
|
|
// loop over all tags in the buf, see if we got a dup
|
|
for ( ; tag ; tag = getNextTag ( tag ) )
|
|
// skip dups
|
|
if ( tag->m_type != TT_DUP ) numTags++;
|
|
return numTags;
|
|
}
|
|
|
|
// . &tagtype%li=<tagtype>
|
|
// . &tagdata%li=<data>
|
|
// . &deltag%li=1 (to delete it)
|
|
// . set &user=mwells, etc. in cookie of HttpReqest, "r" for user
|
|
// . "this" TagRec's user, ip and timestamp will be carried over to "newtr"
|
|
// . returns false and sets g_errno on error
|
|
bool TagRec::setFromHttpRequest ( HttpRequest *r, TcpSocket *s ) {
|
|
// clear it
|
|
//reset();
|
|
// get the username from the cookie
|
|
//char *user = r->getStringFromCookie ( "username" , NULL );
|
|
//char *user = g_users.getUsername ( r );
|
|
// try from form
|
|
//if ( ! user ) user = r->getString ("username",NULL);
|
|
// if no user, don't bother!
|
|
//if ( ! user ) {
|
|
// g_errno = EBADENGINEER;
|
|
// return log("tagdb: no username supplied for modifying tagdb.");
|
|
//}
|
|
// get the user ip address
|
|
long ip = 0;
|
|
if ( s ) ip = s->m_ip;
|
|
// get the time stamp
|
|
long now = getTimeGlobal();
|
|
|
|
// . loop over all urls/sites in text area
|
|
// . no! just use single url for now
|
|
|
|
// put all urls in this buffer
|
|
SafeBuf fou;
|
|
|
|
// try from textarea if the ST_SITE was not in the tag section
|
|
long uslen;
|
|
char *us = r->getString("u",&uslen);
|
|
if ( uslen <= 0 ) us = NULL;
|
|
if ( us ) fou.safeMemcpy ( us , uslen );
|
|
|
|
// read in file, file of urls
|
|
long ufuLen;
|
|
char *ufu = r->getString("ufu",&ufuLen);
|
|
if ( ufuLen <= 0 ) ufu = NULL;
|
|
if ( us ) ufu = NULL; // exclusive
|
|
if ( ufu ) fou.fillFromFile ( ufu );
|
|
|
|
// if st->m_urls has multiple urls, this "u" is not given in the
|
|
// http request! but a filename is... and Msg9::addTags() should add
|
|
// the ST_SITE field anyway...
|
|
if ( ! ufu && ! us ) return true;
|
|
|
|
// make it null terminated since we no longer do this automatically
|
|
fou.pushChar('\0');
|
|
|
|
// normalize it
|
|
//Url u; u.set ( us , uslen );
|
|
// point to it
|
|
//char *site = u.getUrl();
|
|
// skip http + ://
|
|
//site += u.getSchemeLen() + 3;
|
|
// include the \0
|
|
//long psize = gbstrlen(p) + 1;
|
|
|
|
// loop over all tags in the TagRec to mod them
|
|
for ( long i = 0 ; ; i++ ) {
|
|
|
|
char buf[32];
|
|
sprintf ( buf , "tagtype%li",i );
|
|
char *tagTypeStr = r->getString(buf,NULL,NULL);
|
|
// if not there we are done
|
|
if ( ! tagTypeStr ) break;
|
|
|
|
// should we delete it?
|
|
sprintf ( buf , "deltag%li",i);
|
|
char *deltag = r->getString(buf,NULL,NULL);
|
|
//if ( deltag && deltag[0] ) continue;
|
|
|
|
sprintf ( buf , "taguser%li",i);
|
|
char *tagUser = r->getString( buf,NULL,"admin");//user);
|
|
//if ( tagUser && tagUser[0]==0 ) tagUser = user;
|
|
|
|
sprintf ( buf , "tagtime%li",i);
|
|
long tagTime = r->getLong(buf,now);
|
|
|
|
sprintf ( buf , "tagip%li",i);
|
|
long tagIp = r->getLong(buf,ip);
|
|
|
|
// get the value of this tag
|
|
sprintf ( buf , "tagdata%li" , i );
|
|
char *dataPtr = r->getString ( buf , NULL );
|
|
|
|
// get the tag original key
|
|
key128_t key;
|
|
sprintf ( buf , "tagn1key%li" , i );
|
|
key.n1 = r->getLongLong ( buf, 0 );
|
|
sprintf ( buf , "tagn0key%li" , i );
|
|
key.n0 = r->getLongLong ( buf, 0LL );
|
|
|
|
// if empty skip it
|
|
if ( ! dataPtr ) continue;
|
|
if ( ! dataPtr[0] ) continue;
|
|
// is it numeric? i think only ST_COMMENT is not
|
|
//char isNum = true;
|
|
// get the numeric
|
|
//long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
// set "isNum" to false if not numeric
|
|
//if ( tagType == ST_COMMENT ) isNum = false;
|
|
//if ( tagType == ST_SITE ) isNum = false;
|
|
//if ( tagType == ST_META ) isNum = false;
|
|
//if ( isTagTypeString ( tagType ) ) isNum = false;
|
|
//long dataSize = 0;
|
|
// . if it is a string, like ST_COMMENT
|
|
// . include the \0
|
|
//if ( ! isNum ) dataSize = gbstrlen(dataPtr) + 1;
|
|
// everything is now a string
|
|
long dataSize = gbstrlen(dataPtr) + 1;
|
|
// if numeric store in tag buf
|
|
/*
|
|
long long data;
|
|
if ( isNum ) {
|
|
data = atoll ( dataPtr );//r->getLongLong(val,-1);
|
|
dataSize = 1;
|
|
if ( data >= 0xffLL ) dataSize = 2;
|
|
if ( data >= 0xffffLL ) dataSize = 3;
|
|
if ( data >= 0xffffffLL ) dataSize = 4;
|
|
if ( data >= 0xffffffffLL ) dataSize = 5;
|
|
if ( data >= 0xffffffffffLL ) dataSize = 6;
|
|
if ( data >= 0xffffffffffffLL ) dataSize = 7;
|
|
dataPtr = (char *)&data;
|
|
}
|
|
*/
|
|
// add to tag buf
|
|
//addTag ( tagTypeStr ,
|
|
// tagTime ,
|
|
// tagUser ,
|
|
// tagIp ,
|
|
// dataPtr ,
|
|
// dataSize );
|
|
|
|
|
|
// loop over all urls in the url file if provided
|
|
char *up = fou.getBufStart();
|
|
|
|
for ( ; ; ) {
|
|
// set url
|
|
char *urlPtr = up;
|
|
// stop if EOF or processed the one url
|
|
if ( ! urlPtr ) break;
|
|
// advance it or NULL it out
|
|
up = fou.getNextLine ( up );
|
|
// null term the url ptr
|
|
if ( up ) up[-1] = '\0';
|
|
|
|
// save buffer spot in case we have to rewind
|
|
long saved = m_sbuf.length();
|
|
|
|
// . add to tag rdb recs in safebuf
|
|
// . this pushes the rdbid as first byte
|
|
// . mdwmdwmdw
|
|
Tag *tag = m_sbuf.addTag ( urlPtr, // us, // site ,
|
|
tagTypeStr ,
|
|
tagTime ,
|
|
tagUser ,
|
|
tagIp ,
|
|
dataPtr,
|
|
dataSize ,
|
|
RDB_TAGDB,
|
|
// do not push rdbid into safebuf
|
|
false ) ;
|
|
// error?
|
|
if ( ! tag )
|
|
return false;
|
|
|
|
bool deleteOldKey = false;
|
|
|
|
// if tag has different key, delete the old one
|
|
if ( key.n1 && tag->m_key != key ) deleteOldKey = true;
|
|
|
|
// if del was marked, delete old one and do not add new one
|
|
if ( deltag && deltag[0] ) {
|
|
// rewind over the tag we were about to add
|
|
m_sbuf.setLength ( saved );
|
|
// and add as a delete
|
|
deleteOldKey = true;
|
|
}
|
|
|
|
if ( deleteOldKey ) {
|
|
// make it negative
|
|
key128_t delKey = key;
|
|
delKey.n0 &= 0xfffffffffffffffeLL;
|
|
if (! m_sbuf.safeMemcpy((char *)&delKey,
|
|
sizeof(key128_t)))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// all done
|
|
//if ( getTag ( ST_SITE ) ) return ;
|
|
//if ( getTag("site") ) return;
|
|
|
|
// add the special ST_SITE tag
|
|
//addTag ( "site" , // ST_SITE ,
|
|
// now ,
|
|
// user ,
|
|
// ip ,
|
|
// p ,
|
|
// psize );
|
|
return true;
|
|
}
|
|
|
|
// to stdout
|
|
long TagRec::print ( ) {
|
|
SafeBuf sb;
|
|
printToBuf ( &sb );
|
|
// dump that
|
|
return fprintf(stderr,"%s\n",sb.getBufStart());
|
|
}
|
|
|
|
bool TagRec::printToBuf ( SafeBuf *sb ) {
|
|
Tag *tag = getFirstTag();
|
|
//sb->safePrintf("k.n1=0x%08lx k.n0=0x%016llx version=%li",
|
|
// m_key.n1,m_key.n0,(long)m_version);
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
tag->printToBuf ( sb );
|
|
sb->pushChar('\n');
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// . return size of characters scanned from "p"
|
|
// . returns 0 on error
|
|
/*
|
|
long TagRec::setFromBuf ( char *p , char *pend ) {
|
|
// remember the start
|
|
char *start = p;
|
|
// scan in the key
|
|
//if ( strncmp(p,"k.n1=0x",7) != 0 ) return 0;
|
|
// skip key stuff
|
|
//p += 7;
|
|
// clear our key
|
|
//m_key.setToMin();
|
|
// read in the key
|
|
//key_t k;
|
|
//sscanf(p,"k.n1=0x%08lx k.n0=0x%016llx ",&k.n1,&k.n0);
|
|
|
|
// now do it the fast way and compare the results!
|
|
//p += 7 ;
|
|
//hexToBinary ( p , pend , ((char *)&m_key.n1)+3 , true );
|
|
//p += 8 + 8;
|
|
//hexToBinary ( p , pend , ((char *)&m_key.n0)+7 , true );
|
|
// test it
|
|
//if ( m_key.n1 != k.n1 || m_key.n0 != k.n0 ) { char *xx=NULL; *xx=0; }
|
|
|
|
//p = strstr ( p , " version=");
|
|
// error?
|
|
//if ( ! p ) return 0;
|
|
// skip " version="
|
|
//p += 9;
|
|
// get version
|
|
//m_version = atoi(p);
|
|
|
|
// skip p until space
|
|
//while ( p < pend && *p != ' ' ) p++;
|
|
// error?
|
|
//if ( p >= pend ) return 0;
|
|
// skip the space -- NO! tag parser wants the space
|
|
//p++;
|
|
|
|
// point to the where we should serialize the tags into
|
|
//char *tagPtr = m_buf;
|
|
|
|
char tbuf[5000];
|
|
|
|
while ( p < pend ) {
|
|
// now we should be pointing to the tag
|
|
Tag *tag = (Tag *)tbuf;
|
|
// serialize the tag from the buf
|
|
long asciiBytesRead = tag->setFromBuf ( p , pend );
|
|
// if bad this is 0
|
|
if ( asciiBytesRead == 0 ) return 0;
|
|
// store tag into our safebuf. return 0 with g_errno set on err
|
|
// . mdwmdwmdw
|
|
if ( ! m_sbuf.addTag ( tag ) ) return 0;
|
|
// point to next tag to read into our binary buffer
|
|
//p += asciiBytesRead;
|
|
// inc our ptr to point to next tag if it exists
|
|
//tagPtr += tag->getSize();
|
|
// inc our count in the TagRec
|
|
//m_numTags++;
|
|
// adjust our tag buffer size, TagRec::m_dataSize
|
|
//m_dataSize = tagPtr - m_buf;
|
|
// hey, it includes the other crap too!
|
|
// it includes m_numTags + m_version, see Tagdb.h
|
|
//m_dataSize += 2 + 1;
|
|
|
|
}
|
|
|
|
// clear all lists
|
|
//resetLists();
|
|
// now make list point to that
|
|
//m_lists[0].m_list = m_sbuf.getBufStart();
|
|
//m_lists[0].m_listSize = m_sbuf.length();
|
|
//m_lists[0].m_listAllocSize = 0; // do not free it!
|
|
//m_numLists = 0;
|
|
|
|
//return getSize();
|
|
return p - start;
|
|
}
|
|
*/
|
|
|
|
bool TagRec::setFromBuf ( char *p , long bufSize ) {
|
|
|
|
// assign to list! but do not free i guess
|
|
m_lists[0].m_list = p;
|
|
m_lists[0].m_listSize = bufSize;
|
|
m_lists[0].m_listEnd = p + bufSize;
|
|
m_lists[0].m_ownData = false;
|
|
m_lists[0].m_lastKeyIsValid = false;
|
|
m_lists[0].m_fixedDataSize = -1;
|
|
m_lists[0].m_useHalfKeys = false;
|
|
m_lists[0].m_ks = sizeof(key128_t);
|
|
m_listPtrs[0] = &m_lists[0];
|
|
m_numListPtrs = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::serialize ( SafeBuf &dst ) {
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
if ( ! dst.addTag ( tag ) ) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::printToBufAsAddRequest ( SafeBuf *sb ) {
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) )
|
|
if ( tag->m_type != TT_DUP ) tag->printToBufAsAddRequest ( sb);
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::printToBufAsXml ( SafeBuf *sb ) {
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) )
|
|
if ( tag->m_type != TT_DUP ) tag->printToBufAsXml ( sb );
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::printToBufAsHtml ( SafeBuf *sb , char *prefix ) {
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) )
|
|
if ( tag->m_type != TT_DUP ) tag->printToBufAsHtml (sb,prefix);
|
|
return true;
|
|
}
|
|
|
|
bool TagRec::printToBufAsTagVector ( SafeBuf *sb ) {
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) )
|
|
if ( tag->m_type != TT_DUP ) tag->printToBufAsTagVector ( sb );
|
|
return true;
|
|
}
|
|
|
|
Tag *TagRec::getTag ( char *tagTypeStr , char *dataPtr , long dataSize ) {
|
|
// get the tag type numerically
|
|
long tagType = getTagTypeFromStr ( tagTypeStr );
|
|
Tag *tag = getFirstTag();
|
|
for ( ; tag ; tag = getNextTag ( tag ) ) {
|
|
// skip if tag does not match "tagType"
|
|
if ( tag->m_type != tagType ) continue;
|
|
// skip dup tags
|
|
if ( tag->m_type == TT_DUP ) continue;
|
|
// skip if dataSize does not match
|
|
if ( tag->getTagDataSize() != dataSize ) continue;
|
|
// skip if data does not match
|
|
if ( memcmp ( tag->getTagData() , dataPtr , dataSize ) ) continue;
|
|
// we got a match
|
|
return tag;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
//
|
|
// flags for a TagDescriptor
|
|
//
|
|
|
|
// is the tag a string type?
|
|
#define TDF_STRING 0x01
|
|
// can we have multiple tags of this type from the same user in the
|
|
// same TagRec?
|
|
#define TDF_ARRAY 0x02
|
|
// . should we index it?
|
|
// . index gbtagjapanese:<score>
|
|
// . also index "gbtagjapanese" if score != 0
|
|
// . TODO: actually use this
|
|
#define TDF_NOINDEX 0x04
|
|
|
|
class TagDesc {
|
|
public:
|
|
char *m_name;
|
|
char m_flags;
|
|
// we compute the m_type of each TD on init
|
|
long m_type;
|
|
};
|
|
|
|
// map the tags to names
|
|
static TagDesc s_tagDesc[] = {
|
|
|
|
// data for the "lang" tag is 2 char language id followed by
|
|
// a comma then a score from 1 to 100 to indicate percentage.
|
|
// Allow multiple "lang" tags in one tagrec.
|
|
{"rootlang" ,TDF_STRING,0},
|
|
|
|
// title tag and incoming link text of the root page is stored here
|
|
// for determining default venue addresses
|
|
{"roottitles" ,TDF_STRING|TDF_NOINDEX,0},
|
|
//{"rootlangid" ,TDF_STRING|TDF_NOINDEX,0},
|
|
|
|
// for addresses of the website, can be multiple
|
|
{"venueaddress" ,TDF_STRING|TDF_ARRAY|TDF_NOINDEX,0},
|
|
|
|
/*
|
|
{"langunknown" ,0x00,0},
|
|
{"english" ,0x00,0},
|
|
{"french" ,0x00,0},
|
|
{"spanish" ,0x00,0},
|
|
{"russian" ,0x00,0},
|
|
{"turkish" ,0x00,0},
|
|
{"japanese" ,0x00,0},
|
|
{"chinesetraditional" ,0x00,0},
|
|
{"chinesesimplified" ,0x00,0},
|
|
{"korean" ,0x00,0},
|
|
{"german" ,0x00,0},
|
|
{"dutch" ,0x00,0},
|
|
{"italian" ,0x00,0},
|
|
{"finnish" ,0x00,0},
|
|
{"swedish" ,0x00,0},
|
|
{"norwegian" ,0x00,0},
|
|
{"portuguese" ,0x00,0},
|
|
{"vietnamese" ,0x00,0},
|
|
{"arabic" ,0x00,0},
|
|
{"hebrew" ,0x00,0},
|
|
{"indonesian" ,0x00,0},
|
|
{"greek" ,0x00,0},
|
|
{"thai" ,0x00,0},
|
|
{"hindi" ,0x00,0},
|
|
{"bengala" ,0x00,0},
|
|
{"polish" ,0x00,0},
|
|
{"tagalog" ,0x00,0},
|
|
*/
|
|
|
|
/*
|
|
{"spam" ,0x00,0},
|
|
{"retail" ,0x00,0},
|
|
{"business" ,0x00,0},
|
|
{"adult" ,0x00,0},
|
|
{"forum" ,0x00,0},
|
|
{"blog" ,0x00,0},
|
|
{"news" ,0x00,0},
|
|
{"reference" ,0x00,0},
|
|
{"directory" ,0x00,0},
|
|
{"searchengine" ,0x00,0},
|
|
{"domainsquatter" ,0x00,0},
|
|
{"platform" ,0x00,0},
|
|
{"travel" ,0x00,0},
|
|
{"audio" ,0x00,0},
|
|
{"video" ,0x00,0},
|
|
{"socialnetworking" ,0x00,0},
|
|
*/
|
|
|
|
{"manualban" ,0x00,0},
|
|
{"manualfilter" ,0x00,0},
|
|
// clock hashes are now stored in indexdb
|
|
//{"clock" ,0x00,0},
|
|
{"dateformat" ,0x00,0}, // 1 = american, 2 = european
|
|
|
|
{"ruleset" ,0x00,0},
|
|
//{"filtered" ,0x00,0},
|
|
//{"compromised" ,0x00,0},
|
|
//{"good" ,0x00,0},
|
|
{"deep" ,0x00,0},
|
|
//{"quality" ,0x00,0},
|
|
//{"dmozcatid" ,TDF_NOINDEX,0},
|
|
{"comment" ,TDF_STRING|TDF_NOINDEX,0},
|
|
// we now index this. really we need it for storing into title rec.
|
|
{"site" ,TDF_STRING|TDF_ARRAY,0},
|
|
|
|
//{"meta" ,TDF_STRING,0},
|
|
|
|
// . website contact info
|
|
// . used by ContactInfo.cpp
|
|
// . TDB_ARRAY means not to "overwrite" even if username is the same
|
|
// . a website can have multiple street addresses, etc.
|
|
// . the "lines" of an single street address are separated by ';'
|
|
// instead of \n to maintain tagdb dump output readability
|
|
//{"streetaddress" ,TDF_ARRAY,0},
|
|
//{"phonenumber" ,TDF_ARRAY,0},
|
|
//{"faxnumber" ,TDF_ARRAY,0},
|
|
//{"emailaddress" ,TDF_ARRAY,0},
|
|
// . this tag can contain multiple zipcodes, separated by ' '
|
|
// . we do index these for local search
|
|
//{"zipcodes" ,0x00,0},
|
|
// . similar to zip codes, separated by ' '
|
|
// . TODO: we need to fix Places.cpp to label the places for these tags
|
|
// but for now we can do gbtagstreetaddress:munich and hope for
|
|
// the best, although we will get websites on "munich st.!", but
|
|
// maybe you can combine that with gbtagstreetaddress:germany
|
|
//{"countries", ,0x00,0},
|
|
//{"cities", ,0x00,0},
|
|
// this is "0" or "1". if it is "0" then the date lets XmlDoc.cpp know
|
|
// when we last tried to get the contact info for the site
|
|
{"hascontactinfo" ,0x00,0},
|
|
// street address using ; as delimeter
|
|
{"contactaddress" ,TDF_ARRAY|TDF_NOINDEX,0},
|
|
{"contactemails" ,TDF_ARRAY|TDF_NOINDEX,0},
|
|
//{"emailaddressonsite" ,TDF_ARRAY|TDF_NOINDEX,0},
|
|
//{"emailaddressoffsite" ,TDF_ARRAY|TDF_NOINDEX,0},
|
|
{"hascontactform" ,0x00,0},
|
|
|
|
// subscribe to google's blacklist and mark the sites as this
|
|
//{"malware" ,0x00,0},
|
|
|
|
// . this is used to define INDEPENDENT subsites
|
|
// . such INDEPENDENT subsites should never inherit from this tag rec
|
|
// . it is used to handle "homesteading" sites like geocities.com
|
|
// and the like, and is automatically set by SiteGetter.cpp
|
|
// . if this is 1 then xyz.com/yyyyy/ is considered a subsite
|
|
// . if this is 2 then xyz.com/yyyyy/zzzzz/ is considered a subsite
|
|
// . if this is -1 then no subsite is found
|
|
// . this should never be 0 either
|
|
{"sitepathdepth" ,0x00,0},
|
|
|
|
// . used by XmlDoc::updateTagdb() and also used to determine
|
|
// if we should index a site in XmlDoc.cpp. to be indexed a site
|
|
// must be in google, or must have this tag type in its tag rec,
|
|
// or have some other, soon to be invented, tag
|
|
// . really this is all controlled by url filters table
|
|
// . allow multiple tags of this type from same "user"
|
|
{"authorityinlink" ,TDF_STRING|TDF_ARRAY,0},
|
|
|
|
{"pagerank" ,0x00,0},
|
|
{"ingoogle" ,0x00,0},
|
|
{"ingoogleblogs" ,0x00,0},
|
|
{"ingooglenews" ,0x00,0},
|
|
|
|
// geo location from this news site directory
|
|
{"abyznewslinks.address",0x00,0},
|
|
|
|
// we now store site pop, etc. in tagdb
|
|
{"sitenuminlinks" ,0x00,0},
|
|
{"sitenuminlinksuniqueip" ,0x00,0},
|
|
{"sitenuminlinksuniquecblock" ,0x00,0},
|
|
{"sitenuminlinkstotal" ,0x00,0},
|
|
|
|
// keep these although no longer used
|
|
{"sitepop" ,0x00,0},
|
|
{"sitenuminlinksfresh" ,0x00,0},
|
|
|
|
|
|
// . the first ip we lookup for this domain
|
|
// . this is permanent and should never change
|
|
// . it is used by Spider.cpp to assign a host for throttling
|
|
// all urls/SpiderRequests from that ip
|
|
// . so if we did change it then that would result in two hosts
|
|
// doing the throttling, really messing things up
|
|
{"firstip" ,0x00,0}
|
|
|
|
|
|
|
|
/*
|
|
{"user.id" ,0x00,0},
|
|
{"user.xml" ,TDF_STRING,0},
|
|
{"user.login" ,TDF_STRING,0},
|
|
{"user.password" ,TDF_STRING,0},
|
|
{"user.securityquestion",TDF_STRING,0},
|
|
{"user.securityanswer" ,TDF_STRING,0},
|
|
{"user.email" ,TDF_STRING,0},
|
|
{"user.firstname" ,TDF_STRING,0},
|
|
{"user.lastname" ,TDF_STRING,0},
|
|
{"user.cookie" ,TDF_STRING,0},
|
|
{"user.zipcode" ,TDF_STRING,0},
|
|
{"user.city" ,TDF_STRING,0},
|
|
{"user.state" ,TDF_STRING,0},
|
|
{"user.imageurl" ,TDF_STRING,0},
|
|
|
|
{"user.dob" ,TDF_STRING,0},
|
|
{"user.language" ,TDF_STRING,0},
|
|
{"user.creditcardname" ,TDF_STRING,0},
|
|
{"user.creditcardnum" ,TDF_STRING,0},
|
|
{"user.creditcardexp" ,TDF_STRING,0},
|
|
{"user.creditcardcode" ,TDF_STRING,0},
|
|
{"user.lastlogin" ,0x00,0},
|
|
{"user.numlogins" ,0x00,0},
|
|
{"user.openlinksnewwin" ,0x00,0},
|
|
{"user.usehttps" ,0x00,0},
|
|
{"user.maxreadhist" ,0x00,0},
|
|
{"user.maxsearchhist" ,0x00,0},
|
|
{"user.format" ,0x00,0},
|
|
{"user.acctbalance" ,0x00,0},
|
|
{"user.acctlimit" ,0x00,0},
|
|
{"user.acctsuspended" ,0x00,0},
|
|
{"user.acctbillemails" ,TDF_STRING,0},
|
|
{"user.adstopicid" ,0x00,0},
|
|
{"user.adsdailybudget" ,0x00,0},
|
|
{"user.adsdisabled" ,0x00,0},
|
|
{"user.feednumqueries" ,0x00,0},
|
|
{"user.feedcpq" ,0x00,0},
|
|
{"user.feeddailybudget" ,0x00,0},
|
|
{"user.feeddisabled" ,0x00,0},
|
|
{"user.feedpassword" ,TDF_STRING,0},
|
|
{"user.feeddailycount" ,TDF_ARRAY,0},
|
|
{"user.usertransrec" ,TDF_ARRAY,0},
|
|
{"user.userhistoryrec" ,TDF_ARRAY,0},
|
|
{"user.userpanelrec" ,TDF_ARRAY,0},
|
|
{"trans.amount" ,0x00,0},
|
|
{"trans.desc" ,TDF_STRING,0},
|
|
{"hist.wasread" ,0x00,0},
|
|
{"hist.url" ,TDF_STRING,0},
|
|
{"hist.gigabits" ,TDF_STRING,0},
|
|
{"hist.timespent" ,0x00,0},
|
|
{"panel.topcid" ,0x00,0},
|
|
{"panel.showmainstream" ,0x00,0},
|
|
{"panel.showblogs" ,0x00,0},
|
|
{"panel.showforum" ,0x00,0},
|
|
{"panel.showweb" ,0x00,0},
|
|
{"panel.showsearchbox" ,0x00,0},
|
|
{"panel.showimages" ,0x00,0},
|
|
{"panel.showvideo" ,0x00,0},
|
|
{"panel.showchatbox" ,0x00,0},
|
|
{"panel.showchatpics" ,0x00,0},
|
|
{"panel.chatboxnumlines",0x00,0},
|
|
{"panel.popsliderval" ,0x00,0},
|
|
{"panel.agesliderval" ,0x00,0},
|
|
{"panel.windowxpos" ,0x00,0},
|
|
{"panel.windowypos" ,0x00,0},
|
|
{"panel.numstories" ,0x00,0},
|
|
{"panel.storylang" ,TDF_STRING,0},
|
|
{"panel.translatelang" ,TDF_STRING,0},
|
|
{"panel.displaylang" ,TDF_STRING,0},
|
|
{"panel.filterquery" ,TDF_STRING,0},
|
|
{"panel.sendemailalerts",TDF_STRING,0},
|
|
{"chat.comment" ,TDF_STRING,0},
|
|
|
|
{"ad.topicid" ,0x00,0},
|
|
{"ad.userid" ,0x00,0},
|
|
{"ad.adid" ,0x00,0},
|
|
{"ad.title" ,TDF_STRING,0},
|
|
{"ad.text" ,TDF_STRING,0},
|
|
{"ad.url" ,TDF_STRING,0},
|
|
{"ad.keywordstring" ,TDF_STRING,0},
|
|
{"ad.dailypledge" ,0x00,0},
|
|
{"ad.disabled" ,0x00,0},
|
|
{"ad.dailyimpresscount" ,TDF_ARRAY,0},
|
|
{"ad.dailyclickcount" ,TDF_ARRAY,0}
|
|
*/
|
|
};
|
|
|
|
// . convert "domain_squatter" to ST_DOMAIN_SQUATTER
|
|
// . used by CollectionRec::getRegExpNum()
|
|
// . tagnameLen is -1 if unknown
|
|
long getTagTypeFromStr( char *tagname , long tagnameLen ) {
|
|
// this is now the hash
|
|
long tagType;
|
|
if ( tagnameLen == -1 ) tagType = hash32n ( tagname );
|
|
else tagType = hash32 ( tagname , tagnameLen );
|
|
// make sure table is valid
|
|
if ( ! s_initialized ) g_tagdb.setHashTable();
|
|
// sanity check, make sure it is a supported tag!
|
|
if ( ! s_ht.getValue ( tagType ) ) {
|
|
log("tagdb: unsupported tagname \"%s\"",tagname);
|
|
char *xx=NULL;*xx=0;
|
|
return -1;
|
|
}
|
|
return tagType;
|
|
}
|
|
|
|
// . convert ST_DOMAIN_SQUATTER to "domain_squatter"
|
|
char *getTagStrFromType ( long tagType ) {
|
|
// make sure table is valid
|
|
if ( ! s_initialized ) g_tagdb.setHashTable();
|
|
TagDesc *td = (TagDesc *)s_ht.getValue ( tagType );
|
|
// sanity check
|
|
if ( ! td ) { char *xx=NULL;*xx=0; }
|
|
// return it
|
|
return td->m_name;
|
|
}
|
|
|
|
// a global class extern'd in .h file
|
|
Tagdb g_tagdb;
|
|
Tagdb g_tagdb2;
|
|
|
|
// a fake site for Tagdb::convert()
|
|
//Tagdb g_sitedb;
|
|
|
|
//static HashTableT<long long,long> s_lockTable;
|
|
//static HashTableX s_lockTable2;
|
|
|
|
// reset rdb and Xmls
|
|
void Tagdb::reset() {
|
|
m_rdb.reset();
|
|
//s_lockTable2.reset();
|
|
}
|
|
|
|
bool Tagdb::setHashTable ( ) {
|
|
if ( s_initialized ) return true;
|
|
s_initialized = true;
|
|
// the hashtable of TagDescriptors
|
|
if ( ! s_ht.set ( 1024 ) )
|
|
return log("tagdb: Tagdb hash init failed.");
|
|
// stock it
|
|
long n = (long)sizeof(s_tagDesc)/(long)sizeof(TagDesc);
|
|
for ( long i = 0 ; i < n ; i++ ) {
|
|
TagDesc *td = &s_tagDesc[i];
|
|
char *s = td->m_name;
|
|
long slen = gbstrlen(s);
|
|
// use the same algo that Words.cpp computeWordIds does
|
|
long h = hash64Lower_a ( s , slen );
|
|
// call it a bad name if already in there
|
|
TagDesc *etd = (TagDesc *)s_ht.getValue ( h );
|
|
if ( etd )
|
|
return log("tagdb: Tag %s collides with old tag %s",
|
|
td->m_name,etd->m_name);
|
|
// set the type
|
|
td->m_type = h;
|
|
// add it
|
|
s_ht.addKey ( h , (long)td );
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Tagdb::init ( ) {
|
|
// snity test
|
|
//if ( TAGREC_CURRENT_VERSION >= 30 ) {
|
|
// log("tagdb: fix call to convert()");
|
|
// char *xx = NULL; *xx = 0;
|
|
//}
|
|
// . what's max # of tree nodes?
|
|
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
|
|
// . NOTE: 32 bytes of the 82 are overhead
|
|
long maxTreeNodes = g_conf.m_tagdbMaxTreeMem / 82;
|
|
|
|
//long long pcmem = 250000000; // 250MB
|
|
// TODO: make it a biased disk page cache!
|
|
long long pcmem = 160000000; // 160MB
|
|
// turn it off for rebuilding posdb, to 10MB anyway
|
|
pcmem = 10000000;
|
|
//long pcmem = 100000000;
|
|
// each entry in the cache is usually just a single record, no lists,
|
|
// unless a hostname has multiple sites in it. has 24 bytes more
|
|
// overhead in cache.
|
|
//long maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
|
|
// we now use a page cache
|
|
if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE))
|
|
return log("tagdb: Tagdb init failed.");
|
|
|
|
// init this
|
|
//if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") )
|
|
// return log("tagdb: lock table init failed.");
|
|
|
|
// . initialize our own internal rdb
|
|
// . i no longer use cache so changes to tagdb are instant
|
|
// . we still use page cache however, which is good enough!
|
|
return m_rdb.init ( g_hostdb.m_dir ,
|
|
"tagdb" ,
|
|
true , // dedup same keys?
|
|
-1 , // fixed record size
|
|
2,//g_conf.m_tagdbMinFilesToMerge ,
|
|
g_conf.m_tagdbMaxTreeMem ,
|
|
maxTreeNodes ,
|
|
// now we balance so Sync.cpp can ordered huge list
|
|
true , // balance tree?
|
|
0 , //g_conf.m_tagdbMaxCacheMem ,
|
|
0 , //maxCacheNodes ,
|
|
false , // half keys?
|
|
false , //m_tagdbSaveCache
|
|
&m_pc ,
|
|
false, // is titledb
|
|
true , // preload disk page cache
|
|
sizeof(key128_t), // key size
|
|
true ); // bias disk page cache?
|
|
}
|
|
|
|
bool Tagdb::init2 ( long treeMem ) {
|
|
// . what's max # of tree nodes?
|
|
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
|
|
// . NOTE: 32 bytes of the 82 are overhead
|
|
long maxTreeNodes = treeMem / 82;
|
|
// . initialize our own internal rdb
|
|
// . i no longer use cache so changes to tagdb are instant
|
|
// . we still use page cache however, which is good enough!
|
|
return m_rdb.init ( g_hostdb.m_dir ,
|
|
"tagdbRebuild" ,
|
|
true , // dedup same keys?
|
|
-1 , // fixed record size
|
|
50,//g_conf.m_tagdbMinFilesToMerge ,
|
|
treeMem ,
|
|
maxTreeNodes ,
|
|
// now we balance so Sync.cpp can ordered huge list
|
|
true , // balance tree?
|
|
0 , //g_conf.m_tagdbMaxCacheMem ,
|
|
0 , //maxCacheNodes ,
|
|
false , // half keys?
|
|
false , //m_tagdbSaveCache
|
|
NULL , // pc
|
|
false, // is titledb
|
|
false , // preload disk page cache
|
|
sizeof(key128_t), // key size
|
|
false ); // bias disk page cache?
|
|
}
|
|
|
|
|
|
bool Tagdb::addColl ( char *coll, bool doVerify ) {
|
|
if ( ! m_rdb.addColl ( coll ) ) return false;
|
|
if ( ! doVerify ) return true;//false;
|
|
// verify
|
|
if ( verify(coll) ) return true;
|
|
// if not allowing scale, return false
|
|
//if ( ! g_conf.m_allowScale ) return false;
|
|
// otherwise let it go
|
|
//log ( "tagdb: Verify failed, but scaling is allowed, passing." );
|
|
//return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
|
|
bool Tagdb::verify ( char *coll ) {
|
|
char *rdbName = NULL;
|
|
rdbName = "Tagdb";
|
|
|
|
log ( LOG_DEBUG, "db: Verifying %s for coll %s...", rdbName, coll );
|
|
|
|
g_threads.disableThreads();
|
|
|
|
Msg5 msg5;
|
|
Msg5 msg5b;
|
|
RdbList list;
|
|
key128_t startKey;
|
|
key128_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
|
|
if ( ! msg5.getList ( RDB_TAGDB ,
|
|
coll ,
|
|
&list ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
64000 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL ,
|
|
0 ,
|
|
-1 ,
|
|
true ,
|
|
-1LL ,
|
|
&msg5b ,
|
|
true )) {
|
|
g_threads.enableThreads();
|
|
return log("tagdb: HEY! it did not block");
|
|
}
|
|
|
|
long count = 0;
|
|
long got = 0;
|
|
//long numOld = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
//key128_t k = list.getCurrentKey();
|
|
key128_t k;
|
|
list.getCurrentKey ( &k );
|
|
count++;
|
|
// see if it is the "old" school tagdb rec
|
|
//char *data = list.getCurrentData();
|
|
//long dataSize = list.getCurrentDataSize();
|
|
// this is the file number in the old school tagdb recs
|
|
// and it is the version number in the new school style recs.
|
|
// just make sure the new school version number stays below 30!
|
|
//char version = *data;
|
|
// lower 3 bytes are the file number. >= 30 on gk
|
|
//if ( version >= 30 ) numOld++;
|
|
//unsigned long groupId = g_tagdb.getGroupId ( &k );
|
|
unsigned long shardNum = getShardNum ( RDB_TAGDB , &k );
|
|
if ( shardNum == getMyShardNum() ) got++;
|
|
}
|
|
if ( got != count ) {
|
|
// tally it up
|
|
g_rebalance.m_numForeignRecs += count - got;
|
|
log ("tagdb: Out of first %li records in %s, only %li belong "
|
|
"to our group.",count,rdbName,got);
|
|
// exit if NONE, we probably got the wrong data
|
|
if ( got == 0 ) log("tagdb: Are you sure you have the "
|
|
"right "
|
|
"data in the right directory? "
|
|
"Exiting.");
|
|
log ( "tagdb: Exiting due to %s inconsistency.", rdbName );
|
|
g_threads.enableThreads();
|
|
return g_conf.m_bypassValidation;
|
|
}
|
|
log ( LOG_DEBUG, "db: %s passed verification successfully for %li "
|
|
"recs.",rdbName, count );
|
|
|
|
// turn threads back on
|
|
g_threads.enableThreads();
|
|
|
|
// if no recs in tagdb, but sitedb exists, convert it
|
|
if ( count > 0 ) return true;
|
|
|
|
// . convert them
|
|
// . returns false and sets g_errno on error
|
|
//if ( ! convert ( coll ) ) return false;
|
|
|
|
// DONE
|
|
g_threads.enableThreads();
|
|
return true;
|
|
}
|
|
|
|
/////////////
|
|
//
|
|
// past blast -- for Tagdb::convert()
|
|
//
|
|
////////////
|
|
/*
|
|
struct SiteType {
|
|
SiteType() : m_score(0) {}
|
|
SiteType& operator=(SiteType& o)
|
|
{m_type=o.m_type;m_score=o.m_score; return *this;}
|
|
// get this type's size
|
|
long getStoredSize() {
|
|
if (isType4Bytes(m_type)) return sizeof(m_type)+4;
|
|
else return sizeof(m_type)+1;
|
|
};
|
|
enum {
|
|
FIRST_TYPE = 0,
|
|
SPAM = FIRST_TYPE, //probablitity that it is spam
|
|
RETAIL, //selling something
|
|
BUSINESS, //a corporate storefront eg ibm.com
|
|
ADULT, //not safe for kids, higher score = more hardcore
|
|
FORUM, //message board
|
|
BLOG, //or personal home page
|
|
NEWS, //articles, opinions magazines
|
|
REFERENCE, //all special interest sites
|
|
DIRECTORY, //links organized categorically
|
|
SEARCH_ENGINE, //indexed info
|
|
DOMAIN_SQUATTER,
|
|
PLATFORM, //political candidate, or org
|
|
TRAVEL, //Travel sites
|
|
AUDIO, //podcast, streaming radio
|
|
VIDEO, //flash video
|
|
SOCIAL_NETWORKING,//dating, myspace, facebook
|
|
MANUAL_BAN, //a human hates this site
|
|
PAGE_RANK, //google's page rank
|
|
CLOCK1_PREHASH, //hash of unique preceeding 1st clock
|
|
CLOCK1_PREHASH_CNT, // count of tags to make 1st clock hash
|
|
DATE_FORMAT, //format of dates on page
|
|
CLOCK2_PREHASH, //hash of unique tags preceeding 2nd clock
|
|
CLOCK2_PREHASH_CNT, // count of tags to make 2nd clock hash
|
|
CLOCK3_PREHASH, //hash of unique tags preceeding 3rd clock
|
|
CLOCK3_PREHASH_CNT, // count of tags to make 3rd clock hash
|
|
CLOCK4_PREHASH, //hash of unique tags preceeding 4th clock
|
|
CLOCK4_PREHASH_CNT, // count of tags to make 4th clock hash
|
|
|
|
// ....ADD ALL NEW TYPES HERE... corruption upon ye if not
|
|
|
|
LAST_TYPE,
|
|
BAD_TYPE = LAST_TYPE,
|
|
|
|
TOTAL_TYPE_COUNT = (LAST_TYPE-FIRST_TYPE)
|
|
};
|
|
// . types can be 1 byte or 4 bytes. if they are 4 bytes, they must be
|
|
// added to this function
|
|
static bool isType4Bytes(int type) {
|
|
if ( type == CLOCK1_PREHASH ) return true;
|
|
if ( type == CLOCK2_PREHASH ) return true;
|
|
if ( type == CLOCK3_PREHASH ) return true;
|
|
if ( type == CLOCK4_PREHASH ) return true;
|
|
return false;
|
|
}
|
|
|
|
static long getScoreSize(uint8_t type) {
|
|
if ( type == CLOCK1_PREHASH ) return 4;
|
|
if ( type == CLOCK2_PREHASH ) return 4;
|
|
if ( type == CLOCK3_PREHASH ) return 4;
|
|
if ( type == CLOCK4_PREHASH ) return 4;
|
|
return 1;
|
|
};
|
|
bool isNormScore() {return m_type <= PAGE_RANK;}
|
|
uint8_t m_type;
|
|
uint32_t m_score;
|
|
};
|
|
|
|
// . convert the old Tagdb format into the new format
|
|
bool Tagdb::convert ( char *coll ) {
|
|
|
|
g_threads.disableThreads();
|
|
|
|
log("db: Trying to convert sitedb for coll %s into tagdb",coll);
|
|
collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
|
// open up old sitedb files
|
|
long mem = 100000000;
|
|
long maxTreeNodes = mem / 82;
|
|
//Rdb sitedb;
|
|
g_sitedb.m_rdb.init ( g_hostdb.m_dir ,
|
|
"sitedb" ,
|
|
true , // dedup same keys?
|
|
-1 , // fixed record size
|
|
9999 , // MinFilesToMerge
|
|
100000000 , // g_conf.m_tagdbMaxTreeMem
|
|
maxTreeNodes ,
|
|
true , // balance tree?
|
|
0 , // g_conf.m_tagdbMaxCacheMem
|
|
0 , // maxCacheNodes
|
|
false , // half keys?
|
|
false , // m_tagdbSaveCache
|
|
NULL , // DiskPageCache *, &m_pc
|
|
false , // is titledb
|
|
false , // preload disk page cache
|
|
12 , // key size
|
|
false );// bias disk page cache?
|
|
//g_collectiondb.init(true);
|
|
g_sitedb.addColl ( coll, false );
|
|
|
|
Msg5 msg5;
|
|
Msg5 msg5b;
|
|
RdbList list;
|
|
key_t startKey;
|
|
key_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
key_t k;
|
|
bool threadsWereEnabled = !g_threads.areThreadsDisabled();
|
|
g_threads.disableThreads();
|
|
|
|
loop:
|
|
// loop over all tagdb recs in tagdb
|
|
if ( ! msg5.getList ( RDB_SITEDB ,
|
|
coll ,
|
|
&list ,
|
|
startKey ,
|
|
endKey ,
|
|
64000 , // minRecSizes ,
|
|
true , // includeTree ,
|
|
false , // add to cache?
|
|
0 , // max cache age
|
|
0 , // startFileNum ,
|
|
-1 , // numFiles ,
|
|
NULL , // state
|
|
NULL , // callback
|
|
0 , // niceness
|
|
false , // err correction?
|
|
NULL ,
|
|
0 ,
|
|
-1 ,
|
|
true ,
|
|
-1LL ,
|
|
&msg5b ,
|
|
true )) {
|
|
if(threadsWereEnabled) g_threads.enableThreads();
|
|
return log("db: HEY! it did not block");
|
|
}
|
|
|
|
long count = 0;
|
|
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
|
list.skipCurrentRecord() ) {
|
|
k = list.getCurrentKey();
|
|
count++;
|
|
char *data = list.getCurrentData();
|
|
//long dataSize = list.getCurrentDataSize();
|
|
// point to end of it
|
|
//char *pend = data + dataSize;
|
|
// parse the old site rec
|
|
char *p = data;
|
|
long old_sfn = (*(long *)p) & 0x00ffffff;
|
|
//char old_version = p[3];
|
|
p += 4;
|
|
char *old_site = p;
|
|
long old_siteLen = gbstrlen(p);
|
|
p += old_siteLen + 1;
|
|
long old_time = *(long *)p;
|
|
p += 4;
|
|
char *old_comment = p;
|
|
p += gbstrlen(p) + 1;
|
|
//char *old_username = p;
|
|
p += gbstrlen(p) + 1;
|
|
//unsigned char siteFlags = *p;
|
|
p += 1;
|
|
//char siteQuality = *p;
|
|
p += 1;
|
|
//char incHere = *(long *)p;
|
|
uint8_t numTypes = *(uint8_t *)p;
|
|
p += 1;
|
|
|
|
// do not start with http:// ! wastes space!!
|
|
if (old_siteLen>=8 && strncmp(old_site,"http://",7)==0 ) {
|
|
old_site += 7;
|
|
old_siteLen -= 7;
|
|
}
|
|
// sanity check
|
|
//Url s; s.set ( old_site, old_siteLen );
|
|
//key_t newk = g_tagdb.makeKey ( &s , false );
|
|
//if ( k != newk ) { char *xx=NULL;*xx=0; }
|
|
// . without any tags, what is our dataSize?
|
|
// . version(1 byte)+site(X bytes)+NULLTerm(1 byte)+
|
|
// #Tags(2 bytes)
|
|
//long dataSize2 = 1 + old_siteLen + 1 + 2;
|
|
// set the new rec with this stuff
|
|
TagRec newgr;
|
|
//newgr.set ( k ,
|
|
// dataSize2 ,
|
|
// TAGREC_CURRENT_VERSION ,
|
|
// old_site );
|
|
long now = getTimeGlobal();
|
|
// add the "site" name as a tag (include NULL)
|
|
newgr.addTag ( ST_SITE , old_time , "conv" , 0,
|
|
old_site, gbstrlen(old_site)+1);
|
|
// the banned tag
|
|
if ( old_sfn == 30 ) {
|
|
char data = 1;
|
|
newgr.addTag ( ST_MANUAL_BAN ,now, "conv", 0,&data,1);
|
|
}
|
|
if ( old_sfn == 50 ) {
|
|
char data = 1;
|
|
newgr.addTag ( ST_DEEP,now, "conv", 0,&data,1);
|
|
}
|
|
// just for historical reasons, keep this too
|
|
newgr.addTag ( ST_RULESET , now , "conv",0,(char *)&old_sfn,1);
|
|
// . add in comment tag
|
|
// . this will increase newgr::m_dataEnd/m_dataSize
|
|
// . include NULL
|
|
if ( old_comment[0] )
|
|
newgr.addTag ( ST_COMMENT ,now, "conv", 0,
|
|
old_comment , gbstrlen(old_comment)+1);
|
|
// reset these
|
|
bool gotPrehash1 = false;
|
|
bool gotPrehash2 = false;
|
|
bool gotPrehash3 = false;
|
|
bool gotPrehash4 = false;
|
|
bool gotPrehashCount1 = false;
|
|
bool gotPrehashCount2 = false;
|
|
bool gotPrehashCount3 = false;
|
|
bool gotPrehashCount4 = false;
|
|
long prehash1;
|
|
long prehash2;
|
|
long prehash3;
|
|
long prehash4;
|
|
char prehashCount1;
|
|
char prehashCount2;
|
|
char prehashCount3;
|
|
char prehashCount4;
|
|
// now for the old SiteTypes
|
|
for ( long i = 0 ; i < numTypes ; i++ ) {
|
|
//while ( p < pend ) {
|
|
//SiteType *ost = (SiteType *)p;
|
|
// get the type
|
|
char siteType = *p; p++;
|
|
// and the score
|
|
char *siteTypeScore = p;
|
|
long siteTypeScoreSize =
|
|
SiteType::getScoreSize(siteType);
|
|
p += siteTypeScoreSize;
|
|
// a 0 score in the old sitedb meant to ignore
|
|
if ( *siteTypeScore == 0 && siteTypeScoreSize == 1 )
|
|
continue;
|
|
// map the siteType 1-1 for the most part
|
|
long tagType = siteType + ST_SPAM;
|
|
// if the type is SiteType::CLOCK2-4_ re-map it
|
|
if ( siteType == SiteType::CLOCK1_PREHASH ) {
|
|
gotPrehash1 = true;
|
|
prehash1 = *(long *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK1_PREHASH_CNT ) {
|
|
gotPrehashCount1 = true;
|
|
prehashCount1 = *(char *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK2_PREHASH ) {
|
|
gotPrehash2 = true;
|
|
prehash2 = *(long *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK2_PREHASH_CNT ) {
|
|
gotPrehashCount2 = true;
|
|
prehashCount2 = *(char *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK3_PREHASH ) {
|
|
gotPrehash3 = true;
|
|
prehash3 = *(long *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK3_PREHASH_CNT ) {
|
|
gotPrehashCount3 = true;
|
|
prehashCount3 = *(char *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK4_PREHASH ) {
|
|
gotPrehash4 = true;
|
|
prehash4 = *(long *)siteTypeScore;
|
|
continue;
|
|
}
|
|
if ( siteType == SiteType::CLOCK4_PREHASH_CNT ) {
|
|
gotPrehashCount4 = true;
|
|
prehashCount4 = *(char *)siteTypeScore;
|
|
continue;
|
|
}
|
|
// but DATE_FORMAT is off
|
|
if ( siteType == SiteType::DATE_FORMAT )
|
|
tagType = ST_DATE_FORMAT;
|
|
|
|
// panic
|
|
if ( tagType >= ST_LAST_TAG ) {
|
|
log("db: got bad tagtype %li for sitedb rec.",
|
|
(long)tagType);
|
|
continue;
|
|
}
|
|
// add to new rec
|
|
newgr.addTag ( tagType , // should be 1-1
|
|
now ,
|
|
"conv" ,
|
|
0 , // ip
|
|
siteTypeScore ,
|
|
siteTypeScoreSize );
|
|
}
|
|
// add in the clock stuff
|
|
if ( gotPrehash1 && gotPrehashCount1 ) {
|
|
// make a 5 byte thingy
|
|
char tmp[5];
|
|
tmp[0] = prehashCount1;
|
|
memcpy ( tmp+1 , &prehash1, 4 );
|
|
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
|
|
}
|
|
if ( gotPrehash2 && gotPrehashCount2 ) {
|
|
// make a 5 byte thingy
|
|
char tmp[5];
|
|
tmp[0] = prehashCount2;
|
|
memcpy ( tmp+1 , &prehash2, 4 );
|
|
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
|
|
}
|
|
if ( gotPrehash3 && gotPrehashCount3 ) {
|
|
// make a 5 byte thingy
|
|
char tmp[5];
|
|
tmp[0] = prehashCount3;
|
|
memcpy ( tmp+1 , &prehash3, 4 );
|
|
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
|
|
}
|
|
if ( gotPrehash4 && gotPrehashCount4 ) {
|
|
// make a 5 byte thingy
|
|
char tmp[5];
|
|
tmp[0] = prehashCount4;
|
|
memcpy ( tmp+1 , &prehash4, 4 );
|
|
newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5);
|
|
}
|
|
|
|
// now the langs
|
|
uint8_t numLangs = *p;
|
|
p += 1;
|
|
for ( long i = 0 ; i < numLangs ; i++ ) {
|
|
uint8_t langId = *p;
|
|
p += 1;
|
|
long score = (long)*(uint8_t *)p;
|
|
p += 1;
|
|
// add to new rec
|
|
newgr.addTag ( langId , // should be 1-1
|
|
now ,
|
|
"conv" ,
|
|
0 , // ip
|
|
(char *)&score ,
|
|
1 );
|
|
}
|
|
|
|
// print it out
|
|
SafeBuf sb;
|
|
newgr.printToBuf(&sb);
|
|
logf(LOG_INFO,"tagdb: %s",sb.getBufStart());
|
|
|
|
Rdb *r = &g_tagdb.m_rdb;
|
|
|
|
// . add the new site rec back as a TagRec
|
|
// . it should overwrite the old one since the key is the same
|
|
// . this should not block
|
|
// . it should do a dump if tree is full
|
|
if ( ! r->addRecord ( collnum ,
|
|
newgr.getKey () ,
|
|
newgr.getData () ,
|
|
newgr.getDataSize() ,
|
|
MAX_NICENESS )) {
|
|
log("tagdb: convert: %s",mstrerror(g_errno));
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
|
|
// do a blocking dump of tree if it's 90% full now
|
|
if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
|
|
log("tagdb: convert: dumping tree to disk.");
|
|
if ( ! r->dumpTree ( 0 ) ) // niceness
|
|
return log("tagdb: convert: dump failed.");
|
|
}
|
|
}
|
|
|
|
// if list not empty, get more
|
|
if ( list.isEmpty() ) { g_threads.enableThreads(); return true; }
|
|
// advance startKey
|
|
startKey = k;
|
|
startKey += 1;
|
|
// watch for wrap, that means done, too
|
|
if ( startKey < k ) { g_threads.enableThreads(); return true; }
|
|
// otherwise, do more
|
|
goto loop;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
|
|
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
|
|
key_t Tagdb::makeKey ( Url *u , bool isDelete ) {
|
|
key_t k;
|
|
// hash full hostname
|
|
k.n1 = hash32 ( u->getHost() , u->getHostLen() );
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = hash64 ( u->getUrl() , u->getUrlLen() );
|
|
// clear low bit if we're a delete, otherwise set it
|
|
if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL;
|
|
else k.n0 |= 0x0000000000000001LL;
|
|
return k;
|
|
}
|
|
*/
|
|
|
|
// . ssssssss ssssssss ssssssss ssssssss hash of site/url
|
|
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx tagType OR hash of that+user+data
|
|
// . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
|
|
key128_t Tagdb::makeStartKey ( char *site ) { // Url *u ) {
|
|
key128_t k;
|
|
// hash full hostname
|
|
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
|
|
k.n1 = hash64n ( site );
|
|
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
|
|
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = 0;
|
|
return k;
|
|
}
|
|
|
|
key128_t Tagdb::makeEndKey ( char *site ) { // Url *u ) {
|
|
key128_t k;
|
|
// hash full hostname
|
|
//k.n1 = hash64 ( u->getHost() , u->getHostLen() );
|
|
k.n1 = hash64n ( site );
|
|
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
|
|
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = 0xffffffffffffffffLL;
|
|
return k;
|
|
}
|
|
|
|
key128_t Tagdb::makeDomainStartKey ( Url *u ) {
|
|
key128_t k;
|
|
// hash full hostname
|
|
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
|
|
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
|
|
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = 0;
|
|
return k;
|
|
}
|
|
|
|
key128_t Tagdb::makeDomainEndKey ( Url *u ) {
|
|
key128_t k;
|
|
// hash full hostname
|
|
k.n1 = hash64 ( u->getDomain() , u->getDomainLen() );
|
|
//k.n1 = hash32 ( u->getUrl(), u->getUrlLen() );
|
|
//k.n1 = hash32 ( u->getDomain(), u->getDomainLen() );
|
|
// set lower 64 bits of key to hash of this url
|
|
k.n0 = 0xffffffffffffffffLL;
|
|
return k;
|
|
}
|
|
|
|
|
|
/*
|
|
// . returns 0 if "url" is not a suburl of "site"
|
|
// . otherwise, returns "percent" of "url" that matches "site"
|
|
long Tagdb::getMatchPoints ( Url *recUrl , Url *url ) {
|
|
// reset pts to 0
|
|
long pts = 0;
|
|
|
|
// temporary fix to the hostname key collision problem is Tagdb Rdb
|
|
long rhlen = recUrl->getHostLen ();
|
|
|
|
char *uhost = url ->getDomain ();
|
|
long uhlen = url ->getDomainLen ();
|
|
char *shost = recUrl->getDomain ();
|
|
long shlen = recUrl->getDomainLen ();
|
|
//long uip = url->getIp ();
|
|
//long sip = site->getIp ();
|
|
|
|
// MDW: we are not really doing ips like this now
|
|
if ( uhlen != shlen || strncmp( uhost, shost, uhlen ) != 0 )
|
|
// if ( ! uip || uip != sip ) return 0;
|
|
return 0;
|
|
|
|
// compare ports for bonus points
|
|
// but return 0 if site's port is not default
|
|
long rport = recUrl->getPort ();
|
|
long uport = url->getPort ();
|
|
if ( rport == uport ) pts += 1000000;
|
|
else if ( uport != url->getDefaultPort() ) return 0;
|
|
|
|
// now ensure url's path is a subpath of recUrl's
|
|
long rplen = recUrl->getPathLen();
|
|
char *rpath = recUrl->getPath();
|
|
long uplen = url->getPathLen();
|
|
char *upath = url->getPath();
|
|
if ( rplen > uplen ) return 0;
|
|
if ( strncmp ( upath , rpath , rplen ) != 0 ) return 0;
|
|
// . now we got a solid match
|
|
// . add 1 pt for each char in recUrl's path
|
|
// . so the longer recUrl's path the better the match (more specific)
|
|
// . this allows us to override TagRecs for deeper sub urls
|
|
pts += rplen;
|
|
// add in host size of the matching recUrl
|
|
pts += rhlen*1000;
|
|
// all done
|
|
return pts;
|
|
}
|
|
*/
|
|
|
|
///////////////////////////////////////////////
|
|
//
|
|
// for getting the final TagRec for a url
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
Msg8a::Msg8a() {
|
|
m_replies = 0;
|
|
m_requests = 0;
|
|
}
|
|
|
|
Msg8a::~Msg8a ( ) {
|
|
reset();
|
|
}
|
|
|
|
void Msg8a::reset() {
|
|
// do no free if in progress, reply may come in and corrupt the mem
|
|
if ( m_replies != m_requests && ! g_process.m_exiting ) {
|
|
char *xx=NULL;*xx=0; }
|
|
//for ( long i = 0 ; i < m_replies ; i++ )
|
|
// m_lists[i].reset();
|
|
m_replies = 0;
|
|
m_requests = 0;
|
|
}
|
|
|
|
// . get records from multiple subdomains of url
|
|
// . calls g_udpServer.sendRequest() on each subdomain of url
|
|
// . all matching records are merge into a final record
|
|
// i.e. site tags are also propagated accordingly
|
|
// . closest matching "site" is used as the "site" (the site url)
|
|
bool Msg8a::getTagRec ( Url *url ,
|
|
// site of the url
|
|
char *site ,
|
|
char *coll ,
|
|
bool skipDomainLookup , // useCanonicalName ,
|
|
long niceness ,
|
|
void *state ,
|
|
void (* callback)(void *state ),
|
|
TagRec *tagRec ,
|
|
bool doInheritance ,
|
|
char rdbId ) {
|
|
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
return true;
|
|
}
|
|
|
|
// reset tag rec
|
|
tagRec->reset();//m_numListPtrs = 0;
|
|
|
|
// sanity check
|
|
if ( rdbId != RDB_TAGDB ) {char *xx=NULL;*xx=0;}
|
|
// save it
|
|
m_rdbId = rdbId;
|
|
|
|
// in use? need to wait before reusing
|
|
if ( m_replies != m_requests ) {char *xx=NULL;*xx=0; }
|
|
// then we gotta free the lists if any
|
|
reset();
|
|
|
|
m_niceness = niceness;
|
|
m_coll = coll;
|
|
m_tagRec = tagRec;
|
|
m_callback = callback;
|
|
m_state = state;
|
|
//m_url = url;
|
|
// reset
|
|
m_errno = 0;
|
|
m_requests = 0;
|
|
m_replies = 0;
|
|
m_doneLaunching = false;
|
|
//m_doFullUrl = true;
|
|
//m_skipDomainLookup = skipDomainLookup;
|
|
|
|
// set siteLen to the provided site if it is non-NULL
|
|
long siteLen = 0;
|
|
if ( site ) siteLen = gbstrlen(site);
|
|
|
|
// . get the site
|
|
// . msge0 passes this in as NULL an expects us to figure it out
|
|
// . if site was NULL that means we guess it. default to hostname
|
|
// unless in a recognized for like /~mwells/
|
|
if ( ! site ) {
|
|
SiteGetter sg;
|
|
sg.getSite ( url->getUrl() ,
|
|
NULL , // tagrec
|
|
0 , // timestamp
|
|
NULL, // coll
|
|
m_niceness,
|
|
NULL, // state
|
|
NULL); // callback
|
|
// if it set it to a recognized site, like ~mwells
|
|
// then set "site"
|
|
if ( sg.m_siteLen ) {
|
|
site = sg.m_site;
|
|
siteLen = sg.m_siteLen;
|
|
}
|
|
}
|
|
|
|
// if provided site was NULL and not of a ~mwells type of form
|
|
// then default it to hostname
|
|
if ( ! site ) {
|
|
site = url->getHost();
|
|
siteLen = url->getHostLen();
|
|
}
|
|
|
|
// temp null terminate it
|
|
char c = site[siteLen];
|
|
site[siteLen] = '\0';
|
|
|
|
// use that
|
|
m_siteStartKey = g_tagdb.makeStartKey ( site );//url );
|
|
m_siteEndKey = g_tagdb.makeEndKey ( site ); // url );
|
|
|
|
// un NULL terminate it
|
|
site[siteLen] = c;
|
|
|
|
|
|
|
|
|
|
// ignore this part of url is already root like
|
|
//if ( m_url->isRoot() ) m_doFullUrl = false;
|
|
|
|
// makeStartKey only works on the hostname of the url, so doing the
|
|
// full url has no effect right now
|
|
//m_doFullUrl = false;
|
|
|
|
// sendPageInject keeps "url" on the stack!
|
|
//m_url.set ( url->getUrl() , url->getUrlLen() );
|
|
m_url = url;
|
|
|
|
|
|
// save this
|
|
m_doInheritance = doInheritance;
|
|
// . launch a request for each subdomain of the url
|
|
// . the request format is
|
|
// . <url>\0<niceness><coll>\0
|
|
// . that way we can use a small request buffer and have different
|
|
// pointers to the different subdomains
|
|
//char *p = m_request;
|
|
// point to url
|
|
char *u = url->getUrl();
|
|
long ulen = url->getUrlLen();
|
|
// point to the TLD of the url
|
|
char *tld = url->getTLD();
|
|
// . if NULL, that is bad... TLD is unsupported
|
|
// . no! it could be an ip address!
|
|
// . anyway, if the tld does not exist, just return an empty tagrec
|
|
// do not set g_errno
|
|
if ( ! tld && ! url->isIp() ) return true;
|
|
//if ( ! tld ) { g_errno = EBADURL; return true; }
|
|
// url cannot have NULLs in it because handleRequest8a() uses
|
|
// gbstrlen() on it to get its size
|
|
for ( long i = 0 ; i < ulen ; i++ ) {
|
|
if ( u[i] ) continue;
|
|
log("TagRec: got bad url with NULL in it %s",u);
|
|
m_errno = EBADURL;
|
|
g_errno = EBADURL;
|
|
return true;
|
|
}
|
|
// skip over http://
|
|
long plen = url->getSchemeLen() + 3;
|
|
u += plen;
|
|
ulen -= plen;
|
|
// copy over url without the protocol thingy (http://)
|
|
//memcpy ( p , u , ulen );
|
|
// get the domain
|
|
m_dom = url->getDomain();
|
|
// if none, bad!
|
|
if ( ! m_dom && ! url->isIp() ) return true;
|
|
// save this
|
|
//m_host = url->getHost();
|
|
// get its delta
|
|
//long delta = dom - u;
|
|
// . save ptr for launchGetRequests()
|
|
// . move this BACKWARDS for subdomains that have a ton of .'s
|
|
// . no, now move towards domain
|
|
m_p = m_url->getHost();
|
|
// and save this too
|
|
m_hostEnd = m_url->getHost() + m_url->getHostLen();
|
|
// if ip just use the full "hostname" which is the full ip address
|
|
//if ( url->isIp() ) m_p = m_host;
|
|
|
|
// launch the requests
|
|
if ( ! launchGetRequests() ) return false;
|
|
// . they did it without blocking
|
|
// . this sets g_errno on error
|
|
gotAllReplies();
|
|
// did not block
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno and returns true on error
|
|
bool Msg8a::launchGetRequests ( ) {
|
|
// clear it
|
|
g_errno = 0;
|
|
bool tryDomain = false;
|
|
loop:
|
|
// return true if nothing to launch
|
|
if ( m_doneLaunching ) return (m_requests == m_replies);
|
|
// don't bother if already got an error
|
|
if ( m_errno ) return (m_requests == m_replies);
|
|
// limit max to 5ish
|
|
if (m_requests >=MAX_TAGDB_REQUESTS) return (m_requests==m_replies);
|
|
// take a breath
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// . first, try it by canonical domain name
|
|
// . if that finds no matches, then try it by ip domain
|
|
// get host
|
|
//char *subdom = m_p;
|
|
//long subdomLen = m_hostEnd - m_p;
|
|
|
|
key128_t startKey ;
|
|
key128_t endKey ;
|
|
//long siteHash32;
|
|
// . if our first time, do the full url!
|
|
// . need to do this because the turking process (XmlDoc::getTurkForm()
|
|
// and PageReindex.cpp:processTurkForm()) add tags to tagdb based on
|
|
// the full url.
|
|
/*
|
|
if ( m_doFullUrl ) {
|
|
startKey = g_tagdb.makeStartKey ( m_url );
|
|
endKey = g_tagdb.makeEndKey ( m_url );
|
|
// . like the "norm" url above
|
|
// . we'll get back a list of tags for this hostname,
|
|
// but they could all be from different sites, some sites
|
|
// would be the hostname, other tags might be from sites
|
|
// that are a subsite of the hostname, so we have to make
|
|
// sure the tag's key.n0 matches this siteHash32
|
|
siteHash32 = hash32 ( m_url->getUrl() , m_url->getUrlLen());
|
|
}
|
|
else {
|
|
// make into a url
|
|
Url u;
|
|
u.set ( subdom , subdomLen );
|
|
// set key range now
|
|
startKey = g_tagdb.makeStartKey ( &u );
|
|
endKey = g_tagdb.makeEndKey ( &u );
|
|
// . like the "norm" url above
|
|
// . we'll get back a list of tags for this hostname,
|
|
// but they could all be from different sites, some sites
|
|
// would be the hostname, other tags might be from sites
|
|
// that are a subsite of the hostname, so we have to make
|
|
// sure the tag's key.n0 matches this siteHash32
|
|
siteHash32 = hash32 ( u.getUrl() , u.getUrlLen() );
|
|
}
|
|
*/
|
|
|
|
if ( tryDomain ) {
|
|
startKey = g_tagdb.makeDomainStartKey ( m_url );
|
|
endKey = g_tagdb.makeDomainEndKey ( m_url );
|
|
if ( g_conf.m_logDebugTagdb )
|
|
log("tagdb: looking up domain tags for %s",
|
|
m_url->getUrl());
|
|
}
|
|
else {
|
|
// usually the site is the hostname but sometimes it is like
|
|
// "www.last.fm/user/breendaxx/"
|
|
//startKey = g_tagdb.makeStartKey ( m_site );//url );
|
|
//endKey = g_tagdb.makeEndKey ( m_site ); // url );
|
|
startKey = m_siteStartKey;
|
|
endKey = m_siteEndKey;
|
|
if ( g_conf.m_logDebugTagdb )
|
|
log("tagdb: looking up site tags for %s",
|
|
m_url->getUrl());
|
|
}
|
|
|
|
|
|
// get the groupid
|
|
//unsigned long groupId = g_tagdb.getGroupId ( startKey );
|
|
|
|
// get the next mcast
|
|
Msg0 *m = &m_msg0s[m_requests];
|
|
// and the list
|
|
RdbList *listPtr = &m_tagRec->m_lists[m_requests];
|
|
|
|
// bias based on the top 64 bits which is the hash of the "site" now
|
|
//uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true );
|
|
//Host *group = g_hostdb.getGroup ( gid );
|
|
long shardNum = getShardNum ( m_rdbId , &startKey );//, true );
|
|
Host *group = g_hostdb.getShard ( shardNum );
|
|
|
|
long numTwins = g_hostdb.getNumHostsPerShard();
|
|
// use top byte!
|
|
uint8_t *sks = (uint8_t *)&startKey;
|
|
uint8_t top = sks[sizeof(TAGDB_KEY)-1];
|
|
long hostNum = 0;
|
|
if ( numTwins == 2 && (top & 0x80) ) hostNum = 1;
|
|
// TODO: fix this!
|
|
if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; }
|
|
long hostId = group[hostNum].m_hostId;
|
|
|
|
|
|
// . launch this request, even if to ourselves
|
|
// . TODO: just use msg0!!
|
|
bool status = m->getList ( hostId , // hostId
|
|
0 , // ip
|
|
0 , // port
|
|
0 , // maxCacheAge
|
|
false , // addToCache
|
|
m_rdbId, //RDB_TAGDB ,
|
|
m_coll ,
|
|
listPtr ,
|
|
(char *) &startKey ,
|
|
(char *) &endKey ,
|
|
10000000 , // minRecSizes
|
|
this , // state
|
|
gotMsg0ReplyWrapper ,
|
|
m_niceness ,
|
|
true , // error correction?
|
|
true , // include tree?
|
|
true , // doMerge?
|
|
-1 , // firstHostId
|
|
0 , // startFileNum
|
|
-1 , // numFiles
|
|
3600*24*365 );// timeout
|
|
// all done?
|
|
//if ( m_p == m_url->getDomain() ) m_doneLaunching = true;
|
|
// error?
|
|
if ( status && g_errno ) {
|
|
// g_errno should be set, we had an error
|
|
m_errno = g_errno;
|
|
return (m_requests == m_replies);
|
|
}
|
|
// successfully launched
|
|
m_requests++;
|
|
// if we got a reply instantly
|
|
if ( status ) m_replies++;
|
|
|
|
if ( ! tryDomain ) { //&&
|
|
//! m_skipDomainLookup &&
|
|
//m_url->getHostLen() != m_url->getDomainLen() ) {
|
|
tryDomain = true;
|
|
goto loop;
|
|
}
|
|
|
|
//
|
|
// no more looping!
|
|
//
|
|
// i don't think we need to loop any more because we got all the
|
|
// tags for this hostname. then the lower bits of the Tag key
|
|
// corresponds to the actual SITE hash. so we gotta filter those
|
|
// out i guess after we read the whole list.
|
|
//
|
|
return (m_requests == m_replies);
|
|
//m_doneLaunching = true;
|
|
//goto loop;
|
|
|
|
/*
|
|
// do not advance m_p if doing the full url first
|
|
if ( m_doFullUrl ) {
|
|
m_doFullUrl = false;
|
|
goto loop;
|
|
}
|
|
// . advance m_p
|
|
// . we go backwards to better support subdomains that have a ton
|
|
// of periods in them...
|
|
for ( ; m_p < m_dom && *m_p != '.' ; m_p++ );
|
|
// advance over .
|
|
if ( m_p != m_dom ) m_p++;
|
|
// if another dot that is bad!
|
|
if ( *m_p == '.' ) m_errno = EBADURL;
|
|
// launch another
|
|
goto loop;
|
|
*/
|
|
}
|
|
|
|
void gotMsg0ReplyWrapper ( void *state ) {
|
|
Msg8a *THIS = (Msg8a *)state;
|
|
// we got one
|
|
THIS->m_replies++;
|
|
// error?
|
|
if ( g_errno ) THIS->m_errno = g_errno;
|
|
// launchGetRequests() returns false if still waiting for replies...
|
|
if ( ! THIS->launchGetRequests() ) return;
|
|
// get all the replies
|
|
THIS->gotAllReplies();
|
|
// set g_errno for the callback
|
|
if ( THIS->m_errno ) g_errno = THIS->m_errno;
|
|
// otherwise, call callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
// get the TagRec from the reply
|
|
void Msg8a::gotAllReplies ( ) {
|
|
// if any had an error, don't do anything
|
|
if ( m_errno ) return;
|
|
// scan the lists
|
|
for ( long i = 0 ; i < m_replies ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// get list
|
|
RdbList *list = &m_tagRec->m_lists[i];
|
|
// skip if empty
|
|
if ( list->m_listSize <= 0 ) continue;
|
|
// panic msg
|
|
if ( list->m_listSize >= 10000000 ) {
|
|
log("tagdb: CAUTION!!! cutoff tagdb list!");
|
|
log("tagdb: CAUTION!!! will lost useful info!!");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// otherwise, add to array
|
|
m_tagRec->m_listPtrs[m_tagRec->m_numListPtrs] = list;
|
|
// advance
|
|
m_tagRec->m_numListPtrs++;
|
|
}
|
|
|
|
// . now scan all the tags for this HOSTNAME
|
|
// . filter out tags that are not for a supersite of our url
|
|
// . i.e. if our url is www.xyz.com/tim/bob/file.html
|
|
// then hash
|
|
// http://www.xyz.com/
|
|
// http://www.xyz.com/tim/
|
|
// http://www.xyz.com/tim/bob/
|
|
// and skip over any tag whose lower 32 bits does not match
|
|
// one of those hashes...
|
|
// . see where we set Tag::m_key.n0 in Tag::set() above:
|
|
// m_key.n0 |= (uint32_t) hash32 ( norm.getUrl(),norm.getUrlLen() );
|
|
// where "norm" is the provided site but with a http:// in front
|
|
// and a / at the end since Url::set() normalized it
|
|
// . m_url is the url we want to get the tags for
|
|
// . HACK: right now just restrict to the hostname!
|
|
/*
|
|
Url norm;
|
|
norm.set ( m_url->getHost() , m_url->getHostLen() );
|
|
unsigned long siteHash32 = hash32 ( norm.getUrl(),norm.getUrlLen() );
|
|
// . and the domain too so we can ban domains
|
|
// . this is messed up because we can't just hash the domain, we have
|
|
// to hash it like a complete url because that is what Tag::set()
|
|
// does when it makes the key's top 32 bits.
|
|
unsigned long siteHash32d = 0;
|
|
long conti = 0;
|
|
siteHash32d = hash32_cont ( "http://",7,siteHash32d,&conti);
|
|
siteHash32d = hash32_cont ( norm.getDomain(),
|
|
norm.getDomainLen(),
|
|
siteHash32d,
|
|
&conti);
|
|
siteHash32d = hash32_cont ( "/",1,siteHash32d,&conti);
|
|
// the non-del bit i guess. we forgot to shift up when we made
|
|
// the key above!
|
|
siteHash32 |= 0x01;
|
|
siteHash32d |= 0x01;
|
|
*/
|
|
|
|
// scan tags in list and set Tag::m_type to TT_DUP if its a dup
|
|
Tag *tag = m_tagRec->getFirstTag();
|
|
HashTableX cx;
|
|
char cbuf[2048];
|
|
cx.set ( 4,0,64,cbuf,2048,false,m_niceness,"tagtypetab");
|
|
// . loop over all tags in all lists in order by key
|
|
// . each list should be from a different suburl?
|
|
// . the first list should be the narrowest/longest?
|
|
for ( ; tag ; tag = m_tagRec->getNextTag ( tag ) ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// skip tag if it is not from the proper site. we are
|
|
// only guarenteed that all tags in this list are for the
|
|
// same HOSTNAME not SITE! site is in the lower bits
|
|
// of the tagdb key.
|
|
// should fix www.paypal.com:1234 bug where we were reading
|
|
// sitenuminlinks from that tag and was always 0!! even
|
|
// when we'd add a count of 2k to the www.paypal.com site...
|
|
// now filter out www.paypal.com:1234's tags!
|
|
// TODO: allow multiple different siteHash32 values to match
|
|
// here, use one siteHash32 for each possible suburl of "m_url"
|
|
// so if m_url is "http://www.xyz.com/tim/" then we also
|
|
// can match hash32("http://www.xyz.com/tim/" not just
|
|
// "http://www.xyz.com/" which is how it is now.
|
|
//unsigned long th32 = tag->m_key.n0 & 0xffffffff;
|
|
//if ( th32 != siteHash32 && th32 != siteHash32d ) {
|
|
// // maybe use TT_DIFFSITE instead of this! TODO!
|
|
// tag->m_type = TT_DUP;
|
|
// continue;
|
|
//}
|
|
|
|
// form the hash!
|
|
uint32_t h32 = (unsigned long)((tag->m_key.n0) >> 32);
|
|
// skip if not unique
|
|
//if ( ! isTagTypeUnique ( tag->m_type ) ) continue;
|
|
// otherwise, record it
|
|
if ( cx.isInTable(&h32 ) ) // tag->m_type) )
|
|
tag->m_type = TT_DUP;
|
|
else if ( ! cx.addKey(&h32) ) {
|
|
m_errno = g_errno;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
// get the TagRec from the reply
|
|
void TagRec::gotAllReplies ( ) {
|
|
// if any had an error, don't do anything
|
|
if ( m_errno ) return;
|
|
// time how long this takes and log it
|
|
long long startTime = gettimeofdayInMilliseconds();
|
|
// how many TagRecs we matched
|
|
long n = 0;
|
|
// arrays for pointing to best matching TagRecs
|
|
//char *data [128];
|
|
//long dataSizes [128];
|
|
//long dataScores [128];
|
|
char *recs [128];
|
|
long recScores [128];
|
|
|
|
// . each reply is a list of TagRecs
|
|
// . each TagRec is a standard Rdb record
|
|
// . key|dataSize|data...
|
|
// . go through all TagRecs and sort our list of ptrs to the
|
|
// best TagRecs
|
|
// . some TagRecs will not even match, so do not include those in
|
|
// our list of pointers
|
|
// . the closest matching TagRecs will be on top
|
|
// . inherit Tags from lesser matching TagRecs provided there
|
|
// is no such Tag::m_type from a closer matching TagRec
|
|
// . if xyz.com is banned and abc.xyz.com has a 0 score for the
|
|
// ST_BANNED Tag, then it is effectively "unbanned" and should
|
|
// not inherit the score from xyz.com for ST_BANNED.
|
|
// . so by scanning each TagRec in order, we compose our own
|
|
// final merged TagRec that may have a lot more Tags in it
|
|
// than any one matching TagRec
|
|
for ( long i = 0 ; i < m_replies ; i++ ) {
|
|
// get the list from this reply
|
|
RdbList *list = &m_lists[i];
|
|
// scan list
|
|
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
|
// break if overflow
|
|
if ( n >= 128 ) break;
|
|
// get next rec
|
|
//char *d = list->getCurrentData ();
|
|
//long dsize = list->getCurrentDataSize();
|
|
char *rec = list->getCurrentRec();
|
|
// set TagRec to it
|
|
TagRec *gr = (TagRec *)rec;
|
|
// get the site
|
|
//char *site = gr->getString(ST_SITE,NULL);
|
|
char *site = gr->getString("site",NULL);
|
|
// sanity check
|
|
if ( ! site ) { char *xx=NULL;*xx=0; }
|
|
// make it a url
|
|
Url u;
|
|
u.set ( site , gbstrlen(site) );
|
|
// score it
|
|
long s = g_tagdb.getMatchPoints ( &u , m_url );
|
|
// skip it if not a match
|
|
if ( s <= 0 ) continue;
|
|
// save it
|
|
//data [n] = d;
|
|
//dataSize [n] = dsize;
|
|
recs [n] = rec;
|
|
recScores [n] = s;
|
|
n++;
|
|
}
|
|
}
|
|
|
|
// if no recs, we did not match anything
|
|
if ( n == 0 ) return;
|
|
// or on error
|
|
if ( m_errno ) return;
|
|
|
|
// bubble sort the recs by their scores, highest score first
|
|
bubble:
|
|
bool swapped = false;
|
|
for ( long i = 1 ; i < n ; i++ ) {
|
|
// keep going if in correct order
|
|
if ( recScores[i-1] >= recScores[i] ) continue;
|
|
// swap
|
|
char *t1 = recs [i-1];
|
|
long t2 = recScores [i-1];
|
|
recs [i-1] = recs [i];
|
|
recs [i ] = t1;
|
|
recScores [i-1] = recScores [i];
|
|
recScores [i ] = t2;
|
|
swapped = true;
|
|
}
|
|
if ( swapped ) goto bubble;
|
|
|
|
// parse the best matching SiteData
|
|
//TagRec gr ; gr.set ( data[0] , dataSizes[0] );
|
|
// use the site from the best matching TagRec as our site
|
|
//m_siteUrl.set ( gr.getSite() , gr.getSiteLen() );
|
|
|
|
// reset the inheritance array
|
|
//char array[ST_LAST_TAG];
|
|
//memset ( array , -1 , 256 );
|
|
HashTable ia;
|
|
char ibuf [ 1024 * 8 ];
|
|
ia.set ( 1024 , ibuf , 1024 * 8 );
|
|
|
|
// we just store the tags, ptrs into the tags in the m_lists
|
|
//Tag *tags[MAX_TAGS];
|
|
// assume we got no tags
|
|
//long numTags = 0;
|
|
// size of all tags
|
|
//long size = 0;
|
|
|
|
// set our new tag rec
|
|
m_tagRec->reset();
|
|
|
|
// . only get tags from the first matching tag rec if we should not
|
|
// do the inheritance loop
|
|
// . if they click "get rec" on PageTagdb, then do not do inheritance,
|
|
// but if they click "get tags", then do it!
|
|
if ( ! m_doInheritance && n > 0 ) n = 1;
|
|
|
|
|
|
// . DO NOT INHERIT ANYTHING FROM TAG RECS that have a sitePathDepth
|
|
// tag in them UNLESS the sitePathDepth does not work on us
|
|
// . i.e. if xyz.com has a sitePathDepth of 2 in its TagRec and the
|
|
// url we are looking at is xyz.com/a/b/c/d then we must assume that
|
|
// out site is xyz.com/a/b/ we are an independent subsite of
|
|
// xyz.com and inherit nothing from it
|
|
SiteGetter siteGetter;
|
|
|
|
// site getter sometimes adds recs to tagdb to add in a new subsite
|
|
// it finds... i'd imagine this will create a parsing inconsistency
|
|
// when injecting docs into the "test" coll... but oh well!
|
|
long timestamp = getTimeGlobal();
|
|
|
|
// . begin the "inheritance loop"
|
|
// . fill our m_tags[] array with the Tags that apply to us
|
|
for ( long i = 0 ; i < n ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(m_niceness);
|
|
// parse the TagRec (very fast)
|
|
TagRec *gr = (TagRec *)recs[i];
|
|
// is "url" an independent subsite of gr's site?
|
|
char *us = m_url->getUrl();
|
|
bool st=siteGetter.getSite(us,gr,timestamp,m_coll,m_niceness );
|
|
// sanity check, not allowed to block since state is NULL!
|
|
if ( ! st ) { char *xx=NULL;*xx=0; }
|
|
// are we independent subsite? if so, do not inherit
|
|
// from that. this is used to prevent www.geocities.com/~mark/
|
|
// from gaining the benefits of being on the www.geocities.com
|
|
// site. TODO later: we should make another tag to indicate
|
|
// a subsite is expicitly independent. but for now we rely
|
|
// on the "sitepathdepth" tag automatically computed by
|
|
// SiteGetter.cpp.
|
|
//if ( siteGetter.isIndependentSubsite() ) continue;
|
|
|
|
//
|
|
// TODO:
|
|
// NONO, just do not inherit sitenumlinks or any tag
|
|
// that is marked as such!!! add a new flag to the tags!!!!!!
|
|
//
|
|
|
|
// always add the ST_SITE tag first from each tag so we know
|
|
// what site the other tags belong to
|
|
//Tag *stag = gr->getTag ( ST_SITE );
|
|
Tag *stag = gr->getTag ( "site" );
|
|
// only add if non null
|
|
if ( stag ) m_tagRec->addTag ( stag );
|
|
// last tag
|
|
Tag *last = NULL;
|
|
// loop over all tags in TagRec #i
|
|
tagLoop:
|
|
// get the tag id of current tag
|
|
Tag *tag = gr->getNextTag ( last );
|
|
// assign
|
|
last = tag;
|
|
// was that the end of the tags? if so, go to next TagRec
|
|
if ( ! tag ) continue;
|
|
// get tag id
|
|
long tagType = tag->m_type;
|
|
// skip all ST_SITE tags, we added those first above
|
|
//if ( tagType == ST_SITE ) goto tagLoop;
|
|
if ( tag->isType("site") ) goto tagLoop;
|
|
// sanity check
|
|
//if ( tagType >= ST_LAST_TAG ) { char *xx=NULL;*xx=0;}
|
|
// for getting the next tag, remember this
|
|
last = tag;
|
|
// . have we added this yet?
|
|
// . if tagType added from a prev TagRec do not "inherit" it
|
|
//if(array[tagType] != -1 && array[tagType] != i) goto tagLoop;
|
|
long slot = ia.getSlot ( tagType );
|
|
if ( slot >= 0 && ia.getValueFromSlot(slot) != i) goto tagLoop;
|
|
|
|
// if tag type is "eventtag" then only add it if the site of this
|
|
// tagrec EQUALS our url. exact match... that way we make sure to only
|
|
// tag a single url, otherwise we might accidentally tag an entire site.
|
|
if ( tag->isType("eventtag") ) {
|
|
// must be in tagRec that matches us the closest
|
|
if ( i != 0 ) goto tagLoop;
|
|
// if no site, skip it
|
|
if ( ! stag ) goto tagLoop;
|
|
// and even then must match site exactly
|
|
char *site = stag->m_data;
|
|
// as string
|
|
char *url = m_url->getUrl();
|
|
long ulen = m_url->getUrlLen();
|
|
// skip our proto (http://)
|
|
url += m_url->getSchemeLen() + 3;
|
|
ulen -= m_url->getSchemeLen() + 3;
|
|
// remove trailing /
|
|
if ( ulen > 0 && url[ulen-1] == '/' ) ulen--;
|
|
// likewise for site
|
|
long slen = gbstrlen(site);
|
|
if ( slen > 0 && site[slen-1] == '/' ) slen--;
|
|
// skip if not exact
|
|
if ( slen != ulen ) goto tagLoop;
|
|
// compare, must match exactly, if not, do not add tag
|
|
if ( strncmp(url,site,slen) != 0 ) goto tagLoop;
|
|
}
|
|
|
|
// ok, add/inherit it
|
|
//tags[numTags++] = tag;
|
|
// add it directly to m_tagRec
|
|
if ( ! m_tagRec->addTag ( tag ) ) {
|
|
log("tagdb: addTag failed: %s",mstrerror(g_errno));
|
|
m_errno = g_errno;
|
|
break;
|
|
}
|
|
// add in size
|
|
//size += tag->getSize();
|
|
// note it, so we do not add/inherit it from another TagRec
|
|
//array[tagType] = i;
|
|
ia.addKey ( tagType , i );
|
|
// add more tags
|
|
goto tagLoop;
|
|
}
|
|
|
|
// sanity!
|
|
//if ( size > 32000 ) { char *xx=NULL;*xx=0; }
|
|
//if ( size + 2 + 2 > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0; }
|
|
// then copy the tags into the buffer
|
|
//for ( long i = 0 ; i < numTags ; i++ )
|
|
// m_tagRec->addTag ( tags[i] );
|
|
|
|
// sanity check
|
|
//if ( p - m_tagRec > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
|
|
|
|
// free the mem
|
|
reset();
|
|
|
|
// time it
|
|
long long took = gettimeofdayInMilliseconds() - startTime;
|
|
if(took>10) log(LOG_INFO, "admin: gotreply for msg8a took %lli",took);
|
|
}
|
|
*/
|
|
/*
|
|
///////////////////////////////////////////////
|
|
//
|
|
// Msg9a : for modifying TagRecs in Tagdb
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
Msg9a::Msg9a () {
|
|
m_requestBuf = NULL;
|
|
m_requests = 0;
|
|
m_replies = 0;
|
|
}
|
|
Msg9a::~Msg9a() { reset(); }
|
|
|
|
void Msg9a::reset() {
|
|
// guard against not waiting for all replies to come in
|
|
if ( m_requests != m_replies && ! g_process.m_exiting ) {
|
|
char *xx=NULL;*xx=0; }
|
|
if ( ! m_requestBuf ) return;
|
|
mfree ( m_requestBuf , m_requestBufSize , "msg9a" );
|
|
m_requestBuf = NULL;
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets errno on error
|
|
// . "urls" is a NULL-terminated list of space-separated urls
|
|
// . if "addTags" is true, then the tags in "tagRec" will be added to the
|
|
/// the TagRecs specified by the sites in "sites". if a TagRec
|
|
// does not exist for a given "site" then it will be added just
|
|
// so we can add the Tags to it. If it does exist, we will
|
|
// just append the given Tags to it.
|
|
// . to "delete" a tag, just assign it a dataSize of 0!
|
|
// . Tags added with the same user name and tag type of an existing tag
|
|
// will overwrite it.
|
|
// . you can now optionally supply an array of ptrs to sites, sitePtrs.
|
|
// . you can call this with your "tagRec" on the stack because we copy
|
|
// its contents into our own buffer here
|
|
bool Msg9a::addTags ( char *sites ,
|
|
char **sitePtrs ,
|
|
long numSitePtrs ,
|
|
char *coll ,
|
|
void *state ,
|
|
void (*callback)(void *state) ,
|
|
long niceness ,
|
|
TagRec *tagRec ,
|
|
bool nukeTagRecs ,
|
|
long *ipVector ) {
|
|
|
|
// incase we are being re-used!
|
|
reset();
|
|
|
|
g_errno = 0;
|
|
|
|
// sanity check, one or the other
|
|
if ( sites && sitePtrs ) { char *xx=NULL;*xx=0; }
|
|
|
|
// ipVector only used with sitePtrs for now
|
|
if ( ! sitePtrs && ipVector ) { char *xx=NULL;*xx=0; }
|
|
|
|
// when we add the "site" tag to it use the timestamp from one
|
|
// of the tags we are adding... therefore we must require there be
|
|
// some tags! we do this to insure injection consistency into the
|
|
// "test" collection.
|
|
if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// use the first timestamp
|
|
long timestamp = tagRec->getFirstTag()->m_timestamp;
|
|
|
|
// . up to 20 oustanding Msg0 getting the exact TagRec for each site
|
|
// . when we get it we immediately modify it and then add it back
|
|
// using Msg4.
|
|
// . to resolve collisions we could assign a particular hostid
|
|
// to handle adding each site... yeah, how about the local host.
|
|
// . so forward the Msg9a add/del/rpl request to the responsible
|
|
// host. then it can lock the "site" until the add completes.
|
|
// . it should use Msg1 to add it.
|
|
|
|
// reset
|
|
m_errno = 0;
|
|
m_requests = 0;
|
|
m_replies = 0;
|
|
m_niceness = niceness;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
long collLen = gbstrlen(coll);
|
|
|
|
// how many urls in the sites do we have?
|
|
long numUrls = 0;
|
|
// point to buf
|
|
char *s = sites;
|
|
// count each one
|
|
while ( sites && *s ) {
|
|
// skip whitespace
|
|
while ( *s && is_wspace_a(*s) ) s++;
|
|
// alnum?
|
|
if ( *s ) numUrls++;
|
|
// skip url
|
|
while ( *s && ! is_wspace_a(*s) ) s++;
|
|
}
|
|
if ( sitePtrs )
|
|
numUrls = numSitePtrs;
|
|
|
|
|
|
// how much buf do we need to hold all the requests for all the sites
|
|
long need = 0;
|
|
|
|
|
|
// just a buffer of sites
|
|
if ( sites )
|
|
need += 2 * (gbstrlen(sites) + 1);
|
|
// otherwise, use the site ptrs
|
|
for ( long i = 0 ; i < numSitePtrs ; i++ )
|
|
need += 2 * (gbstrlen(sitePtrs[i]) + 1);
|
|
|
|
// how big is each request's header?
|
|
long header = 0;
|
|
// request size
|
|
header += 4;
|
|
// niceness
|
|
header += 1;
|
|
// collection
|
|
header += collLen + 1;
|
|
// flag
|
|
header += 1;
|
|
// the tag rec
|
|
header += tagRec->getSize();
|
|
// . add ST_SITE to each tagRec
|
|
// . we already accounted for the sites in the gbstrlen() above
|
|
header += sizeof(Tag);
|
|
// one header per url
|
|
need += header * numUrls;
|
|
|
|
// make a request buffer for all the requests
|
|
m_requestBuf = (char *)mmalloc ( need , "msg9a-add");
|
|
if ( ! m_requestBuf ) return true;
|
|
m_requestBufSize = need;
|
|
|
|
// carve it up
|
|
char *p = m_requestBuf;
|
|
// loop over sites
|
|
s = sites;
|
|
// reset sitePtr counter in case we are using those
|
|
long si = 0;
|
|
|
|
//long now = getTimeGlobal();
|
|
|
|
// loop it
|
|
for ( ; ; si++ ) {
|
|
// stop if all done
|
|
if ( sites && ! *s ) break;
|
|
|
|
// or this
|
|
if ( sitePtrs && si >= numSitePtrs ) break;
|
|
// make "s" point to the site if we are using ptrs
|
|
if ( sitePtrs ) s = sitePtrs[si];
|
|
|
|
// skip whitespace
|
|
while ( *s && is_wspace_a(*s) ) s++;
|
|
// skip over http:// (wastes space)
|
|
if ( strncmp(s,"http://",7)==0 ) s += 7;
|
|
// find end of url
|
|
char *send = s;
|
|
while ( *send && ! is_wspace_a(*send)) send++;
|
|
// get the length
|
|
long len = send - s;
|
|
// done? make sure we are using the site buffer and not ptrs
|
|
if ( sites && ! *s ) break;
|
|
// a place holder for the request size
|
|
long *rsizePtr = (long *)p; p += 4;
|
|
// track the size
|
|
char *start = p;
|
|
// first niceness
|
|
*p = niceness; p++;
|
|
// then coll
|
|
memcpy ( p , coll , collLen ); p += collLen;
|
|
// NULL term
|
|
*p++ = '\0';
|
|
// add flag first
|
|
*p = 0x00;
|
|
//if ( deleteTags ) *p = 0x01;
|
|
if ( nukeTagRecs ) *p = 0x02; // delete entire TagRec?
|
|
p++;
|
|
// now make the Tag!
|
|
//TagRec *tagRec = (TagRec *)p;
|
|
// sets its ip special if we should
|
|
long ip = 0;
|
|
if ( ipVector ) ip = ipVector[si];
|
|
// . copy it over
|
|
// . get the size
|
|
long size = tagRec->getSize();
|
|
// add in tagRec
|
|
memcpy ( p , tagRec , size );
|
|
// cat it to p
|
|
TagRec *newgr = (TagRec *)p;
|
|
// NULL terminate it temporarily
|
|
char c = s[len];
|
|
s[len] = 0;
|
|
// . remove the old site so the new one can replace it
|
|
// . we already contain a SITE_TAG and addTag() will NEVER
|
|
// replace that particular tag...
|
|
// . this is now removed above
|
|
//newgr->removeTag ( "site" , NULL );
|
|
// add the site
|
|
//newgr->addTag ( ST_SITE, now,"tagdb",0,s, len+1 );
|
|
newgr->addTag ( "site", timestamp,"tagdb",ip,s, len+1 );
|
|
// undo the NULL termination
|
|
s[len] = c;
|
|
// update the size
|
|
size = newgr->getSize();
|
|
// advance
|
|
p += size;
|
|
// how big was the request, store that
|
|
*rsizePtr = (p - start);
|
|
// advance s
|
|
s = send;
|
|
}
|
|
|
|
|
|
// reset ptr to request to launch
|
|
m_p = m_requestBuf;
|
|
// sanity check
|
|
if ( p - m_requestBuf > need ) { char *xx=NULL;*xx=0; }
|
|
// all done
|
|
m_pend = p;
|
|
// launch them
|
|
if ( ! launchAddRequests () ) return false;
|
|
// hey that should always block!
|
|
if ( ! g_errno ) { char *xx=NULL; *xx=0; }
|
|
// show erroer
|
|
log("tagdb: msg9a: %s",mstrerror(g_errno));
|
|
// free the allocated mem
|
|
reset();
|
|
// did not block...
|
|
return true;
|
|
}
|
|
|
|
// . "dumpFile" format contains one tag record per line as
|
|
// dumped from './gb dump S main 0 -1 1' cmd line cmd.
|
|
// . it is the format given by the TagRec::printToBuf() cmd
|
|
bool Msg9a::addTags ( char *dumpFile ,
|
|
char *coll ,
|
|
void *state ,
|
|
void (*callback)(void *state) ,
|
|
long niceness ) {
|
|
|
|
g_errno = 0;
|
|
|
|
// reset
|
|
m_errno = 0;
|
|
m_requests = 0;
|
|
m_replies = 0;
|
|
m_niceness = niceness;
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
long collLen = gbstrlen(coll);
|
|
// scan the dump file
|
|
char *p = dumpFile;
|
|
// the end of it
|
|
char *pend = p + gbstrlen(p);
|
|
// add up total sizes
|
|
long sum = 0;
|
|
// end of line ptr
|
|
char *eol;
|
|
// count
|
|
long count = 1;
|
|
// debug
|
|
//HashTable ht;
|
|
// do the scan
|
|
for ( ; p < pend ; p = eol + 1 ) {
|
|
// point to next line
|
|
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
|
|
// a fake tag rec
|
|
TagRec gr;
|
|
// . scan it into "gr"
|
|
// . returns size of the tag rec stored into "buf"
|
|
long bytesScanned = gr.setFromBuf ( p , eol );
|
|
// error?
|
|
if ( bytesScanned <= 0 ) {count++; continue;}
|
|
// get size
|
|
long size = gr.getSize();
|
|
// error?
|
|
if ( size <= 0 ) {count++; continue;}
|
|
//logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size);
|
|
// hash it for debug
|
|
//ht.addKey ( count , size );
|
|
count++;
|
|
// sanity check
|
|
if ( size > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;}
|
|
// sanity check
|
|
char *site = gr.getString("site",NULL);
|
|
if ( ! site ) { char *xx=NULL;*xx=0;}
|
|
// then request header size
|
|
size += 4 + 1 + collLen + 1 + 1;
|
|
// increment total size
|
|
sum += size;
|
|
}
|
|
|
|
// make the buf
|
|
m_requestBuf = (char *)mmalloc ( sum , "msg9adbuf");
|
|
m_requestBufSize = sum;
|
|
// store tags here
|
|
char *t = m_requestBuf;
|
|
// return true on error with g_errno set
|
|
if ( ! t ) return true;
|
|
// reset to beginning of file
|
|
p = dumpFile;
|
|
// reset
|
|
count = 1;
|
|
// do the scan
|
|
for ( ; p < pend ; p = eol + 1 ) {
|
|
// point to next line
|
|
eol = p; while ( eol < pend && *eol != '\n' ) eol++;
|
|
// first is the request size
|
|
long *requestSizePtr = (long *)t; t += 4;
|
|
// see how big the request is
|
|
char *a = t;
|
|
// then niceness
|
|
*t++ = (char)MAX_NICENESS;
|
|
// then coll
|
|
memcpy ( t , coll , collLen ); t += collLen;
|
|
// null temrinate
|
|
*t++ = '\0';
|
|
// then the 1 byte flag (0 means add?)
|
|
*t++ = 0;
|
|
// store TagRec into the request buffer
|
|
TagRec *gr = (TagRec *)t;
|
|
// . scan it into "t"
|
|
// . returns size of the tag rec stored into "buf"
|
|
long bytesScanned = gr->setFromBuf ( p , eol );
|
|
// error?
|
|
if ( bytesScanned <= 0 ) {
|
|
log("tagdb: skipping tag rec #%li.",count++);
|
|
t -= (4+1+collLen+1+1);
|
|
continue;
|
|
}
|
|
// get size
|
|
long size = gr->getSize();
|
|
// error?
|
|
if ( size <= 0 ) {
|
|
log("tagdb: skipping tag rec #%li.",count++);
|
|
t -= (4+1+collLen+1+1);
|
|
continue;
|
|
}
|
|
// test it
|
|
//long slot = ht.getSlot ( count );
|
|
//if ( slot < 0 ) { char *xx=NULL;*xx=0; }
|
|
//long shouldbe = ht.getValueFromSlot ( slot );
|
|
//if ( size != shouldbe ) { char *xx=NULL;*xx=0; }
|
|
count++;
|
|
//logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size);
|
|
// increment storage ptr
|
|
t += size;
|
|
// store the size of the WHOLE REQUEST, does not
|
|
// include the request size itself. see
|
|
// launchRequests() below.
|
|
*requestSizePtr = (t - a);
|
|
// sanity check
|
|
if ( *requestSizePtr > 10000 ) { char*xx=NULL;*xx=0;}
|
|
}
|
|
// sanity check
|
|
if ( t - m_requestBuf != sum ) { char *xx=NULL;*xx=0; }
|
|
// use their ptrs for adding these tag recs
|
|
m_p = m_requestBuf;
|
|
m_pend = m_requestBuf + m_requestBufSize ;
|
|
// now add those tags
|
|
return launchAddRequests ( );
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno and returns true on error
|
|
bool Msg9a::launchAddRequests ( ) {
|
|
// clear it
|
|
g_errno = 0;
|
|
loop:
|
|
// return true if nothing to launch
|
|
if ( m_p >= m_pend ) return (m_requests == m_replies);
|
|
// don't bother if already got an error
|
|
if ( m_errno ) return (m_requests == m_replies);
|
|
// limit max oustanding to 20
|
|
if (m_requests - m_replies >= 20 ) return (m_requests==m_replies);
|
|
// take a breath
|
|
QUICKPOLL(m_niceness);
|
|
|
|
// parse our request
|
|
char *p = m_p;
|
|
// first is the request size
|
|
p += 4;
|
|
// then niceness
|
|
p += 1;
|
|
// then coll
|
|
p += gbstrlen(p) + 1;
|
|
// then the 1 byte flag
|
|
p++;
|
|
// then the tag rec
|
|
TagRec *tagRec = (TagRec *)p;
|
|
// . get the groupid
|
|
// . tagRec's key should already be valid because when you add
|
|
// a ST_SITE to a TagRec it sets TagRec::m_key (special thing)
|
|
//unsigned long groupId = g_tagdb.getGroupId ( &tagRec->m_key );
|
|
uint32_t shardNum = getShardNum ( RDB_TAGDB , &tagRec->m_key );
|
|
// get the host to send to
|
|
Host *hosts = g_hostdb.getGroup ( groupId );
|
|
// select a host in the group
|
|
long hostNum = tagRec->m_key.n1 % g_hostdb.getNumHostsPerShard();
|
|
// and his ptr
|
|
Host *h = &hosts[hostNum];
|
|
|
|
// get the next mcast
|
|
//Multicast *m = &m_casts[m_requests];
|
|
// reqeust size
|
|
long requestSize = *(long *)m_p; m_p += 4;
|
|
char *request = m_p; m_p += requestSize;
|
|
|
|
// . send to just one very specific host so he is the only one that
|
|
// controls modification to this particular tagdb rec. that way if
|
|
// we are changing its Tags we do not collide with another.
|
|
// . this returns false and sets g_errno on error
|
|
UdpServer *us = &g_udpServer;
|
|
bool status = us->sendRequest ( request ,
|
|
requestSize ,
|
|
0x9a ,
|
|
h->m_ip , // bestIp
|
|
h->m_port , // destPort
|
|
h->m_hostId , // hostId
|
|
NULL , // slotPtr
|
|
this , // state
|
|
gotReplyWrapper9a , // callback
|
|
365*24*3600 , // timeout
|
|
-1 , // backoff
|
|
-1 , // max wait in ms
|
|
NULL , // replybuf
|
|
0 , // replybufMaxSize
|
|
m_niceness );
|
|
// error?
|
|
if ( ! status ) {
|
|
// g_errno should be set, we had an error
|
|
m_errno = g_errno;
|
|
return (m_requests == m_replies);
|
|
}
|
|
// successfully launched
|
|
m_requests++;
|
|
// launch another
|
|
goto loop;
|
|
}
|
|
|
|
void gotReplyWrapper9a ( void *state , UdpSlot *slot ) {
|
|
Msg9a *THIS = (Msg9a *) state;
|
|
THIS->m_replies++;
|
|
// don't let him free our send buf, it is m_requestBuf
|
|
// which we allocated above
|
|
slot->m_sendBufAlloc = NULL;
|
|
// error? if so, save it
|
|
if ( g_errno && ! THIS->m_errno ) THIS->m_errno = g_errno;
|
|
if ( ! THIS->launchAddRequests() ) return;
|
|
// free the allocated mem
|
|
THIS->reset();
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
class State9a {
|
|
public:
|
|
UdpSlot *m_slot;
|
|
Msg5 m_msg5;
|
|
char m_requestType;
|
|
Msg1 m_msg1;
|
|
RdbList m_list;
|
|
// this has all the tags we need to add/remove/replace
|
|
TagRec *m_tagRec;
|
|
// this has the original tagRec and we modify it with "m_tagRec"
|
|
// to get the final TagRec we add back to Tagdb. it is the
|
|
// "accumulator" tagdb record.
|
|
TagRec m_accRec;
|
|
// enough mem to store a key_t and a 0 dataSize (long)
|
|
char m_tmp[12+4];
|
|
|
|
char m_niceness;
|
|
char *m_coll;
|
|
|
|
// linked list of ppl waiting in line to make mods
|
|
class State9a *m_next;
|
|
//class State9a *m_tail;
|
|
};
|
|
|
|
void handleRequest9a ( UdpSlot *slot , long niceness ) {
|
|
// get the request
|
|
char *request = slot->m_readBuf;
|
|
long requestSize = slot->m_readBufSize;
|
|
// overflow protection for corrupt requests
|
|
if ( requestSize < 4 ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
g_udpServer.sendErrorReply ( slot , g_errno );
|
|
return;
|
|
}
|
|
// make a new Msg9a
|
|
State9a *st ;
|
|
try { st = new (State9a); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("msg9a: new(%i): %s", sizeof(State9a), mstrerror(g_errno));
|
|
return g_udpServer.sendErrorReply ( slot, g_errno );
|
|
}
|
|
mnew ( st , sizeof(State9a) , "Msg10" );
|
|
|
|
// parse the request
|
|
char *p = request;
|
|
// save slot for sending reply
|
|
st->m_slot = slot;
|
|
// get niceness
|
|
st->m_niceness = *(char *)p; p++;
|
|
// get coll
|
|
st->m_coll = p; p += gbstrlen(p) + 1;
|
|
// save this
|
|
st->m_requestType = *p; p++;
|
|
// the "tagRec" is the record
|
|
TagRec *tagRec = (TagRec *)p; p += tagRec->getSize();
|
|
// store ptr
|
|
st->m_tagRec = tagRec;
|
|
// reset this, we are the head/tail of the linked list so far
|
|
st->m_next = NULL;
|
|
|
|
// sanity check
|
|
//char *site = tagRec->getString(ST_SITE,NULL);
|
|
char *site = tagRec->getString("site",NULL);
|
|
// this is a no-no
|
|
if ( ! site ) { char *xx=NULL;*xx=0;}
|
|
|
|
// no tail after us
|
|
//st->m_tail = NULL;
|
|
|
|
// . get the lock on this site
|
|
// . the lower 64 bits of the key should be the url hash
|
|
long slotNum = s_lockTable2.getSlot ( &st->m_tagRec->m_key.n0 );
|
|
// if already in there, we have to wait because someone is already
|
|
// making mods to this TagRec
|
|
if ( slotNum >= 0 ) {
|
|
// log this for now?
|
|
if ( g_conf.m_logDebugSpider )
|
|
logf(LOG_DEBUG,"tagdb: TAGDB handleRequest9a "
|
|
"waiting for lock st=0x%lx key.n0=%llu",(long)st,
|
|
st->m_tagRec->m_key.n0);
|
|
State9a *p ;
|
|
p = *(State9a **)s_lockTable2.getValueFromSlot(slotNum);
|
|
// put us right after him in the linked list
|
|
st->m_next = p->m_next;
|
|
p->m_next = st;
|
|
// we could be the next in line
|
|
//if ( ! p->m_next ) p->m_next = st;
|
|
// we wait...
|
|
return;
|
|
}
|
|
|
|
// delete our slot from the lock table
|
|
if ( ! s_lockTable2.addKey ( &st->m_tagRec->m_key.n0 , &st ) ) {
|
|
log("tagdb: failed to get lock : %s",mstrerror(g_errno));
|
|
// free him, we sent his reply
|
|
mdelete ( st , sizeof(State9a),"msg9afr");
|
|
delete (st);
|
|
return g_udpServer.sendErrorReply ( slot, g_errno );
|
|
}
|
|
|
|
// make a startKey and endKey from the tagRec's key
|
|
key_t startKey = tagRec->m_key;
|
|
key_t endKey = tagRec->m_key;
|
|
// startkey gets is low bit cleared though
|
|
startKey.n0 &= 0xfffffffffffffffeLL;
|
|
|
|
// delete record request, no need to look it up
|
|
if ( st->m_requestType == 0x02 ) {
|
|
// note it
|
|
SafeBuf sb; tagRec->printToBuf ( &sb );
|
|
log("tagdb: deleting TagRec for site %s",sb.getBufStart());
|
|
// use tmp buf in st
|
|
char *p = st->m_tmp;
|
|
// store key in the tmp buf
|
|
*(key_t *)p = startKey;
|
|
// advance
|
|
p += sizeof(key_t);
|
|
// and store the data size
|
|
*(long *)p = 0;
|
|
// advance
|
|
p += 4;
|
|
// set the list (just a negative rec in it)
|
|
st->m_list.set ( st->m_tmp , // list
|
|
4+sizeof(key_t) , // listSize
|
|
st->m_tmp , // alloc
|
|
4+sizeof(key_t) , // allocSize
|
|
(char *)&startKey , // startKey
|
|
(char *)&endKey , // endKey
|
|
-1 , // fixeDataSize
|
|
false , // ownData?
|
|
false , // useHalfKeys?
|
|
sizeof(key_t) );// keySize
|
|
|
|
if ( ! st->m_msg1.addList( &st->m_list ,
|
|
RDB_TAGDB ,
|
|
st->m_coll ,
|
|
st ,
|
|
sendReply9a ,
|
|
false , // forceLocal?
|
|
st->m_niceness ))
|
|
// return if blocked
|
|
return;
|
|
sendReply9a( st );
|
|
return;
|
|
}
|
|
|
|
// . get from msg5, return if it blocked
|
|
// . will probably not block since in the disk page cache a lot
|
|
if ( ! st->m_msg5.getList ( RDB_TAGDB ,
|
|
st->m_coll ,
|
|
&st->m_list ,
|
|
startKey ,
|
|
endKey ,
|
|
100000 , // minRecSizes
|
|
true , // include tree?
|
|
false , // addtocache?
|
|
0 , // maxcacheage
|
|
0 , // startfilenum
|
|
-1 , // numFiles
|
|
st ,
|
|
gotList ,
|
|
st->m_niceness ,
|
|
true ))// do err correction?
|
|
return;
|
|
// log that for debug
|
|
//log("tagdb: msg5 call did not block. st=%lu",(long)st);
|
|
// sanity check - why not block if it had corruption?
|
|
if ( st->m_msg5.m_msg3.m_hadCorruption ) { char *xx=NULL;*xx=0; }
|
|
// it did not block...
|
|
gotList( st , NULL , NULL );
|
|
}
|
|
|
|
void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) {
|
|
// cast our state class
|
|
State9a *st = (State9a *)state;
|
|
// return right away if error getting the rec
|
|
if ( g_errno ) { sendReply9a ( st ); return; }
|
|
// note it
|
|
//log("tagdb: in gotlist st=%lu",(long)st);
|
|
// this is the TagRec rdb record
|
|
char *rec = st->m_list.getList ();
|
|
long recSize = st->m_list.getListSize();
|
|
// cast it as a TagRec
|
|
TagRec *accRec = &st->m_accRec;
|
|
// reset in case not in tagdb and rec/recSize is NULL/0
|
|
accRec->reset();
|
|
// copy it to our accumulator rec which has room to grow, the list
|
|
// does not
|
|
memcpy ( (char *)accRec , rec , recSize );
|
|
// free that list buffer now, we copied it into a larger buffer
|
|
st->m_list.reset();
|
|
|
|
loop:
|
|
// clear it
|
|
g_errno = 0;
|
|
// . add/remove the tags from the tagRec
|
|
// . add will replace tags with the same tag id and username
|
|
// . should deal with "negative" tags (addDelTag())
|
|
//if ( st->m_requestType == 0x00 ) accRec->addTags ( st->m_tagRec );
|
|
//else accRec->removeTags ( st->m_tagRec );
|
|
accRec->addTags ( st->m_tagRec );
|
|
// was there an error? abandon all operations on this TagRec if so
|
|
if ( g_errno ) { sendReply9a ( st ); return; }
|
|
// perform operations on others in the queue
|
|
st = st->m_next;
|
|
// debug for now
|
|
if ( st && g_conf.m_logDebugSpider )
|
|
logf(LOG_DEBUG,"tagdb: calling lock for st=0x%lx",(long)st);
|
|
// if there was one, do it
|
|
if ( st ) goto loop;
|
|
// reset to original parent
|
|
st = (State9a *)state;
|
|
// debug msg
|
|
SafeBuf sb; accRec->printToBuf ( &sb );
|
|
log(LOG_DEBUG,"tagdb: adding to tagdb: %s",sb.getBufStart());
|
|
|
|
// set the list, it should free itself
|
|
st->m_list.set ( (char *)accRec , // list
|
|
accRec->getSize() , // allocSize
|
|
(char *)accRec , // alloc
|
|
accRec->getSize() , // allocSize
|
|
(char *)&accRec->m_key , // startKey
|
|
(char *)&accRec->m_key , // endKey
|
|
-1 , // fixeDataSize
|
|
false , // ownData?
|
|
false , // useHalfKeys?
|
|
sizeof(key_t) );// keySize
|
|
|
|
// add it back after the mods
|
|
if ( ! st->m_msg1.addList( &st->m_list ,
|
|
RDB_TAGDB ,
|
|
st->m_coll ,
|
|
st ,
|
|
sendReply9a ,
|
|
false , // forceLocal?
|
|
MAX_NICENESS ))// niceness
|
|
return;
|
|
// i giess we did not block! send back the reply...
|
|
sendReply9a ( st );
|
|
}
|
|
|
|
void sendReply9a ( void *state ) {
|
|
// cast our state class
|
|
State9a *st = (State9a *)state;
|
|
// delete our slot from the lock table
|
|
s_lockTable2.removeKey ( &st->m_tagRec->m_key.n0 );
|
|
// log it
|
|
if (g_errno) log("tagdb: msg9a failed to add: %s",mstrerror(g_errno));
|
|
// save it, in case a function below clears g_errno
|
|
long saved = g_errno;
|
|
|
|
loop:
|
|
if ( saved ) g_udpServer.sendErrorReply( st->m_slot,saved);
|
|
// send empty reply
|
|
else g_udpServer.sendReply_ass(NULL,0,NULL,0,st->m_slot);
|
|
// save old guy
|
|
State9a *next = st->m_next;
|
|
// free him, we sent his reply
|
|
mdelete ( st , sizeof(State9a),"msg9afr");
|
|
delete (st);
|
|
// repeat for each guy waiting in line
|
|
st = next;
|
|
// if there was one, do it
|
|
if ( st ) goto loop;
|
|
// reset to original parent
|
|
st = (State9a *)state;
|
|
}
|
|
*/
|
|
|
|
///////////////////////////////////////////////
|
|
//
|
|
// OTHER functions
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
long getY ( long long X , long long *x , long long *y , long n ) {
|
|
// if we only have one point then there'll be no interpolation
|
|
if ( n == 1 ) return y[0];
|
|
// find the first x after our "X"
|
|
long j;
|
|
for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break;
|
|
// before/after first/last point means we don't have to interpolate
|
|
if ( j <= 0 ) return y[0 ];
|
|
if ( j >= n ) return y[n-1];
|
|
// linear interpolate between our 2 points (x0,y0) and (x1,y1)
|
|
long long x0 = x[j-1];
|
|
long long x1 = x[j ];
|
|
long long y0 = y[j-1];
|
|
long long y1 = y[j ];
|
|
// error if x1 less than x0
|
|
if ( x1 <= x0 ) {
|
|
log("tagdb: X coordinates are not in ascending order for map");
|
|
char *xx=NULL;*xx=0;
|
|
}
|
|
// otherwise we have a sloping line
|
|
return y0 + ( ((long long)X - x0) * (y1-y0) ) /(x1-x0) ;
|
|
}
|
|
|
|
///////////////////////////////////////////////
|
|
//
|
|
// sendPageTagdb() is the HTML interface to tagdb
|
|
//
|
|
///////////////////////////////////////////////
|
|
|
|
static void sendReplyWrapper ( void *state ) ;
|
|
static void sendReplyWrapper2 ( void *state ) ;
|
|
static bool sendReply ( void *state ) ;
|
|
static bool sendReply2 ( void *state ) ;
|
|
static bool getTagRec ( class State12 *st );
|
|
|
|
// don't change name to "State" cuz that might conflict with another
|
|
class State12 {
|
|
public:
|
|
//Msg9a m_msg9a;
|
|
TcpSocket *m_socket;
|
|
bool m_adding;
|
|
char *m_coll;
|
|
//long m_collLen;
|
|
//char *m_buf;
|
|
//long m_bufLen;
|
|
bool m_isLocal;
|
|
//long m_fileNum;
|
|
//bool m_isAdmin;
|
|
//bool m_isAssassin;
|
|
// . Commented by Gourav
|
|
// . Reason:user perm no longer used
|
|
//char m_userType;
|
|
HttpRequest m_r;
|
|
//char *m_username;
|
|
TagRec m_tagRec;
|
|
TagRec m_newtr;
|
|
Msg8a m_msg8a;
|
|
Url m_url;
|
|
char *m_urls;
|
|
long m_urlsLen;
|
|
Msg1 m_msg1;
|
|
RdbList m_list;
|
|
//Msg1 m_msg1;
|
|
long m_niceness;
|
|
bool m_mergeTags;
|
|
//char m_tmp[16];
|
|
};
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
// . make a web page displaying the tagdb interface
|
|
// . call g_httpServer.sendDynamicPage() to send it
|
|
// . show a textarea for sites, then list all the different site tags
|
|
// and have an option to add/delete them
|
|
bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) {
|
|
// are we the admin?
|
|
//bool isAdmin = g_collectiondb.isAdmin ( req , s );
|
|
// get the collection record
|
|
CollectionRec *cr = g_collectiondb.getRec ( req );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
log("admin: No collection record found "
|
|
"for specified collection name. Could not add sites to "
|
|
"tagdb. Returning HTTP status of 500.");
|
|
return g_httpServer.sendErrorReply ( s , 500 ,
|
|
"collection does not exist");
|
|
}
|
|
/*
|
|
bool isAssassin = cr->isAssassin ( s->m_ip );
|
|
if ( isAdmin ) isAssassin = true;
|
|
// bail if permission denied
|
|
if ( ! isAssassin ){
|
|
//&& ! cr->hasPermission ( req , s ) ) {
|
|
log("admin: Bad collection name or password. Could not add "
|
|
"sites to tagdb. Permission denied.");
|
|
return sendPageLogin ( s , req ,
|
|
"Collection name or "
|
|
"password is incorrect");
|
|
}
|
|
*/
|
|
// make a state
|
|
State12 *st ;
|
|
try { st = new (State12); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("PageTagdb: new(%i): %s",
|
|
sizeof(State12),mstrerror(g_errno));
|
|
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
|
|
mnew ( st , sizeof(State12) , "PageTagdb" );
|
|
//st->m_isAdmin = isAdmin;
|
|
//st->m_isAssassin = isAssassin;
|
|
// . Commented by Gourav
|
|
// . Reason:user perm no longer used
|
|
//st->m_userType = g_pages.getUserType ( s , req );
|
|
// assume we've nothing to add
|
|
st->m_adding = false;
|
|
// save the socket
|
|
st->m_socket = s;
|
|
// i guess this is nuked, so copy it
|
|
st->m_r.copy ( req );
|
|
// make it high priority
|
|
st->m_niceness = 0;
|
|
// point to it
|
|
HttpRequest *r = &st->m_r;
|
|
|
|
// get the collection
|
|
long collLen = 0;
|
|
char *coll = r->getString ( "c" , &collLen , NULL /*default*/);
|
|
// get collection rec
|
|
CollectionRec *cr2 = g_collectiondb.getRec ( coll );
|
|
// bitch if no collection rec found
|
|
if ( ! cr2 || ! coll || collLen+1 > MAX_COLL_LEN ) {
|
|
g_errno = ENOCOLLREC;
|
|
log("admin: No collection record found "
|
|
"for specified collection name. Could not add sites to "
|
|
"tagdb. Returning HTTP status of 500.");
|
|
mdelete ( st , sizeof(State12) , "PageTagdb" );
|
|
delete (st);
|
|
return g_httpServer.sendErrorReply ( s , 500 ,
|
|
"collection does not exist");
|
|
}
|
|
|
|
// . get fields from cgi field of the requested url
|
|
// . get the null-terminated, space-separated lists of sites to add
|
|
long urlsLen = 0;
|
|
char *urls = r->getString ( "u" , &urlsLen , NULL /*default*/);
|
|
|
|
//a quick hack so we can put multiple sites in a link
|
|
if(r->getLong("uenc", 0))
|
|
for(long i = 0; i < urlsLen; i++)
|
|
if(urls[i] == '+') urls[i] = '\n';
|
|
// get the file # of the tagdb file these sites should use
|
|
//long fileNum = r->getLong ("f",-1);
|
|
// get the archive filename of sites to add
|
|
/*
|
|
long xlen;
|
|
char *x = r->getString("x",&xlen,NULL);
|
|
// trim off any spaces
|
|
while ( xlen > 0 && is_wspace_a(x[xlen-1]) ) x[--xlen]='\0';
|
|
*/
|
|
// . get the username
|
|
// . just get from cookie so it is not broadcast over the web via a
|
|
// referral url
|
|
//st->m_username = r->getStringFromCookie("username");
|
|
//st->m_username = g_users.getUsername(r);
|
|
|
|
// are we coming from a local machine?
|
|
st->m_isLocal = r->isLocal();
|
|
/*
|
|
// don't set this unless we have to free it
|
|
st->m_buf = NULL;
|
|
st->m_bufLen = 0;
|
|
// . set our archive filename of sites to add with this fileNum
|
|
// . "a" will be NULL if none supplied
|
|
if ( xlen ) {
|
|
File file;
|
|
file.set ( x );
|
|
// add 1 to bufLen for terminating \0
|
|
long bufLen = file.getFileSize() + 1 ;
|
|
char *buf = (char *) mmalloc ( bufLen , "PageTagdb");
|
|
if ( ! buf ) {
|
|
log("admin: File of sites is too big to add to tagdb."
|
|
" Allocation of %li bytes failed.",bufLen);
|
|
mdelete ( st , sizeof(State12) , "PageTagdb" );
|
|
delete (st);
|
|
return g_httpServer.sendErrorReply(s,500,
|
|
mstrerror(g_errno));
|
|
}
|
|
file.open(O_RDONLY);
|
|
file.read ( buf , bufLen - 1 , 0 );
|
|
// NULL terminate the list of urls
|
|
buf [ bufLen - 1 ] = '\0';
|
|
st->m_buf = buf;
|
|
st->m_bufLen = bufLen ;
|
|
urls = buf;
|
|
urlsLen = bufLen;
|
|
}
|
|
*/
|
|
// it references into the request, should be ok
|
|
st->m_coll = coll;
|
|
//st->m_collLen = collLen;
|
|
//strcpy ( st->m_coll , coll );
|
|
// do not print "(null)" in the textarea
|
|
if ( ! urls ) urls = "";
|
|
|
|
// the url buffer
|
|
st->m_urls = urls;
|
|
st->m_urlsLen = urlsLen;
|
|
|
|
// sanity check
|
|
//bool delOp = r->getLong ("delop",0 );
|
|
//char *nuke = r->getString ("nuke" ,NULL );
|
|
//if ( nuke && ! delOp ) {
|
|
// g_errno = EBADENGINEER;
|
|
// log("tagdb: delete operation checkbox not checked.");
|
|
// mdelete ( st , sizeof(State12) , "PageTagdb" );
|
|
// delete (st);
|
|
// return g_httpServer.sendErrorReply(s,500,
|
|
// mstrerror(g_errno));
|
|
//}
|
|
|
|
long ufuLen;
|
|
char *ufu = r->getString("ufu",&ufuLen);
|
|
|
|
if ( urls[0] == '\0' && ! ufu ) return sendReply ( st );
|
|
|
|
char *get = r->getString ("get",NULL );
|
|
// this is also a get operation but merges the tags from all TagRecs
|
|
char *merge = r->getString("tags",NULL);
|
|
|
|
// is this an add/update operation? or just get?
|
|
if ( get || merge ) st->m_adding = false;
|
|
else st->m_adding = true;
|
|
|
|
|
|
// if each line in the file is the output of a tagdb dump
|
|
// operation on the cmd line like this:
|
|
// k.n1=0x892f9 k.n0=0xac2ff39f8112b71f version=0 TAG=ruleset,
|
|
// "mwells",1,Jan-02-2009-18:26:04,333333333,67.16.94.2,3735437892,36
|
|
// THEN we should just call msg9a directly and it should create
|
|
// a tag rec for each line and add that
|
|
/*
|
|
bool isDumpFile = false;
|
|
if ( urls && strncmp(urls,"k.n1=",5)==0 ) isDumpFile = true;
|
|
if ( isDumpFile ) {
|
|
if ( ! st->m_msg9a.addTags ( st->m_urls , // dumpFile
|
|
st->m_coll ,
|
|
st ,
|
|
sendReplyWrapper2 ,
|
|
0 ))// niceness
|
|
return false;
|
|
return sendReply2 ( st );
|
|
}
|
|
*/
|
|
|
|
// get/merge operations can skip the tag rec lookup
|
|
//if ( ! st->m_adding ) return sendReply ( st );
|
|
|
|
// regardless, we have to get the tagrec for all operations
|
|
//Url site;
|
|
//site.set(urls,gbstrlen(urls));
|
|
st->m_url.set(urls,gbstrlen(urls));
|
|
st->m_mergeTags = merge;
|
|
|
|
return getTagRec ( st );
|
|
}
|
|
|
|
bool getTagRec ( State12 *st ) {
|
|
|
|
bool doInheritance = st->m_mergeTags;//(bool)merge;
|
|
char rdbId = RDB_TAGDB;
|
|
// fbid09729034234.com then use facebookdb
|
|
//char *host = site.getHost();
|
|
//if ( strncmp(host,"fbid",4)==0 && is_digit(host[4]) )
|
|
// rdbId = RDB_FACEBOOKDB;
|
|
// this replaces msg8a
|
|
if ( ! st->m_msg8a.getTagRec ( &st->m_url,//&site ,
|
|
// tell msg8a to try to guess the site
|
|
NULL,
|
|
st->m_coll ,
|
|
false, // skip dom lookup?
|
|
st->m_niceness ,
|
|
st ,
|
|
sendReplyWrapper ,
|
|
&st->m_tagRec ,
|
|
doInheritance ,
|
|
rdbId))
|
|
return false;
|
|
/*
|
|
if ( ! st->m_msg8a.getTagRec ( &site , // &st->m_url,
|
|
st->m_coll,
|
|
st->m_collLen,
|
|
true, //usecanonicalName
|
|
0, //niceness
|
|
st,
|
|
sendReplyWrapper ,
|
|
&st->m_tagRec ,
|
|
doInheritance )){
|
|
return false;
|
|
}
|
|
*/
|
|
return sendReply ( st );
|
|
}
|
|
|
|
void sendReplyWrapper ( void *state ) {
|
|
sendReply ( state );
|
|
}
|
|
|
|
static void sendReplyWrapper2 ( void *state ) {
|
|
State12 *st = (State12 *)state;
|
|
// re-get the tags from msg8a since we changed them
|
|
getTagRec(st);
|
|
//sendReply2 ( state );
|
|
}
|
|
|
|
bool sendReply ( void *state ) {
|
|
|
|
// get our state class
|
|
State12 *st = (State12 *) state;
|
|
// get the request
|
|
HttpRequest *r = &st->m_r;
|
|
// and socket
|
|
TcpSocket *s = st->m_socket;
|
|
// the tagrec
|
|
//TagRec *gr = &st->m_tagRec;
|
|
// reset "gr" so it won't show the old tags of the first rec
|
|
// in the text area box on the tagdb page after the add is completed
|
|
//if ( st->m_adding ) gr->reset();
|
|
|
|
// . if urlsLen <= 0 or fileNum < 0 and we're not deleting
|
|
// . then we've nothing to add
|
|
//if ( urlsLen <= 0 ) return sendReply ( st );
|
|
|
|
// need a valid username
|
|
//if ( ! st->m_username || st->m_username[0] == '\0' ) {
|
|
// log("tagdb: bad username.");
|
|
// mdelete ( st , sizeof(State12) , "PageTagdb" );
|
|
// delete (st);
|
|
// return g_httpServer.sendErrorReply(s,500,
|
|
// mstrerror(g_errno));
|
|
//}
|
|
|
|
if ( ! st->m_adding ) return sendReply2 ( st );
|
|
|
|
//char *nuke = r->getString ("nuke" ,NULL );
|
|
|
|
TagRec *newtr = &st->m_newtr;
|
|
// update it from the http request
|
|
newtr->setFromHttpRequest ( r , s );
|
|
// but remove the site tag
|
|
//newtr.removeTags ( "site" , NULL );
|
|
// add it into gr
|
|
//gr->addTags ( &newtr );
|
|
// copy it over to our state
|
|
//memcpy ( gr , &newtr , newtr.getSize() );
|
|
|
|
// debug
|
|
// this doesn't work because we do not set TagRec::m_listPtrs[0]
|
|
// to point to the list we make below (MDW 4/29/13)
|
|
//SafeBuf tmp;
|
|
//newtr->printToBuf ( &tmp );
|
|
//log(LOG_DEBUG,"tagdb: converted from http: %s",
|
|
// tmp.getBufStart() );
|
|
|
|
// make a startKey and endKey from the tagRec's key
|
|
//key_t startKey = gr->m_key;
|
|
//key_t endKey = gr->m_key;
|
|
// startkey gets is low bit cleared though
|
|
//startKey.n0 &= 0xfffffffffffffffeLL;
|
|
|
|
/*
|
|
// add using msg9a
|
|
if ( ! st->m_msg9a.addTags ( st->m_urls ,
|
|
NULL , // sitePtrs
|
|
0 , // numSitePtrs
|
|
st->m_coll ,
|
|
st ,
|
|
sendReplyWrapper2 ,
|
|
0 , // niceness
|
|
&newtr , // gr
|
|
nuke ,
|
|
NULL )) // ipvec
|
|
return false;
|
|
*/
|
|
|
|
// shrotcut
|
|
SafeBuf *sbuf = &newtr->m_sbuf;
|
|
// use the list we got
|
|
RdbList *list = &st->m_list;
|
|
key128_t startKey;
|
|
key128_t endKey;
|
|
startKey.setMin();
|
|
endKey.setMax();
|
|
// set it from safe buf
|
|
list->set ( sbuf->getBufStart() ,
|
|
sbuf->length() ,
|
|
NULL ,
|
|
0 ,
|
|
(char *)&startKey ,
|
|
(char *)&endKey ,
|
|
-1 ,
|
|
false ,
|
|
false ,
|
|
sizeof(key128_t) );
|
|
|
|
// no longer adding
|
|
st->m_adding = false;
|
|
|
|
// . just use TagRec::m_msg1 now
|
|
// . no, can't use that because tags are added using SafeBuf::addTag()
|
|
// which first pushes the rdbid, so we gotta use msg4
|
|
if ( ! st->m_msg1.addList ( list ,
|
|
RDB_TAGDB ,
|
|
st->m_coll ,
|
|
st ,
|
|
sendReplyWrapper2 ,
|
|
false ,
|
|
st->m_niceness ) )
|
|
return false;
|
|
|
|
// . if addTagRecs() doesn't block then sendReply right away
|
|
// . this returns false if blocks, true otherwise
|
|
//return sendReply2 ( st );
|
|
return getTagRec ( st );
|
|
}
|
|
|
|
bool sendReply2 ( void *state ) {
|
|
|
|
// get our state class
|
|
State12 *st = (State12 *) state;
|
|
// get the request
|
|
HttpRequest *r = &st->m_r;
|
|
// and socket
|
|
TcpSocket *s = st->m_socket;
|
|
|
|
// page is not more than 32k
|
|
char buf[1024*32];
|
|
SafeBuf sb(buf, 1024*32);
|
|
// do they want an xml reply?
|
|
if( r->getLong("xml",0) ) { // was "raw"
|
|
sb.safePrintf("<?xml version=\"1.0\" "
|
|
"encoding=\"ISO-8859-1\"?>\n"
|
|
"<response>\n");
|
|
|
|
st->m_tagRec.printToBufAsXml(&sb);
|
|
|
|
sb.safePrintf("</response>");
|
|
log ( LOG_INFO,"sending raw page###\n");
|
|
// clear g_errno, if any, so our reply send goes through
|
|
g_errno = 0;
|
|
// extract the socket
|
|
TcpSocket *s = st->m_socket;
|
|
// . nuke the state
|
|
// . first free the buffer, if non-NULL
|
|
//if (st->m_buf) mfree (st->m_buf, st->m_bufLen, "PageTagdb");
|
|
mdelete(st, sizeof(State12), "PageTagdb");
|
|
delete (st);
|
|
// . send this page
|
|
// . encapsulates in html header and tail
|
|
// . make a Mime
|
|
return g_httpServer.sendDynamicPage(s, sb.getBufStart(),
|
|
sb.length(),
|
|
0, false, "text/xml",
|
|
-1, NULL, "ISO-8859-1");
|
|
}
|
|
// . print standard header
|
|
// . do not print big links if only an assassin, just print host ids
|
|
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );
|
|
// did we add some sites???
|
|
if ( st->m_adding ) {
|
|
// if there was an error let them know
|
|
if ( g_errno )
|
|
sb.safePrintf("<center>Error adding site(s): <b>"
|
|
"%s[%i]</b><br><br></center>\n",
|
|
mstrerror(g_errno) , g_errno );
|
|
else sb.safePrintf ("<center><b><font color=red>"
|
|
"Sites added successfully"
|
|
"</font></b><br><br></center>\n");
|
|
}
|
|
|
|
//char *c = st->m_coll;
|
|
char bb [ MAX_COLL_LEN + 60 ];
|
|
bb[0]='\0';
|
|
|
|
// print interface to add sites
|
|
sb.safePrintf (
|
|
"<table width=100%% bgcolor=#%s border=1 cellpadding=4>"
|
|
"<tr><td bgcolor=#%s colspan=21>"
|
|
"<center><font size=+1><b>Tagdb</b>%s</font></center>"
|
|
"</td></tr>", LIGHT_BLUE , DARK_BLUE , bb );
|
|
|
|
// sometimes we add a huge # of urls, so don't display them because
|
|
// it like freezes the silly browser
|
|
char *uu = st->m_urls;
|
|
if ( st->m_urlsLen > 100000 ) uu = "";
|
|
|
|
sb.safePrintf ( "<tr><td colspan=21>");
|
|
// text area for adding space separated sites/urls
|
|
//char *pp = "put sites here";
|
|
//char *pp = "";
|
|
//if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls"
|
|
sb.safePrintf ("<center>"
|
|
"<br>"
|
|
"<textarea rows=16 cols=64 name=u>"
|
|
"%s</textarea><br><br>" , uu );
|
|
|
|
// spam assassins should not use this much power, too risky
|
|
//if ( st->m_isAdmin ) {
|
|
// sb.safePrintf ("<i><font size=-1>Note: use 1.2.3.<b>0</b> to "
|
|
// "specify ip domain.</i><br>");
|
|
//}
|
|
|
|
// allow filename to load them from
|
|
//if ( st->m_isAdmin ) {
|
|
sb.safePrintf("or specify a file of them: <input name=ufu "
|
|
"type=text size=40><br>"
|
|
"<i>file can also be dumped output of "
|
|
"tagdb from the <b>gb dump S ...</b> "
|
|
"command.</i>"
|
|
"<br><br>" );
|
|
//}
|
|
|
|
// this is applied to every tag that is added for accountability
|
|
sb.safePrintf("<br>Username: <input name=username type=text size=6 "
|
|
"value=\"admin\"> " );//,st->m_username);
|
|
|
|
// as a safety, this must be checked for any delete operation
|
|
sb.safePrintf (" delete operation<input type=\"checkbox\" "
|
|
"value=\"1\" name=\"delop\"><br>");
|
|
|
|
// close up
|
|
sb.safePrintf ("<br><center>"
|
|
|
|
// this is merge all by default right now but since
|
|
// zak is really only using eventtaghashxxxx.com we
|
|
// should be ok
|
|
"<input type=submit name=get "
|
|
"value=\"get tags\" border=0>"
|
|
|
|
//"<input type=submit name=get "
|
|
//"value=\"get best rec\" border=0>"
|
|
|
|
//"<input type=submit name=tags "
|
|
//"value=\"merge all matching recs\" border=0>"
|
|
|
|
//"<input type=submit name=nuke "
|
|
//"value=\"delete recs\" border=0>"
|
|
|
|
// "</form>"
|
|
"</center>"
|
|
"</tr>\n");
|
|
|
|
// . show all tags we got values for
|
|
// . put a delete checkbox next to each one
|
|
// . show 5-10 dropdowns for adding new tags
|
|
|
|
// for some reason the "selected" option tags do not show up below
|
|
// on firefox unless i have this line.
|
|
|
|
// count how many "tagRecs" we are taking tags from
|
|
Tag *jtag = st->m_tagRec.getFirstTag();
|
|
long numTagRecs = 0;
|
|
for ( ; jtag ; jtag = st->m_tagRec.getNextTag(jtag) ) {
|
|
// skip dups
|
|
if ( jtag->m_type == TT_DUP ) continue;
|
|
// count # of TagRecs contributing to the tags
|
|
//if ( tag && tag->m_type == ST_SITE ) numTagRecs++;
|
|
if ( jtag && jtag->isType("site") ) numTagRecs++;
|
|
}
|
|
|
|
// if we are displaying a COMBINATION of TagRecs merged together in
|
|
// the inheritance loop (above) then you can not edit that! you can
|
|
// only edit individual tag recs
|
|
bool canEdit = (numTagRecs <= 1);
|
|
|
|
if ( ! canEdit )
|
|
sb.safePrintf("<tr><td colspan=20><center><font color=red>"
|
|
"<b>Can not edit because more than one "
|
|
"TagRecs were merged</b></font></center>"
|
|
"</td></tr>\n" );
|
|
|
|
// headers
|
|
sb.safePrintf("<tr bgcolor=%s>"
|
|
//"<td><b>delete?</b></td>"
|
|
"<td><b>del?</b></td>"
|
|
"<td><b>tag name</b></td>"
|
|
"<td><b>tag value</b></td>"
|
|
"<td><b>datasize (with NULL)</b></td>"
|
|
"<td><b>username</b></td>"
|
|
"<td><b>timestamp</b></td>"
|
|
"<td><b>user ip</b></td>"
|
|
"<td><b>deduphash32</b></td>"
|
|
"<td><b>sitehash32</b></td>"
|
|
"</tr>\n",
|
|
DARK_BLUE);
|
|
|
|
// set up the loop
|
|
Tag *itag = st->m_tagRec.getFirstTag();
|
|
//last = NULL;
|
|
long count = 0;
|
|
long empty = 0;
|
|
// loop over all tags in TagRec
|
|
for ( ; empty < 3 ; count++ ) {
|
|
// use this tag to print from
|
|
Tag *ctag = itag;
|
|
// advance
|
|
if ( itag ) itag = st->m_tagRec.getNextTag(itag);
|
|
// make it NULL, do not start over at the beginning
|
|
if ( empty > 0 ) ctag = NULL;
|
|
// skip dups
|
|
if ( ctag && ctag->m_type == TT_DUP ) continue;
|
|
// if ctag NULL and we are getting all tags, break
|
|
if ( ! canEdit && ! ctag ) break;
|
|
// assign for looping
|
|
//last = tag;
|
|
// if we are NULL, print out 3 empty tags
|
|
if ( ! ctag ) empty++;
|
|
// start the section
|
|
sb.safePrintf("<tr bgcolor=%s>",DARK_BLUE);
|
|
// the delete tag checkbox
|
|
//sb.safePrintf("<tr bgcolor=%s><td>",DARK_BLUE);
|
|
sb.safePrintf("<td>");
|
|
if ( ctag && canEdit ) // && tag->m_type != ST_SITE )
|
|
sb.safePrintf("<input name=deltag%li "
|
|
"type=checkbox>",count);
|
|
else
|
|
sb.safePrintf(" ");
|
|
sb.safePrintf("</td>");
|
|
// start the next cell
|
|
sb.safePrintf("<td>");
|
|
// . skip ST_SITE, do not show dropdown for that
|
|
// . no, because for looking up tagRecs i like to see
|
|
// the site tag value, to see what subdomain is matched
|
|
//if ( ctag && ctag->m_type == ST_SITE ) continue;
|
|
// print drop down
|
|
if ( ! ctag ) sb.safePrintf("<select name=tagtype%li>",count);
|
|
// how many tags do we have?
|
|
long n = (long)sizeof(s_tagDesc)/(long)sizeof(TagDesc);
|
|
// the options
|
|
for ( long i = 0 ; ! ctag && i < n ; i++ ) {
|
|
TagDesc *td = &s_tagDesc[i];
|
|
// get tag name
|
|
char *tagName = td->m_name;
|
|
// skip if a reserved tag
|
|
//if ( strncasecmp ( tagName , "reserved" ,8)==0 )
|
|
// continue;
|
|
// select the item in the dropdown
|
|
char *selected = "";
|
|
// was it selected?
|
|
if ( ctag && td->m_type == ctag->m_type )
|
|
selected = " selected";
|
|
// show it in the drop down list
|
|
sb.safePrintf("<option value=\"%s\"%s>%s",
|
|
tagName,selected,tagName);
|
|
}
|
|
// close up the drop down list
|
|
if ( ! ctag ) sb.safePrintf("</select>");
|
|
else {
|
|
char *tagName = getTagStrFromType ( ctag->m_type );
|
|
sb.safePrintf("<input type=hidden name=tagtype%li "
|
|
"value=\"%s\">%s",
|
|
count,tagName,tagName);
|
|
}
|
|
sb.safePrintf("</td><td>");
|
|
// the score field for the drop down list, whatever tag id
|
|
// was selected will have this score
|
|
if ( canEdit )
|
|
sb.safePrintf("<input type=text name=tagdata%li "
|
|
"size=70 value=\"",count);
|
|
// show the value
|
|
if ( ctag ) ctag->printDataToBuf ( &sb );
|
|
// close up the input tag
|
|
if ( canEdit ) sb.safePrintf("\">");
|
|
// close up table cell
|
|
sb.safePrintf("\n</td>");
|
|
|
|
// if no tag, just placeholders
|
|
if ( ! ctag ) {
|
|
sb.safePrintf("<td> </td>"
|
|
"<td> </td>"
|
|
"<td> </td>"
|
|
"<td> </td>"
|
|
"<td> </td>"
|
|
"<td> </td></tr>");
|
|
continue;
|
|
}
|
|
// data size
|
|
sb.safePrintf("<td>%li</td>",(long)ctag->getTagDataSize());
|
|
// username, timestamp only for non-empty tags
|
|
char *username = ctag->getUser();
|
|
long timestamp = ctag->m_timestamp;
|
|
long ip = 0;
|
|
char *ips = " ";
|
|
if ( ctag->m_ip ) { ip=ctag->m_ip; ips=iptoa(ctag->m_ip);}
|
|
// convert timestamp to string
|
|
char tmp[64];
|
|
sprintf(tmp," ");
|
|
time_t ts = timestamp;
|
|
struct tm *timeStruct = localtime ( &ts );
|
|
if ( timestamp )
|
|
strftime(tmp,64,"%b-%d-%Y-%H:%M:%S",timeStruct);
|
|
sb.safePrintf("<td><input type=hidden name=taguser%li "
|
|
"value=%s>%s</td>",
|
|
count,username,username);
|
|
sb.safePrintf("<td><input type=hidden name=tagtime%li "
|
|
"value=%li>%s</td>",
|
|
count,timestamp,tmp);
|
|
|
|
sb.safePrintf("<td><input type=hidden name=tagip%li "
|
|
"value=%li>%s",
|
|
count,ip,ips);
|
|
|
|
sb.safePrintf("<input type=hidden name=tagn1key%li "
|
|
"value=%llu>",
|
|
count,ctag->m_key.n1);
|
|
sb.safePrintf("<input type=hidden name=tagn0key%li "
|
|
"value=%llu>",
|
|
count,ctag->m_key.n0);
|
|
|
|
sb.safePrintf("</td>");
|
|
|
|
sb.safePrintf("<td>0x%lx</td>", (long)(ctag->m_key.n0>>32) );
|
|
|
|
sb.safePrintf("<td>0x%lx</td>",
|
|
// order 1 in since we always do that because
|
|
// we forgot to shift up one for the delbit
|
|
// above in Tag::set() when it sets m_key.n0
|
|
(long)(ctag->m_key.n0&0xffffffff) | 0x01);
|
|
|
|
//sb.safePrintf("<td>%s</td><td>%s</td><td>%s</td>",
|
|
// username,tmp,ips);
|
|
sb.safePrintf("</tr>");
|
|
}
|
|
|
|
// do not print add or del tags buttons if we got tags from more
|
|
// than one TagRec!
|
|
if ( canEdit )
|
|
sb.safePrintf ("<tr bgcolor=%s><td colspan=21><center>"
|
|
|
|
"<input type=submit name=add "
|
|
"value=\"add tags\" border=0>"
|
|
|
|
"</center></td>"
|
|
"</tr>\n",DARK_BLUE);
|
|
|
|
sb.safePrintf ( "</center></table>" );
|
|
|
|
sb.safePrintf ("</form>");
|
|
|
|
sb.safePrintf ("</html>");
|
|
|
|
// clear g_errno, if any, so our reply send goes through
|
|
g_errno = 0;
|
|
// calculate buffer length
|
|
// extract the socket
|
|
//TcpSocket *s = st->m_socket;
|
|
// . nuke the state
|
|
// . first free the buffer, if non-NULL
|
|
//if ( st->m_buf ) mfree ( st->m_buf , st->m_bufLen , "PageTagdb" );
|
|
mdelete ( st , sizeof(State12) , "PageTagdb" );
|
|
delete (st);
|
|
// print it out
|
|
//logf(LOG_DEBUG,"tagdb: %s",sb.getBufStart()+sb.length()-256);
|
|
// . send this page
|
|
// . encapsulates in html header and tail
|
|
// . make a Mime
|
|
return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
|
|
}
|
|
|
|
//void classifierDoneWrapper ( void *state ) {
|
|
// g_tagdbClassifier.m_running = false;
|
|
//}
|
|
|
|
// . we can have multiple tags of this type per tag for a single username
|
|
// . by default, there can be multiple tags of the same type in the Tag as
|
|
// long as the usernames are all different. see addTag()'s deduping below.
|
|
bool isTagTypeUnique ( long tt ) {
|
|
// a dup?
|
|
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
|
|
// make sure table is valid
|
|
if ( ! s_initialized ) g_tagdb.setHashTable();
|
|
// look up in hash table
|
|
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
|
|
// if none, that is crazy
|
|
if ( ! td ) { char *xx=NULL;*xx=0; }
|
|
// return
|
|
if ( td->m_flags & TDF_ARRAY) return false;
|
|
return true;
|
|
}
|
|
|
|
bool isTagTypeIndexable ( long tt ) {
|
|
// a dup?
|
|
if ( tt == TT_DUP ) return false; // TT_DUP = 123456
|
|
// make sure table is valid
|
|
if ( ! s_initialized ) g_tagdb.setHashTable();
|
|
// look up in hash table
|
|
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
|
|
// if none, that is crazy
|
|
if ( ! td ) { char *xx=NULL;*xx=0; }
|
|
// return false if we should not index it
|
|
if ( td->m_flags & TDF_NOINDEX ) return false;
|
|
// otherwise, index it
|
|
return true;
|
|
}
|
|
|
|
// . when displaying a tag we need to know if it is a string or not
|
|
// . that and the dataSize determine how we display it
|
|
/*
|
|
bool isTagTypeString ( long tt ) {
|
|
// look up in hash table
|
|
TagDesc *td = (TagDesc *)s_ht.getValue ( tt );
|
|
// if none, that is crazy
|
|
if ( ! td ) { char *xx=NULL;*xx=0; }
|
|
// return
|
|
return (td->m_flags & TDF_STRING);
|
|
}
|
|
*/
|
|
|
|
// used to determine if one Tag should overwrite the other! if they
|
|
// have the same dedup hash... then yes...
|
|
long Tag::getDedupHash ( ) {
|
|
|
|
// if unique use that!
|
|
if ( isTagTypeUnique ( m_type ) ) return m_type;
|
|
|
|
// if we are NOT unique... then hash username and data. thus we only
|
|
// replace a key if its the same tagtype, username and data. that
|
|
// way it will just update the timestamp and/or ip.
|
|
|
|
// start hashing here
|
|
char *startHashing = (char *)&m_type;
|
|
// end here. include username (and tag data!)
|
|
char *endHashing = m_buf + m_bufSize;
|
|
|
|
// if we are an event tag then PageEvents.cpp added us in the form of
|
|
// user%llutag%sval%li ... so ignore value (FACEBOOKDB)
|
|
//if ( m_type == s_eventTag ) {
|
|
// endHashing--;
|
|
// for (;endHashing-1>m_buf&&is_digit(endHashing[-1]);
|
|
// endHashing--);
|
|
//}
|
|
|
|
// do not include bufsize in hash
|
|
long saved = m_bufSize;
|
|
m_bufSize = 0;
|
|
|
|
// hash this many bytes
|
|
long hashSize = endHashing - startHashing;
|
|
// set key
|
|
long dh = hash32 ( startHashing , hashSize );
|
|
|
|
// revert bufsize
|
|
m_bufSize = saved;
|
|
|
|
return dh;
|
|
}
|