#include "gb-include.h" #include #include "Titledb.h" #include "Tagdb.h" #include "Categories.h" #include "Unicode.h" #include "Threads.h" #include "Msg1.h" #include "HttpServer.h" #include "Pages.h" #include "SiteGetter.h" #include "HashTableX.h" #include "Users.h" #include "Process.h" static void gotMsg0ReplyWrapper ( void *state ); //static void gotReplyWrapper9a ( void *state , UdpSlot *slot ) ; //static void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) ; //static void sendReply9a ( void *state ) ; static HashTable s_ht; static bool s_initialized = false; // to stdout long Tag::print ( ) { SafeBuf sb; printToBuf ( &sb ); // dump that return fprintf(stderr,"%s\n",sb.getBufStart()); } bool Tag::printToBuf ( SafeBuf *sb ) { sb->safePrintf("k.hsthash=%016llx k.duphash=%08lx k.sitehash=%08lx ", m_key.n1, (long)(m_key.n0>>32), (long)(m_key.n0&0xffffffff)); // print the tagname sb->safePrintf ( "TAG=%s,\"%s\",", getTagStrFromType(m_type), getUser() ); // data size //sb->safePrintf( "%li,", (long)getTagDataSize()); // print the date when this tag was added time_t ts = m_timestamp; struct tm *timeStruct = localtime ( &ts ); char tmp[100]; strftime(tmp,100,"%b-%d-%Y-%H:%M:%S,",timeStruct); sb->safePrintf("%s(%lu),",tmp,m_timestamp); // print the time as a long, seconds since epoch //sb->safePrintf("%lu,",m_timestamp); // print the ip added from sb->safePrintf("%s,",iptoa(m_ip)); // print the tag id //sb->safePrintf("%lu,\"",(long)m_tagId); // key.n1 is hash of the subdomain i think //sb->safePrintf("%lu,\"",m_key.n1); sb->safePrintf("\""); if ( ! printDataToBuf ( sb ) ) return false; // final quote sb->safePrintf("\""); return true; } // . "site" can also be a specific url, but it must be normalized // . i.e. of the form http://xyz.com/ void Tag::set ( char *site , char *tagname , long timestamp , char *user , long ip , char *data , long dataSize ) { // get type from name m_type = getTagTypeFromStr ( tagname , strlen(tagname) ); // sanity //isTagTypeIndexable ( m_type ); m_timestamp = timestamp; m_ip = ip; long userLen = 0; if ( user ) userLen = gbstrlen(user); // truncate to 127 byte long if ( userLen > 126 ) userLen = 126; // first byte is size of user, then user plus \0 then data //m_bufSize = 1 + userLen + 1 + dataSize; // "site" must skip http:// //long slen = gbstrlen(site); //if ( slen > 8 && strncasecmp(site,"http://",7)==0 ) // site += 7; //else if ( slen > 8 && strncasecmp(site,"https://",8)==0 ) // site += 8; // normalize Url norm; norm.set ( site ); // store user into special buffer //long ulen = 0; //if ( user ) { // ulen = gbstrlen(user); // if ( ulen > 7 ) ulen = 7; //} //memset ( m_user , 0 , 8 ); //memcpy ( m_user , user , ulen ); char *p = m_buf; // store size (includes \0) *p++ = userLen + 1; // then user name memcpy ( p , user , userLen ); p += userLen; // then \0 *p++ = '\0'; // store data now too memcpy ( p , data , dataSize ); p += dataSize; // NULL terminate if they did not! now all tag are strings and must // be NULL terminated. if ( data && p[-1] ) { // data && m_data[dataSize-1] ) { //m_data[dataSize] = '\0'; *p++ = '\0'; //dataSize++; //m_dataSize++; } // set it m_bufSize = p - m_buf; // top X bits should be hash of the domain only so all recs are on the // same host near each other //m_key.n1 = hash32 ( norm.getDomain() , norm.getDomainLen()); // // too many tags were being read when k.n1 was the domain hash for // sites like az.com that had hundreds of subdomains. so go based on // host instead. // // CRAP: using 32 bit hash we get collisions for crap like // thedietsolutionprogramscam.com and // 2witchdoctors.a-livejasmin.com // so let's move to 64bit keys //m_key.n1 = hash64 ( norm.getHost() , norm.getHostLen()); // i had to make this the hash of the site, not host, // because www.last.fm/user/xxxxx/ // was making the rdblist a few megabytes big!! m_key.n1 = hash64n ( site ); // assume we are unique tag, that many of this type can exist uint32_t upper32 = getDedupHash(); // m_type; /* // if we are NOT unique... then hash username and data. thus we only // replace a key if its the same tagtype, username and data. that // way it will just update the timestamp and/or ip. if ( ! isTagTypeUnique ( m_type ) ) { // start hashing here char *startHashing = (char *)&m_type; // end here. include username (and tag data!) char *endHashing = m_buf + m_bufSize; // hash this many bytes long hashSize = endHashing - startHashing; // . set key upper32 = hash32 ( startHashing , hashSize ); } */ // put in upper 32 m_key.n0 = upper32; // shift it up m_key.n0 <<= 32; // . then or in url hash // . for the site "www.paypal.com:1234" this included the port! // but for the most part if the site is just a hostname then // this is basically just a hostname, too, but the hash will // include the http:// and the ending / // . www.paypal.com:1234 was added as a site. so it has the // same m_key.n1 as www.paypal.com, but this part is different // here. this is the full site hash really. so during the lookup // i'd say filter out such tags if they don't match the site you // are looking up. //m_key.n0 |= (uint32_t) hash32 ( norm.getUrl() , norm.getUrlLen() ); // set positive bit so its not a delete record m_key.n0 |= 0x01; // the size of this class as an Rdb record m_recDataSize = m_bufSize + sizeof(Tag) - sizeof(key128_t) - 4; } // . return # of ascii chars scanned in "p" // . return 0 on error // . parses output of printToBuf() above // . k.n1=0x695b3 k.n0=0xa4118684fa4edf93 version=0 TAG=ruleset,"mwells",Jan-02-2009-18:26:04,,67.16.94.2,3735437892,36 TAG=blog,"mwells",Jan-02-2009-18:26:04,67.16.94.2,2207516434,1 TAG=site,"tagdb",Jan-02-2009-18:26:04,0.0.0.0,833534375,mini-j-gaidin.livejournal.com/ long Tag::setFromBuf ( char *p , char *pend ) { // save our place char *start = p; // tags always start with " TAG=" if ( strncmp(p," TAG=",5) ) { log("tagdb: error processing tag in setFromBuf()."); return 0; } // skip that p += 5; // get the type char *type = p; // get type length while ( p < pend && *p != ',' ) p++; // error? if ( p == pend ) return 0; // that is the length long typeLen = p - type; // convert to number m_type = getTagTypeFromStr ( type , typeLen ); // panic? if ( m_type == -1 ) { char *xx=NULL;*xx=0;} // now the user, skip comma and quote p+=2; // data buffer char *dst = m_buf; // point to it char *user = p; // get end of it while ( p < pend && *p != '\"' ) p++; // error? if ( p == pend ) return 0; // set length long userLen = p - user; // sanity. username total buf space including \0 <= 8 if ( userLen > 126 ) userLen = 126; // copy it over into us //memcpy ( m_user , user , userLen ); // NULL terminate //m_user[userLen] = '\0'; // first byte is username size *dst++ = userLen+1; // then the username memcpy ( dst , user , userLen ); dst += userLen; // and finall null termination *dst++ = '\0'; // skip quote and comma p+=2; // now the datasize //long m_dataSize = atoi(p); // skip till comma //while ( p < pend && *p != ',' ) p++; // error? //if ( p == pend ) return 0; // skip comma //p++; // that is the time stamp in canonical form // skip till comma while ( p < pend && *p != ',' ) p++; // error? if ( p == pend ) return 0; // skip comma p++; // save start char *ts = p; // skip until comma again while ( p < pend && *p != ',' ) p++; // error? if ( p == pend ) return 0; // this is the timestamp in seconds since epoch m_timestamp = atoi(ts); // skip comma p++; // ip address as text char *ips = p; // skip until comma again while ( p < pend && *p != ',' ) p++; // error? if ( p == pend ) return 0; // convert it to binary m_ip = atoip ( ips , p - ips ); // skip comma p++; // get the tag identifier //m_tagId = atol(p); //sscanf ( p , "%lu,",&m_tagId); //long long big = atoll(p); //m_tagId = (long)big; // skip until comma again //while ( p < pend && *p != ',' ) p++; // error? //if ( p == pend ) return 0; // skip comma //p++; // // BEGIN HACK // // as a hack for now, override this, because before we were not 100% // strings as tags, we had single byte values being printed out as // strings of 3 bytes //char *e = p; //while ( e < pend && ! is_wspace_a(*e) ) e++; //if ( e > pend ) return 0; //m_dataSize = e - p; // add in a \0 //m_dataSize++; // // END HACK // // . now is the data // . return # of chars scanned in "p" p += setDataFromBuf ( p , pend ); // . sanity check // . all tags must be NULL terminated now if ( m_buf[m_bufSize-1] != '\0' ) {char *xx=NULL; *xx=0; } // we reset this since we now require that all tags are NULL terminated // strings //m_tagId = hash32 ( (char *)this,(long)sizeof(Tag)+m_dataSize , 0 ); // 0 is not valid //if ( m_tagId == 0 ) m_tagId = 1; // return how many bytes we read return p - start; } // . return # of chars scanned in "p" // . return 0 on error long Tag::setDataFromBuf ( char *p , char *pend ) { // string are special //if ( isTagTypeString ( m_type ) ) { // skip over username in the buffer to point to where to put tag data char *dst = m_buf + *m_buf + 1; // stop at space of memcpy(dst,p,pend-p); // advance dst += (pend-p); // update m_bufSize = dst - m_buf; // should be end delimter char c = m_buf[m_bufSize-1]; // sanity check if ( c && ! isspace(c) ) { char *xx=NULL;*xx=0; } // strings are always NULL terminated, the datasize should // include the NULL termination m_buf[m_bufSize-1]='\0'; // we basically insert the \0, and *p should point to the space // right after the string...! so return m_dataSize - 1 return m_bufSize - 1; /* } // save it to count char *start = p; // print as decimal if just 1 byte if ( m_dataSize == 1 ) { long v = atoi(p); if ( v > 256 ) { char *xx=NULL;*xx=0; } m_data[0] = v; // skip till whitespace or end while ( p < pend && isdigit(*p) ) p++; return p - start; } // skip 0x if ( *p!='0' || *(p+1)!='x' ) { char *xx=NULL;*xx=0; } p += 2; // convert hexadecimal string into binary long bytesStored = hexToBinary ( p , pend , m_data , false ); // sanity check if ( bytesStored != m_dataSize ) { char*xx=NULL;*xx=0;} // advance p, each byte is two characters p += bytesStored * 2; // return # of bytes in "p" we scanned return p - start; */ } long hexToBinary ( char *src , char *srcEnd , char *dst , bool decrement ) { // keep tabs on how many bytes we store into "dst" char *start = dst; // read in hex values while ( src < srcEnd ) { // get FIRST hex digit unsigned char v; v = *(unsigned char *)src; if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10; else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10; else if ( v >= '0' && v <= '9' ) v = v - '0'; else break; // sanity check if ( v >= 16 ) { char *xx=NULL;*xx=0;} // next character src++; // store it in the destination *dst = v; // sanity check, need one more char FOR SURE! if ( src >= srcEnd ) { char*xx=NULL;*xx=0;} // get the SECOND hex digit of this byte v = *(unsigned char *)src; if ( v >= 'a' && v <= 'f' ) v = v - 'a' + 10; else if ( v >= 'A' && v <= 'F' ) v = v - 'A' + 10; else if ( v >= '0' && v <= '9' ) v = v - '0'; else break; // sanity check if ( v >= 16 ) { char *xx=NULL;*xx=0;} // next character src++; // shift last guy up 4 bits *dst = *dst << 4; // or in the new guy *dst |= v; // point to next byte now if ( decrement ) dst--; else dst++; } return dst - start; } bool Tag::printDataToBuf ( SafeBuf *sb ) { // string are special //if ( isTagTypeString ( m_type ) ) { char *data = getTagData(); long dataSize = getTagDataSize(); // because of a bug of not appending the \0 and incrementing // Tag::m_dataSize when we should have, we must deal with this! //sb->safePrintf("%s",m_data); for ( long i = 0 ; data[i] && i < dataSize ; i++ ) sb->safePrintf ( "%c" , data[i] ); return true; /* } // print as decimal if just 1 byte if ( m_dataSize == 1 ) { sb->safePrintf("%li",(long)m_data[0]); return true; } // the "score" sb->safePrintf("0x"); //for ( long i = 0 ; i < m_dataSize ; i++ ) // sb->safePrintf ( "%02hhx" , m_data[m_dataSize-i-1] ); // i guess just print it first byte first now for ( long i = 0 ; i < m_dataSize ; i++ ) sb->safePrintf ( "%02hhx" , m_data[i] ); */ return true; } bool Tag::printToBufAsAddRequest ( SafeBuf *sb ) { // print the tagname char *str = getTagStrFromType ( m_type ); // print the user that added this tag sb->safePrintf ( "%s.user=%s" , str , getUser() ); // print the date when this tag was added sb->safePrintf ("&%s.time=%li", str, m_timestamp ); // print the ip added from sb->safePrintf("&%s.ip=%s",str,iptoa(m_ip)); // print the tag id //sb->safePrintf("&%s.id=%lu",str,(long)m_tagId); // the "score" sb->safePrintf("&%s.data=",str); // print the m_data if ( ! printDataToBuf ( sb ) ) return false; return true; } bool Tag::printToBufAsXml ( SafeBuf *sb ) { // print the tagname char *str = getTagStrFromType ( m_type ); // print the user that added this tag sb->safePrintf ("\t\t\n\t\t\t%s\n\t\t\t%s", str,getUser()); // print the date when this tag was added sb->safePrintf("\n\t\t\t%li\n", m_timestamp); // print the ip added from sb->safePrintf("\t\t\t%s\n",iptoa(m_ip)); // print the tag id //sb->safePrintf("\t\t\t%lu\n",(long)m_tagId); // the "score" sb->safePrintf("\t\t\t"); // print the m_data if ( ! printDataToBuf ( sb ) ) return false; sb->safePrintf("\n\t\t"); return true; } //if ( ! sb->safePrintf("\t\t" // "safePrintf ("\t\t\n" // who added the tag: "\t\t\t\n" // when tag was added: "\t\t\t%lu\n" // ip added from "\t\t\t\n" // name of the tag: "\t\t\t\n" // the tag data "\t\t\tsafePrintf("]]>\n" "\t\t\n"); return true; } bool Tag::printToBufAsHtml ( SafeBuf *sb , char *prefix ) { // print the tagname char *str = getTagStrFromType ( m_type ); // print the user that added this tag sb->safePrintf ("%s%s", prefix, str); // the "score" sb->safePrintf(" value="); // print the m_data if ( ! printDataToBuf ( sb ) ) return false; // print the date when this tag was added sb->safePrintf(" user=%s time=",getUser()); time_t ts = m_timestamp; struct tm *timeStruct = localtime ( &ts ); char tmp[100]; strftime(tmp,100,"%b-%d-%Y-%H:%M:%S",timeStruct); sb->safePrintf("%s(%lu)",tmp,m_timestamp); // print the ip added from sb->safePrintf(" ip=%s",iptoa(m_ip)); //sb->safePrintf(" id=%lu",(long)m_tagId); sb->safePrintf("\n"); return true; } bool Tag::printToBufAsTagVector ( SafeBuf *sb ) { // print the tagname char *str = getTagStrFromType ( m_type ); // print strings data types special //if ( isTagTypeString ( m_type ) ) { //sb->safePrintf("%s:%s ",str,m_data); sb->safePrintf("%s:",str); // print the m_data if ( ! printDataToBuf ( sb ) ) return false; sb->safePrintf(" "); return true; /* } // print the user that added this tag sb->safePrintf ("%s:", str ); if ( ! printDataToBuf ( sb ) ) return false; sb->safePrintf(" "); return true; */ } bool Tag::isType ( char *t ) { long h = hash32n ( t ); return (m_type == h); } TagRec::TagRec ( ) { m_numListPtrs = 0; } void TagRec::constructor ( ) { m_numListPtrs = 0; // run a constructor on the lists for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ ) { m_lists[i].constructor();//m_alloc = NULL; //m_lists[i].m_allocSize = 0; } } TagRec::~TagRec ( ) { reset(); } void TagRec::reset ( ) { m_numListPtrs = 0; for ( long i = 0 ; i < MAX_TAGDB_REQUESTS ; i++ ) m_lists[i].freeList(); } Tag *TagRec::getTag ( char *tagTypeStr ) { long tagType = getTagTypeFromStr ( tagTypeStr ); return getTag2 ( tagType ); } Tag *TagRec::getTag2 ( long tagType ) { Tag *tag = getFirstTag(); // loop over all tags in the buf for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if not a match if ( tag->m_type != tagType ) continue; // skip dups if ( tag->m_type == TT_DUP ) continue; // got it return tag; } // if not found return NULL return NULL; } // . functions to act on a site "tag buf", like that in Msg16::m_tagRec // . first 2 bytes is size, 2nd to bytes is # of tags, then the tags long TagRec::getLong ( char *tagTypeStr, long defalt , Tag **bookmark , long *timestamp , char **user ) { long tagType = getTagTypeFromStr ( tagTypeStr ); return getLong ( tagType , defalt , bookmark , timestamp , user ); } long TagRec::getLong ( long tagType , long defalt , Tag **bookmark , long *timestamp , char **user ) { // start here Tag *tag ; if ( ! bookmark ) tag = getFirstTag(); else tag = getNextTag ( *bookmark ); // loop over all tags in the buf for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if not a match if ( tag->m_type != tagType ) continue; // skip dups if ( tag->m_type == TT_DUP ) continue; // get the value as a long long score = 0; // the size char *data = tag->getTagData(); long dataSize = tag->getTagDataSize(); //long size = m_dataSize; // if ends in NULL trunc it if ( data[dataSize-1] == '\0' ) dataSize--; // trunc it //if ( size > 4 ) size = 4; // convert string to value, MUST be signed!!! the data // should inclue a \0 score = atol2(data,dataSize); // if only a single byte.need to preserve negatives (twos comp) //if ( size == 1 ) score = (long)tag->m_data[0]; //else if ( size == 2 ) score = (long)*((short *)tag->m_data); //else memcpy ( &score , tag->m_data , size ); // bookmark, et al if ( bookmark ) *bookmark = tag; if ( timestamp ) *timestamp = tag->m_timestamp; if ( user ) *user = tag->getUser(); return score; } // not found return defalt; } long long TagRec::getLongLong ( char *tagTypeStr, long long defalt , Tag **bookmark , long *timestamp , char **user ) { long tagType = getTagTypeFromStr ( tagTypeStr ); // start here Tag *tag ; if ( ! bookmark ) tag = getFirstTag(); else tag = getNextTag ( *bookmark ); // loop over all tags in the buf for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if not a match if ( tag->m_type != tagType ) continue; // skip dups if ( tag->m_type == TT_DUP ) continue; // get the value as a long long long score = 0; // the size char *data = tag->getTagData(); long dataSize = tag->getTagDataSize(); // if ends in NULL trunc it if ( data[dataSize-1] == '\0' ) dataSize--; // trunc it //if ( size > 8 ) size = 8; // now everything is a string score = atoll2(data,dataSize); // store it //memcpy ( &score , tag->m_data , size ); // bookmark, et al if ( bookmark ) *bookmark = tag; if ( timestamp ) *timestamp = tag->m_timestamp; if ( user ) *user = tag->getUser(); return score; } // not found return defalt; } char *TagRec::getString ( char *tagTypeStr, char *defalt , long *size , Tag **bookmark , long *timestamp , char **user ) { long tagType = getTagTypeFromStr ( tagTypeStr ); // start here Tag *tag ; if ( ! bookmark ) tag = getFirstTag(); else tag = getNextTag ( *bookmark ); // loop over all tags in the buf for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if not a match if ( tag->m_type != tagType ) continue; // skip dups if ( tag->m_type == TT_DUP ) continue; // want size? includes \0 probably if ( size ) *size = tag->getTagDataSize();//m_dataSize; // bookmark, et al if ( bookmark ) *bookmark = tag; if ( timestamp ) *timestamp = tag->m_timestamp; if ( user ) *user = tag->getUser(); // return it return tag->getTagData();//m_data; } // not found return defalt; } /* // add a special tag with null m_data. this tells Msg9a to delete // all tags of this tag type before adding any other tags of this type // that we might have. it is basically a "negative" tag. bool TagRec::addDelTag ( char *tagTypeStr ) { return addTag ( tagTypeStr , 0 , // timestamp NULL , // user 0 , // ip NULL , // data 0 );// dataSize } // returns false and sets g_errno on error bool TagRec::addTag ( char *tagTypeStr, long timestamp , char *user , long ip , char *data , long dataSize ) { // get the tagType long tagType = getTagTypeFromStr ( tagTypeStr ); // breach check if ( dataSize + sizeof(Tag) > MAX_TAGREC_SIZE ) { g_errno = EBUFTOOSMALL; return log("tagdb: no room to add tag"); } // the Tag::m_dataSize is only 2 bytes... NOT ANYMORE, MDW if ( dataSize < 0 ) { // >= 65536 ) { g_errno = EBADENGINEER; return log("tagdb: tag dataSize of %li is >= 65536. " "Bad value.", dataSize); } // sanity check -- no binary chars allowed, must all be strings! // BUT they can have an empty string (i.e. just \0) if ( dataSize == 1 && data[0] < 9 && data[0] >= 0 && data[0] ) { char *xx=NULL;*xx=0; } // make a tag char buf[MAX_TAGREC_SIZE]; Tag *tag = (Tag *)buf; // fill it in tag->m_type = tagType; tag->m_timestamp = timestamp; tag->m_ip = ip; tag->m_dataSize = dataSize; // dummy value for now tag->m_tagId = 0; // careful! if ( sizeof(Tag) + dataSize + 10 > MAX_TAGREC_SIZE ) { g_errno = EBUFTOOSMALL; return log("tagdb: no room to add tag data"); } // store user into special buffer long ulen = 0; if ( user ) { ulen = gbstrlen(user); if ( ulen > 7 ) ulen = 7; } memset ( tag->m_user , 0 , 8 ); memcpy ( tag->m_user , user , ulen ); // store data now too memcpy ( tag->m_data , data , dataSize ); // NULL terminate if they did not! now all tag are strings and must // be NULL terminated. if ( data && tag->m_data[dataSize-1] ) { tag->m_data[dataSize] = '\0'; dataSize++; tag->m_dataSize++; } // the id is the hash for now (MDW) tag->m_tagId = hash32 ( (char *)tag,(long)sizeof(tag)+dataSize , 0 ); // 0 is not valid if ( tag->m_tagId == 0 ) tag->m_tagId = 1; // now add that tag return addTag ( tag ); } // returns false and sets g_errno on error bool TagRec::addTag ( Tag *TAG ) { // . do not allow empty user // . but "del tags" i.e. "negative tags" can have no user if ( TAG->m_dataSize>0 && (!TAG->m_user || TAG->m_user[0] == '\0') ) { char *xx=NULL;*xx=0;} // sanity check if ( TAG->m_tagId == 0 ) { char *xx=NULL;*xx=0;} // come back up here if we did a remove operation loop: // start at the first tag Tag *tag = getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if not matching id if ( tag->m_type != TAG->m_type ) continue; // skip if does not match user if ( memcmp(tag->m_user,TAG->m_user,7) ) continue; // data now has to match too, so we will allow tags of the // same type from the same user to be added if they have // different data now. i would only do this for strings, // but for longs and chars i would skip this check... // so only replace "unique" tags of the same type. // mostly strings and embedded tag recs will be non-unquie if ( ! isTagTypeUnique ( tag->m_type ) ) { if ( tag->m_dataSize != TAG->m_dataSize ) continue; if ( memcmp(tag->m_data,TAG->m_data,tag->m_dataSize)) continue; } // Msg8a allows multiple ST_SITE tags in order to indicate // what sites the other tags came from (i.e. used by the // inheritance loop below) // MDW: This is now covered by isTagTypeUnique() above. //if ( tag->m_type == ST_SITE ) continue; // it does match, so replace it! //removeTags ( tag->m_type , tag->m_user ); removeTag ( tag ); // start from the top goto loop; } // . ok, we "deduped" the tag // . point to the end of the buf char *p = getRecEnd(); // get the max end char *pend = getMaxEnd(); // how much do we need? long need = TAG->getSize(); // breach? if ( p + need > pend ) { char *site = getString("site","unknown"); g_errno = EBUFTOOSMALL; log("tagdb: no room to add tag to buf. tagtype=%s " "tagsize=%li site=%s", getTagStrFromType ( TAG->m_type ) , need , site ); //char *xx=NULL;*xx=0; return false; } // store it memcpy ( p , TAG , need ); // update our counters m_numTags++; m_dataSize += need; // SPECIAL: if it was ST_SITE, set our m_key, we are an Rdb record //if ( TAG->m_type != ST_SITE ) return true; if ( ! TAG->isType ("site") ) return true; // set the key Url u; // convenience char *site = TAG->m_data; long size = TAG->m_dataSize; // sanity check if ( site[size-1] != '\0' ) { char *xx=NULL;*xx=0; } // do not start with http:// ! wastes space!! if (size>=8 && strncmp(site,"http://",7)==0 ) { log("tagdb: don't sotre http:// in tags!"); char *xx=NULL;*xx=0; } // do not include the NULL u.set ( site , size - 1 ); // set our key, the endKey is our "startKey" m_key = g_tagdb.makeKey ( &u , false ); // isDelete? // success, return true return true; } bool TagRec::removeTags ( char *tagTypeStr , char *user , long tagId ) { long tagType = getTagTypeFromStr ( tagTypeStr ); return removeTags ( tagType , user , tagId ); } bool TagRec::removeTags ( long tagType , char *user , long tagId ) { loop: // start at the first tag Tag *tag = getFirstTag(); // loop over all tags in the rec, see if we got a dup for ( ; tag ; tag = getNextTag ( tag ) ) { // id if matches, that is good enough if ( tagId && tag->m_tagId != tagId ) continue; // skip if not matching id if ( tagId == 0 && tag->m_type != tagType ) continue; // skip if does not match user if ( tagId == 0 && user && memcmp(tag->m_user,user,7))continue; // remove that tag removeTag ( tag ); // re do loop goto loop; } // success return true; } bool TagRec::removeTag ( Tag *rmTag ) { // save this long oldn = m_numTags; // start at the first tag Tag *tag = getFirstTag(); // loop over all tags in the rec, see if we got a dup for ( ; tag ; tag = getNextTag ( tag ) ) { // must be it if ( tag != rmTag ) continue; // copy to here char *dst = (char *)tag; // size of tag we are removing long size = tag->getSize(); // from here char *src = dst + size; // end of tag buffer char *pend = getRecEnd(); // byte to move long move = pend - src; // it does match, so replace it! memcpy ( dst , src , move ); // decrement counts m_numTags--; m_dataSize -= size; } // sanity check if ( m_numTags != oldn - 1 ) { char *xx=NULL;*xx=0; } // success, return true return true; } // add all the tags from "tagRec" to our list of tags bool TagRec::addTags ( TagRec *tagRec ) { // start at the first tag Tag *tag = tagRec->getFirstTag(); // . remove any tag of any of the tag types we got in "tagRec" ? // . deal with "negative" tags // . used by TagRec::addDelTag() above for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) { // if tag has m_data, skip. if ( tag->m_data && tag->m_dataSize > 0 ) continue; // otherwise, it is a signal to nuke all tags of this type removeTags ( tag->m_type , NULL ); } // start at the first tag again tag = tagRec->getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) { // skip if it was a delete tag if ( tag->m_dataSize <= 0 ) continue; // do not transfer over ST_SITE tags if we already got one //if ( tag->m_type == ST_SITE && getTag ( ST_SITE ) ) continue; if ( tag->isType("site") && getTag("site") ) continue; // add it, return false on error, g_errno should be set if ( ! addTag ( tag ) ) return false; } return true; } // add all the tags from "tagRec" to our list of tags bool TagRec::removeTags ( TagRec *tagRec ) { // start at the first tag Tag *tag = tagRec->getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = tagRec->getNextTag ( tag ) ) { // do not remove ST_SITE tags //if ( tag->m_type == ST_SITE ) continue; if ( tag->isType("site") ) continue; // add it, return false on error, g_errno should be set if ( ! removeTags ( tag->m_type , tag->m_user ) ) return false; } return true; } Tag *TagRec::getNextTag ( Tag *tag ) { if ( m_numTags == 0 ) return NULL; if ( ! tag ) return (Tag *)m_buf; char *tagEnd = getRecEnd(); long size = tag->getSize(); char *ret = ((char *)tag) + size; // overboard? if ( ret >= tagEnd ) return NULL; return (Tag *)ret; } */ // return the number of tags having the particular TagType long TagRec::getNumTagTypes ( char *tagTypeStr ) { long tagType = getTagTypeFromStr ( tagTypeStr ); long numTagType = 0; // start at the first tag Tag *tag = getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = getNextTag ( tag ) ) { // skip dups if ( tag->m_type == TT_DUP ) continue; // if there is tagType match then increment the count if ( tag->m_type == tagType ) numTagType++; } return numTagType; } long TagRec::getNumTags ( ) { long numTags = 0; // start at the first tag Tag *tag = getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = getNextTag ( tag ) ) // skip dups if ( tag->m_type != TT_DUP ) numTags++; return numTags; } // . &tagtype%li= // . &tagdata%li= // . &deltag%li=1 (to delete it) // . set &user=mwells, etc. in cookie of HttpReqest, "r" for user // . "this" TagRec's user, ip and timestamp will be carried over to "newtr" // . returns false and sets g_errno on error bool TagRec::setFromHttpRequest ( HttpRequest *r, TcpSocket *s ) { // clear it //reset(); // get the username from the cookie //char *user = r->getStringFromCookie ( "username" , NULL ); //char *user = g_users.getUsername ( r ); // try from form //if ( ! user ) user = r->getString ("username",NULL); // if no user, don't bother! //if ( ! user ) { // g_errno = EBADENGINEER; // return log("tagdb: no username supplied for modifying tagdb."); //} // get the user ip address long ip = 0; if ( s ) ip = s->m_ip; // get the time stamp long now = getTimeGlobal(); // . loop over all urls/sites in text area // . no! just use single url for now // put all urls in this buffer SafeBuf fou; // try from textarea if the ST_SITE was not in the tag section long uslen; char *us = r->getString("u",&uslen); if ( uslen <= 0 ) us = NULL; if ( us ) fou.safeMemcpy ( us , uslen ); // read in file, file of urls long ufuLen; char *ufu = r->getString("ufu",&ufuLen); if ( ufuLen <= 0 ) ufu = NULL; if ( us ) ufu = NULL; // exclusive if ( ufu ) fou.fillFromFile ( ufu ); // if st->m_urls has multiple urls, this "u" is not given in the // http request! but a filename is... and Msg9::addTags() should add // the ST_SITE field anyway... if ( ! ufu && ! us ) return true; // make it null terminated since we no longer do this automatically fou.pushChar('\0'); // normalize it //Url u; u.set ( us , uslen ); // point to it //char *site = u.getUrl(); // skip http + :// //site += u.getSchemeLen() + 3; // include the \0 //long psize = gbstrlen(p) + 1; // loop over all tags in the TagRec to mod them for ( long i = 0 ; ; i++ ) { char buf[32]; sprintf ( buf , "tagtype%li",i ); char *tagTypeStr = r->getString(buf,NULL,NULL); // if not there we are done if ( ! tagTypeStr ) break; // should we delete it? sprintf ( buf , "deltag%li",i); char *deltag = r->getString(buf,NULL,NULL); //if ( deltag && deltag[0] ) continue; sprintf ( buf , "taguser%li",i); char *tagUser = r->getString( buf,NULL,"admin");//user); //if ( tagUser && tagUser[0]==0 ) tagUser = user; sprintf ( buf , "tagtime%li",i); long tagTime = r->getLong(buf,now); sprintf ( buf , "tagip%li",i); long tagIp = r->getLong(buf,ip); // get the value of this tag sprintf ( buf , "tagdata%li" , i ); char *dataPtr = r->getString ( buf , NULL ); // get the tag original key key128_t key; sprintf ( buf , "tagn1key%li" , i ); key.n1 = r->getLongLong ( buf, 0 ); sprintf ( buf , "tagn0key%li" , i ); key.n0 = r->getLongLong ( buf, 0LL ); // if empty skip it if ( ! dataPtr ) continue; if ( ! dataPtr[0] ) continue; // is it numeric? i think only ST_COMMENT is not //char isNum = true; // get the numeric //long tagType = getTagTypeFromStr ( tagTypeStr ); // set "isNum" to false if not numeric //if ( tagType == ST_COMMENT ) isNum = false; //if ( tagType == ST_SITE ) isNum = false; //if ( tagType == ST_META ) isNum = false; //if ( isTagTypeString ( tagType ) ) isNum = false; //long dataSize = 0; // . if it is a string, like ST_COMMENT // . include the \0 //if ( ! isNum ) dataSize = gbstrlen(dataPtr) + 1; // everything is now a string long dataSize = gbstrlen(dataPtr) + 1; // if numeric store in tag buf /* long long data; if ( isNum ) { data = atoll ( dataPtr );//r->getLongLong(val,-1); dataSize = 1; if ( data >= 0xffLL ) dataSize = 2; if ( data >= 0xffffLL ) dataSize = 3; if ( data >= 0xffffffLL ) dataSize = 4; if ( data >= 0xffffffffLL ) dataSize = 5; if ( data >= 0xffffffffffLL ) dataSize = 6; if ( data >= 0xffffffffffffLL ) dataSize = 7; dataPtr = (char *)&data; } */ // add to tag buf //addTag ( tagTypeStr , // tagTime , // tagUser , // tagIp , // dataPtr , // dataSize ); // loop over all urls in the url file if provided char *up = fou.getBufStart(); for ( ; ; ) { // set url char *urlPtr = up; // stop if EOF or processed the one url if ( ! urlPtr ) break; // advance it or NULL it out up = fou.getNextLine ( up ); // null term the url ptr if ( up ) up[-1] = '\0'; // save buffer spot in case we have to rewind long saved = m_sbuf.length(); // . add to tag rdb recs in safebuf // . this pushes the rdbid as first byte // . mdwmdwmdw Tag *tag = m_sbuf.addTag ( urlPtr, // us, // site , tagTypeStr , tagTime , tagUser , tagIp , dataPtr, dataSize , RDB_TAGDB, // do not push rdbid into safebuf false ) ; // error? if ( ! tag ) return false; bool deleteOldKey = false; // if tag has different key, delete the old one if ( key.n1 && tag->m_key != key ) deleteOldKey = true; // if del was marked, delete old one and do not add new one if ( deltag && deltag[0] ) { // rewind over the tag we were about to add m_sbuf.setLength ( saved ); // and add as a delete deleteOldKey = true; } if ( deleteOldKey ) { // make it negative key128_t delKey = key; delKey.n0 &= 0xfffffffffffffffeLL; if (! m_sbuf.safeMemcpy((char *)&delKey, sizeof(key128_t))) return false; } } } // all done //if ( getTag ( ST_SITE ) ) return ; //if ( getTag("site") ) return; // add the special ST_SITE tag //addTag ( "site" , // ST_SITE , // now , // user , // ip , // p , // psize ); return true; } // to stdout long TagRec::print ( ) { SafeBuf sb; printToBuf ( &sb ); // dump that return fprintf(stderr,"%s\n",sb.getBufStart()); } bool TagRec::printToBuf ( SafeBuf *sb ) { Tag *tag = getFirstTag(); //sb->safePrintf("k.n1=0x%08lx k.n0=0x%016llx version=%li", // m_key.n1,m_key.n0,(long)m_version); for ( ; tag ; tag = getNextTag ( tag ) ) { if ( tag->m_type == TT_DUP ) continue; tag->printToBuf ( sb ); sb->pushChar('\n'); } return true; } // . return size of characters scanned from "p" // . returns 0 on error /* long TagRec::setFromBuf ( char *p , char *pend ) { // remember the start char *start = p; // scan in the key //if ( strncmp(p,"k.n1=0x",7) != 0 ) return 0; // skip key stuff //p += 7; // clear our key //m_key.setToMin(); // read in the key //key_t k; //sscanf(p,"k.n1=0x%08lx k.n0=0x%016llx ",&k.n1,&k.n0); // now do it the fast way and compare the results! //p += 7 ; //hexToBinary ( p , pend , ((char *)&m_key.n1)+3 , true ); //p += 8 + 8; //hexToBinary ( p , pend , ((char *)&m_key.n0)+7 , true ); // test it //if ( m_key.n1 != k.n1 || m_key.n0 != k.n0 ) { char *xx=NULL; *xx=0; } //p = strstr ( p , " version="); // error? //if ( ! p ) return 0; // skip " version=" //p += 9; // get version //m_version = atoi(p); // skip p until space //while ( p < pend && *p != ' ' ) p++; // error? //if ( p >= pend ) return 0; // skip the space -- NO! tag parser wants the space //p++; // point to the where we should serialize the tags into //char *tagPtr = m_buf; char tbuf[5000]; while ( p < pend ) { // now we should be pointing to the tag Tag *tag = (Tag *)tbuf; // serialize the tag from the buf long asciiBytesRead = tag->setFromBuf ( p , pend ); // if bad this is 0 if ( asciiBytesRead == 0 ) return 0; // store tag into our safebuf. return 0 with g_errno set on err // . mdwmdwmdw if ( ! m_sbuf.addTag ( tag ) ) return 0; // point to next tag to read into our binary buffer //p += asciiBytesRead; // inc our ptr to point to next tag if it exists //tagPtr += tag->getSize(); // inc our count in the TagRec //m_numTags++; // adjust our tag buffer size, TagRec::m_dataSize //m_dataSize = tagPtr - m_buf; // hey, it includes the other crap too! // it includes m_numTags + m_version, see Tagdb.h //m_dataSize += 2 + 1; } // clear all lists //resetLists(); // now make list point to that //m_lists[0].m_list = m_sbuf.getBufStart(); //m_lists[0].m_listSize = m_sbuf.length(); //m_lists[0].m_listAllocSize = 0; // do not free it! //m_numLists = 0; //return getSize(); return p - start; } */ bool TagRec::setFromBuf ( char *p , long bufSize ) { // assign to list! but do not free i guess m_lists[0].m_list = p; m_lists[0].m_listSize = bufSize; m_lists[0].m_listEnd = p + bufSize; m_lists[0].m_ownData = false; m_lists[0].m_lastKeyIsValid = false; m_lists[0].m_fixedDataSize = -1; m_lists[0].m_useHalfKeys = false; m_lists[0].m_ks = sizeof(key128_t); m_listPtrs[0] = &m_lists[0]; m_numListPtrs = 1; return true; } bool TagRec::serialize ( SafeBuf &dst ) { Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) { if ( tag->m_type == TT_DUP ) continue; if ( ! dst.addTag ( tag ) ) return false; } return true; } bool TagRec::printToBufAsAddRequest ( SafeBuf *sb ) { Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) if ( tag->m_type != TT_DUP ) tag->printToBufAsAddRequest ( sb); return true; } bool TagRec::printToBufAsXml ( SafeBuf *sb ) { Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) if ( tag->m_type != TT_DUP ) tag->printToBufAsXml ( sb ); return true; } bool TagRec::printToBufAsHtml ( SafeBuf *sb , char *prefix ) { Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) if ( tag->m_type != TT_DUP ) tag->printToBufAsHtml (sb,prefix); return true; } bool TagRec::printToBufAsTagVector ( SafeBuf *sb ) { Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) if ( tag->m_type != TT_DUP ) tag->printToBufAsTagVector ( sb ); return true; } Tag *TagRec::getTag ( char *tagTypeStr , char *dataPtr , long dataSize ) { // get the tag type numerically long tagType = getTagTypeFromStr ( tagTypeStr ); Tag *tag = getFirstTag(); for ( ; tag ; tag = getNextTag ( tag ) ) { // skip if tag does not match "tagType" if ( tag->m_type != tagType ) continue; // skip dup tags if ( tag->m_type == TT_DUP ) continue; // skip if dataSize does not match if ( tag->getTagDataSize() != dataSize ) continue; // skip if data does not match if ( memcmp ( tag->getTagData() , dataPtr , dataSize ) ) continue; // we got a match return tag; } return NULL; } // // flags for a TagDescriptor // // is the tag a string type? #define TDF_STRING 0x01 // can we have multiple tags of this type from the same user in the // same TagRec? #define TDF_ARRAY 0x02 // . should we index it? // . index gbtagjapanese: // . also index "gbtagjapanese" if score != 0 // . TODO: actually use this #define TDF_NOINDEX 0x04 class TagDesc { public: char *m_name; char m_flags; // we compute the m_type of each TD on init long m_type; }; // map the tags to names static TagDesc s_tagDesc[] = { // data for the "lang" tag is 2 char language id followed by // a comma then a score from 1 to 100 to indicate percentage. // Allow multiple "lang" tags in one tagrec. {"rootlang" ,TDF_STRING,0}, // title tag and incoming link text of the root page is stored here // for determining default venue addresses {"roottitles" ,TDF_STRING|TDF_NOINDEX,0}, //{"rootlangid" ,TDF_STRING|TDF_NOINDEX,0}, // for addresses of the website, can be multiple {"venueaddress" ,TDF_STRING|TDF_ARRAY|TDF_NOINDEX,0}, /* {"langunknown" ,0x00,0}, {"english" ,0x00,0}, {"french" ,0x00,0}, {"spanish" ,0x00,0}, {"russian" ,0x00,0}, {"turkish" ,0x00,0}, {"japanese" ,0x00,0}, {"chinesetraditional" ,0x00,0}, {"chinesesimplified" ,0x00,0}, {"korean" ,0x00,0}, {"german" ,0x00,0}, {"dutch" ,0x00,0}, {"italian" ,0x00,0}, {"finnish" ,0x00,0}, {"swedish" ,0x00,0}, {"norwegian" ,0x00,0}, {"portuguese" ,0x00,0}, {"vietnamese" ,0x00,0}, {"arabic" ,0x00,0}, {"hebrew" ,0x00,0}, {"indonesian" ,0x00,0}, {"greek" ,0x00,0}, {"thai" ,0x00,0}, {"hindi" ,0x00,0}, {"bengala" ,0x00,0}, {"polish" ,0x00,0}, {"tagalog" ,0x00,0}, */ /* {"spam" ,0x00,0}, {"retail" ,0x00,0}, {"business" ,0x00,0}, {"adult" ,0x00,0}, {"forum" ,0x00,0}, {"blog" ,0x00,0}, {"news" ,0x00,0}, {"reference" ,0x00,0}, {"directory" ,0x00,0}, {"searchengine" ,0x00,0}, {"domainsquatter" ,0x00,0}, {"platform" ,0x00,0}, {"travel" ,0x00,0}, {"audio" ,0x00,0}, {"video" ,0x00,0}, {"socialnetworking" ,0x00,0}, */ {"manualban" ,0x00,0}, {"manualfilter" ,0x00,0}, // clock hashes are now stored in indexdb //{"clock" ,0x00,0}, {"dateformat" ,0x00,0}, // 1 = american, 2 = european {"ruleset" ,0x00,0}, //{"filtered" ,0x00,0}, //{"compromised" ,0x00,0}, //{"good" ,0x00,0}, {"deep" ,0x00,0}, //{"quality" ,0x00,0}, //{"dmozcatid" ,TDF_NOINDEX,0}, {"comment" ,TDF_STRING|TDF_NOINDEX,0}, // we now index this. really we need it for storing into title rec. {"site" ,TDF_STRING|TDF_ARRAY,0}, //{"meta" ,TDF_STRING,0}, // . website contact info // . used by ContactInfo.cpp // . TDB_ARRAY means not to "overwrite" even if username is the same // . a website can have multiple street addresses, etc. // . the "lines" of an single street address are separated by ';' // instead of \n to maintain tagdb dump output readability //{"streetaddress" ,TDF_ARRAY,0}, //{"phonenumber" ,TDF_ARRAY,0}, //{"faxnumber" ,TDF_ARRAY,0}, //{"emailaddress" ,TDF_ARRAY,0}, // . this tag can contain multiple zipcodes, separated by ' ' // . we do index these for local search //{"zipcodes" ,0x00,0}, // . similar to zip codes, separated by ' ' // . TODO: we need to fix Places.cpp to label the places for these tags // but for now we can do gbtagstreetaddress:munich and hope for // the best, although we will get websites on "munich st.!", but // maybe you can combine that with gbtagstreetaddress:germany //{"countries", ,0x00,0}, //{"cities", ,0x00,0}, // this is "0" or "1". if it is "0" then the date lets XmlDoc.cpp know // when we last tried to get the contact info for the site {"hascontactinfo" ,0x00,0}, // street address using ; as delimeter {"contactaddress" ,TDF_ARRAY|TDF_NOINDEX,0}, {"contactemails" ,TDF_ARRAY|TDF_NOINDEX,0}, //{"emailaddressonsite" ,TDF_ARRAY|TDF_NOINDEX,0}, //{"emailaddressoffsite" ,TDF_ARRAY|TDF_NOINDEX,0}, {"hascontactform" ,0x00,0}, // subscribe to google's blacklist and mark the sites as this //{"malware" ,0x00,0}, // . this is used to define INDEPENDENT subsites // . such INDEPENDENT subsites should never inherit from this tag rec // . it is used to handle "homesteading" sites like geocities.com // and the like, and is automatically set by SiteGetter.cpp // . if this is 1 then xyz.com/yyyyy/ is considered a subsite // . if this is 2 then xyz.com/yyyyy/zzzzz/ is considered a subsite // . if this is -1 then no subsite is found // . this should never be 0 either {"sitepathdepth" ,0x00,0}, // . used by XmlDoc::updateTagdb() and also used to determine // if we should index a site in XmlDoc.cpp. to be indexed a site // must be in google, or must have this tag type in its tag rec, // or have some other, soon to be invented, tag // . really this is all controlled by url filters table // . allow multiple tags of this type from same "user" {"authorityinlink" ,TDF_STRING|TDF_ARRAY,0}, {"pagerank" ,0x00,0}, {"ingoogle" ,0x00,0}, {"ingoogleblogs" ,0x00,0}, {"ingooglenews" ,0x00,0}, // geo location from this news site directory {"abyznewslinks.address",0x00,0}, // we now store site pop, etc. in tagdb {"sitenuminlinks" ,0x00,0}, {"sitenuminlinksuniqueip" ,0x00,0}, {"sitenuminlinksuniquecblock" ,0x00,0}, {"sitenuminlinkstotal" ,0x00,0}, // keep these although no longer used {"sitepop" ,0x00,0}, {"sitenuminlinksfresh" ,0x00,0}, // . the first ip we lookup for this domain // . this is permanent and should never change // . it is used by Spider.cpp to assign a host for throttling // all urls/SpiderRequests from that ip // . so if we did change it then that would result in two hosts // doing the throttling, really messing things up {"firstip" ,0x00,0} /* {"user.id" ,0x00,0}, {"user.xml" ,TDF_STRING,0}, {"user.login" ,TDF_STRING,0}, {"user.password" ,TDF_STRING,0}, {"user.securityquestion",TDF_STRING,0}, {"user.securityanswer" ,TDF_STRING,0}, {"user.email" ,TDF_STRING,0}, {"user.firstname" ,TDF_STRING,0}, {"user.lastname" ,TDF_STRING,0}, {"user.cookie" ,TDF_STRING,0}, {"user.zipcode" ,TDF_STRING,0}, {"user.city" ,TDF_STRING,0}, {"user.state" ,TDF_STRING,0}, {"user.imageurl" ,TDF_STRING,0}, {"user.dob" ,TDF_STRING,0}, {"user.language" ,TDF_STRING,0}, {"user.creditcardname" ,TDF_STRING,0}, {"user.creditcardnum" ,TDF_STRING,0}, {"user.creditcardexp" ,TDF_STRING,0}, {"user.creditcardcode" ,TDF_STRING,0}, {"user.lastlogin" ,0x00,0}, {"user.numlogins" ,0x00,0}, {"user.openlinksnewwin" ,0x00,0}, {"user.usehttps" ,0x00,0}, {"user.maxreadhist" ,0x00,0}, {"user.maxsearchhist" ,0x00,0}, {"user.format" ,0x00,0}, {"user.acctbalance" ,0x00,0}, {"user.acctlimit" ,0x00,0}, {"user.acctsuspended" ,0x00,0}, {"user.acctbillemails" ,TDF_STRING,0}, {"user.adstopicid" ,0x00,0}, {"user.adsdailybudget" ,0x00,0}, {"user.adsdisabled" ,0x00,0}, {"user.feednumqueries" ,0x00,0}, {"user.feedcpq" ,0x00,0}, {"user.feeddailybudget" ,0x00,0}, {"user.feeddisabled" ,0x00,0}, {"user.feedpassword" ,TDF_STRING,0}, {"user.feeddailycount" ,TDF_ARRAY,0}, {"user.usertransrec" ,TDF_ARRAY,0}, {"user.userhistoryrec" ,TDF_ARRAY,0}, {"user.userpanelrec" ,TDF_ARRAY,0}, {"trans.amount" ,0x00,0}, {"trans.desc" ,TDF_STRING,0}, {"hist.wasread" ,0x00,0}, {"hist.url" ,TDF_STRING,0}, {"hist.gigabits" ,TDF_STRING,0}, {"hist.timespent" ,0x00,0}, {"panel.topcid" ,0x00,0}, {"panel.showmainstream" ,0x00,0}, {"panel.showblogs" ,0x00,0}, {"panel.showforum" ,0x00,0}, {"panel.showweb" ,0x00,0}, {"panel.showsearchbox" ,0x00,0}, {"panel.showimages" ,0x00,0}, {"panel.showvideo" ,0x00,0}, {"panel.showchatbox" ,0x00,0}, {"panel.showchatpics" ,0x00,0}, {"panel.chatboxnumlines",0x00,0}, {"panel.popsliderval" ,0x00,0}, {"panel.agesliderval" ,0x00,0}, {"panel.windowxpos" ,0x00,0}, {"panel.windowypos" ,0x00,0}, {"panel.numstories" ,0x00,0}, {"panel.storylang" ,TDF_STRING,0}, {"panel.translatelang" ,TDF_STRING,0}, {"panel.displaylang" ,TDF_STRING,0}, {"panel.filterquery" ,TDF_STRING,0}, {"panel.sendemailalerts",TDF_STRING,0}, {"chat.comment" ,TDF_STRING,0}, {"ad.topicid" ,0x00,0}, {"ad.userid" ,0x00,0}, {"ad.adid" ,0x00,0}, {"ad.title" ,TDF_STRING,0}, {"ad.text" ,TDF_STRING,0}, {"ad.url" ,TDF_STRING,0}, {"ad.keywordstring" ,TDF_STRING,0}, {"ad.dailypledge" ,0x00,0}, {"ad.disabled" ,0x00,0}, {"ad.dailyimpresscount" ,TDF_ARRAY,0}, {"ad.dailyclickcount" ,TDF_ARRAY,0} */ }; // . convert "domain_squatter" to ST_DOMAIN_SQUATTER // . used by CollectionRec::getRegExpNum() // . tagnameLen is -1 if unknown long getTagTypeFromStr( char *tagname , long tagnameLen ) { // this is now the hash long tagType; if ( tagnameLen == -1 ) tagType = hash32n ( tagname ); else tagType = hash32 ( tagname , tagnameLen ); // make sure table is valid if ( ! s_initialized ) g_tagdb.setHashTable(); // sanity check, make sure it is a supported tag! if ( ! s_ht.getValue ( tagType ) ) { log("tagdb: unsupported tagname \"%s\"",tagname); char *xx=NULL;*xx=0; return -1; } return tagType; } // . convert ST_DOMAIN_SQUATTER to "domain_squatter" char *getTagStrFromType ( long tagType ) { // make sure table is valid if ( ! s_initialized ) g_tagdb.setHashTable(); TagDesc *td = (TagDesc *)s_ht.getValue ( tagType ); // sanity check if ( ! td ) { char *xx=NULL;*xx=0; } // return it return td->m_name; } // a global class extern'd in .h file Tagdb g_tagdb; Tagdb g_tagdb2; // a fake site for Tagdb::convert() //Tagdb g_sitedb; //static HashTableT s_lockTable; //static HashTableX s_lockTable2; // reset rdb and Xmls void Tagdb::reset() { m_rdb.reset(); //s_lockTable2.reset(); } bool Tagdb::setHashTable ( ) { if ( s_initialized ) return true; s_initialized = true; // the hashtable of TagDescriptors if ( ! s_ht.set ( 1024 ) ) return log("tagdb: Tagdb hash init failed."); // stock it long n = (long)sizeof(s_tagDesc)/(long)sizeof(TagDesc); for ( long i = 0 ; i < n ; i++ ) { TagDesc *td = &s_tagDesc[i]; char *s = td->m_name; long slen = gbstrlen(s); // use the same algo that Words.cpp computeWordIds does long h = hash64Lower_a ( s , slen ); // call it a bad name if already in there TagDesc *etd = (TagDesc *)s_ht.getValue ( h ); if ( etd ) return log("tagdb: Tag %s collides with old tag %s", td->m_name,etd->m_name); // set the type td->m_type = h; // add it s_ht.addKey ( h , (long)td ); } return true; } bool Tagdb::init ( ) { // snity test //if ( TAGREC_CURRENT_VERSION >= 30 ) { // log("tagdb: fix call to convert()"); // char *xx = NULL; *xx = 0; //} // . what's max # of tree nodes? // . assume avg tagdb rec size (siteUrl) is about 82 bytes we get: // . NOTE: 32 bytes of the 82 are overhead long maxTreeNodes = g_conf.m_tagdbMaxTreeMem / 82; //long long pcmem = 250000000; // 250MB // TODO: make it a biased disk page cache! long long pcmem = 160000000; // 160MB // turn it off for rebuilding posdb, to 10MB anyway pcmem = 10000000; //long pcmem = 100000000; // each entry in the cache is usually just a single record, no lists, // unless a hostname has multiple sites in it. has 24 bytes more // overhead in cache. //long maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106; // we now use a page cache if ( ! m_pc.init ("tagdb",RDB_TAGDB,pcmem,GB_TFNDB_PAGE_SIZE)) return log("tagdb: Tagdb init failed."); // init this //if ( ! s_lockTable2.set(8,4,32,NULL,0,false,0,"taglocktbl") ) // return log("tagdb: lock table init failed."); // . initialize our own internal rdb // . i no longer use cache so changes to tagdb are instant // . we still use page cache however, which is good enough! return m_rdb.init ( g_hostdb.m_dir , "tagdb" , true , // dedup same keys? -1 , // fixed record size 2,//g_conf.m_tagdbMinFilesToMerge , g_conf.m_tagdbMaxTreeMem , maxTreeNodes , // now we balance so Sync.cpp can ordered huge list true , // balance tree? 0 , //g_conf.m_tagdbMaxCacheMem , 0 , //maxCacheNodes , false , // half keys? false , //m_tagdbSaveCache &m_pc , false, // is titledb true , // preload disk page cache sizeof(key128_t), // key size true ); // bias disk page cache? } bool Tagdb::init2 ( long treeMem ) { // . what's max # of tree nodes? // . assume avg tagdb rec size (siteUrl) is about 82 bytes we get: // . NOTE: 32 bytes of the 82 are overhead long maxTreeNodes = treeMem / 82; // . initialize our own internal rdb // . i no longer use cache so changes to tagdb are instant // . we still use page cache however, which is good enough! return m_rdb.init ( g_hostdb.m_dir , "tagdbRebuild" , true , // dedup same keys? -1 , // fixed record size 50,//g_conf.m_tagdbMinFilesToMerge , treeMem , maxTreeNodes , // now we balance so Sync.cpp can ordered huge list true , // balance tree? 0 , //g_conf.m_tagdbMaxCacheMem , 0 , //maxCacheNodes , false , // half keys? false , //m_tagdbSaveCache NULL , // pc false, // is titledb false , // preload disk page cache sizeof(key128_t), // key size false ); // bias disk page cache? } bool Tagdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true;//false; // verify if ( verify(coll) ) return true; // if not allowing scale, return false //if ( ! g_conf.m_allowScale ) return false; // otherwise let it go //log ( "tagdb: Verify failed, but scaling is allowed, passing." ); //return true; return false; } bool Tagdb::verify ( char *coll ) { char *rdbName = NULL; rdbName = "Tagdb"; log ( LOG_DEBUG, "db: Verifying %s for coll %s...", rdbName, coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key128_t startKey; key128_t endKey; startKey.setMin(); endKey.setMax(); if ( ! msg5.getList ( RDB_TAGDB , coll , &list , (char *)&startKey , (char *)&endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("tagdb: HEY! it did not block"); } long count = 0; long got = 0; //long numOld = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { //key128_t k = list.getCurrentKey(); key128_t k; list.getCurrentKey ( &k ); count++; // see if it is the "old" school tagdb rec //char *data = list.getCurrentData(); //long dataSize = list.getCurrentDataSize(); // this is the file number in the old school tagdb recs // and it is the version number in the new school style recs. // just make sure the new school version number stays below 30! //char version = *data; // lower 3 bytes are the file number. >= 30 on gk //if ( version >= 30 ) numOld++; //unsigned long groupId = g_tagdb.getGroupId ( &k ); unsigned long shardNum = getShardNum ( RDB_TAGDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { // tally it up g_rebalance.m_foreignRecs += count - got; log ("tagdb: Out of first %li records in %s, only %li belong " "to our group.",count,rdbName,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("tagdb: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "tagdb: Exiting due to %s inconsistency.", rdbName ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_DEBUG, "db: %s passed verification successfully for %li " "recs.",rdbName, count ); // turn threads back on g_threads.enableThreads(); // if no recs in tagdb, but sitedb exists, convert it if ( count > 0 ) return true; // . convert them // . returns false and sets g_errno on error //if ( ! convert ( coll ) ) return false; // DONE g_threads.enableThreads(); return true; } ///////////// // // past blast -- for Tagdb::convert() // //////////// /* struct SiteType { SiteType() : m_score(0) {} SiteType& operator=(SiteType& o) {m_type=o.m_type;m_score=o.m_score; return *this;} // get this type's size long getStoredSize() { if (isType4Bytes(m_type)) return sizeof(m_type)+4; else return sizeof(m_type)+1; }; enum { FIRST_TYPE = 0, SPAM = FIRST_TYPE, //probablitity that it is spam RETAIL, //selling something BUSINESS, //a corporate storefront eg ibm.com ADULT, //not safe for kids, higher score = more hardcore FORUM, //message board BLOG, //or personal home page NEWS, //articles, opinions magazines REFERENCE, //all special interest sites DIRECTORY, //links organized categorically SEARCH_ENGINE, //indexed info DOMAIN_SQUATTER, PLATFORM, //political candidate, or org TRAVEL, //Travel sites AUDIO, //podcast, streaming radio VIDEO, //flash video SOCIAL_NETWORKING,//dating, myspace, facebook MANUAL_BAN, //a human hates this site PAGE_RANK, //google's page rank CLOCK1_PREHASH, //hash of unique preceeding 1st clock CLOCK1_PREHASH_CNT, // count of tags to make 1st clock hash DATE_FORMAT, //format of dates on page CLOCK2_PREHASH, //hash of unique tags preceeding 2nd clock CLOCK2_PREHASH_CNT, // count of tags to make 2nd clock hash CLOCK3_PREHASH, //hash of unique tags preceeding 3rd clock CLOCK3_PREHASH_CNT, // count of tags to make 3rd clock hash CLOCK4_PREHASH, //hash of unique tags preceeding 4th clock CLOCK4_PREHASH_CNT, // count of tags to make 4th clock hash // ....ADD ALL NEW TYPES HERE... corruption upon ye if not LAST_TYPE, BAD_TYPE = LAST_TYPE, TOTAL_TYPE_COUNT = (LAST_TYPE-FIRST_TYPE) }; // . types can be 1 byte or 4 bytes. if they are 4 bytes, they must be // added to this function static bool isType4Bytes(int type) { if ( type == CLOCK1_PREHASH ) return true; if ( type == CLOCK2_PREHASH ) return true; if ( type == CLOCK3_PREHASH ) return true; if ( type == CLOCK4_PREHASH ) return true; return false; } static long getScoreSize(uint8_t type) { if ( type == CLOCK1_PREHASH ) return 4; if ( type == CLOCK2_PREHASH ) return 4; if ( type == CLOCK3_PREHASH ) return 4; if ( type == CLOCK4_PREHASH ) return 4; return 1; }; bool isNormScore() {return m_type <= PAGE_RANK;} uint8_t m_type; uint32_t m_score; }; // . convert the old Tagdb format into the new format bool Tagdb::convert ( char *coll ) { g_threads.disableThreads(); log("db: Trying to convert sitedb for coll %s into tagdb",coll); collnum_t collnum = g_collectiondb.getCollnum ( coll ); // open up old sitedb files long mem = 100000000; long maxTreeNodes = mem / 82; //Rdb sitedb; g_sitedb.m_rdb.init ( g_hostdb.m_dir , "sitedb" , true , // dedup same keys? -1 , // fixed record size 9999 , // MinFilesToMerge 100000000 , // g_conf.m_tagdbMaxTreeMem maxTreeNodes , true , // balance tree? 0 , // g_conf.m_tagdbMaxCacheMem 0 , // maxCacheNodes false , // half keys? false , // m_tagdbSaveCache NULL , // DiskPageCache *, &m_pc false , // is titledb false , // preload disk page cache 12 , // key size false );// bias disk page cache? //g_collectiondb.init(true); g_sitedb.addColl ( coll, false ); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); key_t k; bool threadsWereEnabled = !g_threads.areThreadsDisabled(); g_threads.disableThreads(); loop: // loop over all tagdb recs in tagdb if ( ! msg5.getList ( RDB_SITEDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { if(threadsWereEnabled) g_threads.enableThreads(); return log("db: HEY! it did not block"); } long count = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { k = list.getCurrentKey(); count++; char *data = list.getCurrentData(); //long dataSize = list.getCurrentDataSize(); // point to end of it //char *pend = data + dataSize; // parse the old site rec char *p = data; long old_sfn = (*(long *)p) & 0x00ffffff; //char old_version = p[3]; p += 4; char *old_site = p; long old_siteLen = gbstrlen(p); p += old_siteLen + 1; long old_time = *(long *)p; p += 4; char *old_comment = p; p += gbstrlen(p) + 1; //char *old_username = p; p += gbstrlen(p) + 1; //unsigned char siteFlags = *p; p += 1; //char siteQuality = *p; p += 1; //char incHere = *(long *)p; uint8_t numTypes = *(uint8_t *)p; p += 1; // do not start with http:// ! wastes space!! if (old_siteLen>=8 && strncmp(old_site,"http://",7)==0 ) { old_site += 7; old_siteLen -= 7; } // sanity check //Url s; s.set ( old_site, old_siteLen ); //key_t newk = g_tagdb.makeKey ( &s , false ); //if ( k != newk ) { char *xx=NULL;*xx=0; } // . without any tags, what is our dataSize? // . version(1 byte)+site(X bytes)+NULLTerm(1 byte)+ // #Tags(2 bytes) //long dataSize2 = 1 + old_siteLen + 1 + 2; // set the new rec with this stuff TagRec newgr; //newgr.set ( k , // dataSize2 , // TAGREC_CURRENT_VERSION , // old_site ); long now = getTimeGlobal(); // add the "site" name as a tag (include NULL) newgr.addTag ( ST_SITE , old_time , "conv" , 0, old_site, gbstrlen(old_site)+1); // the banned tag if ( old_sfn == 30 ) { char data = 1; newgr.addTag ( ST_MANUAL_BAN ,now, "conv", 0,&data,1); } if ( old_sfn == 50 ) { char data = 1; newgr.addTag ( ST_DEEP,now, "conv", 0,&data,1); } // just for historical reasons, keep this too newgr.addTag ( ST_RULESET , now , "conv",0,(char *)&old_sfn,1); // . add in comment tag // . this will increase newgr::m_dataEnd/m_dataSize // . include NULL if ( old_comment[0] ) newgr.addTag ( ST_COMMENT ,now, "conv", 0, old_comment , gbstrlen(old_comment)+1); // reset these bool gotPrehash1 = false; bool gotPrehash2 = false; bool gotPrehash3 = false; bool gotPrehash4 = false; bool gotPrehashCount1 = false; bool gotPrehashCount2 = false; bool gotPrehashCount3 = false; bool gotPrehashCount4 = false; long prehash1; long prehash2; long prehash3; long prehash4; char prehashCount1; char prehashCount2; char prehashCount3; char prehashCount4; // now for the old SiteTypes for ( long i = 0 ; i < numTypes ; i++ ) { //while ( p < pend ) { //SiteType *ost = (SiteType *)p; // get the type char siteType = *p; p++; // and the score char *siteTypeScore = p; long siteTypeScoreSize = SiteType::getScoreSize(siteType); p += siteTypeScoreSize; // a 0 score in the old sitedb meant to ignore if ( *siteTypeScore == 0 && siteTypeScoreSize == 1 ) continue; // map the siteType 1-1 for the most part long tagType = siteType + ST_SPAM; // if the type is SiteType::CLOCK2-4_ re-map it if ( siteType == SiteType::CLOCK1_PREHASH ) { gotPrehash1 = true; prehash1 = *(long *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK1_PREHASH_CNT ) { gotPrehashCount1 = true; prehashCount1 = *(char *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK2_PREHASH ) { gotPrehash2 = true; prehash2 = *(long *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK2_PREHASH_CNT ) { gotPrehashCount2 = true; prehashCount2 = *(char *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK3_PREHASH ) { gotPrehash3 = true; prehash3 = *(long *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK3_PREHASH_CNT ) { gotPrehashCount3 = true; prehashCount3 = *(char *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK4_PREHASH ) { gotPrehash4 = true; prehash4 = *(long *)siteTypeScore; continue; } if ( siteType == SiteType::CLOCK4_PREHASH_CNT ) { gotPrehashCount4 = true; prehashCount4 = *(char *)siteTypeScore; continue; } // but DATE_FORMAT is off if ( siteType == SiteType::DATE_FORMAT ) tagType = ST_DATE_FORMAT; // panic if ( tagType >= ST_LAST_TAG ) { log("db: got bad tagtype %li for sitedb rec.", (long)tagType); continue; } // add to new rec newgr.addTag ( tagType , // should be 1-1 now , "conv" , 0 , // ip siteTypeScore , siteTypeScoreSize ); } // add in the clock stuff if ( gotPrehash1 && gotPrehashCount1 ) { // make a 5 byte thingy char tmp[5]; tmp[0] = prehashCount1; memcpy ( tmp+1 , &prehash1, 4 ); newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5); } if ( gotPrehash2 && gotPrehashCount2 ) { // make a 5 byte thingy char tmp[5]; tmp[0] = prehashCount2; memcpy ( tmp+1 , &prehash2, 4 ); newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5); } if ( gotPrehash3 && gotPrehashCount3 ) { // make a 5 byte thingy char tmp[5]; tmp[0] = prehashCount3; memcpy ( tmp+1 , &prehash3, 4 ); newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5); } if ( gotPrehash4 && gotPrehashCount4 ) { // make a 5 byte thingy char tmp[5]; tmp[0] = prehashCount4; memcpy ( tmp+1 , &prehash4, 4 ); newgr.addTag ( ST_CLOCK,now,"conv",0,tmp,5); } // now the langs uint8_t numLangs = *p; p += 1; for ( long i = 0 ; i < numLangs ; i++ ) { uint8_t langId = *p; p += 1; long score = (long)*(uint8_t *)p; p += 1; // add to new rec newgr.addTag ( langId , // should be 1-1 now , "conv" , 0 , // ip (char *)&score , 1 ); } // print it out SafeBuf sb; newgr.printToBuf(&sb); logf(LOG_INFO,"tagdb: %s",sb.getBufStart()); Rdb *r = &g_tagdb.m_rdb; // . add the new site rec back as a TagRec // . it should overwrite the old one since the key is the same // . this should not block // . it should do a dump if tree is full if ( ! r->addRecord ( collnum , newgr.getKey () , newgr.getData () , newgr.getDataSize() , MAX_NICENESS )) { log("tagdb: convert: %s",mstrerror(g_errno)); char *xx=NULL;*xx=0; } // do a blocking dump of tree if it's 90% full now if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){ log("tagdb: convert: dumping tree to disk."); if ( ! r->dumpTree ( 0 ) ) // niceness return log("tagdb: convert: dump failed."); } } // if list not empty, get more if ( list.isEmpty() ) { g_threads.enableThreads(); return true; } // advance startKey startKey = k; startKey += 1; // watch for wrap, that means done, too if ( startKey < k ) { g_threads.enableThreads(); return true; } // otherwise, do more goto loop; } */ /* // . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu key_t Tagdb::makeKey ( Url *u , bool isDelete ) { key_t k; // hash full hostname k.n1 = hash32 ( u->getHost() , u->getHostLen() ); // set lower 64 bits of key to hash of this url k.n0 = hash64 ( u->getUrl() , u->getUrlLen() ); // clear low bit if we're a delete, otherwise set it if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL; else k.n0 |= 0x0000000000000001LL; return k; } */ // . ssssssss ssssssss ssssssss ssssssss hash of site/url // . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx tagType OR hash of that+user+data // . xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx key128_t Tagdb::makeStartKey ( char *site ) { // Url *u ) { key128_t k; // hash full hostname //k.n1 = hash64 ( u->getHost() , u->getHostLen() ); k.n1 = hash64n ( site ); //k.n1 = hash32 ( u->getUrl(), u->getUrlLen() ); //k.n1 = hash32 ( u->getDomain(), u->getDomainLen() ); // set lower 64 bits of key to hash of this url k.n0 = 0; return k; } key128_t Tagdb::makeEndKey ( char *site ) { // Url *u ) { key128_t k; // hash full hostname //k.n1 = hash64 ( u->getHost() , u->getHostLen() ); k.n1 = hash64n ( site ); //k.n1 = hash32 ( u->getUrl(), u->getUrlLen() ); //k.n1 = hash32 ( u->getDomain(), u->getDomainLen() ); // set lower 64 bits of key to hash of this url k.n0 = 0xffffffffffffffffLL; return k; } key128_t Tagdb::makeDomainStartKey ( Url *u ) { key128_t k; // hash full hostname k.n1 = hash64 ( u->getDomain() , u->getDomainLen() ); //k.n1 = hash32 ( u->getUrl(), u->getUrlLen() ); //k.n1 = hash32 ( u->getDomain(), u->getDomainLen() ); // set lower 64 bits of key to hash of this url k.n0 = 0; return k; } key128_t Tagdb::makeDomainEndKey ( Url *u ) { key128_t k; // hash full hostname k.n1 = hash64 ( u->getDomain() , u->getDomainLen() ); //k.n1 = hash32 ( u->getUrl(), u->getUrlLen() ); //k.n1 = hash32 ( u->getDomain(), u->getDomainLen() ); // set lower 64 bits of key to hash of this url k.n0 = 0xffffffffffffffffLL; return k; } /* // . returns 0 if "url" is not a suburl of "site" // . otherwise, returns "percent" of "url" that matches "site" long Tagdb::getMatchPoints ( Url *recUrl , Url *url ) { // reset pts to 0 long pts = 0; // temporary fix to the hostname key collision problem is Tagdb Rdb long rhlen = recUrl->getHostLen (); char *uhost = url ->getDomain (); long uhlen = url ->getDomainLen (); char *shost = recUrl->getDomain (); long shlen = recUrl->getDomainLen (); //long uip = url->getIp (); //long sip = site->getIp (); // MDW: we are not really doing ips like this now if ( uhlen != shlen || strncmp( uhost, shost, uhlen ) != 0 ) // if ( ! uip || uip != sip ) return 0; return 0; // compare ports for bonus points // but return 0 if site's port is not default long rport = recUrl->getPort (); long uport = url->getPort (); if ( rport == uport ) pts += 1000000; else if ( uport != url->getDefaultPort() ) return 0; // now ensure url's path is a subpath of recUrl's long rplen = recUrl->getPathLen(); char *rpath = recUrl->getPath(); long uplen = url->getPathLen(); char *upath = url->getPath(); if ( rplen > uplen ) return 0; if ( strncmp ( upath , rpath , rplen ) != 0 ) return 0; // . now we got a solid match // . add 1 pt for each char in recUrl's path // . so the longer recUrl's path the better the match (more specific) // . this allows us to override TagRecs for deeper sub urls pts += rplen; // add in host size of the matching recUrl pts += rhlen*1000; // all done return pts; } */ /////////////////////////////////////////////// // // for getting the final TagRec for a url // /////////////////////////////////////////////// Msg8a::Msg8a() { m_replies = 0; m_requests = 0; } Msg8a::~Msg8a ( ) { reset(); } void Msg8a::reset() { // do no free if in progress, reply may come in and corrupt the mem if ( m_replies != m_requests && ! g_process.m_exiting ) { char *xx=NULL;*xx=0; } //for ( long i = 0 ; i < m_replies ; i++ ) // m_lists[i].reset(); m_replies = 0; m_requests = 0; } // . get records from multiple subdomains of url // . calls g_udpServer.sendRequest() on each subdomain of url // . all matching records are merge into a final record // i.e. site tags are also propagated accordingly // . closest matching "site" is used as the "site" (the site url) bool Msg8a::getTagRec ( Url *url , // site of the url char *site , char *coll , bool skipDomainLookup , // useCanonicalName , long niceness , void *state , void (* callback)(void *state ), TagRec *tagRec , bool doInheritance , char rdbId ) { CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) { g_errno = ENOCOLLREC; return true; } // reset tag rec tagRec->reset();//m_numListPtrs = 0; // sanity check if ( rdbId != RDB_TAGDB ) {char *xx=NULL;*xx=0;} // save it m_rdbId = rdbId; // in use? need to wait before reusing if ( m_replies != m_requests ) {char *xx=NULL;*xx=0; } // then we gotta free the lists if any reset(); m_niceness = niceness; m_coll = coll; m_tagRec = tagRec; m_callback = callback; m_state = state; //m_url = url; // reset m_errno = 0; m_requests = 0; m_replies = 0; m_doneLaunching = false; //m_doFullUrl = true; //m_skipDomainLookup = skipDomainLookup; // set siteLen to the provided site if it is non-NULL long siteLen = 0; if ( site ) siteLen = gbstrlen(site); // . get the site // . msge0 passes this in as NULL an expects us to figure it out // . if site was NULL that means we guess it. default to hostname // unless in a recognized for like /~mwells/ if ( ! site ) { SiteGetter sg; sg.getSite ( url->getUrl() , NULL , // tagrec 0 , // timestamp NULL, // coll m_niceness, NULL, // state NULL); // callback // if it set it to a recognized site, like ~mwells // then set "site" if ( sg.m_siteLen ) { site = sg.m_site; siteLen = sg.m_siteLen; } } // if provided site was NULL and not of a ~mwells type of form // then default it to hostname if ( ! site ) { site = url->getHost(); siteLen = url->getHostLen(); } // temp null terminate it char c = site[siteLen]; site[siteLen] = '\0'; // use that m_siteStartKey = g_tagdb.makeStartKey ( site );//url ); m_siteEndKey = g_tagdb.makeEndKey ( site ); // url ); // un NULL terminate it site[siteLen] = c; // ignore this part of url is already root like //if ( m_url->isRoot() ) m_doFullUrl = false; // makeStartKey only works on the hostname of the url, so doing the // full url has no effect right now //m_doFullUrl = false; // sendPageInject keeps "url" on the stack! //m_url.set ( url->getUrl() , url->getUrlLen() ); m_url = url; // save this m_doInheritance = doInheritance; // . launch a request for each subdomain of the url // . the request format is // . \0\0 // . that way we can use a small request buffer and have different // pointers to the different subdomains //char *p = m_request; // point to url char *u = url->getUrl(); long ulen = url->getUrlLen(); // point to the TLD of the url char *tld = url->getTLD(); // . if NULL, that is bad... TLD is unsupported // . no! it could be an ip address! // . anyway, if the tld does not exist, just return an empty tagrec // do not set g_errno if ( ! tld && ! url->isIp() ) return true; //if ( ! tld ) { g_errno = EBADURL; return true; } // url cannot have NULLs in it because handleRequest8a() uses // gbstrlen() on it to get its size for ( long i = 0 ; i < ulen ; i++ ) { if ( u[i] ) continue; log("TagRec: got bad url with NULL in it %s",u); m_errno = EBADURL; g_errno = EBADURL; return true; } // skip over http:// long plen = url->getSchemeLen() + 3; u += plen; ulen -= plen; // copy over url without the protocol thingy (http://) //memcpy ( p , u , ulen ); // get the domain m_dom = url->getDomain(); // if none, bad! if ( ! m_dom && ! url->isIp() ) return true; // save this //m_host = url->getHost(); // get its delta //long delta = dom - u; // . save ptr for launchGetRequests() // . move this BACKWARDS for subdomains that have a ton of .'s // . no, now move towards domain m_p = m_url->getHost(); // and save this too m_hostEnd = m_url->getHost() + m_url->getHostLen(); // if ip just use the full "hostname" which is the full ip address //if ( url->isIp() ) m_p = m_host; // launch the requests if ( ! launchGetRequests() ) return false; // . they did it without blocking // . this sets g_errno on error gotAllReplies(); // did not block return true; } // . returns false if blocked, true otherwise // . sets g_errno and returns true on error bool Msg8a::launchGetRequests ( ) { // clear it g_errno = 0; bool tryDomain = false; loop: // return true if nothing to launch if ( m_doneLaunching ) return (m_requests == m_replies); // don't bother if already got an error if ( m_errno ) return (m_requests == m_replies); // limit max to 5ish if (m_requests >=MAX_TAGDB_REQUESTS) return (m_requests==m_replies); // take a breath QUICKPOLL(m_niceness); // . first, try it by canonical domain name // . if that finds no matches, then try it by ip domain // get host //char *subdom = m_p; //long subdomLen = m_hostEnd - m_p; key128_t startKey ; key128_t endKey ; //long siteHash32; // . if our first time, do the full url! // . need to do this because the turking process (XmlDoc::getTurkForm() // and PageReindex.cpp:processTurkForm()) add tags to tagdb based on // the full url. /* if ( m_doFullUrl ) { startKey = g_tagdb.makeStartKey ( m_url ); endKey = g_tagdb.makeEndKey ( m_url ); // . like the "norm" url above // . we'll get back a list of tags for this hostname, // but they could all be from different sites, some sites // would be the hostname, other tags might be from sites // that are a subsite of the hostname, so we have to make // sure the tag's key.n0 matches this siteHash32 siteHash32 = hash32 ( m_url->getUrl() , m_url->getUrlLen()); } else { // make into a url Url u; u.set ( subdom , subdomLen ); // set key range now startKey = g_tagdb.makeStartKey ( &u ); endKey = g_tagdb.makeEndKey ( &u ); // . like the "norm" url above // . we'll get back a list of tags for this hostname, // but they could all be from different sites, some sites // would be the hostname, other tags might be from sites // that are a subsite of the hostname, so we have to make // sure the tag's key.n0 matches this siteHash32 siteHash32 = hash32 ( u.getUrl() , u.getUrlLen() ); } */ if ( tryDomain ) { startKey = g_tagdb.makeDomainStartKey ( m_url ); endKey = g_tagdb.makeDomainEndKey ( m_url ); if ( g_conf.m_logDebugTagdb ) log("tagdb: looking up domain tags for %s", m_url->getUrl()); } else { // usually the site is the hostname but sometimes it is like // "www.last.fm/user/breendaxx/" //startKey = g_tagdb.makeStartKey ( m_site );//url ); //endKey = g_tagdb.makeEndKey ( m_site ); // url ); startKey = m_siteStartKey; endKey = m_siteEndKey; if ( g_conf.m_logDebugTagdb ) log("tagdb: looking up site tags for %s", m_url->getUrl()); } // get the groupid //unsigned long groupId = g_tagdb.getGroupId ( startKey ); // get the next mcast Msg0 *m = &m_msg0s[m_requests]; // and the list RdbList *listPtr = &m_tagRec->m_lists[m_requests]; // bias based on the top 64 bits which is the hash of the "site" now //uint32_t gid = g_hostdb.getGroupId ( m_rdbId , &startKey , true ); //Host *group = g_hostdb.getGroup ( gid ); unsigned long shardNum = getShardNum ( m_rdbId , &startKey , true ); Host *group = g_hostdb.getShard ( shardNum ); long numTwins = g_hostdb.getNumHostsPerShard(); // use top byte! uint8_t *sks = (uint8_t *)&startKey; uint8_t top = sks[sizeof(TAGDB_KEY)-1]; long hostNum = 0; if ( numTwins == 2 && (top & 0x80) ) hostNum = 1; // TODO: fix this! if ( numTwins >= 3 ) { char *xx=NULL;*xx=0; } long hostId = group[hostNum].m_hostId; // . launch this request, even if to ourselves // . TODO: just use msg0!! bool status = m->getList ( hostId , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache m_rdbId, //RDB_TAGDB , m_coll , listPtr , (char *) &startKey , (char *) &endKey , 10000000 , // minRecSizes this , // state gotMsg0ReplyWrapper , m_niceness , true , // error correction? true , // include tree? true , // doMerge? -1 , // firstHostId 0 , // startFileNum -1 , // numFiles 3600*24*365 );// timeout // all done? //if ( m_p == m_url->getDomain() ) m_doneLaunching = true; // error? if ( status && g_errno ) { // g_errno should be set, we had an error m_errno = g_errno; return (m_requests == m_replies); } // successfully launched m_requests++; // if we got a reply instantly if ( status ) m_replies++; if ( ! tryDomain ) { //&& //! m_skipDomainLookup && //m_url->getHostLen() != m_url->getDomainLen() ) { tryDomain = true; goto loop; } // // no more looping! // // i don't think we need to loop any more because we got all the // tags for this hostname. then the lower bits of the Tag key // corresponds to the actual SITE hash. so we gotta filter those // out i guess after we read the whole list. // return (m_requests == m_replies); //m_doneLaunching = true; //goto loop; /* // do not advance m_p if doing the full url first if ( m_doFullUrl ) { m_doFullUrl = false; goto loop; } // . advance m_p // . we go backwards to better support subdomains that have a ton // of periods in them... for ( ; m_p < m_dom && *m_p != '.' ; m_p++ ); // advance over . if ( m_p != m_dom ) m_p++; // if another dot that is bad! if ( *m_p == '.' ) m_errno = EBADURL; // launch another goto loop; */ } void gotMsg0ReplyWrapper ( void *state ) { Msg8a *THIS = (Msg8a *)state; // we got one THIS->m_replies++; // error? if ( g_errno ) THIS->m_errno = g_errno; // launchGetRequests() returns false if still waiting for replies... if ( ! THIS->launchGetRequests() ) return; // get all the replies THIS->gotAllReplies(); // set g_errno for the callback if ( THIS->m_errno ) g_errno = THIS->m_errno; // otherwise, call callback THIS->m_callback ( THIS->m_state ); } // get the TagRec from the reply void Msg8a::gotAllReplies ( ) { // if any had an error, don't do anything if ( m_errno ) return; // scan the lists for ( long i = 0 ; i < m_replies ; i++ ) { // breathe QUICKPOLL(m_niceness); // get list RdbList *list = &m_tagRec->m_lists[i]; // skip if empty if ( list->m_listSize <= 0 ) continue; // panic msg if ( list->m_listSize >= 10000000 ) { log("tagdb: CAUTION!!! cutoff tagdb list!"); log("tagdb: CAUTION!!! will lost useful info!!"); char *xx=NULL;*xx=0; } // otherwise, add to array m_tagRec->m_listPtrs[m_tagRec->m_numListPtrs] = list; // advance m_tagRec->m_numListPtrs++; } // . now scan all the tags for this HOSTNAME // . filter out tags that are not for a supersite of our url // . i.e. if our url is www.xyz.com/tim/bob/file.html // then hash // http://www.xyz.com/ // http://www.xyz.com/tim/ // http://www.xyz.com/tim/bob/ // and skip over any tag whose lower 32 bits does not match // one of those hashes... // . see where we set Tag::m_key.n0 in Tag::set() above: // m_key.n0 |= (uint32_t) hash32 ( norm.getUrl(),norm.getUrlLen() ); // where "norm" is the provided site but with a http:// in front // and a / at the end since Url::set() normalized it // . m_url is the url we want to get the tags for // . HACK: right now just restrict to the hostname! /* Url norm; norm.set ( m_url->getHost() , m_url->getHostLen() ); unsigned long siteHash32 = hash32 ( norm.getUrl(),norm.getUrlLen() ); // . and the domain too so we can ban domains // . this is messed up because we can't just hash the domain, we have // to hash it like a complete url because that is what Tag::set() // does when it makes the key's top 32 bits. unsigned long siteHash32d = 0; long conti = 0; siteHash32d = hash32_cont ( "http://",7,siteHash32d,&conti); siteHash32d = hash32_cont ( norm.getDomain(), norm.getDomainLen(), siteHash32d, &conti); siteHash32d = hash32_cont ( "/",1,siteHash32d,&conti); // the non-del bit i guess. we forgot to shift up when we made // the key above! siteHash32 |= 0x01; siteHash32d |= 0x01; */ // scan tags in list and set Tag::m_type to TT_DUP if its a dup Tag *tag = m_tagRec->getFirstTag(); HashTableX cx; char cbuf[2048]; cx.set ( 4,0,64,cbuf,2048,false,m_niceness,"tagtypetab"); // . loop over all tags in all lists in order by key // . each list should be from a different suburl? // . the first list should be the narrowest/longest? for ( ; tag ; tag = m_tagRec->getNextTag ( tag ) ) { // breathe QUICKPOLL(m_niceness); // skip tag if it is not from the proper site. we are // only guarenteed that all tags in this list are for the // same HOSTNAME not SITE! site is in the lower bits // of the tagdb key. // should fix www.paypal.com:1234 bug where we were reading // sitenuminlinks from that tag and was always 0!! even // when we'd add a count of 2k to the www.paypal.com site... // now filter out www.paypal.com:1234's tags! // TODO: allow multiple different siteHash32 values to match // here, use one siteHash32 for each possible suburl of "m_url" // so if m_url is "http://www.xyz.com/tim/" then we also // can match hash32("http://www.xyz.com/tim/" not just // "http://www.xyz.com/" which is how it is now. //unsigned long th32 = tag->m_key.n0 & 0xffffffff; //if ( th32 != siteHash32 && th32 != siteHash32d ) { // // maybe use TT_DIFFSITE instead of this! TODO! // tag->m_type = TT_DUP; // continue; //} // form the hash! uint32_t h32 = (unsigned long)((tag->m_key.n0) >> 32); // skip if not unique //if ( ! isTagTypeUnique ( tag->m_type ) ) continue; // otherwise, record it if ( cx.isInTable(&h32 ) ) // tag->m_type) ) tag->m_type = TT_DUP; else if ( ! cx.addKey(&h32) ) { m_errno = g_errno; return; } } } /* // get the TagRec from the reply void TagRec::gotAllReplies ( ) { // if any had an error, don't do anything if ( m_errno ) return; // time how long this takes and log it long long startTime = gettimeofdayInMilliseconds(); // how many TagRecs we matched long n = 0; // arrays for pointing to best matching TagRecs //char *data [128]; //long dataSizes [128]; //long dataScores [128]; char *recs [128]; long recScores [128]; // . each reply is a list of TagRecs // . each TagRec is a standard Rdb record // . key|dataSize|data... // . go through all TagRecs and sort our list of ptrs to the // best TagRecs // . some TagRecs will not even match, so do not include those in // our list of pointers // . the closest matching TagRecs will be on top // . inherit Tags from lesser matching TagRecs provided there // is no such Tag::m_type from a closer matching TagRec // . if xyz.com is banned and abc.xyz.com has a 0 score for the // ST_BANNED Tag, then it is effectively "unbanned" and should // not inherit the score from xyz.com for ST_BANNED. // . so by scanning each TagRec in order, we compose our own // final merged TagRec that may have a lot more Tags in it // than any one matching TagRec for ( long i = 0 ; i < m_replies ; i++ ) { // get the list from this reply RdbList *list = &m_lists[i]; // scan list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // break if overflow if ( n >= 128 ) break; // get next rec //char *d = list->getCurrentData (); //long dsize = list->getCurrentDataSize(); char *rec = list->getCurrentRec(); // set TagRec to it TagRec *gr = (TagRec *)rec; // get the site //char *site = gr->getString(ST_SITE,NULL); char *site = gr->getString("site",NULL); // sanity check if ( ! site ) { char *xx=NULL;*xx=0; } // make it a url Url u; u.set ( site , gbstrlen(site) ); // score it long s = g_tagdb.getMatchPoints ( &u , m_url ); // skip it if not a match if ( s <= 0 ) continue; // save it //data [n] = d; //dataSize [n] = dsize; recs [n] = rec; recScores [n] = s; n++; } } // if no recs, we did not match anything if ( n == 0 ) return; // or on error if ( m_errno ) return; // bubble sort the recs by their scores, highest score first bubble: bool swapped = false; for ( long i = 1 ; i < n ; i++ ) { // keep going if in correct order if ( recScores[i-1] >= recScores[i] ) continue; // swap char *t1 = recs [i-1]; long t2 = recScores [i-1]; recs [i-1] = recs [i]; recs [i ] = t1; recScores [i-1] = recScores [i]; recScores [i ] = t2; swapped = true; } if ( swapped ) goto bubble; // parse the best matching SiteData //TagRec gr ; gr.set ( data[0] , dataSizes[0] ); // use the site from the best matching TagRec as our site //m_siteUrl.set ( gr.getSite() , gr.getSiteLen() ); // reset the inheritance array //char array[ST_LAST_TAG]; //memset ( array , -1 , 256 ); HashTable ia; char ibuf [ 1024 * 8 ]; ia.set ( 1024 , ibuf , 1024 * 8 ); // we just store the tags, ptrs into the tags in the m_lists //Tag *tags[MAX_TAGS]; // assume we got no tags //long numTags = 0; // size of all tags //long size = 0; // set our new tag rec m_tagRec->reset(); // . only get tags from the first matching tag rec if we should not // do the inheritance loop // . if they click "get rec" on PageTagdb, then do not do inheritance, // but if they click "get tags", then do it! if ( ! m_doInheritance && n > 0 ) n = 1; // . DO NOT INHERIT ANYTHING FROM TAG RECS that have a sitePathDepth // tag in them UNLESS the sitePathDepth does not work on us // . i.e. if xyz.com has a sitePathDepth of 2 in its TagRec and the // url we are looking at is xyz.com/a/b/c/d then we must assume that // out site is xyz.com/a/b/ we are an independent subsite of // xyz.com and inherit nothing from it SiteGetter siteGetter; // site getter sometimes adds recs to tagdb to add in a new subsite // it finds... i'd imagine this will create a parsing inconsistency // when injecting docs into the "test" coll... but oh well! long timestamp = getTimeGlobal(); // . begin the "inheritance loop" // . fill our m_tags[] array with the Tags that apply to us for ( long i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL(m_niceness); // parse the TagRec (very fast) TagRec *gr = (TagRec *)recs[i]; // is "url" an independent subsite of gr's site? char *us = m_url->getUrl(); bool st=siteGetter.getSite(us,gr,timestamp,m_coll,m_niceness ); // sanity check, not allowed to block since state is NULL! if ( ! st ) { char *xx=NULL;*xx=0; } // are we independent subsite? if so, do not inherit // from that. this is used to prevent www.geocities.com/~mark/ // from gaining the benefits of being on the www.geocities.com // site. TODO later: we should make another tag to indicate // a subsite is expicitly independent. but for now we rely // on the "sitepathdepth" tag automatically computed by // SiteGetter.cpp. //if ( siteGetter.isIndependentSubsite() ) continue; // // TODO: // NONO, just do not inherit sitenumlinks or any tag // that is marked as such!!! add a new flag to the tags!!!!!! // // always add the ST_SITE tag first from each tag so we know // what site the other tags belong to //Tag *stag = gr->getTag ( ST_SITE ); Tag *stag = gr->getTag ( "site" ); // only add if non null if ( stag ) m_tagRec->addTag ( stag ); // last tag Tag *last = NULL; // loop over all tags in TagRec #i tagLoop: // get the tag id of current tag Tag *tag = gr->getNextTag ( last ); // assign last = tag; // was that the end of the tags? if so, go to next TagRec if ( ! tag ) continue; // get tag id long tagType = tag->m_type; // skip all ST_SITE tags, we added those first above //if ( tagType == ST_SITE ) goto tagLoop; if ( tag->isType("site") ) goto tagLoop; // sanity check //if ( tagType >= ST_LAST_TAG ) { char *xx=NULL;*xx=0;} // for getting the next tag, remember this last = tag; // . have we added this yet? // . if tagType added from a prev TagRec do not "inherit" it //if(array[tagType] != -1 && array[tagType] != i) goto tagLoop; long slot = ia.getSlot ( tagType ); if ( slot >= 0 && ia.getValueFromSlot(slot) != i) goto tagLoop; // if tag type is "eventtag" then only add it if the site of this // tagrec EQUALS our url. exact match... that way we make sure to only // tag a single url, otherwise we might accidentally tag an entire site. if ( tag->isType("eventtag") ) { // must be in tagRec that matches us the closest if ( i != 0 ) goto tagLoop; // if no site, skip it if ( ! stag ) goto tagLoop; // and even then must match site exactly char *site = stag->m_data; // as string char *url = m_url->getUrl(); long ulen = m_url->getUrlLen(); // skip our proto (http://) url += m_url->getSchemeLen() + 3; ulen -= m_url->getSchemeLen() + 3; // remove trailing / if ( ulen > 0 && url[ulen-1] == '/' ) ulen--; // likewise for site long slen = gbstrlen(site); if ( slen > 0 && site[slen-1] == '/' ) slen--; // skip if not exact if ( slen != ulen ) goto tagLoop; // compare, must match exactly, if not, do not add tag if ( strncmp(url,site,slen) != 0 ) goto tagLoop; } // ok, add/inherit it //tags[numTags++] = tag; // add it directly to m_tagRec if ( ! m_tagRec->addTag ( tag ) ) { log("tagdb: addTag failed: %s",mstrerror(g_errno)); m_errno = g_errno; break; } // add in size //size += tag->getSize(); // note it, so we do not add/inherit it from another TagRec //array[tagType] = i; ia.addKey ( tagType , i ); // add more tags goto tagLoop; } // sanity! //if ( size > 32000 ) { char *xx=NULL;*xx=0; } //if ( size + 2 + 2 > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0; } // then copy the tags into the buffer //for ( long i = 0 ; i < numTags ; i++ ) // m_tagRec->addTag ( tags[i] ); // sanity check //if ( p - m_tagRec > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;} // free the mem reset(); // time it long long took = gettimeofdayInMilliseconds() - startTime; if(took>10) log(LOG_INFO, "admin: gotreply for msg8a took %lli",took); } */ /* /////////////////////////////////////////////// // // Msg9a : for modifying TagRecs in Tagdb // /////////////////////////////////////////////// Msg9a::Msg9a () { m_requestBuf = NULL; m_requests = 0; m_replies = 0; } Msg9a::~Msg9a() { reset(); } void Msg9a::reset() { // guard against not waiting for all replies to come in if ( m_requests != m_replies && ! g_process.m_exiting ) { char *xx=NULL;*xx=0; } if ( ! m_requestBuf ) return; mfree ( m_requestBuf , m_requestBufSize , "msg9a" ); m_requestBuf = NULL; } // . returns false if blocked, true otherwise // . sets errno on error // . "urls" is a NULL-terminated list of space-separated urls // . if "addTags" is true, then the tags in "tagRec" will be added to the /// the TagRecs specified by the sites in "sites". if a TagRec // does not exist for a given "site" then it will be added just // so we can add the Tags to it. If it does exist, we will // just append the given Tags to it. // . to "delete" a tag, just assign it a dataSize of 0! // . Tags added with the same user name and tag type of an existing tag // will overwrite it. // . you can now optionally supply an array of ptrs to sites, sitePtrs. // . you can call this with your "tagRec" on the stack because we copy // its contents into our own buffer here bool Msg9a::addTags ( char *sites , char **sitePtrs , long numSitePtrs , char *coll , void *state , void (*callback)(void *state) , long niceness , TagRec *tagRec , bool nukeTagRecs , long *ipVector ) { // incase we are being re-used! reset(); g_errno = 0; // sanity check, one or the other if ( sites && sitePtrs ) { char *xx=NULL;*xx=0; } // ipVector only used with sitePtrs for now if ( ! sitePtrs && ipVector ) { char *xx=NULL;*xx=0; } // when we add the "site" tag to it use the timestamp from one // of the tags we are adding... therefore we must require there be // some tags! we do this to insure injection consistency into the // "test" collection. if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; } // use the first timestamp long timestamp = tagRec->getFirstTag()->m_timestamp; // . up to 20 oustanding Msg0 getting the exact TagRec for each site // . when we get it we immediately modify it and then add it back // using Msg4. // . to resolve collisions we could assign a particular hostid // to handle adding each site... yeah, how about the local host. // . so forward the Msg9a add/del/rpl request to the responsible // host. then it can lock the "site" until the add completes. // . it should use Msg1 to add it. // reset m_errno = 0; m_requests = 0; m_replies = 0; m_niceness = niceness; m_state = state; m_callback = callback; long collLen = gbstrlen(coll); // how many urls in the sites do we have? long numUrls = 0; // point to buf char *s = sites; // count each one while ( sites && *s ) { // skip whitespace while ( *s && is_wspace_a(*s) ) s++; // alnum? if ( *s ) numUrls++; // skip url while ( *s && ! is_wspace_a(*s) ) s++; } if ( sitePtrs ) numUrls = numSitePtrs; // how much buf do we need to hold all the requests for all the sites long need = 0; // just a buffer of sites if ( sites ) need += 2 * (gbstrlen(sites) + 1); // otherwise, use the site ptrs for ( long i = 0 ; i < numSitePtrs ; i++ ) need += 2 * (gbstrlen(sitePtrs[i]) + 1); // how big is each request's header? long header = 0; // request size header += 4; // niceness header += 1; // collection header += collLen + 1; // flag header += 1; // the tag rec header += tagRec->getSize(); // . add ST_SITE to each tagRec // . we already accounted for the sites in the gbstrlen() above header += sizeof(Tag); // one header per url need += header * numUrls; // make a request buffer for all the requests m_requestBuf = (char *)mmalloc ( need , "msg9a-add"); if ( ! m_requestBuf ) return true; m_requestBufSize = need; // carve it up char *p = m_requestBuf; // loop over sites s = sites; // reset sitePtr counter in case we are using those long si = 0; //long now = getTimeGlobal(); // loop it for ( ; ; si++ ) { // stop if all done if ( sites && ! *s ) break; // or this if ( sitePtrs && si >= numSitePtrs ) break; // make "s" point to the site if we are using ptrs if ( sitePtrs ) s = sitePtrs[si]; // skip whitespace while ( *s && is_wspace_a(*s) ) s++; // skip over http:// (wastes space) if ( strncmp(s,"http://",7)==0 ) s += 7; // find end of url char *send = s; while ( *send && ! is_wspace_a(*send)) send++; // get the length long len = send - s; // done? make sure we are using the site buffer and not ptrs if ( sites && ! *s ) break; // a place holder for the request size long *rsizePtr = (long *)p; p += 4; // track the size char *start = p; // first niceness *p = niceness; p++; // then coll memcpy ( p , coll , collLen ); p += collLen; // NULL term *p++ = '\0'; // add flag first *p = 0x00; //if ( deleteTags ) *p = 0x01; if ( nukeTagRecs ) *p = 0x02; // delete entire TagRec? p++; // now make the Tag! //TagRec *tagRec = (TagRec *)p; // sets its ip special if we should long ip = 0; if ( ipVector ) ip = ipVector[si]; // . copy it over // . get the size long size = tagRec->getSize(); // add in tagRec memcpy ( p , tagRec , size ); // cat it to p TagRec *newgr = (TagRec *)p; // NULL terminate it temporarily char c = s[len]; s[len] = 0; // . remove the old site so the new one can replace it // . we already contain a SITE_TAG and addTag() will NEVER // replace that particular tag... // . this is now removed above //newgr->removeTag ( "site" , NULL ); // add the site //newgr->addTag ( ST_SITE, now,"tagdb",0,s, len+1 ); newgr->addTag ( "site", timestamp,"tagdb",ip,s, len+1 ); // undo the NULL termination s[len] = c; // update the size size = newgr->getSize(); // advance p += size; // how big was the request, store that *rsizePtr = (p - start); // advance s s = send; } // reset ptr to request to launch m_p = m_requestBuf; // sanity check if ( p - m_requestBuf > need ) { char *xx=NULL;*xx=0; } // all done m_pend = p; // launch them if ( ! launchAddRequests () ) return false; // hey that should always block! if ( ! g_errno ) { char *xx=NULL; *xx=0; } // show erroer log("tagdb: msg9a: %s",mstrerror(g_errno)); // free the allocated mem reset(); // did not block... return true; } // . "dumpFile" format contains one tag record per line as // dumped from './gb dump S main 0 -1 1' cmd line cmd. // . it is the format given by the TagRec::printToBuf() cmd bool Msg9a::addTags ( char *dumpFile , char *coll , void *state , void (*callback)(void *state) , long niceness ) { g_errno = 0; // reset m_errno = 0; m_requests = 0; m_replies = 0; m_niceness = niceness; m_state = state; m_callback = callback; long collLen = gbstrlen(coll); // scan the dump file char *p = dumpFile; // the end of it char *pend = p + gbstrlen(p); // add up total sizes long sum = 0; // end of line ptr char *eol; // count long count = 1; // debug //HashTable ht; // do the scan for ( ; p < pend ; p = eol + 1 ) { // point to next line eol = p; while ( eol < pend && *eol != '\n' ) eol++; // a fake tag rec TagRec gr; // . scan it into "gr" // . returns size of the tag rec stored into "buf" long bytesScanned = gr.setFromBuf ( p , eol ); // error? if ( bytesScanned <= 0 ) {count++; continue;} // get size long size = gr.getSize(); // error? if ( size <= 0 ) {count++; continue;} //logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size); // hash it for debug //ht.addKey ( count , size ); count++; // sanity check if ( size > MAX_TAGREC_SIZE ) { char *xx=NULL;*xx=0;} // sanity check char *site = gr.getString("site",NULL); if ( ! site ) { char *xx=NULL;*xx=0;} // then request header size size += 4 + 1 + collLen + 1 + 1; // increment total size sum += size; } // make the buf m_requestBuf = (char *)mmalloc ( sum , "msg9adbuf"); m_requestBufSize = sum; // store tags here char *t = m_requestBuf; // return true on error with g_errno set if ( ! t ) return true; // reset to beginning of file p = dumpFile; // reset count = 1; // do the scan for ( ; p < pend ; p = eol + 1 ) { // point to next line eol = p; while ( eol < pend && *eol != '\n' ) eol++; // first is the request size long *requestSizePtr = (long *)t; t += 4; // see how big the request is char *a = t; // then niceness *t++ = (char)MAX_NICENESS; // then coll memcpy ( t , coll , collLen ); t += collLen; // null temrinate *t++ = '\0'; // then the 1 byte flag (0 means add?) *t++ = 0; // store TagRec into the request buffer TagRec *gr = (TagRec *)t; // . scan it into "t" // . returns size of the tag rec stored into "buf" long bytesScanned = gr->setFromBuf ( p , eol ); // error? if ( bytesScanned <= 0 ) { log("tagdb: skipping tag rec #%li.",count++); t -= (4+1+collLen+1+1); continue; } // get size long size = gr->getSize(); // error? if ( size <= 0 ) { log("tagdb: skipping tag rec #%li.",count++); t -= (4+1+collLen+1+1); continue; } // test it //long slot = ht.getSlot ( count ); //if ( slot < 0 ) { char *xx=NULL;*xx=0; } //long shouldbe = ht.getValueFromSlot ( slot ); //if ( size != shouldbe ) { char *xx=NULL;*xx=0; } count++; //logf(LOG_DEBUG,"tagdb: tag %li size=%li",count++,size); // increment storage ptr t += size; // store the size of the WHOLE REQUEST, does not // include the request size itself. see // launchRequests() below. *requestSizePtr = (t - a); // sanity check if ( *requestSizePtr > 10000 ) { char*xx=NULL;*xx=0;} } // sanity check if ( t - m_requestBuf != sum ) { char *xx=NULL;*xx=0; } // use their ptrs for adding these tag recs m_p = m_requestBuf; m_pend = m_requestBuf + m_requestBufSize ; // now add those tags return launchAddRequests ( ); } // . returns false if blocked, true otherwise // . sets g_errno and returns true on error bool Msg9a::launchAddRequests ( ) { // clear it g_errno = 0; loop: // return true if nothing to launch if ( m_p >= m_pend ) return (m_requests == m_replies); // don't bother if already got an error if ( m_errno ) return (m_requests == m_replies); // limit max oustanding to 20 if (m_requests - m_replies >= 20 ) return (m_requests==m_replies); // take a breath QUICKPOLL(m_niceness); // parse our request char *p = m_p; // first is the request size p += 4; // then niceness p += 1; // then coll p += gbstrlen(p) + 1; // then the 1 byte flag p++; // then the tag rec TagRec *tagRec = (TagRec *)p; // . get the groupid // . tagRec's key should already be valid because when you add // a ST_SITE to a TagRec it sets TagRec::m_key (special thing) //unsigned long groupId = g_tagdb.getGroupId ( &tagRec->m_key ); uint32_t shardNum = getShardNum ( RDB_TAGDB , &tagRec->m_key ); // get the host to send to Host *hosts = g_hostdb.getGroup ( groupId ); // select a host in the group long hostNum = tagRec->m_key.n1 % g_hostdb.getNumHostsPerShard(); // and his ptr Host *h = &hosts[hostNum]; // get the next mcast //Multicast *m = &m_casts[m_requests]; // reqeust size long requestSize = *(long *)m_p; m_p += 4; char *request = m_p; m_p += requestSize; // . send to just one very specific host so he is the only one that // controls modification to this particular tagdb rec. that way if // we are changing its Tags we do not collide with another. // . this returns false and sets g_errno on error UdpServer *us = &g_udpServer; bool status = us->sendRequest ( request , requestSize , 0x9a , h->m_ip , // bestIp h->m_port , // destPort h->m_hostId , // hostId NULL , // slotPtr this , // state gotReplyWrapper9a , // callback 365*24*3600 , // timeout -1 , // backoff -1 , // max wait in ms NULL , // replybuf 0 , // replybufMaxSize m_niceness ); // error? if ( ! status ) { // g_errno should be set, we had an error m_errno = g_errno; return (m_requests == m_replies); } // successfully launched m_requests++; // launch another goto loop; } void gotReplyWrapper9a ( void *state , UdpSlot *slot ) { Msg9a *THIS = (Msg9a *) state; THIS->m_replies++; // don't let him free our send buf, it is m_requestBuf // which we allocated above slot->m_sendBufAlloc = NULL; // error? if so, save it if ( g_errno && ! THIS->m_errno ) THIS->m_errno = g_errno; if ( ! THIS->launchAddRequests() ) return; // free the allocated mem THIS->reset(); THIS->m_callback ( THIS->m_state ); } class State9a { public: UdpSlot *m_slot; Msg5 m_msg5; char m_requestType; Msg1 m_msg1; RdbList m_list; // this has all the tags we need to add/remove/replace TagRec *m_tagRec; // this has the original tagRec and we modify it with "m_tagRec" // to get the final TagRec we add back to Tagdb. it is the // "accumulator" tagdb record. TagRec m_accRec; // enough mem to store a key_t and a 0 dataSize (long) char m_tmp[12+4]; char m_niceness; char *m_coll; // linked list of ppl waiting in line to make mods class State9a *m_next; //class State9a *m_tail; }; void handleRequest9a ( UdpSlot *slot , long niceness ) { // get the request char *request = slot->m_readBuf; long requestSize = slot->m_readBufSize; // overflow protection for corrupt requests if ( requestSize < 4 ) { g_errno = EBUFTOOSMALL; g_udpServer.sendErrorReply ( slot , g_errno ); return; } // make a new Msg9a State9a *st ; try { st = new (State9a); } catch ( ... ) { g_errno = ENOMEM; log("msg9a: new(%i): %s", sizeof(State9a), mstrerror(g_errno)); return g_udpServer.sendErrorReply ( slot, g_errno ); } mnew ( st , sizeof(State9a) , "Msg10" ); // parse the request char *p = request; // save slot for sending reply st->m_slot = slot; // get niceness st->m_niceness = *(char *)p; p++; // get coll st->m_coll = p; p += gbstrlen(p) + 1; // save this st->m_requestType = *p; p++; // the "tagRec" is the record TagRec *tagRec = (TagRec *)p; p += tagRec->getSize(); // store ptr st->m_tagRec = tagRec; // reset this, we are the head/tail of the linked list so far st->m_next = NULL; // sanity check //char *site = tagRec->getString(ST_SITE,NULL); char *site = tagRec->getString("site",NULL); // this is a no-no if ( ! site ) { char *xx=NULL;*xx=0;} // no tail after us //st->m_tail = NULL; // . get the lock on this site // . the lower 64 bits of the key should be the url hash long slotNum = s_lockTable2.getSlot ( &st->m_tagRec->m_key.n0 ); // if already in there, we have to wait because someone is already // making mods to this TagRec if ( slotNum >= 0 ) { // log this for now? if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"tagdb: TAGDB handleRequest9a " "waiting for lock st=0x%lx key.n0=%llu",(long)st, st->m_tagRec->m_key.n0); State9a *p ; p = *(State9a **)s_lockTable2.getValueFromSlot(slotNum); // put us right after him in the linked list st->m_next = p->m_next; p->m_next = st; // we could be the next in line //if ( ! p->m_next ) p->m_next = st; // we wait... return; } // delete our slot from the lock table if ( ! s_lockTable2.addKey ( &st->m_tagRec->m_key.n0 , &st ) ) { log("tagdb: failed to get lock : %s",mstrerror(g_errno)); // free him, we sent his reply mdelete ( st , sizeof(State9a),"msg9afr"); delete (st); return g_udpServer.sendErrorReply ( slot, g_errno ); } // make a startKey and endKey from the tagRec's key key_t startKey = tagRec->m_key; key_t endKey = tagRec->m_key; // startkey gets is low bit cleared though startKey.n0 &= 0xfffffffffffffffeLL; // delete record request, no need to look it up if ( st->m_requestType == 0x02 ) { // note it SafeBuf sb; tagRec->printToBuf ( &sb ); log("tagdb: deleting TagRec for site %s",sb.getBufStart()); // use tmp buf in st char *p = st->m_tmp; // store key in the tmp buf *(key_t *)p = startKey; // advance p += sizeof(key_t); // and store the data size *(long *)p = 0; // advance p += 4; // set the list (just a negative rec in it) st->m_list.set ( st->m_tmp , // list 4+sizeof(key_t) , // listSize st->m_tmp , // alloc 4+sizeof(key_t) , // allocSize (char *)&startKey , // startKey (char *)&endKey , // endKey -1 , // fixeDataSize false , // ownData? false , // useHalfKeys? sizeof(key_t) );// keySize if ( ! st->m_msg1.addList( &st->m_list , RDB_TAGDB , st->m_coll , st , sendReply9a , false , // forceLocal? st->m_niceness )) // return if blocked return; sendReply9a( st ); return; } // . get from msg5, return if it blocked // . will probably not block since in the disk page cache a lot if ( ! st->m_msg5.getList ( RDB_TAGDB , st->m_coll , &st->m_list , startKey , endKey , 100000 , // minRecSizes true , // include tree? false , // addtocache? 0 , // maxcacheage 0 , // startfilenum -1 , // numFiles st , gotList , st->m_niceness , true ))// do err correction? return; // log that for debug //log("tagdb: msg5 call did not block. st=%lu",(long)st); // sanity check - why not block if it had corruption? if ( st->m_msg5.m_msg3.m_hadCorruption ) { char *xx=NULL;*xx=0; } // it did not block... gotList( st , NULL , NULL ); } void gotList ( void *state , RdbList *xxx , Msg5 *yyy ) { // cast our state class State9a *st = (State9a *)state; // return right away if error getting the rec if ( g_errno ) { sendReply9a ( st ); return; } // note it //log("tagdb: in gotlist st=%lu",(long)st); // this is the TagRec rdb record char *rec = st->m_list.getList (); long recSize = st->m_list.getListSize(); // cast it as a TagRec TagRec *accRec = &st->m_accRec; // reset in case not in tagdb and rec/recSize is NULL/0 accRec->reset(); // copy it to our accumulator rec which has room to grow, the list // does not memcpy ( (char *)accRec , rec , recSize ); // free that list buffer now, we copied it into a larger buffer st->m_list.reset(); loop: // clear it g_errno = 0; // . add/remove the tags from the tagRec // . add will replace tags with the same tag id and username // . should deal with "negative" tags (addDelTag()) //if ( st->m_requestType == 0x00 ) accRec->addTags ( st->m_tagRec ); //else accRec->removeTags ( st->m_tagRec ); accRec->addTags ( st->m_tagRec ); // was there an error? abandon all operations on this TagRec if so if ( g_errno ) { sendReply9a ( st ); return; } // perform operations on others in the queue st = st->m_next; // debug for now if ( st && g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"tagdb: calling lock for st=0x%lx",(long)st); // if there was one, do it if ( st ) goto loop; // reset to original parent st = (State9a *)state; // debug msg SafeBuf sb; accRec->printToBuf ( &sb ); log(LOG_DEBUG,"tagdb: adding to tagdb: %s",sb.getBufStart()); // set the list, it should free itself st->m_list.set ( (char *)accRec , // list accRec->getSize() , // allocSize (char *)accRec , // alloc accRec->getSize() , // allocSize (char *)&accRec->m_key , // startKey (char *)&accRec->m_key , // endKey -1 , // fixeDataSize false , // ownData? false , // useHalfKeys? sizeof(key_t) );// keySize // add it back after the mods if ( ! st->m_msg1.addList( &st->m_list , RDB_TAGDB , st->m_coll , st , sendReply9a , false , // forceLocal? MAX_NICENESS ))// niceness return; // i giess we did not block! send back the reply... sendReply9a ( st ); } void sendReply9a ( void *state ) { // cast our state class State9a *st = (State9a *)state; // delete our slot from the lock table s_lockTable2.removeKey ( &st->m_tagRec->m_key.n0 ); // log it if (g_errno) log("tagdb: msg9a failed to add: %s",mstrerror(g_errno)); // save it, in case a function below clears g_errno long saved = g_errno; loop: if ( saved ) g_udpServer.sendErrorReply( st->m_slot,saved); // send empty reply else g_udpServer.sendReply_ass(NULL,0,NULL,0,st->m_slot); // save old guy State9a *next = st->m_next; // free him, we sent his reply mdelete ( st , sizeof(State9a),"msg9afr"); delete (st); // repeat for each guy waiting in line st = next; // if there was one, do it if ( st ) goto loop; // reset to original parent st = (State9a *)state; } */ /////////////////////////////////////////////// // // OTHER functions // /////////////////////////////////////////////// long getY ( long long X , long long *x , long long *y , long n ) { // if we only have one point then there'll be no interpolation if ( n == 1 ) return y[0]; // find the first x after our "X" long j; for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break; // before/after first/last point means we don't have to interpolate if ( j <= 0 ) return y[0 ]; if ( j >= n ) return y[n-1]; // linear interpolate between our 2 points (x0,y0) and (x1,y1) long long x0 = x[j-1]; long long x1 = x[j ]; long long y0 = y[j-1]; long long y1 = y[j ]; // error if x1 less than x0 if ( x1 <= x0 ) { log("tagdb: X coordinates are not in ascending order for map"); char *xx=NULL;*xx=0; } // otherwise we have a sloping line return y0 + ( ((long long)X - x0) * (y1-y0) ) /(x1-x0) ; } /////////////////////////////////////////////// // // sendPageTagdb() is the HTML interface to tagdb // /////////////////////////////////////////////// static void sendReplyWrapper ( void *state ) ; static void sendReplyWrapper2 ( void *state ) ; static bool sendReply ( void *state ) ; static bool sendReply2 ( void *state ) ; static bool getTagRec ( class State12 *st ); // don't change name to "State" cuz that might conflict with another class State12 { public: //Msg9a m_msg9a; TcpSocket *m_socket; bool m_adding; char *m_coll; //long m_collLen; //char *m_buf; //long m_bufLen; bool m_isLocal; //long m_fileNum; //bool m_isAdmin; //bool m_isAssassin; // . Commented by Gourav // . Reason:user perm no longer used //char m_userType; HttpRequest m_r; //char *m_username; TagRec m_tagRec; TagRec m_newtr; Msg8a m_msg8a; Url m_url; char *m_urls; long m_urlsLen; Msg1 m_msg1; RdbList m_list; //Msg1 m_msg1; long m_niceness; bool m_mergeTags; //char m_tmp[16]; }; // . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the tagdb interface // . call g_httpServer.sendDynamicPage() to send it // . show a textarea for sites, then list all the different site tags // and have an option to add/delete them bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) { // are we the admin? //bool isAdmin = g_collectiondb.isAdmin ( req , s ); // get the collection record CollectionRec *cr = g_collectiondb.getRec ( req ); if ( ! cr ) { g_errno = ENOCOLLREC; log("admin: No collection record found " "for specified collection name. Could not add sites to " "tagdb. Returning HTTP status of 500."); return g_httpServer.sendErrorReply ( s , 500 , "collection does not exist"); } /* bool isAssassin = cr->isAssassin ( s->m_ip ); if ( isAdmin ) isAssassin = true; // bail if permission denied if ( ! isAssassin ){ //&& ! cr->hasPermission ( req , s ) ) { log("admin: Bad collection name or password. Could not add " "sites to tagdb. Permission denied."); return sendPageLogin ( s , req , "Collection name or " "password is incorrect"); } */ // make a state State12 *st ; try { st = new (State12); } catch ( ... ) { g_errno = ENOMEM; log("PageTagdb: new(%i): %s", sizeof(State12),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State12) , "PageTagdb" ); //st->m_isAdmin = isAdmin; //st->m_isAssassin = isAssassin; // . Commented by Gourav // . Reason:user perm no longer used //st->m_userType = g_pages.getUserType ( s , req ); // assume we've nothing to add st->m_adding = false; // save the socket st->m_socket = s; // i guess this is nuked, so copy it st->m_r.copy ( req ); // make it high priority st->m_niceness = 0; // point to it HttpRequest *r = &st->m_r; // get the collection long collLen = 0; char *coll = r->getString ( "c" , &collLen , NULL /*default*/); // get collection rec CollectionRec *cr2 = g_collectiondb.getRec ( coll ); // bitch if no collection rec found if ( ! cr2 || ! coll || collLen+1 > MAX_COLL_LEN ) { g_errno = ENOCOLLREC; log("admin: No collection record found " "for specified collection name. Could not add sites to " "tagdb. Returning HTTP status of 500."); mdelete ( st , sizeof(State12) , "PageTagdb" ); delete (st); return g_httpServer.sendErrorReply ( s , 500 , "collection does not exist"); } // . get fields from cgi field of the requested url // . get the null-terminated, space-separated lists of sites to add long urlsLen = 0; char *urls = r->getString ( "u" , &urlsLen , NULL /*default*/); //a quick hack so we can put multiple sites in a link if(r->getLong("uenc", 0)) for(long i = 0; i < urlsLen; i++) if(urls[i] == '+') urls[i] = '\n'; // get the file # of the tagdb file these sites should use //long fileNum = r->getLong ("f",-1); // get the archive filename of sites to add /* long xlen; char *x = r->getString("x",&xlen,NULL); // trim off any spaces while ( xlen > 0 && is_wspace_a(x[xlen-1]) ) x[--xlen]='\0'; */ // . get the username // . just get from cookie so it is not broadcast over the web via a // referral url //st->m_username = r->getStringFromCookie("username"); //st->m_username = g_users.getUsername(r); // are we coming from a local machine? st->m_isLocal = r->isLocal(); /* // don't set this unless we have to free it st->m_buf = NULL; st->m_bufLen = 0; // . set our archive filename of sites to add with this fileNum // . "a" will be NULL if none supplied if ( xlen ) { File file; file.set ( x ); // add 1 to bufLen for terminating \0 long bufLen = file.getFileSize() + 1 ; char *buf = (char *) mmalloc ( bufLen , "PageTagdb"); if ( ! buf ) { log("admin: File of sites is too big to add to tagdb." " Allocation of %li bytes failed.",bufLen); mdelete ( st , sizeof(State12) , "PageTagdb" ); delete (st); return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno)); } file.open(O_RDONLY); file.read ( buf , bufLen - 1 , 0 ); // NULL terminate the list of urls buf [ bufLen - 1 ] = '\0'; st->m_buf = buf; st->m_bufLen = bufLen ; urls = buf; urlsLen = bufLen; } */ // it references into the request, should be ok st->m_coll = coll; //st->m_collLen = collLen; //strcpy ( st->m_coll , coll ); // do not print "(null)" in the textarea if ( ! urls ) urls = ""; // the url buffer st->m_urls = urls; st->m_urlsLen = urlsLen; // sanity check //bool delOp = r->getLong ("delop",0 ); //char *nuke = r->getString ("nuke" ,NULL ); //if ( nuke && ! delOp ) { // g_errno = EBADENGINEER; // log("tagdb: delete operation checkbox not checked."); // mdelete ( st , sizeof(State12) , "PageTagdb" ); // delete (st); // return g_httpServer.sendErrorReply(s,500, // mstrerror(g_errno)); //} long ufuLen; char *ufu = r->getString("ufu",&ufuLen); if ( urls[0] == '\0' && ! ufu ) return sendReply ( st ); char *get = r->getString ("get",NULL ); // this is also a get operation but merges the tags from all TagRecs char *merge = r->getString("tags",NULL); // is this an add/update operation? or just get? if ( get || merge ) st->m_adding = false; else st->m_adding = true; // if each line in the file is the output of a tagdb dump // operation on the cmd line like this: // k.n1=0x892f9 k.n0=0xac2ff39f8112b71f version=0 TAG=ruleset, // "mwells",1,Jan-02-2009-18:26:04,333333333,67.16.94.2,3735437892,36 // THEN we should just call msg9a directly and it should create // a tag rec for each line and add that /* bool isDumpFile = false; if ( urls && strncmp(urls,"k.n1=",5)==0 ) isDumpFile = true; if ( isDumpFile ) { if ( ! st->m_msg9a.addTags ( st->m_urls , // dumpFile st->m_coll , st , sendReplyWrapper2 , 0 ))// niceness return false; return sendReply2 ( st ); } */ // get/merge operations can skip the tag rec lookup //if ( ! st->m_adding ) return sendReply ( st ); // regardless, we have to get the tagrec for all operations //Url site; //site.set(urls,gbstrlen(urls)); st->m_url.set(urls,gbstrlen(urls)); st->m_mergeTags = merge; return getTagRec ( st ); } bool getTagRec ( State12 *st ) { bool doInheritance = st->m_mergeTags;//(bool)merge; char rdbId = RDB_TAGDB; // fbid09729034234.com then use facebookdb //char *host = site.getHost(); //if ( strncmp(host,"fbid",4)==0 && is_digit(host[4]) ) // rdbId = RDB_FACEBOOKDB; // this replaces msg8a if ( ! st->m_msg8a.getTagRec ( &st->m_url,//&site , // tell msg8a to try to guess the site NULL, st->m_coll , false, // skip dom lookup? st->m_niceness , st , sendReplyWrapper , &st->m_tagRec , doInheritance , rdbId)) return false; /* if ( ! st->m_msg8a.getTagRec ( &site , // &st->m_url, st->m_coll, st->m_collLen, true, //usecanonicalName 0, //niceness st, sendReplyWrapper , &st->m_tagRec , doInheritance )){ return false; } */ return sendReply ( st ); } void sendReplyWrapper ( void *state ) { sendReply ( state ); } static void sendReplyWrapper2 ( void *state ) { State12 *st = (State12 *)state; // re-get the tags from msg8a since we changed them getTagRec(st); //sendReply2 ( state ); } bool sendReply ( void *state ) { // get our state class State12 *st = (State12 *) state; // get the request HttpRequest *r = &st->m_r; // and socket TcpSocket *s = st->m_socket; // the tagrec //TagRec *gr = &st->m_tagRec; // reset "gr" so it won't show the old tags of the first rec // in the text area box on the tagdb page after the add is completed //if ( st->m_adding ) gr->reset(); // . if urlsLen <= 0 or fileNum < 0 and we're not deleting // . then we've nothing to add //if ( urlsLen <= 0 ) return sendReply ( st ); // need a valid username //if ( ! st->m_username || st->m_username[0] == '\0' ) { // log("tagdb: bad username."); // mdelete ( st , sizeof(State12) , "PageTagdb" ); // delete (st); // return g_httpServer.sendErrorReply(s,500, // mstrerror(g_errno)); //} if ( ! st->m_adding ) return sendReply2 ( st ); //char *nuke = r->getString ("nuke" ,NULL ); TagRec *newtr = &st->m_newtr; // update it from the http request newtr->setFromHttpRequest ( r , s ); // but remove the site tag //newtr.removeTags ( "site" , NULL ); // add it into gr //gr->addTags ( &newtr ); // copy it over to our state //memcpy ( gr , &newtr , newtr.getSize() ); // debug // this doesn't work because we do not set TagRec::m_listPtrs[0] // to point to the list we make below (MDW 4/29/13) //SafeBuf tmp; //newtr->printToBuf ( &tmp ); //log(LOG_DEBUG,"tagdb: converted from http: %s", // tmp.getBufStart() ); // make a startKey and endKey from the tagRec's key //key_t startKey = gr->m_key; //key_t endKey = gr->m_key; // startkey gets is low bit cleared though //startKey.n0 &= 0xfffffffffffffffeLL; /* // add using msg9a if ( ! st->m_msg9a.addTags ( st->m_urls , NULL , // sitePtrs 0 , // numSitePtrs st->m_coll , st , sendReplyWrapper2 , 0 , // niceness &newtr , // gr nuke , NULL )) // ipvec return false; */ // shrotcut SafeBuf *sbuf = &newtr->m_sbuf; // use the list we got RdbList *list = &st->m_list; key128_t startKey; key128_t endKey; startKey.setMin(); endKey.setMax(); // set it from safe buf list->set ( sbuf->getBufStart() , sbuf->length() , NULL , 0 , (char *)&startKey , (char *)&endKey , -1 , false , false , sizeof(key128_t) ); // no longer adding st->m_adding = false; // . just use TagRec::m_msg1 now // . no, can't use that because tags are added using SafeBuf::addTag() // which first pushes the rdbid, so we gotta use msg4 if ( ! st->m_msg1.addList ( list , RDB_TAGDB , st->m_coll , st , sendReplyWrapper2 , false , st->m_niceness ) ) return false; // . if addTagRecs() doesn't block then sendReply right away // . this returns false if blocks, true otherwise //return sendReply2 ( st ); return getTagRec ( st ); } bool sendReply2 ( void *state ) { // get our state class State12 *st = (State12 *) state; // get the request HttpRequest *r = &st->m_r; // and socket TcpSocket *s = st->m_socket; // page is not more than 32k char buf[1024*32]; SafeBuf sb(buf, 1024*32); // do they want an xml reply? if( r->getLong("xml",0) ) { // was "raw" sb.safePrintf("\n" "\n"); st->m_tagRec.printToBufAsXml(&sb); sb.safePrintf(""); log ( LOG_INFO,"sending raw page###\n"); // clear g_errno, if any, so our reply send goes through g_errno = 0; // extract the socket TcpSocket *s = st->m_socket; // . nuke the state // . first free the buffer, if non-NULL //if (st->m_buf) mfree (st->m_buf, st->m_bufLen, "PageTagdb"); mdelete(st, sizeof(State12), "PageTagdb"); delete (st); // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), 0, false, "text/xml", -1, NULL, "ISO-8859-1"); } // . print standard header // . do not print big links if only an assassin, just print host ids g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r ); // did we add some sites??? if ( st->m_adding ) { // if there was an error let them know if ( g_errno ) sb.safePrintf("
Error adding site(s): " "%s[%i]

\n", mstrerror(g_errno) , g_errno ); else sb.safePrintf ("
" "Sites added successfully" "

\n"); } //char *c = st->m_coll; char bb [ MAX_COLL_LEN + 60 ]; bb[0]='\0'; // print interface to add sites sb.safePrintf ( "" "", LIGHT_BLUE , DARK_BLUE , bb ); // sometimes we add a huge # of urls, so don't display them because // it like freezes the silly browser char *uu = st->m_urls; if ( st->m_urlsLen > 100000 ) uu = ""; sb.safePrintf ( "\n"); // . show all tags we got values for // . put a delete checkbox next to each one // . show 5-10 dropdowns for adding new tags // for some reason the "selected" option tags do not show up below // on firefox unless i have this line. // count how many "tagRecs" we are taking tags from Tag *jtag = st->m_tagRec.getFirstTag(); long numTagRecs = 0; for ( ; jtag ; jtag = st->m_tagRec.getNextTag(jtag) ) { // skip dups if ( jtag->m_type == TT_DUP ) continue; // count # of TagRecs contributing to the tags //if ( tag && tag->m_type == ST_SITE ) numTagRecs++; if ( jtag && jtag->isType("site") ) numTagRecs++; } // if we are displaying a COMBINATION of TagRecs merged together in // the inheritance loop (above) then you can not edit that! you can // only edit individual tag recs bool canEdit = (numTagRecs <= 1); if ( ! canEdit ) sb.safePrintf("\n" ); // headers sb.safePrintf("" //"" "" "" "" "" "" "" "" "" "" "\n", DARK_BLUE); // set up the loop Tag *itag = st->m_tagRec.getFirstTag(); //last = NULL; long count = 0; long empty = 0; // loop over all tags in TagRec for ( ; empty < 3 ; count++ ) { // use this tag to print from Tag *ctag = itag; // advance if ( itag ) itag = st->m_tagRec.getNextTag(itag); // make it NULL, do not start over at the beginning if ( empty > 0 ) ctag = NULL; // skip dups if ( ctag && ctag->m_type == TT_DUP ) continue; // if ctag NULL and we are getting all tags, break if ( ! canEdit && ! ctag ) break; // assign for looping //last = tag; // if we are NULL, print out 3 empty tags if ( ! ctag ) empty++; // start the section sb.safePrintf("",DARK_BLUE); // the delete tag checkbox //sb.safePrintf(""); // start the next cell sb.safePrintf(""); // if no tag, just placeholders if ( ! ctag ) { sb.safePrintf("" "" "" "" "" ""); continue; } // data size sb.safePrintf("",(long)ctag->getTagDataSize()); // username, timestamp only for non-empty tags char *username = ctag->getUser(); long timestamp = ctag->m_timestamp; long ip = 0; char *ips = " "; if ( ctag->m_ip ) { ip=ctag->m_ip; ips=iptoa(ctag->m_ip);} // convert timestamp to string char tmp[64]; sprintf(tmp," "); time_t ts = timestamp; struct tm *timeStruct = localtime ( &ts ); if ( timestamp ) strftime(tmp,64,"%b-%d-%Y-%H:%M:%S",timeStruct); sb.safePrintf("", count,username,username); sb.safePrintf("", count,timestamp,tmp); sb.safePrintf(""); sb.safePrintf("", (long)(ctag->m_key.n0>>32) ); sb.safePrintf("", // order 1 in since we always do that because // we forgot to shift up one for the delbit // above in Tag::set() when it sets m_key.n0 (long)(ctag->m_key.n0&0xffffffff) | 0x01); //sb.safePrintf("", // username,tmp,ips); sb.safePrintf(""); } // do not print add or del tags buttons if we got tags from more // than one TagRec! if ( canEdit ) sb.safePrintf ("" "\n",DARK_BLUE); sb.safePrintf ( "
" "
Tagdb%s
" "
"); // text area for adding space separated sites/urls //char *pp = "put sites here"; //char *pp = ""; //if ( st->m_bufLen > 0 ) pp = st->m_buf; // no, print out "urls" sb.safePrintf ("
" "
" "

" , uu ); // spam assassins should not use this much power, too risky //if ( st->m_isAdmin ) { // sb.safePrintf ("Note: use 1.2.3.0 to " // "specify ip domain.
"); //} // allow filename to load them from //if ( st->m_isAdmin ) { sb.safePrintf("or specify a file of them:
" "file can also be dumped output of " "tagdb from the gb dump S ... " "command." "

" ); //} // this is applied to every tag that is added for accountability sb.safePrintf("
Username: " );//,st->m_username); // as a safety, this must be checked for any delete operation sb.safePrintf ("  delete operation
"); // close up sb.safePrintf ("
" // this is merge all by default right now but since // zak is really only using eventtaghashxxxx.com we // should be ok "" //"" //"" //"" // "" "
" "
" "Can not edit because more than one " "TagRecs were merged
" "
delete?del?tag nametag valuedatasize (with NULL)usernametimestampuser ipdeduphash32sitehash32
",DARK_BLUE); sb.safePrintf(""); if ( ctag && canEdit ) // && tag->m_type != ST_SITE ) sb.safePrintf("",count); else sb.safePrintf(" "); sb.safePrintf(""); // . skip ST_SITE, do not show dropdown for that // . no, because for looking up tagRecs i like to see // the site tag value, to see what subdomain is matched //if ( ctag && ctag->m_type == ST_SITE ) continue; // print drop down if ( ! ctag ) sb.safePrintf(""); else { char *tagName = getTagStrFromType ( ctag->m_type ); sb.safePrintf("%s", count,tagName,tagName); } sb.safePrintf(""); // the score field for the drop down list, whatever tag id // was selected will have this score if ( canEdit ) sb.safePrintf("printDataToBuf ( &sb ); // close up the input tag if ( canEdit ) sb.safePrintf("\">"); // close up table cell sb.safePrintf("\n      
%li%s%s%s", count,ip,ips); sb.safePrintf("", count,ctag->m_key.n1); sb.safePrintf("", count,ctag->m_key.n0); sb.safePrintf("0x%lx0x%lx%s%s%s
" "" "
" ); sb.safePrintf (""); sb.safePrintf (""); // clear g_errno, if any, so our reply send goes through g_errno = 0; // calculate buffer length // extract the socket //TcpSocket *s = st->m_socket; // . nuke the state // . first free the buffer, if non-NULL //if ( st->m_buf ) mfree ( st->m_buf , st->m_bufLen , "PageTagdb" ); mdelete ( st , sizeof(State12) , "PageTagdb" ); delete (st); // print it out //logf(LOG_DEBUG,"tagdb: %s",sb.getBufStart()+sb.length()-256); // . send this page // . encapsulates in html header and tail // . make a Mime return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); } //void classifierDoneWrapper ( void *state ) { // g_tagdbClassifier.m_running = false; //} // . we can have multiple tags of this type per tag for a single username // . by default, there can be multiple tags of the same type in the Tag as // long as the usernames are all different. see addTag()'s deduping below. bool isTagTypeUnique ( long tt ) { // a dup? if ( tt == TT_DUP ) return false; // TT_DUP = 123456 // make sure table is valid if ( ! s_initialized ) g_tagdb.setHashTable(); // look up in hash table TagDesc *td = (TagDesc *)s_ht.getValue ( tt ); // if none, that is crazy if ( ! td ) { char *xx=NULL;*xx=0; } // return if ( td->m_flags & TDF_ARRAY) return false; return true; } bool isTagTypeIndexable ( long tt ) { // a dup? if ( tt == TT_DUP ) return false; // TT_DUP = 123456 // make sure table is valid if ( ! s_initialized ) g_tagdb.setHashTable(); // look up in hash table TagDesc *td = (TagDesc *)s_ht.getValue ( tt ); // if none, that is crazy if ( ! td ) { char *xx=NULL;*xx=0; } // return false if we should not index it if ( td->m_flags & TDF_NOINDEX ) return false; // otherwise, index it return true; } // . when displaying a tag we need to know if it is a string or not // . that and the dataSize determine how we display it /* bool isTagTypeString ( long tt ) { // look up in hash table TagDesc *td = (TagDesc *)s_ht.getValue ( tt ); // if none, that is crazy if ( ! td ) { char *xx=NULL;*xx=0; } // return return (td->m_flags & TDF_STRING); } */ // used to determine if one Tag should overwrite the other! if they // have the same dedup hash... then yes... long Tag::getDedupHash ( ) { // if unique use that! if ( isTagTypeUnique ( m_type ) ) return m_type; // if we are NOT unique... then hash username and data. thus we only // replace a key if its the same tagtype, username and data. that // way it will just update the timestamp and/or ip. // start hashing here char *startHashing = (char *)&m_type; // end here. include username (and tag data!) char *endHashing = m_buf + m_bufSize; // if we are an event tag then PageEvents.cpp added us in the form of // user%llutag%sval%li ... so ignore value (FACEBOOKDB) //if ( m_type == s_eventTag ) { // endHashing--; // for (;endHashing-1>m_buf&&is_digit(endHashing[-1]); // endHashing--); //} // do not include bufsize in hash long saved = m_bufSize; m_bufSize = 0; // hash this many bytes long hashSize = endHashing - startHashing; // set key long dh = hash32 ( startHashing , hashSize ); // revert bufsize m_bufSize = saved; return dh; }