#include "gb-include.h" #include #include "Catdb.h" #include "Categories.h" #include "CatRec.h" #include "Unicode.h" #include "Threads.h" // use this to query delete all the banned sites in tagdb // dsh -a '/a/gb -c /a/hosts.conf dump S main 0 -1 1 | grep =19' | awk '{print $6}' | sort | uniq | grep "\." > banned // cat banned | awk -F'http://' '{print $2}' | grep "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=site%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' > URLS // cat banned | awk -F'/' '{print $3}' | grep -v "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=ip%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' >> URLS // nohup wget -i /a/URLS -O /dev/null & // a global class extern'd in .h file Catdb g_catdb; // reset rdb and Xmls void Catdb::reset() { m_rdb.reset(); } bool Catdb::init ( ) { // clear our m_keys/m_bufs arrays // memset ( m_xml , 0 , MAXNUMSITEFILES * sizeof(Xml *) ); //memset ( m_keys, 0 , MAXNUMSITEFILES * sizeof(int64_t) ); // . what's max # of tree nodes? // . assume avg tagdb rec size (siteUrl) is about 82 bytes we get: // . NOTE: 32 bytes of the 82 are overhead //int32_t treeMem = g_conf.m_catdbMaxTreeMem; // speed up gen catdb, use 15MB. later maybe once gen is complete // we can free this tree or something... // TODO! int32_t treeMem = 15000000; //int32_t treeMem = 100000000; //int32_t maxTreeNodes = g_conf.m_catdbMaxTreeMem / 82; int32_t maxTreeNodes = treeMem / 82; // do not use any page cache if doing tmp cluster in order to // prevent swapping int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem; if ( g_hostdb.m_useTmpCluster ) pcmem = 0; pcmem = 0; // each entry in the cache is usually just a single record, no lists, // unless a hostname has multiple sites in it. has 24 bytes more // overhead in cache. //int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106; // we now use a page cache if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem, GB_TFNDB_PAGE_SIZE) ) return log("db: Catdb init failed."); // . initialize our own internal rdb // . i no longer use cache so changes to tagdb are instant // . we still use page cache however, which is good enough! //if ( this == &g_catdb ) if ( ! m_rdb.init ( g_hostdb.m_dir , "catdb" , true , // dedup same keys? -1 , // fixed record size //g_hostdb.m_groupMask , //g_hostdb.m_groupId , 2,//g_conf.m_catdbMinFilesToMerge , treeMem ,//g_conf.m_catdbMaxTreeMem , maxTreeNodes , // now we balance so Sync.cpp can ordered huge list true , // balance tree? 0 , //g_conf.m_tagdbMaxCacheMem , 0 , //maxCacheNodes , false , // half keys? false , //m_tagdbSaveCache &m_pc , false, false, 12, // keysize false, true )) // is collectionless? return false; // normally Collectiondb.addColl() will call Rdb::addColl() which // will init the CollectionRec::m_rdbBase, which is what // Rdb::getBase(collnum_t) will return. however, for collectionless // rdb databases we set Rdb::m_collectionlessBase special here. // This was in Rdb.cpp::init(). return m_rdb.addRdbBase1 ( NULL ); } bool Catdb::init2 ( int32_t treeMem ) { // . what's max # of tree nodes? // . assume avg tagdb rec size (siteUrl) is about 82 bytes we get: // . NOTE: 32 bytes of the 82 are overhead int32_t maxTreeNodes = 0; return m_rdb.init ( g_hostdb.m_dir , "tagdbRebuild" , true , // dedup same keys? -1 , // fixed record size 10 , // min to merge treeMem , maxTreeNodes , true , // balance tree? 0 , //g_conf.m_tagdbMaxCacheMem , 0 , //maxCacheNodes , false , // half keys? false , //m_tagdbSaveCache NULL ); //&m_pc } // // end support for "cache recs" // /* bool Catdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; // verify return true; if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Catdb::verify ( char *coll ) { char *rdbName = "Catdb"; log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll ); g_threads.disableThreads(); Msg5 msg5; //Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //int32_t minRecSizes = 64000; if ( ! msg5.getList ( RDB_CATDB , 0,//collnum , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , NULL,//&msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //uint32_t groupId = g_catdb.getGroupId ( &k ); //uint32_t shardNum = getShardNum ( RDB_CATDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum( RDB_CATDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { log ("db: Out of first %"INT32" records in %s, only %"INT32" belong " "to our group.",count,rdbName,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to %s inconsistency.", rdbName ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_INFO, "db: %s passed verification successfully for %"INT32" recs.", rdbName, count ); // DONE g_threads.enableThreads(); return true; } void Catdb::normalizeUrl ( Url *srcUrl, Url *dstUrl ) { char urlStr[MAX_URL_LEN]; int32_t urlStrLen = srcUrl->getUrlLen(); memcpy(urlStr, srcUrl->getUrl(), urlStrLen); // fix the url urlStrLen = g_categories->fixUrl(urlStr, urlStrLen); // create the normalized url dstUrl->set(urlStr, urlStrLen, true, false, false, true); } // . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu key_t Catdb::makeKey ( Url *site, bool isDelete ) { key_t k; // . get startKey based on "site"'s domain // . if "site"'s domain is an ip address (non-canonical) then use ip getKeyRange ( site->isIp() , site , &k , NULL); // set lower 64 bits of key to hash of this url k.n0 = hash64 ( site->getUrl() , site->getUrlLen() ); // clear low bit if we're a delete, otherwise set it if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL; else k.n0 |= 0x0000000000000001LL; return k; } // . get startKey,endKey for all SiteRecs from "url"'s domain // . key has the following format: // . dddddddd dddddddd dddddddd dddddddd d = domain hash w/ collection // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu // . putting domain as first 32bits will cluster all SiteRecs from the // same domain together on the same machine void Catdb::getKeyRange ( bool useIp , Url *url, key_t *startKey , key_t *endKey ) { // log warning msg if we need to if ( useIp && ! url->hasIp() ) log(LOG_LOGIC,"db: tagdb: getKeyRange: useIp is true, " "but url has no ip"); // . the upper 32 bits of the key is basically hash of the domain // . mask out the low-order byte (hi byte in little endian order) uint32_t h; // . make sure we use htonl() on ip domain so top byte is not zero! // . this made all our ip-based sites stored in group #0 before // if ( useIp ) h = htonl ( url->getIpDomain() ) ; // . only hash first 3 bytes of ip domain to keep together w/ ip // . if rdbid is tagdb then use hostname as key else use domain if ( useIp ) { // do htonl so most significant byte is first int32_t ipdom = htonl(url->getIpDomain()); h = hash32 ( (char *)&ipdom , 3 ) ; } else h = hash32 (url->getDomain(), url->getDomainLen()); // incorporate collection into "h" //h = hash32 ( coll , collLen , h ); // now make the keys key_t k; // top 4 bytes is always the domain hash (ip or canonical domain) k.n1 = h; // don't set the low del bit for startKey k.n0 = 0x0000000000000000LL; // assign the startKey if ( startKey ) *startKey = k; // set the low del bit for startKey k.n0 = 0xffffffffffffffffLL; // endkey is just as simple if ( endKey ) *endKey = k; } // move the current list pointer back until we hit the start of // a valid key char *Catdb::moveToCorrectKey ( char *listPtr, RdbList *list, uint32_t domainHash ) { char *listEnd = list->getListEnd(); char *listStart = list->getList(); char *p = listPtr; // move back from the end if (listEnd - p < (int32_t)sizeof(key_t)) p -= sizeof(key_t); // loop until we get it for ( ; p > listStart; p--){ // check for the domain hash in the key if ( ((key_t*)p)->n1 == domainHash ) { // . verify the match // get the current rec size and check // the next rec for correct data int32_t recSize = list->getRecSize(p); char *checkp = p + recSize; // step 1, verify the start of the next rec is good if ( recSize >= 0 && ( checkp == listEnd || ( checkp < listEnd && ((key_t*)checkp)->n1 == domainHash ) ) ) { // return if we're at end if ( checkp == listEnd ) return p; // step 2, verify good rec on next rec recSize = list->getRecSize(checkp); checkp = checkp + recSize; if ( recSize >= 0 && ( checkp == listEnd || ( checkp < listEnd && ((key_t*)checkp)->n1 == domainHash ))) // good match, return it return p; } } // otherwise backup a byte } // we'll get here if p == listStart return p; } // binary search on the given list for the given key void Catdb::listSearch ( RdbList *list, key_t exactKey, char **data, int32_t *dataSize ) { // init the data *data = NULL; *dataSize = 0; list->resetListPtr(); // for small lists, just loop through the list if (list->getListSize() < 16*1024) { while ( ! list->isExhausted() ) { // for debug! /* CatRec crec; crec.set ( NULL, list->getCurrentData(), list->getCurrentDataSize(), false); log("catdb: caturl=%s #catid=%"INT32" version=%"INT32"" ,crec.m_url ,(int32_t)crec.m_numCatids ,(int32_t)crec.m_version ); */ // check the current key if ( list->getCurrentKey() != exactKey ) { // miss, next list->skipCurrentRecord(); continue; } // get site from this rec *data = list->getCurrentData(); *dataSize = list->getCurrentDataSize(); break; } } // otherwise do a binary search on the large lists else { // init the low and high char *low = list->getList(); char *high = list->getListEnd(); // move the high ptr to the start of last rec high = moveToCorrectKey(high, list, exactKey.n1); // binary search char *currRec; while ( low <= high ) { // next check spot int32_t delta = high - low; currRec = low + (delta / 2); //currRec = (char*)(((uint64_t)low + // (uint64_t)high)/2); // do correction currRec = moveToCorrectKey( currRec, list, exactKey.n1 ); // check for hit if (list->getKey(currRec) == exactKey) { // hit, save it and get out *data = list->getData(currRec); *dataSize = list->getDataSize(currRec); break; } else if (list->getKey(currRec) > exactKey) { // move high to currRec - one rec high = moveToCorrectKey ( currRec - 1, list, exactKey.n1 ); } else { // move low to currRec + one rec low = currRec + list->getRecSize(currRec); } } } } // now given an RdbList of SiteRecs can we find the best matching rec // for our site? char *Catdb::getRec ( RdbList *list , Url *url , int32_t *recSize, char* coll, int32_t collLen ) { key_t exactKey; int64_t startTime = gettimeofdayInMilliseconds(); int64_t took; char *data; int32_t dataSize; // for now, only get exact hits for catdb // check for an exact key/url match exactKey = makeKey(url, false); // go throught the list looking for the exact key //list->resetListPtr(); data = NULL; dataSize = 0; // call the search listSearch ( list, exactKey, &data, &dataSize ); // make sure the url matches if (data && dataSize > 0) { // get the url /* char *x; int32_t xlen; // hit, check the url // for catdb, skip over the catids if (m_rdbid == RDB_CATDB) { unsigned char numCatids = *data; // . point to stored url/site // . skip dataSize/fileNum int32_t skip = 1 + (4 * numCatids) + 4; x = data + skip; xlen = dataSize - skip; } else { // . point to stored url/site // . skip dataSize/fileNum x = data + 4; xlen = dataSize - 4; } // set the site Url site; site.set ( x , xlen , false ); */ CatRec site; site.set (url, data, dataSize, false); // check for an exact match against the full url int32_t uflen = url->getUrlLen(); char *ufull = url->getUrl(); //int32_t sflen = site.getUrlLen(); //char *sfull = site.getUrl(); int32_t sflen = site.m_urlLen; char *sfull = site.m_url; // if we match, return this rec if ( sflen == uflen && strncmp ( sfull, ufull, sflen ) == 0 ) { *recSize = dataSize; } else { *recSize = 0; data = NULL; } } else { *recSize = 0; data = NULL; } took = gettimeofdayInMilliseconds() - startTime; if ( took > 10 ) log(LOG_INFO, "catdb: catdb lookup took %"INT64" ms, " "listSize=%"INT32"", took, list->getListSize() ); return data; } // . find the indirect matches in the list which match a sub path // of the url int32_t Catdb::getIndirectMatches ( RdbList *list , Url *url , char **matchRecs , int32_t *matchRecSizes , int32_t maxMatches , char *coll, int32_t collLen) { char path[MAX_URL_LEN+1]; int32_t pathLen; Url partialUrl; key_t partialUrlKey; // start with the whole url...include real catid in indirect memcpy(path, url->getUrl(), url->getUrlLen()); pathLen = url->getUrlLen(); // loop looking for partial matches char *data = NULL; int32_t dataSize = 0; int32_t numMatches = 0; while ( numMatches < maxMatches ) { // make the partial url partialUrl.set(path, pathLen, true); normalizeUrl(&partialUrl, &partialUrl); // make the next key partialUrlKey = makeKey ( &partialUrl, false ); // search for it listSearch ( list, partialUrlKey, &data, &dataSize ); // store a hit if ( data && dataSize > 0 ) { // get the url char *x; int32_t xlen; // hit, check the url // for catdb, skip over the catids /* unsigned char numCatids = *data; // . point to stored url/site // . skip dataSize/fileNum int32_t skip = 1 + (4 * numCatids) + 4; x = data + skip; xlen = dataSize - skip; */ CatRec sr; sr.set ( url, data, dataSize, false); x = sr.m_url; xlen = sr.m_urlLen; // ensure it's a sub-path if ( xlen <= url->getUrlLen() && strncasecmp(x, url->getUrl(), xlen) == 0 ) { //char msg[4096]; //char *mp = msg; //mp += sprintf(mp, "For "); //memcpy(mp, url->getUrl(), url->getUrlLen()); //mp += url->getUrlLen(); //mp += sprintf(mp, " , got Indirect: "); //memcpy(mp, x, xlen); //mp += xlen; //*mp = '\0'; //log ( LOG_INFO, "tagdb: %s", msg ); matchRecs [numMatches] = data; matchRecSizes[numMatches] = dataSize; numMatches++; } } // make the next partial url pathLen--; while ( pathLen > 3 && path[pathLen-1] != '/' ) pathLen--; // check for end if ( pathLen <= 3 || strncmp(&path[pathLen-3], "://", 3) == 0 ) break; // chop off the trailing / pathLen--; } return numMatches; }