open-source-search-engine/Catdb.cpp

543 lines
17 KiB
C++

#include "gb-include.h"
#include <sys/stat.h>
#include "Catdb.h"
#include "Categories.h"
#include "CatRec.h"
#include "Unicode.h"
#include "Threads.h"
// use this to query delete all the banned sites in tagdb
// dsh -a '/a/gb -c /a/hosts.conf dump S main 0 -1 1 | grep =19' | awk '{print $6}' | sort | uniq | grep "\." > banned
// cat banned | awk -F'http://' '{print $2}' | grep "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=site%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' > URLS
// cat banned | awk -F'/' '{print $3}' | grep -v "[A-z]" | awk '{print "http://64.62.168.52:8000/admin/reindex?c=main&cast=0&sq=ip%3A"$1"&delbox=1&f=-1&srn=0&ern=2000000&sto=0&sp=7&action=OK"}' >> URLS
// nohup wget -i /a/URLS -O /dev/null &
// a global class extern'd in .h file
Catdb g_catdb;
// reset rdb and Xmls
void Catdb::reset() {
m_rdb.reset();
}
bool Catdb::init ( ) {
// clear our m_keys/m_bufs arrays
// memset ( m_xml , 0 , MAXNUMSITEFILES * sizeof(Xml *) );
//memset ( m_keys, 0 , MAXNUMSITEFILES * sizeof(int64_t) );
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
//int32_t treeMem = g_conf.m_catdbMaxTreeMem;
// speed up gen catdb, use 15MB. later maybe once gen is complete
// we can free this tree or something...
// TODO!
int32_t treeMem = 15000000;
//int32_t treeMem = 100000000;
//int32_t maxTreeNodes = g_conf.m_catdbMaxTreeMem / 82;
int32_t maxTreeNodes = treeMem / 82;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
int32_t pcmem = g_conf.m_catdbMaxDiskPageCacheMem;
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
pcmem = 0;
// each entry in the cache is usually just a single record, no lists,
// unless a hostname has multiple sites in it. has 24 bytes more
// overhead in cache.
//int32_t maxCacheNodes = g_conf.m_tagdbMaxCacheMem / 106;
// we now use a page cache
if ( ! m_pc.init ("catdb",RDB_CATDB,pcmem,
GB_TFNDB_PAGE_SIZE) )
return log("db: Catdb init failed.");
// . initialize our own internal rdb
// . i no longer use cache so changes to tagdb are instant
// . we still use page cache however, which is good enough!
//if ( this == &g_catdb )
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"catdb" ,
true , // dedup same keys?
-1 , // fixed record size
//g_hostdb.m_groupMask ,
//g_hostdb.m_groupId ,
2,//g_conf.m_catdbMinFilesToMerge ,
treeMem ,//g_conf.m_catdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
&m_pc ,
false,
false,
12, // keysize
false,
true )) // is collectionless?
return false;
// normally Collectiondb.addColl() will call Rdb::addColl() which
// will init the CollectionRec::m_rdbBase, which is what
// Rdb::getBase(collnum_t) will return. however, for collectionless
// rdb databases we set Rdb::m_collectionlessBase special here.
// This was in Rdb.cpp::init().
return m_rdb.addRdbBase1 ( NULL );
}
bool Catdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . assume avg tagdb rec size (siteUrl) is about 82 bytes we get:
// . NOTE: 32 bytes of the 82 are overhead
int32_t maxTreeNodes = 0;
return m_rdb.init ( g_hostdb.m_dir ,
"tagdbRebuild" ,
true , // dedup same keys?
-1 , // fixed record size
10 , // min to merge
treeMem ,
maxTreeNodes ,
true , // balance tree?
0 , //g_conf.m_tagdbMaxCacheMem ,
0 , //maxCacheNodes ,
false , // half keys?
false , //m_tagdbSaveCache
NULL ); //&m_pc
}
//
// end support for "cache recs"
//
/*
bool Catdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
// verify
return true;
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Catdb::verify ( char *coll ) {
char *rdbName = "Catdb";
log ( LOG_INFO, "db: Verifying %s for coll %s...", rdbName, coll );
g_threads.disableThreads();
Msg5 msg5;
//Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
startKey.setMin();
endKey.setMax();
//int32_t minRecSizes = 64000;
if ( ! msg5.getList ( RDB_CATDB ,
0,//collnum ,
&list ,
startKey ,
endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
NULL,//&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key_t k = list.getCurrentKey();
count++;
//uint32_t groupId = g_catdb.getGroupId ( &k );
//uint32_t shardNum = getShardNum ( RDB_CATDB , &k );
//if ( groupId == g_hostdb.m_groupId ) got++;
uint32_t shardNum = getShardNum( RDB_CATDB , &k );
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
log ("db: Out of first %"INT32" records in %s, only %"INT32" belong "
"to our group.",count,rdbName,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to %s inconsistency.", rdbName );
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_INFO, "db: %s passed verification successfully for %"INT32" recs.",
rdbName, count );
// DONE
g_threads.enableThreads();
return true;
}
void Catdb::normalizeUrl ( Url *srcUrl, Url *dstUrl ) {
char urlStr[MAX_URL_LEN];
int32_t urlStrLen = srcUrl->getUrlLen();
memcpy(urlStr, srcUrl->getUrl(), urlStrLen);
// fix the url
urlStrLen = g_categories->fixUrl(urlStr, urlStrLen);
// create the normalized url
dstUrl->set(urlStr, urlStrLen, true, false, false, true);
}
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/o collection
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
key_t Catdb::makeKey ( Url *site, bool isDelete ) {
key_t k;
// . get startKey based on "site"'s domain
// . if "site"'s domain is an ip address (non-canonical) then use ip
getKeyRange ( site->isIp() , site , &k , NULL);
// set lower 64 bits of key to hash of this url
k.n0 = hash64 ( site->getUrl() , site->getUrlLen() );
// clear low bit if we're a delete, otherwise set it
if ( isDelete ) k.n0 &= 0xfffffffffffffffeLL;
else k.n0 |= 0x0000000000000001LL;
return k;
}
// . get startKey,endKey for all SiteRecs from "url"'s domain
// . key has the following format:
// . dddddddd dddddddd dddddddd dddddddd d = domain hash w/ collection
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
// . putting domain as first 32bits will cluster all SiteRecs from the
// same domain together on the same machine
void Catdb::getKeyRange ( bool useIp , Url *url,
key_t *startKey , key_t *endKey ) {
// log warning msg if we need to
if ( useIp && ! url->hasIp() )
log(LOG_LOGIC,"db: tagdb: getKeyRange: useIp is true, "
"but url has no ip");
// . the upper 32 bits of the key is basically hash of the domain
// . mask out the low-order byte (hi byte in little endian order)
uint32_t h;
// . make sure we use htonl() on ip domain so top byte is not zero!
// . this made all our ip-based sites stored in group #0 before
// if ( useIp ) h = htonl ( url->getIpDomain() ) ;
// . only hash first 3 bytes of ip domain to keep together w/ ip
// . if rdbid is tagdb then use hostname as key else use domain
if ( useIp ) {
// do htonl so most significant byte is first
int32_t ipdom = htonl(url->getIpDomain());
h = hash32 ( (char *)&ipdom , 3 ) ;
}
else
h = hash32 (url->getDomain(), url->getDomainLen());
// incorporate collection into "h"
//h = hash32 ( coll , collLen , h );
// now make the keys
key_t k;
// top 4 bytes is always the domain hash (ip or canonical domain)
k.n1 = h;
// don't set the low del bit for startKey
k.n0 = 0x0000000000000000LL;
// assign the startKey
if ( startKey ) *startKey = k;
// set the low del bit for startKey
k.n0 = 0xffffffffffffffffLL;
// endkey is just as simple
if ( endKey ) *endKey = k;
}
// move the current list pointer back until we hit the start of
// a valid key
char *Catdb::moveToCorrectKey ( char *listPtr,
RdbList *list,
uint32_t domainHash ) {
char *listEnd = list->getListEnd();
char *listStart = list->getList();
char *p = listPtr;
// move back from the end
if (listEnd - p < (int32_t)sizeof(key_t))
p -= sizeof(key_t);
// loop until we get it
for ( ; p > listStart; p--){
// check for the domain hash in the key
if ( ((key_t*)p)->n1 == domainHash ) {
// . verify the match
// get the current rec size and check
// the next rec for correct data
int32_t recSize = list->getRecSize(p);
char *checkp = p + recSize;
// step 1, verify the start of the next rec is good
if ( recSize >= 0 && ( checkp == listEnd ||
( checkp < listEnd &&
((key_t*)checkp)->n1 == domainHash ) ) ) {
// return if we're at end
if ( checkp == listEnd )
return p;
// step 2, verify good rec on next rec
recSize = list->getRecSize(checkp);
checkp = checkp + recSize;
if ( recSize >= 0 && ( checkp == listEnd ||
( checkp < listEnd &&
((key_t*)checkp)->n1 == domainHash )))
// good match, return it
return p;
}
}
// otherwise backup a byte
}
// we'll get here if p == listStart
return p;
}
// binary search on the given list for the given key
void Catdb::listSearch ( RdbList *list,
key_t exactKey,
char **data,
int32_t *dataSize ) {
// init the data
*data = NULL;
*dataSize = 0;
list->resetListPtr();
// for small lists, just loop through the list
if (list->getListSize() < 16*1024) {
while ( ! list->isExhausted() ) {
// for debug!
/*
CatRec crec;
crec.set ( NULL,
list->getCurrentData(),
list->getCurrentDataSize(),
false);
log("catdb: caturl=%s #catid=%"INT32" version=%"INT32""
,crec.m_url
,(int32_t)crec.m_numCatids
,(int32_t)crec.m_version
);
*/
// check the current key
if ( list->getCurrentKey() != exactKey ) {
// miss, next
list->skipCurrentRecord();
continue;
}
// get site from this rec
*data = list->getCurrentData();
*dataSize = list->getCurrentDataSize();
break;
}
}
// otherwise do a binary search on the large lists
else {
// init the low and high
char *low = list->getList();
char *high = list->getListEnd();
// move the high ptr to the start of last rec
high = moveToCorrectKey(high, list, exactKey.n1);
// binary search
char *currRec;
while ( low <= high ) {
// next check spot
int32_t delta = high - low;
currRec = low + (delta / 2);
//currRec = (char*)(((uint64_t)low +
// (uint64_t)high)/2);
// do correction
currRec = moveToCorrectKey( currRec,
list,
exactKey.n1 );
// check for hit
if (list->getKey(currRec) == exactKey) {
// hit, save it and get out
*data = list->getData(currRec);
*dataSize = list->getDataSize(currRec);
break;
}
else if (list->getKey(currRec) > exactKey) {
// move high to currRec - one rec
high = moveToCorrectKey ( currRec - 1,
list,
exactKey.n1 );
}
else {
// move low to currRec + one rec
low = currRec + list->getRecSize(currRec);
}
}
}
}
// now given an RdbList of SiteRecs can we find the best matching rec
// for our site?
char *Catdb::getRec ( RdbList *list , Url *url , int32_t *recSize,
char* coll, int32_t collLen ) {
key_t exactKey;
int64_t startTime = gettimeofdayInMilliseconds();
int64_t took;
char *data;
int32_t dataSize;
// for now, only get exact hits for catdb
// check for an exact key/url match
exactKey = makeKey(url, false);
// go throught the list looking for the exact key
//list->resetListPtr();
data = NULL;
dataSize = 0;
// call the search
listSearch ( list, exactKey, &data, &dataSize );
// make sure the url matches
if (data && dataSize > 0) {
// get the url
/*
char *x;
int32_t xlen;
// hit, check the url
// for catdb, skip over the catids
if (m_rdbid == RDB_CATDB) {
unsigned char numCatids = *data;
// . point to stored url/site
// . skip dataSize/fileNum
int32_t skip = 1 + (4 * numCatids) + 4;
x = data + skip;
xlen = dataSize - skip;
}
else {
// . point to stored url/site
// . skip dataSize/fileNum
x = data + 4;
xlen = dataSize - 4;
}
// set the site
Url site;
site.set ( x , xlen , false );
*/
CatRec site;
site.set (url, data, dataSize, false);
// check for an exact match against the full url
int32_t uflen = url->getUrlLen();
char *ufull = url->getUrl();
//int32_t sflen = site.getUrlLen();
//char *sfull = site.getUrl();
int32_t sflen = site.m_urlLen;
char *sfull = site.m_url;
// if we match, return this rec
if ( sflen == uflen &&
strncmp ( sfull, ufull, sflen ) == 0 ) {
*recSize = dataSize;
}
else {
*recSize = 0;
data = NULL;
}
}
else {
*recSize = 0;
data = NULL;
}
took = gettimeofdayInMilliseconds() - startTime;
if ( took > 10 )
log(LOG_INFO, "catdb: catdb lookup took %"INT64" ms, "
"listSize=%"INT32"", took, list->getListSize() );
return data;
}
// . find the indirect matches in the list which match a sub path
// of the url
int32_t Catdb::getIndirectMatches ( RdbList *list ,
Url *url ,
char **matchRecs ,
int32_t *matchRecSizes ,
int32_t maxMatches ,
char *coll,
int32_t collLen) {
char path[MAX_URL_LEN+1];
int32_t pathLen;
Url partialUrl;
key_t partialUrlKey;
// start with the whole url...include real catid in indirect
memcpy(path, url->getUrl(), url->getUrlLen());
pathLen = url->getUrlLen();
// loop looking for partial matches
char *data = NULL;
int32_t dataSize = 0;
int32_t numMatches = 0;
while ( numMatches < maxMatches ) {
// make the partial url
partialUrl.set(path, pathLen, true);
normalizeUrl(&partialUrl, &partialUrl);
// make the next key
partialUrlKey = makeKey ( &partialUrl, false );
// search for it
listSearch ( list, partialUrlKey, &data, &dataSize );
// store a hit
if ( data && dataSize > 0 ) {
// get the url
char *x;
int32_t xlen;
// hit, check the url
// for catdb, skip over the catids
/*
unsigned char numCatids = *data;
// . point to stored url/site
// . skip dataSize/fileNum
int32_t skip = 1 + (4 * numCatids) + 4;
x = data + skip;
xlen = dataSize - skip;
*/
CatRec sr;
sr.set ( url, data, dataSize, false);
x = sr.m_url;
xlen = sr.m_urlLen;
// ensure it's a sub-path
if ( xlen <= url->getUrlLen() &&
strncasecmp(x, url->getUrl(), xlen) == 0 ) {
//char msg[4096];
//char *mp = msg;
//mp += sprintf(mp, "For ");
//memcpy(mp, url->getUrl(), url->getUrlLen());
//mp += url->getUrlLen();
//mp += sprintf(mp, " , got Indirect: ");
//memcpy(mp, x, xlen);
//mp += xlen;
//*mp = '\0';
//log ( LOG_INFO, "tagdb: %s", msg );
matchRecs [numMatches] = data;
matchRecSizes[numMatches] = dataSize;
numMatches++;
}
}
// make the next partial url
pathLen--;
while ( pathLen > 3 && path[pathLen-1] != '/' )
pathLen--;
// check for end
if ( pathLen <= 3 || strncmp(&path[pathLen-3], "://", 3) == 0 )
break;
// chop off the trailing /
pathLen--;
}
return numMatches;
}