open-source-search-engine/Collectiondb.cpp
Matt Wells 6370054bd5 fix problem of adding too many collections
and not wrapping the collnum_t id
2015-09-13 14:21:52 -07:00

4159 lines
121 KiB
C++

#include "gb-include.h"
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
#include "Posdb.h"
//#include "Indexdb.h"
#include "Datedb.h"
#include "Titledb.h"
//#include "Revdb.h"
//#include "Sections.h"
#include "Placedb.h"
#include "Tagdb.h"
#include "Catdb.h"
#include "Tfndb.h"
#include "Spider.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Spider.h"
#include "Repair.h"
#include "Users.h"
#include "Parms.h"
void testRegex ( ) ;
HashTableX g_collTable;
// a global class extern'd in .h file
Collectiondb g_collectiondb;
Collectiondb::Collectiondb ( ) {
m_wrapped = 0;
m_numRecs = 0;
m_numRecsUsed = 0;
m_numCollsSwappedOut = 0;
m_initializing = false;
//m_lastUpdateTime = 0LL;
m_needsSave = false;
// sanity
if ( RDB_END2 >= RDB_END ) return;
log("db: increase RDB_END2 to at least %"INT32" in "
"Collectiondb.h",(int32_t)RDB_END);
char *xx=NULL;*xx=0;
}
// reset rdb
void Collectiondb::reset() {
log(LOG_INFO,"db: resetting collectiondb.");
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
delete ( m_recs[i] );
m_recs[i] = NULL;
}
m_numRecs = 0;
m_numRecsUsed = 0;
g_collTable.reset();
}
/*
bool Collectiondb::init ( bool isDump ) {
reset();
if ( g_isYippy ) return true;
// reset # of recs
//m_numRecs = 0;
//m_numRecsUsed = 0;
// . now load ALL recs
// . returns false and sets g_errno on error
if ( ! load ( isDump ) ) return false;
// update time
updateTime();
// so we don't save again
m_needsSave = false;
// sanity
if ( RDB_END2 < RDB_END ) {
log("db: increase RDB_END2 to at least %"INT32" in "
"Collectiondb.h",(int32_t)RDB_END);
char *xx=NULL;*xx=0;
}
// if it set g_errno, return false
//if ( g_errno ) return log("admin: Had init error: %s.",
// mstrerror(g_errno));
g_errno = 0;
// otherwise, true, even if reloadList() blocked
return true;
}
*/
extern bool g_inAutoSave;
// . save to disk
// . returns false if blocked, true otherwise
bool Collectiondb::save ( ) {
if ( g_conf.m_readOnlyMode ) return true;
if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
return true;
// which collection rec needs a save
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
// temp debug message
//logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
if ( ! m_recs[i]->m_needsSave ) continue;
// if we core in malloc we won't be able to save the
// coll.conf files
if ( m_recs[i]->m_isCustomCrawl &&
g_inMemFunction &&
g_hostdb.m_hostId != 0 )
continue;
//log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
m_recs[i]->save ( );
}
// oh well
return true;
}
///////////
//
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
//
///////////
bool Collectiondb::loadAllCollRecs ( ) {
m_initializing = true;
char dname[1024];
// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
sprintf ( dname , "%s" , g_hostdb.m_dir );
Dir d;
d.set ( dname );
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
int32_t count = 0;
char *f;
while ( ( f = d.getNextFilename ( "*" ) ) ) {
// skip if first char not "coll."
if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
// must end on a digit (i.e. coll.main.0)
if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
// count them
count++;
}
// reset directory for another scan
d.set ( dname );
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
// note it
//log(LOG_INFO,"db: loading collection config files.");
// . scan through all subdirs in the collections dir
// . they should be like, "coll.main/" and "coll.mycollection/"
while ( ( f = d.getNextFilename ( "*" ) ) ) {
// skip if first char not "coll."
if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
// must end on a digit (i.e. coll.main.0)
if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
// point to collection
char *coll = f + 5;
// NULL terminate at .
char *pp = strchr ( coll , '.' );
if ( ! pp ) continue;
*pp = '\0';
// get collnum
collnum_t collnum = atol ( pp + 1 );
// add it
if ( ! addExistingColl ( coll , collnum ) )
return false;
// swap it out if we got 100+ collections
// if ( count < 100 ) continue;
// CollectionRec *cr = getRec ( collnum );
// if ( cr ) cr->swapOut();
}
// if no existing recs added... add coll.main.0 always at startup
if ( m_numRecs == 0 ) {
log("admin: adding main collection.");
addNewColl ( "main",
0 , // customCrawl ,
NULL,
0 ,
true , // bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
0 );
}
m_initializing = false;
// note it
//log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
// "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
// update the time
//updateTime();
// don't clean the tree if just dumpin
//if ( isDump ) return true;
return true;
}
// after we've initialized all rdbs in main.cpp call this to clean out
// our rdb trees
bool Collectiondb::cleanTrees ( ) {
// remove any nodes with illegal collnums
Rdb *r;
//r = g_indexdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_posdb.getRdb();
//r->m_tree.cleanTree ();//(char **)r->m_bases);
r->m_buckets.cleanBuckets();
//r = g_datedb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_titledb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
//r = g_revdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_sectiondb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_checksumdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_tfndb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_spiderdb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
r = g_doledb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
// success
return true;
}
/*
void Collectiondb::updateTime() {
// get time now in milliseconds
int64_t newTime = gettimeofdayInMilliseconds();
// change it
if ( m_lastUpdateTime == newTime ) newTime++;
// update it
m_lastUpdateTime = newTime;
// we need a save
m_needsSave = true;
}
*/
#include "Statsdb.h"
#include "Cachedb.h"
#include "Syncdb.h"
// same as addOldColl()
bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
int32_t i = collnum;
// ensure does not already exist in memory
collnum_t oldCollnum = getCollnum(coll);
if ( oldCollnum >= 0 ) {
g_errno = EEXIST;
log("admin: Trying to create collection \"%s\" but "
"already exists in memory. Do an ls on "
"the working dir to see if there are two "
"collection dirs with the same coll name",coll);
char *xx=NULL;*xx=0;
}
// also try by #, i've seen this happen too
CollectionRec *ocr = getRec ( i );
if ( ocr ) {
g_errno = EEXIST;
log("admin: Collection id %i is in use already by "
"%s, so we can not add %s. moving %s to trash."
,(int)i,ocr->m_coll,coll,coll);
SafeBuf cmd;
int64_t now = gettimeofdayInMilliseconds();
cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
, coll
,(int)i
, coll
,(int)i
, now );
//log("admin: %s",cmd.getBufStart());
gbsystem ( cmd.getBufStart() );
return true;
}
// create the record in memory
CollectionRec *cr = new (CollectionRec);
if ( ! cr )
return log("admin: Failed to allocated %"INT32" bytes for new "
"collection record for \"%s\".",
(int32_t)sizeof(CollectionRec),coll);
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// set collnum right for g_parms.setToDefault() call just in case
// because before it was calling CollectionRec::reset() which
// was resetting the RdbBases for the m_collnum which was garbage
// and ended up resetting random collections' rdb. but now
// CollectionRec::CollectionRec() sets m_collnum to -1 so we should
// not need this!
//cr->m_collnum = oldCollnum;
// get the default.conf from working dir if there
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
strcpy ( cr->m_coll , coll );
cr->m_collLen = gbstrlen ( coll );
cr->m_collnum = i;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
//log("admin: loaded old coll \"%s\"",coll);
// load coll.conf file
if ( ! cr->load ( coll , i ) ) {
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
log("admin: Failed to load coll.%s.%"INT32"/coll.conf",coll,i);
delete ( cr );
if ( m_recs ) m_recs[i] = NULL;
return false;
}
if ( ! registerCollRec ( cr , false ) ) return false;
// always index spider status docs now for custom crawls
if ( cr->m_isCustomCrawl )
cr->m_indexSpiderReplies = true;
// and don't do link voting, will help speed up
if ( cr->m_isCustomCrawl ) {
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
}
// we need to compile the regular expressions or update the url
// filters with new logic that maps crawlbot parms to url filters
return cr->rebuildUrlFilters ( );
}
// . add a new rec
// . returns false and sets g_errno on error
// . was addRec()
// . "isDump" is true if we don't need to initialize all the rdbs etc
// because we are doing a './gb dump ...' cmd to dump out data from
// one Rdb which we will custom initialize in main.cpp where the dump
// code is. like for instance, posdb.
// . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
// 2 for a bulk job. diffbot terminology.
bool Collectiondb::addNewColl ( char *coll ,
char customCrawl ,
char *cpc ,
int32_t cpclen ,
bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
collnum_t newCollnum ) {
//do not send add/del coll request until we are in sync with shard!!
// just return ETRYAGAIN for the parmlist...
// ensure coll name is legit
char *p = coll;
for ( ; *p ; p++ ) {
if ( is_alnum_a(*p) ) continue;
if ( *p == '-' ) continue;
if ( *p == '_' ) continue; // underscore now allowed
break;
}
if ( *p ) {
g_errno = EBADENGINEER;
log("admin: \"%s\" is a malformed collection name because it "
"contains the '%c' character.",coll,*p);
return false;
}
// . scan for holes
// . i is also known as the collection id
//int32_t i = (int32_t)newCollnum;
// no longer fill empty slots because if they do a reset then
// a new rec right away it will be filled with msg4 recs not
// destined for it. Later we will have to recycle some how!!
//else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
// right now we #define collnum_t int16_t. so do not breach that!
//if ( m_numRecs < 0x7fff ) {
// // set it
// i = m_numRecs;
// // claim it
// // we don't do it here, because we check i below and
// // increment m_numRecs below.
// //m_numRecs++;
//}
// TODO: scan for holes here...
//else {
if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }
// ceiling?
//int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
//if ( i >= maxColls ) {
// g_errno = ENOBUFS;
// return log("admin: Limit of %"INT64" collection reached. "
// "Collection not created.",maxColls);
//}
// if empty... bail, no longer accepted, use "main"
if ( ! coll || !coll[0] ) {
g_errno = EBADENGINEER;
return log("admin: Trying to create a new collection "
"but no collection name provided. Use the \"c\" "
"cgi parameter to specify it.");
}
// or if too big
if ( gbstrlen(coll) > MAX_COLL_LEN ) {
g_errno = ENOBUFS;
return log("admin: Trying to create a new collection "
"whose name \"%s\" of %i chars is longer than the "
"max of %"INT32" chars.",coll,gbstrlen(coll),
(int32_t)MAX_COLL_LEN);
}
// ensure does not already exist in memory
if ( getCollnum ( coll ) >= 0 ) {
g_errno = EEXIST;
log("admin: Trying to create collection \"%s\" but "
"already exists in memory.",coll);
// just let it pass...
g_errno = 0 ;
return true;
}
// MDW: ensure not created on disk since time of last load
char dname[512];
sprintf(dname, "%scoll.%s.%"INT32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir ) closedir ( dir );
if ( dir ) {
g_errno = EEXIST;
return log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",coll,dname);
}
// create the record in memory
CollectionRec *cr = new (CollectionRec);
if ( ! cr )
return log("admin: Failed to allocated %"INT32" bytes for new "
"collection record for \"%s\".",
(int32_t)sizeof(CollectionRec),coll);
// register the mem
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// get copy collection
//CollectionRec *cpcrec = NULL;
//if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
//if ( cpc && cpc[0] && ! cpcrec )
// log("admin: Collection \"%s\" to copy config from does not "
// "exist.",cpc);
// set collnum right for g_parms.setToDefault() call
//cr->m_collnum = newCollnum;
// . get the default.conf from working dir if there
// . i think this calls CollectionRec::reset() which resets all of its
// rdbbase classes for its collnum so m_collnum needs to be right
//g_parms.setToDefault( (char *)cr );
// get the default.conf from working dir if there
//g_parms.setToDefault( (char *)cr , OBJ_COLL );
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
// put search results back so it doesn't mess up results in qatest123
if ( strcmp(coll,"qatest123") == 0 )
cr->m_sameLangWeight = 20.0;
/*
// the default conf file
char tmp1[1024];
sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
// . set our parms from the file.
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( cr , NULL , tmp1 );
*/
// this will override all
// if ( cpcrec ) {
// // copy it, but not the timedb hashtable, etc.
// int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
// // JAB: bad gbmemcpy - no donut!
// // this is not how objects are supposed to be copied!!!
// gbmemcpy ( cr , cpcrec , size);
// }
// set coll id and coll name for coll id #i
strcpy ( cr->m_coll , coll );
cr->m_collLen = gbstrlen ( coll );
cr->m_collnum = newCollnum;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
//
// BEGIN NEW CODE
//
//
// get token and crawlname if customCrawl is 1 or 2
//
char *token = NULL;
char *crawl = NULL;
SafeBuf tmp;
// . return true with g_errno set on error
// . if we fail to set a parm right we should force ourselves
// out sync
if ( customCrawl ) {
if ( ! tmp.safeStrcpy ( coll ) ) return true;
token = tmp.getBufStart();
// diffbot coll name format is <token>-<crawlname>
char *h = strchr ( tmp.getBufStart() , '-' );
if ( ! h ) {
log("crawlbot: bad custom collname");
g_errno = EBADENGINEER;
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
return true;
}
*h = '\0';
crawl = h + 1;
if ( ! crawl[0] ) {
log("crawlbot: bad custom crawl name");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
// or if too big!
if ( gbstrlen(crawl) > 30 ) {
log("crawlbot: crawlbot crawl NAME is over 30 chars");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
}
//log("parms: added new collection \"%s\"", collName );
cr->m_maxToCrawl = -1;
cr->m_maxToProcess = -1;
if ( customCrawl ) {
// always index spider status docs now
cr->m_indexSpiderReplies = true;
// remember the token
cr->m_diffbotToken.set ( token );
cr->m_diffbotCrawlName.set ( crawl );
// bring this back
cr->m_diffbotApiUrl.set ( "" );
cr->m_diffbotUrlCrawlPattern.set ( "" );
cr->m_diffbotUrlProcessPattern.set ( "" );
cr->m_diffbotPageProcessPattern.set ( "" );
cr->m_diffbotUrlCrawlRegEx.set ( "" );
cr->m_diffbotUrlProcessRegEx.set ( "" );
cr->m_diffbotMaxHops = -1;
cr->m_spiderStatus = SP_INITIALIZING;
// do not spider more than this many urls total.
// -1 means no max.
cr->m_maxToCrawl = 100000;
// do not process more than this. -1 means no max.
cr->m_maxToProcess = 100000;
// -1 means no max
cr->m_maxCrawlRounds = -1;
// john want's deduping on by default to avoid
// processing similar pgs
cr->m_dedupingEnabled = true;
// show the ban links in the search results. the
// collection name is cryptographic enough to show that
cr->m_isCustomCrawl = customCrawl;
cr->m_diffbotOnlyProcessIfNewUrl = true;
// default respider to off
cr->m_collectiveRespiderFrequency = 0.0;
//cr->m_restrictDomain = true;
// reset the crawl stats
// always turn off gigabits so &s=1000 can do summary skipping
cr->m_docsToScanForTopics = 0;
}
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
// . just the basics on these for now
// . if certain parms are changed then the url filters
// must be rebuilt, as well as possibly the waiting tree!!!
// . need to set m_urlFiltersHavePageCounts etc.
cr->rebuildUrlFilters ( );
cr->m_useRobotsTxt = true;
// reset crawler stats.they should be loaded from crawlinfo.txt
memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
// note that
log("colldb: initial revival for %s",cr->m_coll);
// . assume we got some urls ready to spider
// . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
// urls it spidered in that time these will get set to 0 and it
// will send out an email alert if m_sentCrawlDoneAlert is not true.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
// set some defaults. max spiders for all priorities in this
// collection. NO, default is in Parms.cpp.
//cr->m_maxNumSpiders = 10;
//cr->m_needsSave = 1;
// start the spiders!
cr->m_spideringEnabled = true;
// override this?
saveIt = true;
//
// END NEW CODE
//
//log("admin: adding coll \"%s\" (new=%"INT32")",coll,(int32_t)isNew);
// MDW: create the new directory
retry22:
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
// valgrind?
if ( errno == EINTR ) goto retry22;
g_errno = errno;
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
return log("admin: Creating directory %s had error: "
"%s.", dname,mstrerror(g_errno));
}
// save it into this dir... might fail!
if ( saveIt && ! cr->save() ) {
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
return log("admin: Failed to save file %s: %s",
dname,mstrerror(g_errno));
}
if ( ! registerCollRec ( cr , true ) )
return false;
// add the rdbbases for this coll, CollectionRec::m_bases[]
if ( ! addRdbBasesForCollRec ( cr ) )
return false;
return true;
}
void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
// if in the process of swapping in, this will be false...
//if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
// Rdb::deleteColl() will call this even though we are swapped in
// but it calls it with "base" set to NULL after it nukes the RdbBase
// so check if base is null here.
if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
m_bases [ (unsigned char)rdbId ] = base;
}
RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
return m_bases [ (unsigned char)rdbId ];
}
static bool s_inside = false;
// . returns NULL w/ g_errno set on error.
// . TODO: ensure not called from in thread, not thread safe
RdbBase *CollectionRec::getBase ( char rdbId ) {
if ( s_inside ) { char *xx=NULL;*xx=0; }
if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];
log("cdb: swapin collnum=%"INT32"",(int32_t)m_collnum);
// sanity!
if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }
s_inside = true;
// turn off quickpoll to avoid getbase() being re-called and
// coring from s_inside being true
int32_t saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
// load them back in. return NULL w/ g_errno set on error.
if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
log("coll: error swapin: %s",mstrerror(g_errno));
g_conf.m_useQuickpoll = saved;
s_inside = false;
return NULL;
}
g_conf.m_useQuickpoll = saved;
s_inside = false;
g_collectiondb.m_numCollsSwappedOut--;
m_swappedOut = false;
log("coll: swapin was successful for collnum=%"INT32"",(int32_t)m_collnum);
return m_bases[(unsigned char)rdbId];
}
bool CollectionRec::swapOut ( ) {
if ( m_swappedOut ) return true;
log("cdb: swapout collnum=%"INT32"",(int32_t)m_collnum);
// free all RdbBases in each rdb
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
Rdb *rdb = g_process.m_rdbs[i];
// this frees all the RdbBase::m_files and m_maps for the base
rdb->resetBase ( m_collnum );
}
// now free each base itself
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
RdbBase *base = m_bases[i];
if ( ! base ) continue;
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
m_bases[i] = NULL;
}
m_swappedOut = true;
g_collectiondb.m_numCollsSwappedOut++;
return true;
}
// . called only by addNewColl() and by addExistingColl()
bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
// add m_recs[] and to hashtable
if ( ! setRecPtr ( cr->m_collnum , cr ) )
return false;
return true;
}
// swap it in
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
CollectionRec *cr = m_recs[i];
if ( ! cr ) continue;
// skip if swapped out
if ( cr->m_swappedOut ) continue;
// add rdb base files etc. for it
addRdbBasesForCollRec ( cr );
}
// now clean the trees. moved this into here from
// addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
// now from getBase() to load on-demand for saving memory
cleanTrees();
return true;
}
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
char *coll = cr->m_coll;
//////
//
// if we are doing a dump from the command line, skip this stuff
//
//////
if ( g_dumpMode ) return true;
// tell rdbs to add one, too
//if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
// now clean the trees
//cleanTrees();
// debug message
//log ( LOG_INFO, "db: verified collection \"%s\" (%"INT32").",
// coll,(int32_t)cr->m_collnum);
// tell SpiderCache about this collection, it will create a
// SpiderCollection class for it.
//g_spiderCache.reset1();
// success
return true;
hadError:
log("db: error registering coll: %s",mstrerror(g_errno));
return false;
}
/*
bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
if ( r->getLong("admin",1) == 0 ) return false;
if ( g_conf.isMasterAdmin ( s , r ) ) return true;
char *c = r->getString ( "c" );
CollectionRec *cr = getRec ( c );
if ( ! cr ) return false;
return g_users.hasPermission ( r , PAGE_SEARCH );
//return cr->hasPermission ( r , s );
}
void savingCheckWrapper1 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) { log("colldb: we1 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.resetColl ( we->m_coll ,
we ,
we->m_purgeSeeds))
return;
// all done
we->m_callback ( we->m_state );
}
void savingCheckWrapper2 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) { log("colldb: we2 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
// all done
we->m_callback ( we->m_state );
}
*/
/*
// delete all records checked in the list
bool Collectiondb::deleteRecs ( HttpRequest *r ) {
for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
char *f = r->getField ( i );
if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
char *coll = f + 3;
//if ( ! is_digit ( f[3] ) ) continue;
//int32_t h = atol ( f + 3 );
deleteRec ( coll , NULL );
}
return true;
}
*/
/*
// . delete a collection
// . this uses blocking unlinks, may make non-blocking later
// . returns false if blocked, true otherwise
bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
// force on for now
//deleteTurkdb = true;
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
//if ( g_conf.m_spideringEnabled ||
// g_spiderLoop.m_numSpidersOut > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are enabled or active.");
// return false;
//}
// ensure it's not NULL
if ( ! coll ) {
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
g_errno = ENOTFOUND;
return true;
}
// find the rec for this collection
collnum_t collnum = getCollnum ( coll );
return deleteRec2 ( collnum , we );
}
*/
// if there is an outstanding disk read thread or merge thread then
// Spider.cpp will handle the delete in the callback.
// this is now tryToDeleteSpiderColl in Spider.cpp
/*
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
return;
}
}
*/
/// this deletes the collection, not just part of a reset.
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
log("admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
// bitch if not found
if ( collnum < 0 ) {
g_errno = ENOTFOUND;
log(LOG_LOGIC,"admin: Collection #%"INT32" is bad, "
"delete failed.",(int32_t)collnum);
return true;
}
CollectionRec *cr = m_recs [ collnum ];
if ( ! cr ) {
log("admin: Collection id problem. Delete failed.");
g_errno = ENOTFOUND;
return true;
}
if ( g_process.isAnyTreeSaving() ) {
// note it
log("admin: tree is saving. waiting2.");
// all done
return false;
}
// spiders off
//if ( cr->m_spiderColl &&
// cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are oustanding for collection. Turn off "
// "spiders and wait for them to exit.");
// return false;
//}
char *coll = cr->m_coll;
// note it
log(LOG_INFO,"db: deleting coll \"%s\" (%"INT32")",coll,
(int32_t)cr->m_collnum);
// we need a save
m_needsSave = true;
// nuke doleiptable and waintree and waitingtable
/*
SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
sc->m_waitingTree.clear();
sc->m_waitingTable.clear();
sc->m_doleIpTable.clear();
g_spiderLoop.m_lockTable.clear();
g_spiderLoop.m_lockCache.clear(0);
sc->m_lastDownloadCache.clear(collnum);
*/
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
// remove from spider cache, tell it to sync up with collectiondb
//g_spiderCache.reset1();
// . TODO: remove from g_sync
// . remove from all rdbs
//g_indexdb.getRdb()->delColl ( coll );
g_posdb.getRdb()->delColl ( coll );
//g_datedb.getRdb()->delColl ( coll );
g_titledb.getRdb()->delColl ( coll );
//g_revdb.getRdb()->delColl ( coll );
//g_sectiondb.getRdb()->delColl ( coll );
g_tagdb.getRdb()->delColl ( coll );
// let's preserve the tags... they have all the turk votes in them
//if ( deleteTurkdb ) {
//}
//g_catdb.getRdb()->delColl ( coll );
//g_checksumdb.getRdb()->delColl ( coll );
g_spiderdb.getRdb()->delColl ( coll );
g_doledb.getRdb()->delColl ( coll );
//g_tfndb.getRdb()->delColl ( coll );
g_clusterdb.getRdb()->delColl ( coll );
g_linkdb.getRdb()->delColl ( coll );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
//sc->m_collnum = newCollnum;
//sc->reset();
// you have to set this for tryToDeleteSpiderColl to
// actually have a shot at deleting it
sc->m_deleteMyself = true;
// cr will be invalid int16_tly after this
// MDW: this is causing the core...
// use fake ptrs for easier debugging
//sc->m_cr = (CollectionRec *)0x99999;//NULL;
//sc->m_cr = NULL;
sc->setCollectionRec ( NULL );
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc ,"10");
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
// don't let cr reference us anymore, sc is on deathrow
// and "cr" is delete below!
//cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
cr->m_spiderColl = NULL;
}
// the bulk urls file too i guess
if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
SafeBuf bu;
bu.safePrintf("%sbulkurls-%s.txt",
g_hostdb.m_dir , cr->m_coll );
File bf;
bf.set ( bu.getBufStart() );
if ( bf.doesExist() ) bf.unlink();
}
// now remove from list of collections that might need a disk merge
removeFromMergeLinkedList ( cr );
//////
//
// remove from m_recs[]
//
//////
setRecPtr ( cr->m_collnum , NULL );
// free it
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
// do not do this here in case spiders were outstanding
// and they added a new coll right away and it ended up getting
// recs from the deleted coll!!
//while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
// update the time
//updateTime();
// done
return true;
}
//#include "PageTurk.h"
/*
// . reset a collection
// . returns false if blocked and will call callback
bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
// ensure it's not NULL
if ( ! coll ) {
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
g_errno = ENOCOLLREC;
return true;
}
// get the CollectionRec for "qatest123"
CollectionRec *cr = getRec ( coll ); // "qatest123" );
// must be there. if not, we create test i guess
if ( ! cr ) {
log("db: could not get coll rec \"%s\" to reset", coll);
char *xx=NULL;*xx=0;
}
return resetColl2 ( cr->m_collnum, purgeSeeds);
}
*/
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
// an add, make sure big enough
int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
int32_t have = m_recPtrBuf.getLength();
int32_t need2 = need - have;
// if already big enough
if ( need2 <= 0 ) {
m_recs [ collnum ] = NULL;
return true;
}
m_recPtrBuf.setLabel ("crecptrb");
// . true here means to clear the new space to zeroes
// . this shit works based on m_length not m_capacity
if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
log("admin: error growing rec ptr buf2.");
return false;
}
// sanity
if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// update length of used bytes in case we re-alloc
m_recPtrBuf.setLength ( need );
// re-max
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// sanity
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
// initialize slot
m_recs [ collnum ] = NULL;
return true;
}
bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
// first time init hashtable that maps coll to collnum
if ( g_collTable.m_numSlots == 0 &&
! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
false,0,"nhshtbl"))
return false;
// sanity
if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
// sanity
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// tell spiders to re-upadted the active list
g_spiderLoop.m_activeListValid = false;
g_spiderLoop.m_activeListModified = true;
// a delete?
if ( ! cr ) {
// sanity
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
// get what's there
CollectionRec *oc = m_recs[collnum];
// let it go
m_recs[collnum] = NULL;
// if nothing already, done
if ( ! oc ) return true;
// tally it up
m_numRecsUsed--;
// delete key
int64_t h64 = hash64n(oc->m_coll);
// if in the hashtable UNDER OUR COLLNUM then nuke it
// otherwise, we might be called from resetColl2()
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return true;
collnum_t ct = *(collnum_t *)vp;
if ( ct != collnum ) return true;
g_collTable.removeKey ( &h64 );
return true;
}
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
if ( ! growRecPtrBuf ( collnum ) )
return false;
// sanity
if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
// add to hash table to map name to collnum_t
int64_t h64 = hash64n(cr->m_coll);
// debug
//log("coll: adding key %"INT64" for %s",h64,cr->m_coll);
if ( ! g_collTable.addKey ( &h64 , &collnum ) )
return false;
// ensure last is NULL
m_recs[collnum] = cr;
// count it
m_numRecsUsed++;
//log("coll: adding key4 %"UINT64" for coll \"%s\" (%"INT32")",h64,cr->m_coll,
// (int32_t)i);
// reserve it
if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
// sanity to make sure collectionrec ptrs are legit
for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
if ( ! m_recs[j] ) continue;
if ( m_recs[j]->m_collnum == 1 ) continue;
}
// update the time
//updateTime();
return true;
}
// moves a file by first trying rename, then copying since cross device renaming doesn't work
// returns 0 on success
int mv(char* src, char* dest) {
int status = rename( src , dest );
if (status == 0)
return 0;
FILE *fsrc, *fdest;
fsrc = fopen(src, "r");
if (fsrc == NULL)
return -1;
fdest = fopen(dest, "w");
if (fdest == NULL) {
fclose(fsrc);
return -1;
}
const int BUF_SIZE = 1024;
char buf[BUF_SIZE];
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
int read = fread(buf, 1, BUF_SIZE, fsrc);
fwrite(buf, 1, read, fdest);
}
fclose(fsrc);
fclose(fdest);
if (ferror(fdest) || ferror(fsrc))
return -1;
remove(src);
return 0;
}
// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2( collnum_t oldCollnum,
collnum_t newCollnum,
//WaitEntry *we,
bool purgeSeeds){
// save parms in case we block
//we->m_purgeSeeds = purgeSeeds;
// now must be "qatest123" only for now
//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
//if ( g_conf.m_spideringEnabled ||
// g_spiderLoop.m_numSpidersOut > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are enabled or active.");
// return false;
//}
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
log("admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
//log("admin: resetting collnum %"INT32"",(int32_t)oldCollnum);
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
if ( g_process.isAnyTreeSaving() ) {
// we could not complete...
return false;
}
CollectionRec *cr = m_recs [ oldCollnum ];
// let's reset crawlinfo crap
cr->m_globalCrawlInfo.reset();
cr->m_localCrawlInfo.reset();
//collnum_t oldCollnum = cr->m_collnum;
//collnum_t newCollnum = m_numRecs;
// in case of bulk job, be sure to save list of spots
// copy existing list to a /tmp, where they will later be transferred back to the new folder
// now i just store in the root working dir... MDW
/*
char oldbulkurlsname[1036];
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
char newbulkurlsname[1036];
snprintf(newbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
char tmpbulkurlsname[1036];
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%"INT32".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);
if (cr->m_isCustomCrawl == 2)
mv( oldbulkurlsname , tmpbulkurlsname );
*/
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
// don't do this anymore, just nuke it in case
// m_populatingDoledb was true etc. there are too many
// flags to worry about
//sc->m_collnum = newCollnum;
//sc->reset();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc,"11" );
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
}
// reset spider round
cr->m_spiderRoundNum = 0;
cr->m_spiderRoundStartTime = 0;
cr->m_spiderStatus = SP_INITIALIZING; // this is 0
//cr->m_spiderStatusMsg = NULL;
// reset seed buf
if ( purgeSeeds ) {
// free the buffer of seed urls
cr->m_diffbotSeeds.purge();
// reset seed dedup table
HashTableX *ht = &cr->m_seedHashTable;
ht->reset();
}
// so XmlDoc.cpp can detect if the collection was reset since it
// launched its spider:
cr->m_lastResetCount++;
if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
// advance sanity check. did we wrap around?
// right now we #define collnum_t int16_t
if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
// make a new collnum so records in transit will not be added
// to any rdb...
cr->m_collnum = newCollnum;
// update the timestamps since we are restarting/resetting
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
////////
//
// ALTER m_recs[] array
//
////////
// Rdb::resetColl() needs to know the new cr so it can move
// the RdbBase into cr->m_bases[rdbId] array. recycling.
setRecPtr ( newCollnum , cr );
// a new directory then since we changed the collnum
char dname[512];
sprintf(dname, "%scoll.%s.%"INT32"/",
g_hostdb.m_dir,
cr->m_coll,
(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir )
closedir ( dir );
if ( dir ) {
//g_errno = EEXIST;
log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",cr->m_coll,dname);
}
if ( ::mkdir ( dname ,
S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IXOTH ) ) {
// valgrind?
//if ( errno == EINTR ) goto retry22;
//g_errno = errno;
log("admin: Creating directory %s had error: "
"%s.", dname,mstrerror(g_errno));
}
// be sure to copy back the bulk urls for bulk jobs
// MDW: now i just store that file in the root working dir
//if (cr->m_isCustomCrawl == 2)
// mv( tmpbulkurlsname, newbulkurlsname );
// . unlink all the *.dat and *.map files for this coll in its subdir
// . remove all recs from this collnum from m_tree/m_buckets
// . updates RdbBase::m_collnum
// . so for the tree it just needs to mark the old collnum recs
// with a collnum -1 in case it is saving...
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
// reset crawl status too!
cr->m_spiderStatus = SP_INITIALIZING;
// . set m_recs[oldCollnum] to NULL and remove from hash table
// . do after calls to deleteColl() above so it wont crash
setRecPtr ( oldCollnum , NULL );
// save coll.conf to new directory
cr->save();
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we
// have in the test-parser subdir so we are consistent
//RdbCache *robots = Msg13::getHttpCacheRobots();
//RdbCache *others = Msg13::getHttpCacheOthers();
// clear() was removed do to possible corruption
//robots->clear ( oldCollnum );
//others->clear ( oldCollnum );
//g_templateTable.reset();
//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
// repopulate CollectionRec::m_sortByDateTable. should be empty
// since we are resetting here.
//initSortByDateTable ( coll );
// done
return true;
}
// a hack function
bool addCollToTable ( char *coll , collnum_t collnum ) {
// readd it to the hashtable that maps name to collnum too
int64_t h64 = hash64n(coll);
g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
false,0,"nhshtbl");
return g_collTable.addKey ( &h64 , &collnum );
}
// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
// maybe it is crawlbot?
char *name = NULL;
char *token = NULL;
if ( ! coll ) {
name = r->getString("name");
token = r->getString("token");
}
char tmp[MAX_COLL_LEN+1];
if ( ! coll && token && name ) {
snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
coll = tmp;
}
// default to main first
if ( ! coll && useDefaultRec ) {
CollectionRec *cr = g_collectiondb.getRec("main");
if ( cr ) return cr;
}
// try next in line
if ( ! coll && useDefaultRec ) {
return getFirstRec ();
}
// give up?
if ( ! coll ) return NULL;
//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
return g_collectiondb.getRec ( coll );
}
char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
if ( coll ) return coll;
CollectionRec *cr = NULL;
// default to main first
if ( ! coll ) {
cr = g_collectiondb.getRec("main");
// CAUTION: cr could be deleted so don't trust this ptr
// if you give up control of the cpu
if ( cr ) return cr->m_coll;
}
// try next in line
if ( ! coll ) {
cr = getFirstRec ();
if ( cr ) return cr->m_coll;
}
// give up?
return NULL;
}
//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
// char *coll = getDefaultColl();
// return g_collectiondb.getRec(coll);
//}
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
if ( ! coll ) coll = "";
return getRec ( coll , gbstrlen(coll) );
}
CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
if ( ! coll ) coll = "";
collnum_t collnum = getCollnum ( coll , collLen );
if ( collnum < 0 ) return NULL;
return m_recs [ (int32_t)collnum ];
}
CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
if ( collnum >= m_numRecs || collnum < 0 ) {
// Rdb::resetBase() gets here, so don't always log.
// it is called from CollectionRec::reset() which is called
// from the CollectionRec constructor and ::load() so
// it won't have anything in rdb at that time
//log("colldb: collnum %"INT32" > numrecs = %"INT32"",
// (int32_t)collnum,(int32_t)m_numRecs);
return NULL;
}
return m_recs[collnum];
}
//CollectionRec *Collectiondb::getDefaultRec ( ) {
// if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
// collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
// if ( collnum < (collnum_t)0 ) return NULL;
// return m_recs[(int32_t)collnum];
//}
CollectionRec *Collectiondb::getFirstRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return m_recs[i];
return NULL;
}
collnum_t Collectiondb::getFirstCollnum ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return i;
return (collnum_t)-1;
}
char *Collectiondb::getFirstCollName ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return m_recs[i]->m_coll;
return NULL;
}
char *Collectiondb::getCollName ( collnum_t collnum ) {
if ( collnum < 0 || collnum > m_numRecs ) return NULL;
if ( ! m_recs[(int32_t)collnum] ) return NULL;
return m_recs[collnum]->m_coll;
}
collnum_t Collectiondb::getCollnum ( char *coll ) {
int32_t clen = 0;
if ( coll ) clen = gbstrlen(coll );
return getCollnum ( coll , clen );
/*
//if ( ! coll ) coll = "";
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
if ( ! coll ) coll = g_conf.m_defaultColl;
if ( ! coll || ! coll[0] ) coll = "main";
// This is necessary for Statsdb to work, as it is
// not associated with any collection. Is this
// necessary for Catdb?
if ( coll[0]=='s' && coll[1] =='t' &&
strcmp ( "statsdb\0", coll ) == 0)
return 0;
if ( coll[0]=='f' && coll[1]=='a' &&
strcmp ( "facebookdb\0", coll ) == 0)
return 0;
if ( coll[0]=='a' && coll[1]=='c' &&
strcmp ( "accessdb\0", coll ) == 0)
return 0;
// because diffbot may have thousands of crawls/collections
// let's improve the speed here. try hashing it...
int64_t h64 = hash64n(coll);
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return -1; // not found
return *(collnum_t *)vp;
*/
/*
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
}
//if ( strcmp ( "catdb\0", coll ) == 0) return 0;
return (collnum_t)-1; // not found
*/
}
collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
if ( ! coll ) {
coll = g_conf.m_defaultColl;
if ( coll ) clen = gbstrlen(coll);
else clen = 0;
}
if ( ! coll || ! coll[0] ) {
coll = "main";
clen = gbstrlen(coll);
}
// This is necessary for Statsdb to work, as it is
//if ( ! coll ) coll = "";
// not associated with any collection. Is this
// necessary for Catdb?
if ( coll[0]=='s' && coll[1] =='t' &&
strcmp ( "statsdb\0", coll ) == 0)
return 0;
if ( coll[0]=='f' && coll[1]=='f' &&
strcmp ( "facebookdb\0", coll ) == 0)
return 0;
if ( coll[0]=='a' && coll[1]=='c' &&
strcmp ( "accessdb\0", coll ) == 0)
return 0;
// because diffbot may have thousands of crawls/collections
// let's improve the speed here. try hashing it...
int64_t h64 = hash64(coll,clen);
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return -1; // not found
return *(collnum_t *)vp;
/*
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
if ( m_recs[i]->m_collLen != clen ) continue;
if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
}
//if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
return (collnum_t)-1; // not found
*/
}
//collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
// for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
// if ( m_recs[i] ) return i;
// // no next one, use -1
// return (collnum_t) -1;
//}
// what collnum will be used the next time a coll is added?
collnum_t Collectiondb::reserveCollNum ( ) {
if ( m_numRecs < 0x7fff ) {
collnum_t next = m_numRecs;
// make the ptr NULL at least to accomodate the
// loop that scan up to m_numRecs lest we core
growRecPtrBuf ( next );
m_numRecs++;
return next;
}
// collnum_t is signed right now because we use -1 to indicate a
// bad collnum.
int32_t scanned = 0;
// search for an empty slot
for ( int32_t i = m_wrapped ; ; i++ ) {
// because collnum_t is 2 bytes, signed, limit this here
if ( i > 0x7fff ) i = 0;
// how can this happen?
if ( i < 0 ) i = 0;
// if we scanned the max # of recs we could have, we are done
if ( ++scanned >= m_numRecs ) break;
// skip if this is in use
if ( m_recs[i] ) continue;
// start after this one next time
m_wrapped = i+1;
// note it
log("colldb: returning wrapped collnum "
"of %"INT32"",(int32_t)i);
return (collnum_t)i;
}
log("colldb: no new collnum available. consider upping collnum_t");
// none available!!
return -1;
}
///////////////
//
// COLLECTIONREC
//
///////////////
#include "gb-include.h"
//#include "CollectionRec.h"
//#include "Collectiondb.h"
#include "HttpServer.h" // printColors2()
#include "Msg5.h"
#include "Threads.h"
#include "Datedb.h"
#include "Timedb.h"
#include "Spider.h"
#include "Process.h"
static CollectionRec g_default;
CollectionRec::CollectionRec() {
m_nextLink = NULL;
m_prevLink = NULL;
m_spiderCorruptCount = 0;
m_collnum = -1;
m_coll[0] = '\0';
m_updateRoundNum = 0;
m_swappedOut = false;
//m_numSearchPwds = 0;
//m_numBanIps = 0;
//m_numSearchIps = 0;
//m_numSpamIps = 0;
//m_numAdminPwds = 0;
//m_numAdminIps = 0;
memset ( m_bases , 0 , sizeof(RdbBase *)*RDB_END );
// how many keys in the tree of each rdb? we now store this stuff
// here and not in RdbTree.cpp because we no longer have a maximum
// # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
// it is 4 * RDB_END...
memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
m_spiderColl = NULL;
m_overflow = 0x12345678;
m_overflow2 = 0x12345678;
// the spiders are currently uninhibited i guess
m_spiderStatus = SP_INITIALIZING; // this is 0
//m_spiderStatusMsg = NULL;
// for Url::getSite()
m_updateSiteRulesTable = 1;
//m_lastUpdateTime = 0LL;
m_clickNScrollEnabled = false;
// inits for sortbydatetable
m_inProgress = false;
m_msg5 = NULL;
m_importState = NULL;
// JAB - track which regex parsers have been initialized
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);
// clear these out so Parms::calcChecksum can work:
memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
//for ( int i = 0; i < MAX_FILTERS ; i++ )
// m_spiderQuotas[i] = -1;
memset( m_spiderPriorities, 0,
MAX_FILTERS*sizeof(*m_spiderPriorities) );
memset ( m_harvestLinks,0,MAX_FILTERS);
memset ( m_forceDelete,0,MAX_FILTERS);
//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
// *(m_searchPwds[i]) = '\0';
//}
//for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
// *(m_adminPwds[i]) = '\0';
//}
//memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
//memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
//memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
//memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
//for ( int i = 0; i < MAX_FILTERS; i++ ) {
// //m_pRegExParser[i] = NULL;
// *(m_regExs[i]) = '\0';
//}
m_numRegExs = 0;
//m_requests = 0;
//m_replies = 0;
//m_doingCallbacks = false;
m_lastResetCount = 0;
// regex_t types
m_hasucr = false;
m_hasupr = false;
// for diffbot caching the global spider stats
reset();
// add default reg ex if we do not have one
//setUrlFiltersToDefaults();
//rebuildUrlFilters();
}
CollectionRec::~CollectionRec() {
//invalidateRegEx ();
reset();
}
// new collection recs get this called on them
void CollectionRec::setToDefaults ( ) {
g_parms.setFromFile ( this , NULL , NULL , OBJ_COLL );
// add default reg ex
//setUrlFiltersToDefaults();
rebuildUrlFilters();
}
void CollectionRec::reset() {
//log("coll: resetting collnum=%"INT32"",(int32_t)m_collnum);
// . grows dynamically
// . setting to 0 buckets should never have error
//m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// regex_t types
if ( m_hasucr ) regfree ( &m_ucr );
if ( m_hasupr ) regfree ( &m_upr );
m_hasucr = false;
m_hasupr = false;
m_sendingAlertInProgress = false;
// make sure we do not leave spiders "hanging" waiting for their
// callback to be called... and it never gets called
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
//if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
//if ( m_replies != m_requests ) { char *xx=NULL;*xx=0; }
m_localCrawlInfo.reset();
m_globalCrawlInfo.reset();
//m_requests = 0;
//m_replies = 0;
// free all RdbBases in each rdb
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
Rdb *rdb = g_process.m_rdbs[i];
rdb->resetBase ( m_collnum );
}
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
RdbBase *base = m_bases[i];
if ( ! base ) continue;
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
}
SpiderColl *sc = m_spiderColl;
// debug hack thing
//if ( sc == (SpiderColl *)0x8888 ) return;
// if never made one, we are done
if ( ! sc ) return;
// spider coll also!
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
tryToDeleteSpiderColl ( sc ,"12");
// if ( ! sc->m_msg5.m_waitingForList &&
// ! sc->m_msg5b.m_waitingForList &&
// ! sc->m_msg1.m_mcast.m_inUse ) {
// mdelete ( sc, sizeof(SpiderColl),"nukecr2");
// delete ( sc );
// }
}
CollectionRec *g_cr = NULL;
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use
// the value we received from call to setToDefaults()
// . returns false and sets g_errno on load error
bool CollectionRec::load ( char *coll , int32_t i ) {
// also reset some counts not included in parms list
reset();
// before we load, set to defaults in case some are not in xml file
g_parms.setToDefault ( (char *)this , OBJ_COLL , this );
// get the filename with that id
File f;
char tmp2[1024];
sprintf ( tmp2 , "%scoll.%s.%"INT32"/coll.conf", g_hostdb.m_dir , coll,i);
f.set ( tmp2 );
if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
// set our collection number
m_collnum = i;
// set our collection name
m_collLen = gbstrlen ( coll );
strcpy ( m_coll , coll );
if ( ! g_conf.m_doingCommandLine )
log(LOG_INFO,"db: Loading conf for collection %s (%"INT32")",coll,
(int32_t)m_collnum);
// collection name HACK for backwards compatibility
//if ( strcmp ( coll , "main" ) == 0 ) {
// m_coll[0] = '\0';
// m_collLen = 0;
//}
// the default conf file
char tmp1[1024];
snprintf ( tmp1 , 1023, "%sdefault.conf" , g_hostdb.m_dir );
// . set our parms from the file.
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( this , tmp2 , tmp1 , OBJ_COLL );
// add default reg ex IFF there are no url filters there now
//if(m_numRegExs == 0) rebuildUrlFilters();//setUrlFiltersToDefaults();
// this only rebuild them if necessary
rebuildUrlFilters();//setUrlFiltersToDefaults();
// temp check
//testRegex();
//
// LOAD the crawlinfo class in the collectionrec for diffbot
//
// LOAD LOCAL
snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_localCrawlInfo.reset();
SafeBuf sb;
// fillfromfile returns 0 if does not exist, -1 on read error
if ( sb.fillFromFile ( tmp1 ) > 0 )
//m_localCrawlInfo.setFromSafeBuf(&sb);
// it is binary now
gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
m_coll,
(int32_t)m_collnum,
(int32_t)m_localCrawlInfo.m_hasUrlsReadyToSpider);
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
m_localCrawlInfo.m_pageDownloadSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
m_localCrawlInfo.m_pageDownloadSuccesses;
}
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageProcessSuccessesThisRound <
m_localCrawlInfo.m_pageProcessSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageProcessSuccessesThisRound =
m_localCrawlInfo.m_pageProcessSuccesses;
}
// fix from old bug that was fixed
//if ( m_spiderRoundNum == 0 &&
// m_collectiveRespiderFrequency > 0.0 &&
// m_localCrawlInfo.m_sentCrawlDoneAlert ) {
// log("coll: bug fix: resending email alert for coll %s (%"INT32") "
// "of respider freq %f",m_coll,(int32_t)m_collnum,
// m_collectiveRespiderFrequency);
// m_localCrawlInfo.m_sentCrawlDoneAlert = false;
//}
// LOAD GLOBAL
snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_globalCrawlInfo.reset();
sb.reset();
if ( sb.fillFromFile ( tmp1 ) > 0 )
//m_globalCrawlInfo.setFromSafeBuf(&sb);
// it is binary now
gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Loaded %s (%"INT32") global hasurlsready=%"INT32"",
m_coll,
(int32_t)m_collnum,
(int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
if ( ! g_conf.m_doingCommandLine ) {
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
}
////////////
//
// PAGE COUNT TABLE for doing quotas in url filters
//
/////////////
// log it up if there on disk
//snprintf ( tmp1 , 1023, "/coll.%s.%"INT32"/pagecounts.dat",
// m_coll , (int32_t)m_collnum );
//if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
// log("db: failed to load page count table: %s",
// mstrerror(g_errno));
// ignore errors i guess
g_errno = 0;
// fix for diffbot, spider time deduping
if ( m_isCustomCrawl ) m_dedupingEnabled = true;
// always turn off gigabits so &s=1000 can do summary skipping
if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
// make min to merge smaller than normal since most collections are
// small and we want to reduce the # of vfds (files) we have
if ( m_isCustomCrawl ) {
m_posdbMinFilesToMerge = 6;
m_titledbMinFilesToMerge = 4;
m_linkdbMinFilesToMerge = 3;
m_tagdbMinFilesToMerge = 2;
}
// always turn on distributed spider locking because otherwise
// we end up calling Msg50 which calls Msg25 for the same root url
// at the same time, thereby wasting massive resources. it is also
// dangerous to run without this because webmaster get pissed when
// we slam their servers.
// This is now deprecated...
//m_useSpiderLocks = false;
// and all pages downloaded from a particular ip should be done
// by the same host in our cluster to prevent webmaster rage
//m_distributeSpiderGet = true;
//initSortByDateTable(m_coll);
return true;
}
/*
bool CollectionRec::countEvents ( ) {
// set our m_numEventsOnHost value
log("coll: loading event count termlist gbeventcount");
// temporarily turn off threads
bool enabled = g_threads.areThreadsEnabled();
g_threads.disableThreads();
// count them
m_numEventsOnHost = 0;
// 1MB at a time
int32_t minRecSizes = 1000000;
// look up this termlist, gbeventcount which we index in XmlDoc.cpp
int64_t termId = hash64n("gbeventcount") & TERMID_MASK;
// make datedb key from it
key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
Msg5 msg5;
RdbList list;
// . init m_numEventsOnHost by getting the exact length of that
// termlist on this host
// . send in the ping request packet so all hosts can total up
// . Rdb.cpp should be added to incrementally so we should have no
// double positives.
// . Rdb.cpp should inspect each datedb rec for this termid in
// a fast an efficient manner
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_DATEDB ,
m_coll ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
NULL )){// msg5b
// not allowed to block!
char *xx=NULL;*xx=0; }
// scan the list, score is how many valid events from that docid
uint32_t total = 0;
for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
unsigned char *rec = (unsigned char *)list.getCurrentRec();
// in datedb score is byte #5
total += (255-rec[5]);
}
// declare
char *lastKeyPtr;
key128_t newStartKey;
// add to count. datedb uses half keys so subtract 6 bytes
// since the termids will be the same...
//m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
m_numEventsOnHost += total;
// bail if under limit
if ( list.getListSize() < minRecSizes ) goto done;
// update key
lastKeyPtr = list.m_listEnd - 10;
// we make a new start key
list.getKey ( lastKeyPtr , (char *)&newStartKey );
// maxxed out?
if ( newStartKey.n0==0xffffffffffffffffLL &&
newStartKey.n1==0xffffffffffffffffLL )
goto done;
// sanity check
if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
// inc it
newStartKey.n0++;
// in the top if the bottom wrapped
if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
// assign
startKey = newStartKey;
// and loop back up for more now
goto loop;
done:
// update all colls count
g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
if ( enabled ) g_threads.enableThreads();
log("coll: got %"INT32" local events in termlist",m_numEventsOnHost);
// set "m_hasDocQualityFiler"
//updateFilters();
return true;
}
*/
bool CollectionRec::rebuildUrlFilters2 ( ) {
// tell spider loop to update active list
g_spiderLoop.m_activeListValid = false;
bool rebuild = true;
if ( m_numRegExs == 0 )
rebuild = true;
// don't touch it if not supposed to as int32_t as we have some already
//if ( m_urlFiltersProfile != UFP_NONE )
// rebuild = true;
// never for custom crawls however
if ( m_isCustomCrawl )
rebuild = false;
char *s = m_urlFiltersProfile.getBufStart();
// support the old UFP_CUSTOM, etc. numeric values
if ( !strcmp(s,"0" ) )
s = "custom";
// UFP_WEB SUPPORT
if ( !strcmp(s,"1" ) )
s = "web";
// UFP_NEWS
if ( !strcmp(s,"2" ) )
s = "shallow";
// leave custom profiles alone
if ( !strcmp(s,"custom" ) )
rebuild = false;
//if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
// addDefault = true;
if ( ! rebuild ) return true;
if ( !strcmp(s,"shallow" ) )
return rebuildShallowRules();
//if ( strcmp(s,"web") )
// just fall through for that
if ( !strcmp(s,"english") )
return rebuildLangRules( "en","com,us,gov");
if ( !strcmp(s,"german") )
return rebuildLangRules( "de","de");
if ( !strcmp(s,"french") )
return rebuildLangRules( "fr","fr");
if ( !strcmp(s,"norwegian") )
return rebuildLangRules( "nl","nl");
if ( !strcmp(s,"spanish") )
return rebuildLangRules( "es","es");
//if ( m_urlFiltersProfile == UFP_EURO )
// return rebuildLangRules( "de,fr,nl,es,sv,no,it",
// "com,gov,org,de,fr,nl,es,sv,no,it");
if ( !strcmp(s,"romantic") )
return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",
"de,fr,nl,es,sv,no,it,fi,pt,"
"com,gov,org"
);
if ( !strcmp(s,"chinese") )
return rebuildLangRules( "zh_cn,zh_tw","cn");
int32_t n = 0;
/*
m_regExs[n].set("default");
m_regExs[n].nullTerm();
m_spiderFreqs [n] = 30; // 30 days default
m_spiderPriorities[n] = 0;
m_maxSpidersPerRule[n] = 99;
m_spiderIpWaits[n] = 1000;
m_spiderIpMaxSpiders[n] = 7;
m_harvestLinks[n] = 1;
*/
// max spiders per ip
int32_t ipms = 7;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// a non temporary error, like a 404? retry once per 3 months i guess
m_regExs[n].set("errorcount>=1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 90; // 90 day retry
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// 20+ unique c block parent request urls means it is important!
m_regExs[n].set("numinlinks>7 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 52;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// 20+ unique c block parent request urls means it is important!
m_regExs[n].set("numinlinks>7");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 51;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentrss && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentsitemap && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 44;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentrss");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 43;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentsitemap");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 42;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
// do not harvest links if we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
// do not harvest links if we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
else {
n++;
}
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
else {
n++;
}
/*
m_regExs[n].set("isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = resp4;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
n++;
*/
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// more rules
//m_spiderDiffbotApiNum[n] = 1;
//m_numRegExs11++;
//m_spiderDiffbotApiUrl[n].set("");
//m_spiderDiffbotApiUrl[n].nullTerm();
//m_numRegExs11++;
return true;
}
bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
// max spiders per ip
int32_t ipms = 7;
int32_t n = 0;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
"parentlang==%s,xx"
,langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
// m_regExs[n].set("hopcount==0 && iswww && isnew");
// m_harvestLinks [n] = 1;
// m_spiderFreqs [n] = 7; // 30 days default
// m_maxSpidersPerRule [n] = 9; // max spiders
// m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
// m_spiderIpWaits [n] = 1000; // same ip wait
// m_spiderPriorities [n] = 20;
// n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 18;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 17;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 16;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 15;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 14;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 13;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 12;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 11;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding CHINESE rules
return true;
}
bool CollectionRec::rebuildShallowRules ( ) {
// max spiders per ip
int32_t ipms = 7;
int32_t n = 0;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
//
// stop if hopcount>=2 for things tagged shallow in sitelist
//
m_regExs[n].set("tag:shallow && hopcount>=2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 0; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
// if # of pages in this site indexed is >= 10 then stop as well...
m_regExs[n].set("tag:shallow && sitepages>=10");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 0; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding SHALLOW rules
return true;
}
/*
bool CrawlInfo::print (SafeBuf *sb ) {
return sb->safePrintf("objectsAdded:%"INT64"\n"
"objectsDeleted:%"INT64"\n"
"urlsConsidered:%"INT64"\n"
"downloadAttempts:%"INT64"\n"
"downloadSuccesses:%"INT64"\n"
"processAttempts:%"INT64"\n"
"processSuccesses:%"INT64"\n"
"lastupdate:%"UINT32"\n"
, m_objectsAdded
, m_objectsDeleted
, m_urlsConsidered
, m_pageDownloadAttempts
, m_pageDownloadSuccesses
, m_pageProcessAttempts
, m_pageProcessSuccesses
, m_lastUpdateTime
);
}
bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
return sscanf(sb->getBufStart(),
"objectsAdded:%"INT64"\n"
"objectsDeleted:%"INT64"\n"
"urlsConsidered:%"INT64"\n"
"downloadAttempts:%"INT64"\n"
"downloadSuccesses:%"INT64"\n"
"processAttempts:%"INT64"\n"
"processSuccesses:%"INT64"\n"
"lastupdate:%"UINT32"\n"
, &m_objectsAdded
, &m_objectsDeleted
, &m_urlsConsidered
, &m_pageDownloadAttempts
, &m_pageDownloadSuccesses
, &m_pageProcessAttempts
, &m_pageProcessSuccesses
, &m_lastUpdateTime
);
}
*/
// returns false on failure and sets g_errno, true otherwise
bool CollectionRec::save ( ) {
if ( g_conf.m_readOnlyMode ) return true;
//File f;
char tmp[1024];
//sprintf ( tmp , "%scollections/%"INT32".%s/c.conf",
// g_hostdb.m_dir,m_id,m_coll);
// collection name HACK for backwards compatibility
//if ( m_collLen == 0 )
// sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
//else
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/coll.conf",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) return false;
// log msg
//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
//
// save the crawlinfo class in the collectionrec for diffbot
//
// SAVE LOCAL
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
//log("coll: saving %s",tmp);
// in case emergency save from malloc core, do not alloc
char stack[1024];
SafeBuf sb(stack,1024);
//m_localCrawlInfo.print ( &sb );
// binary now
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
// SAVE GLOBAL
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
//log("coll: saving %s",tmp);
sb.reset();
//m_globalCrawlInfo.print ( &sb );
// binary now
sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
// do not need a save now
m_needsSave = false;
// waiting tree is saved in SpiderCache::save() called by Process.cpp
//SpiderColl *sc = m_spiderColl;
//if ( ! sc ) return true;
// save page count table which has # of pages indexed per
// subdomain/site and firstip for doing quotas in url filters table
//snprintf ( tmp , 1023, "coll.%s.%"INT32"/pagecounts.dat",
// m_coll , (int32_t)m_collnum );
//if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
// log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
// g_errno = 0;
//}
return true;
}
// calls hasPermissin() below
bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
int32_t plen;
char *p = r->getString ( "pwd" , &plen );
int32_t ip = s->m_ip;
return hasPermission ( p , plen , ip );
}
// . does this password work for this collection?
bool CollectionRec::isAssassin ( int32_t ip ) {
// ok, make sure they came from an acceptable IP
//for ( int32_t i = 0 ; i < m_numSpamIps ; i++ )
// // they also have a matching IP, so they now have permission
// if ( m_spamIps[i] == ip ) return true;
return false;
}
// . does this password work for this collection?
bool CollectionRec::hasPermission ( char *p, int32_t plen , int32_t ip ) {
// just return true
// collection permission is checked from Users::verifyColl
// in User::getUserType for every request
return true;
// scan the passwords
// MDW: no longer, this is too vulnerable!!!
/*
for ( int32_t i = 0 ; i < m_numAdminPwds ; i++ ) {
int32_t len = gbstrlen ( m_adminPwds[i] );
if ( len != plen ) continue;
if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
// otherwise it's a match!
//goto checkIp;
// . matching one password is good enough now, default OR
// . because just matching an IP is good enough security,
// there is really no need for both IP AND passwd match
return true;
}
*/
// . if had passwords but the provided one didn't match, return false
// . matching one password is good enough now, default OR
//if ( m_numPasswords > 0 ) return false;
// checkIp:
// ok, make sure they came from an acceptable IP
//for ( int32_t i = 0 ; i < m_numAdminIps ; i++ )
// // they also have a matching IP, so they now have permission
// if ( m_adminIps[i] == ip ) return true;
// if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
//if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
// if they did not match an ip or password, even if both lists
// are empty, do not allow access... this prevents security breeches
// by accident
return false;
// if there were IPs then they failed to get in
//if ( m_numAdminIps > 0 ) return false;
// otherwise, they made it
//return true;
}
// can this ip perform a search or add url on this collection?
bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
// get the ip
int32_t ip = 0; if ( s ) ip = s->m_ip;
// and the ip domain
int32_t ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
// and top 2 bytes for the israel isp that has this huge block
int32_t ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
// is it in the ban list?
/*
for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
// otherwise it's just a single banned ip
if ( m_banIps[i] == ip ) return false;
}
*/
// check the encapsulate ip if any
// 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
// /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
// +nj+state HTTP/1.0
/*
if ( encapIp ) {
ipd = ipdom ( encapIp );
ip = encapIp;
for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
if ( m_banIps[i] == ip ) return false;
}
}
*/
return true;
/*
// do we have an "only" list?
if ( m_numSearchIps == 0 ) return true;
// it must be in that list if we do
for ( int32_t i = 0 ; i < m_numSearchIps ; i++ ) {
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_searchIps[i] ) ) {
if ( m_searchIps[i] == ipd ) return true;
continue;
}
// otherwise it's just a single ip
if ( m_searchIps[i] == ip ) return true;
}
*/
// otherwise no permission
return false;
}
bool expandRegExShortcuts ( SafeBuf *sb ) ;
void nukeDoledb ( collnum_t collnum );
// rebuild the regexes related to diffbot, such as the one for the URL pattern
bool CollectionRec::rebuildDiffbotRegexes() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// get the regexes
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
// recompiling regexes starts now
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_upr );
m_hasupr = false;
}
return true;
}
bool CollectionRec::rebuildUrlFiltersDiffbot() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
int32_t err;
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
// error!
char errbuf[1024];
regerror(err,&m_ucr,errbuf,1000);
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
char errbuf[1024];
regerror(err,&m_upr,errbuf,1000);
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_upr );
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
if ( api && ! api[0] ) api = NULL;
// convert from seconds to milliseconds. default is 250ms?
int32_t wait = (int32_t)(m_collectiveCrawlDelay * 1000.0);
// default to 250ms i guess. -1 means unset i think.
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
bool isEthan = false;
if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
// it looks like we are assuming all crawls are repeating so that
// &rountStart=<currenttime> or &roundStart=0 which is the same
// thing, will trigger a re-crawl. so if collectiveRespiderFreq
// is 0 assume it is like 999999.0 days. so that stuff works.
// also i had to make the "default" rule below always have a respider
// freq of 0.0 so it will respider right away if we make it past the
// "lastspidertime>={roundstart}" rule which we will if they
// set the roundstart time to the current time using &roundstart=0
float respiderFreq = m_collectiveRespiderFrequency;
if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
// lower from 7 to 1 since we have so many collections now
// ok, now we have much less colls so raise back to 7
int32_t diffbotipms = 7;//1; // 7
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
// through m_diffbotUrlCrawlPattern alternatively. if that itself
// is empty, we will just restrict to the seed urls subdomain.
for ( int32_t i = 0 ; i < MAX_FILTERS ; i++ ) {
m_regExs[i].purge();
m_spiderPriorities[i] = 0;
m_maxSpidersPerRule [i] = 100;
// when someone has a bulk job of thousands of different
// domains it slows diffbot back-end down, so change this
// from 100 to 7 if doing a bulk job
if ( m_isCustomCrawl == 2 )
m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
// if ( isEthan )
// m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
m_harvestLinks[i] = true;
m_forceDelete [i] = false;
}
int32_t i = 0;
// 1st one! for query reindex/ query delete
m_regExs[i].set("isreindex");
m_spiderIpMaxSpiders [i] = 10;
m_spiderPriorities [i] = 70;
i++;
// 2nd default url
m_regExs[i].set("ismedia && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
// de-prioritize fakefirstip urls so we don't give the impression our
// spiders are slow. like if someone adds a bulk job with 100,000 urls
// then we sit there and process to lookup their ips and add a real
// spider request (if it falls onto the same shard) before we actually
// do any real spidering. so keep the priority here low.
m_regExs[i].set("isfakeip");
m_maxSpidersPerRule [i] = 7;
m_spiderIpMaxSpiders [i] = 7;
m_spiderPriorities [i] = 20;
m_spiderIpWaits [i] = 0;
i++;
// hopcount filter if asked for
if( m_diffbotMaxHops >= 0 ) {
// transform long to string
char numstr[21]; // enough to hold all numbers up to 64-bits
sprintf(numstr, "%"INT32"", (int32_t)m_diffbotMaxHops);
// form regEx like: hopcount>3
char hopcountStr[30];
strcpy(hopcountStr, "hopcount>");
strcat(hopcountStr, numstr);
m_regExs[i].set(hopcountStr);
// means DELETE :
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// just don't spider
m_maxSpidersPerRule[i] = 0;
// compatibility with m_spiderRoundStartTime:
m_spiderFreqs[i] = 0.0;
i++;
}
// 2nd default filter
// always turn this on for now. they need to add domains they want
// to crawl as seeds so they do not spider the web.
// no because FTB seeds with link pages that link to another
// domain. they just need to be sure to supply a crawl pattern
// to avoid spidering the whole web.
//
// if they did not EXPLICITLY provide a url crawl pattern or
// url crawl regex then restrict to seeds to prevent from spidering
// the entire internet.
//if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
// MDW: even if they supplied a crawl pattern let's restrict to seed
// domains 12/15/14
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
//}
bool ucpHasPositive = false;
// . scan them to see if all patterns start with '!' or not
// . if pattern starts with ! it is negative, otherwise positve
if ( ucp ) ucpHasPositive = hasPositivePattern ( ucp );
// if no crawl regex, and it has a crawl pattern consisting of
// only negative patterns then restrict to domains of seeds
if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
}
m_regExs[i].set("errorcount>=1 && !hastmperror");
m_spiderPriorities [i] = 15;
m_spiderFreqs [i] = 0.0;
m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
i++;
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==1 && hastmperror");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.001; // 86 seconds
i++;
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==2 && hastmperror");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.003; // 3*86 seconds (was 24 hrs)
i++;
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
m_regExs[i].set("errorcount>=3 && hastmperror");
m_spiderPriorities [i] = 3;
m_spiderFreqs [i] = 30; // 30 days
// if bulk job, do not download a url more than 3 times
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
i++;
// 3rd rule for respidering
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
else {
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//m_regExs[i].set("isindexed");
m_regExs[i].set("hasreply");
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// url crawl and PAGE process pattern
if ( ucp && ! upp && ppp ) {
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
i++;
// crawl everything else, but don't harvest links,
// we have to see if the page content matches the "ppp"
// to determine whether the page should be processed or not.
m_regExs[i].set("default");
m_spiderPriorities [i] = 52;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
m_harvestLinks [i] = false;
i++;
goto done;
}
// url crawl and process pattern
if ( ucp && upp ) {
m_regExs[i].set("matchesucp && matchesupp");
m_spiderPriorities [i] = 55;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
i++;
// just process, do not spider links if does not match ucp
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
m_harvestLinks [i] = false;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the round
m_spiderFreqs[i] = 0.0;
i++;
}
// harvest links if we should crawl it
if ( ucp && ! upp ) {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// process everything since upp is empty
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't delete, just don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
i++;
}
// just process
if ( upp && ! ucp ) {
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
//m_harvestLinks [i] = false;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// crawl everything by default, no processing
m_regExs[i].set("default");
m_spiderPriorities [i] = 50;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
i++;
}
// no restraints
if ( ! upp && ! ucp ) {
// crawl everything by default, no processing
m_regExs[i].set("default");
m_spiderPriorities [i] = 50;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
done:
m_numRegExs = i;
m_numRegExs2 = i;
m_numRegExs3 = i;
m_numRegExs10 = i;
m_numRegExs5 = i;
m_numRegExs6 = i;
//m_numRegExs7 = i;
m_numRegExs8 = i;
m_numRegExs7 = i;
//m_numRegExs11 = i;
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
return true;
}
// . anytime the url filters are updated, this function is called
// . it is also called on load of the collection at startup
bool CollectionRec::rebuildUrlFilters ( ) {
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
m_urlFiltersProfile.getBufStart());
// if not a custom crawl, and no expressions, add a default one
//if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {
// setUrlFiltersToDefaults();
//}
// if not a custom crawl then set the url filters based on
// the url filter profile, if any
if ( ! m_isCustomCrawl )
rebuildUrlFilters2();
// set this so we know whether we have to keep track of page counts
// per subdomain/site and per domain. if the url filters have
// 'sitepages' 'domainpages' 'domainadds' or 'siteadds' we have to keep
// the count table SpiderColl::m_pageCountTable.
m_urlFiltersHavePageCounts = false;
for ( int32_t i = 0 ; i < m_numRegExs ; i++ ) {
// get the ith rule
SafeBuf *sb = &m_regExs[i];
char *p = sb->getBufStart();
if ( strstr(p,"sitepages") ||
strstr(p,"domainpages") ||
strstr(p,"siteadds") ||
strstr(p,"domainadds") ) {
m_urlFiltersHavePageCounts = true;
break;
}
}
// if collection is brand new being called from addNewColl()
// then sc will be NULL
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);
// . do not do this at startup
// . this essentially resets doledb
if ( g_doledb.m_rdb.m_initialized &&
// somehow this is initialized before we set m_recs[m_collnum]
// so we gotta do the two checks below...
sc &&
// must be a valid coll
m_collnum < g_collectiondb.m_numRecs &&
g_collectiondb.m_recs[m_collnum] ) {
log("coll: resetting doledb for %s (%li)",m_coll,
(long)m_collnum);
// clear doledb recs from tree
//g_doledb.getRdb()->deleteAllRecs ( m_collnum );
nukeDoledb ( m_collnum );
// add it back
//if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
// log("coll: error re-adding doledb for %s",m_coll);
// just start this over...
// . MDW left off here
//tryToDelete ( sc );
// maybe this is good enough
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
//CollectionRec *cr = sc->m_cr;
// . rebuild sitetable? in PageBasic.cpp.
// . re-adds seed spdierrequests using msg4
// . true = addSeeds
// . no, don't do this now because we call updateSiteList()
// when we have &sitelist=xxxx in the request which will
// handle updating those tables
//updateSiteListTables ( m_collnum ,
// true ,
// cr->m_siteListBuf.getBufStart() );
}
// If the crawl is not generated by crawlbot, then we will just update
// the regexes concerning the urls to process
rebuildDiffbotRegexes();
if ( ! m_isCustomCrawl ){
return true;
}
// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
// to some default ones.
return rebuildUrlFiltersDiffbot();
}
// for some reason the libc we use doesn't support these int16_tcuts,
// so expand them to something it does support
bool expandRegExShortcuts ( SafeBuf *sb ) {
if ( ! sb->safeReplace3 ( "\\d" , "[0-9]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\D" , "[^0-9]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\l" , "[a-z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\a" , "[A-Za-z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\u" , "[A-Z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\w" , "[A-Za-z0-9_]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\W" , "[^A-Za-z0-9_]" ) ) return false;
return true;
}
void testRegex ( ) {
//
// TEST
//
char *rx;
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
rx = ".*?article[0-9]*?.html";
regex_t ucr;
int32_t err;
if ( ( err = regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) ) {
// error!
char errbuf[1024];
regerror(err,&ucr,errbuf,1000);
log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
}
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
//char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";
if ( regexec(&ucr,url,0,NULL,0) )
logf(LOG_DEBUG,"db: failed to match %s on %s",
url,rx);
else
logf(LOG_DEBUG,"db: MATCHED %s on %s",
url,rx);
exit(0);
}
int64_t CollectionRec::getNumDocsIndexed() {
RdbBase *base = getBase(RDB_TITLEDB);//m_bases[RDB_TITLEDB];
if ( ! base ) return 0LL;
return base->getNumGlobalRecs();
}
// messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
// so we do not have to keep sending this huge msg!
bool CollectionRec::shouldSendLocalCrawlInfoToHost ( int32_t hostId ) {
if ( ! m_spiderColl ) return false;
if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
if ( hostId >= g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
// sanity
return m_spiderColl->m_sendLocalCrawlInfoToHost[hostId];
}
void CollectionRec::localCrawlInfoUpdate() {
if ( ! m_spiderColl ) return;
// turn on all the flags
memset(m_spiderColl->m_sendLocalCrawlInfoToHost,1,g_hostdb.m_numHosts);
}
// right after we send copy it for sending we set this so we do not send
// again unless localCrawlInfoUpdate() is called
void CollectionRec::sentLocalCrawlInfoToHost ( int32_t hostId ) {
if ( ! m_spiderColl ) return;
m_spiderColl->m_sendLocalCrawlInfoToHost[hostId] = 0;
}