open-source-search-engine/Collectiondb.cpp
Matt Wells 8a65d21371 fix the source of lots of corruption in spiderdb and titledb.
rdbmem.cpp was storing in secondary mem which got reset when
dump completed. also do not add keys that are in collnum and
key range of list currently being dumped, return ETRYAGAIN.
added verify writes parm. clean out tree of titledb and spiderdb
corruption on startup.
2016-03-15 15:54:12 -07:00

4251 lines
125 KiB
C++

#include "gb-include.h"
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Xml.h"
#include "Url.h"
#include "Loop.h"
#include "Spider.h" // for calling SpiderLoop::collectionsUpdated()
#include "Posdb.h"
//#include "Indexdb.h"
#include "Datedb.h"
#include "Titledb.h"
//#include "Revdb.h"
//#include "Sections.h"
#include "Placedb.h"
#include "Tagdb.h"
#include "Catdb.h"
#include "Tfndb.h"
#include "Spider.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Spider.h"
#include "Repair.h"
#include "Users.h"
#include "Parms.h"
void testRegex ( ) ;
HashTableX g_collTable;
// a global class extern'd in .h file
Collectiondb g_collectiondb;
Collectiondb::Collectiondb ( ) {
m_wrapped = 0;
m_numRecs = 0;
m_numRecsUsed = 0;
m_numCollsSwappedOut = 0;
m_initializing = false;
//m_lastUpdateTime = 0LL;
m_needsSave = false;
// sanity
if ( RDB_END2 >= RDB_END ) return;
log("db: increase RDB_END2 to at least %"INT32" in "
"Collectiondb.h",(int32_t)RDB_END);
char *xx=NULL;*xx=0;
}
// reset rdb
void Collectiondb::reset() {
log(LOG_INFO,"db: resetting collectiondb.");
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" );
delete ( m_recs[i] );
m_recs[i] = NULL;
}
m_numRecs = 0;
m_numRecsUsed = 0;
g_collTable.reset();
}
/*
bool Collectiondb::init ( bool isDump ) {
reset();
if ( g_isYippy ) return true;
// reset # of recs
//m_numRecs = 0;
//m_numRecsUsed = 0;
// . now load ALL recs
// . returns false and sets g_errno on error
if ( ! load ( isDump ) ) return false;
// update time
updateTime();
// so we don't save again
m_needsSave = false;
// sanity
if ( RDB_END2 < RDB_END ) {
log("db: increase RDB_END2 to at least %"INT32" in "
"Collectiondb.h",(int32_t)RDB_END);
char *xx=NULL;*xx=0;
}
// if it set g_errno, return false
//if ( g_errno ) return log("admin: Had init error: %s.",
// mstrerror(g_errno));
g_errno = 0;
// otherwise, true, even if reloadList() blocked
return true;
}
*/
extern bool g_inAutoSave;
// . save to disk
// . returns false if blocked, true otherwise
bool Collectiondb::save ( ) {
if ( g_conf.m_readOnlyMode ) return true;
if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
return true;
// which collection rec needs a save
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
// temp debug message
//logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
if ( ! m_recs[i]->m_needsSave ) continue;
// if we core in malloc we won't be able to save the
// coll.conf files
if ( m_recs[i]->m_isCustomCrawl &&
g_inMemFunction &&
g_hostdb.m_hostId != 0 )
continue;
//log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
m_recs[i]->save ( );
}
// oh well
return true;
}
///////////
//
// fill up our m_recs[] array based on the coll.*.*/coll.conf files
//
///////////
bool Collectiondb::loadAllCollRecs ( ) {
m_initializing = true;
char dname[1024];
// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
sprintf ( dname , "%s" , g_hostdb.m_dir );
Dir d;
d.set ( dname );
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
int32_t count = 0;
char *f;
while ( ( f = d.getNextFilename ( "*" ) ) ) {
// skip if first char not "coll."
if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
// must end on a digit (i.e. coll.main.0)
if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
// count them
count++;
}
// reset directory for another scan
d.set ( dname );
if ( ! d.open ()) return log("admin: Could not load collection config "
"files.");
// note it
//log(LOG_INFO,"db: loading collection config files.");
// . scan through all subdirs in the collections dir
// . they should be like, "coll.main/" and "coll.mycollection/"
while ( ( f = d.getNextFilename ( "*" ) ) ) {
// skip if first char not "coll."
if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
// must end on a digit (i.e. coll.main.0)
if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
// point to collection
char *coll = f + 5;
// NULL terminate at .
char *pp = strchr ( coll , '.' );
if ( ! pp ) continue;
*pp = '\0';
// get collnum
collnum_t collnum = atol ( pp + 1 );
// add it
if ( ! addExistingColl ( coll , collnum ) )
return false;
// swap it out if we got 100+ collections
// if ( count < 100 ) continue;
// CollectionRec *cr = getRec ( collnum );
// if ( cr ) cr->swapOut();
}
// if no existing recs added... add coll.main.0 always at startup
if ( m_numRecs == 0 ) {
log("admin: adding main collection.");
addNewColl ( "main",
0 , // customCrawl ,
NULL,
0 ,
true , // bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
0 );
}
m_initializing = false;
// note it
//log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
// "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
// update the time
//updateTime();
// don't clean the tree if just dumpin
//if ( isDump ) return true;
return true;
}
// after we've initialized all rdbs in main.cpp call this to clean out
// our rdb trees
bool Collectiondb::cleanTrees ( ) {
// remove any nodes with illegal collnums
Rdb *r;
//r = g_indexdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_posdb.getRdb();
//r->m_tree.cleanTree ();//(char **)r->m_bases);
r->m_buckets.cleanBuckets();
//r = g_datedb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_titledb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
//r = g_revdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_sectiondb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_checksumdb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
//r = g_tfndb.getRdb();
//r->m_tree.cleanTree ((char **)r->m_bases);
r = g_spiderdb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
r = g_doledb.getRdb();
r->m_tree.cleanTree ();//(char **)r->m_bases);
// success
return true;
}
/*
void Collectiondb::updateTime() {
// get time now in milliseconds
int64_t newTime = gettimeofdayInMilliseconds();
// change it
if ( m_lastUpdateTime == newTime ) newTime++;
// update it
m_lastUpdateTime = newTime;
// we need a save
m_needsSave = true;
}
*/
#include "Statsdb.h"
#include "Cachedb.h"
#include "Syncdb.h"
// same as addOldColl()
bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
int32_t i = collnum;
// ensure does not already exist in memory
collnum_t oldCollnum = getCollnum(coll);
if ( oldCollnum >= 0 ) {
g_errno = EEXIST;
log("admin: Trying to create collection \"%s\" but "
"already exists in memory. Do an ls on "
"the working dir to see if there are two "
"collection dirs with the same coll name",coll);
char *xx=NULL;*xx=0;
}
// also try by #, i've seen this happen too
CollectionRec *ocr = getRec ( i );
if ( ocr ) {
g_errno = EEXIST;
log("admin: Collection id %i is in use already by "
"%s, so we can not add %s. moving %s to trash."
,(int)i,ocr->m_coll,coll,coll);
SafeBuf cmd;
int64_t now = gettimeofdayInMilliseconds();
cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
, coll
,(int)i
, coll
,(int)i
, now );
//log("admin: %s",cmd.getBufStart());
gbsystem ( cmd.getBufStart() );
return true;
}
// create the record in memory
CollectionRec *cr = new (CollectionRec);
if ( ! cr )
return log("admin: Failed to allocated %"INT32" bytes for new "
"collection record for \"%s\".",
(int32_t)sizeof(CollectionRec),coll);
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// set collnum right for g_parms.setToDefault() call just in case
// because before it was calling CollectionRec::reset() which
// was resetting the RdbBases for the m_collnum which was garbage
// and ended up resetting random collections' rdb. but now
// CollectionRec::CollectionRec() sets m_collnum to -1 so we should
// not need this!
//cr->m_collnum = oldCollnum;
// get the default.conf from working dir if there
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
strcpy ( cr->m_coll , coll );
cr->m_collLen = gbstrlen ( coll );
cr->m_collnum = i;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
//log("admin: loaded old coll \"%s\"",coll);
// load coll.conf file
if ( ! cr->load ( coll , i ) ) {
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
log("admin: Failed to load coll.%s.%"INT32"/coll.conf",coll,i);
delete ( cr );
if ( m_recs ) m_recs[i] = NULL;
return false;
}
if ( ! registerCollRec ( cr , false ) ) return false;
// always index spider status docs now for custom crawls
if ( cr->m_isCustomCrawl )
cr->m_indexSpiderReplies = true;
// and don't do link voting, will help speed up
if ( cr->m_isCustomCrawl ) {
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
// limit each shard to 5 spiders per collection to prevent
// ppl from spidering the web and hogging up resources
cr->m_maxNumSpiders = 5;
// diffbot download docs up to 50MB so we don't truncate
// things like sitemap.xml. but keep regular html pages
// 1MB
cr->m_maxTextDocLen = 1024*1024;
// xml, pdf, etc can be this. 50MB
cr->m_maxOtherDocLen = 50000000;
}
// we need to compile the regular expressions or update the url
// filters with new logic that maps crawlbot parms to url filters
return cr->rebuildUrlFilters ( );
}
// . add a new rec
// . returns false and sets g_errno on error
// . was addRec()
// . "isDump" is true if we don't need to initialize all the rdbs etc
// because we are doing a './gb dump ...' cmd to dump out data from
// one Rdb which we will custom initialize in main.cpp where the dump
// code is. like for instance, posdb.
// . "customCrawl" is 0 for a regular collection, 1 for a simple crawl
// 2 for a bulk job. diffbot terminology.
bool Collectiondb::addNewColl ( char *coll ,
char customCrawl ,
char *cpc ,
int32_t cpclen ,
bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
collnum_t newCollnum ) {
//do not send add/del coll request until we are in sync with shard!!
// just return ETRYAGAIN for the parmlist...
// ensure coll name is legit
char *p = coll;
for ( ; *p ; p++ ) {
if ( is_alnum_a(*p) ) continue;
if ( *p == '-' ) continue;
if ( *p == '_' ) continue; // underscore now allowed
break;
}
if ( *p ) {
g_errno = EBADENGINEER;
log("admin: \"%s\" is a malformed collection name because it "
"contains the '%c' character.",coll,*p);
return false;
}
// . scan for holes
// . i is also known as the collection id
//int32_t i = (int32_t)newCollnum;
// no longer fill empty slots because if they do a reset then
// a new rec right away it will be filled with msg4 recs not
// destined for it. Later we will have to recycle some how!!
//else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
// right now we #define collnum_t int16_t. so do not breach that!
//if ( m_numRecs < 0x7fff ) {
// // set it
// i = m_numRecs;
// // claim it
// // we don't do it here, because we check i below and
// // increment m_numRecs below.
// //m_numRecs++;
//}
// TODO: scan for holes here...
//else {
if ( newCollnum < 0 ) { char *xx=NULL;*xx=0; }
// ceiling?
//int64_t maxColls = 1LL<<(sizeof(collnum_t)*8);
//if ( i >= maxColls ) {
// g_errno = ENOBUFS;
// return log("admin: Limit of %"INT64" collection reached. "
// "Collection not created.",maxColls);
//}
// if empty... bail, no longer accepted, use "main"
if ( ! coll || !coll[0] ) {
g_errno = EBADENGINEER;
return log("admin: Trying to create a new collection "
"but no collection name provided. Use the \"c\" "
"cgi parameter to specify it.");
}
// or if too big
if ( gbstrlen(coll) > MAX_COLL_LEN ) {
g_errno = ENOBUFS;
return log("admin: Trying to create a new collection "
"whose name \"%s\" of %i chars is longer than the "
"max of %"INT32" chars.",coll,gbstrlen(coll),
(int32_t)MAX_COLL_LEN);
}
// ensure does not already exist in memory
if ( getCollnum ( coll ) >= 0 ) {
g_errno = EEXIST;
log("admin: Trying to create collection \"%s\" but "
"already exists in memory.",coll);
// just let it pass...
g_errno = 0 ;
return true;
}
// MDW: ensure not created on disk since time of last load
char dname[512];
sprintf(dname, "%scoll.%s.%"INT32"/",g_hostdb.m_dir,coll,(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir ) closedir ( dir );
if ( dir ) {
g_errno = EEXIST;
return log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",coll,dname);
}
// create the record in memory
CollectionRec *cr = new (CollectionRec);
if ( ! cr )
return log("admin: Failed to allocated %"INT32" bytes for new "
"collection record for \"%s\".",
(int32_t)sizeof(CollectionRec),coll);
// register the mem
mnew ( cr , sizeof(CollectionRec) , "CollectionRec" );
// get copy collection
//CollectionRec *cpcrec = NULL;
//if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
//if ( cpc && cpc[0] && ! cpcrec )
// log("admin: Collection \"%s\" to copy config from does not "
// "exist.",cpc);
// set collnum right for g_parms.setToDefault() call
//cr->m_collnum = newCollnum;
// . get the default.conf from working dir if there
// . i think this calls CollectionRec::reset() which resets all of its
// rdbbase classes for its collnum so m_collnum needs to be right
//g_parms.setToDefault( (char *)cr );
// get the default.conf from working dir if there
//g_parms.setToDefault( (char *)cr , OBJ_COLL );
g_parms.setToDefault( (char *)cr , OBJ_COLL , cr );
// put search results back so it doesn't mess up results in qatest123
if ( strcmp(coll,"qatest123") == 0 )
cr->m_sameLangWeight = 20.0;
/*
// the default conf file
char tmp1[1024];
sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
// . set our parms from the file.
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( cr , NULL , tmp1 );
*/
// this will override all
// if ( cpcrec ) {
// // copy it, but not the timedb hashtable, etc.
// int32_t size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
// // JAB: bad gbmemcpy - no donut!
// // this is not how objects are supposed to be copied!!!
// gbmemcpy ( cr , cpcrec , size);
// }
// set coll id and coll name for coll id #i
strcpy ( cr->m_coll , coll );
cr->m_collLen = gbstrlen ( coll );
cr->m_collnum = newCollnum;
// point to this, so Rdb and RdbBase can reference it
coll = cr->m_coll;
//
// BEGIN NEW CODE
//
//
// get token and crawlname if customCrawl is 1 or 2
//
char *token = NULL;
char *crawl = NULL;
SafeBuf tmp;
// . return true with g_errno set on error
// . if we fail to set a parm right we should force ourselves
// out sync
if ( customCrawl ) {
if ( ! tmp.safeStrcpy ( coll ) ) return true;
token = tmp.getBufStart();
// diffbot coll name format is <token>-<crawlname>
char *h = strchr ( tmp.getBufStart() , '-' );
if ( ! h ) {
log("crawlbot: bad custom collname");
g_errno = EBADENGINEER;
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
return true;
}
*h = '\0';
crawl = h + 1;
if ( ! crawl[0] ) {
log("crawlbot: bad custom crawl name");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
// or if too big!
if ( gbstrlen(crawl) > 30 ) {
log("crawlbot: crawlbot crawl NAME is over 30 chars");
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
g_errno = EBADENGINEER;
return true;
}
}
//log("parms: added new collection \"%s\"", collName );
cr->m_maxToCrawl = -1;
cr->m_maxToProcess = -1;
if ( customCrawl ) {
// always index spider status docs now
cr->m_indexSpiderReplies = true;
// remember the token
cr->m_diffbotToken.set ( token );
cr->m_diffbotCrawlName.set ( crawl );
// bring this back
cr->m_diffbotApiUrl.set ( "" );
cr->m_diffbotUrlCrawlPattern.set ( "" );
cr->m_diffbotUrlProcessPattern.set ( "" );
cr->m_diffbotPageProcessPattern.set ( "" );
cr->m_diffbotUrlCrawlRegEx.set ( "" );
cr->m_diffbotUrlProcessRegEx.set ( "" );
cr->m_diffbotMaxHops = -1;
cr->m_spiderStatus = SP_INITIALIZING;
// do not spider more than this many urls total.
// -1 means no max.
cr->m_maxToCrawl = 100000;
// do not process more than this. -1 means no max.
cr->m_maxToProcess = 100000;
// -1 means no max
cr->m_maxCrawlRounds = -1;
// diffbot download docs up to 10MB so we don't truncate
// things like sitemap.xml
cr->m_maxTextDocLen = 10000000;
cr->m_maxOtherDocLen = 10000000;
// john want's deduping on by default to avoid
// processing similar pgs
cr->m_dedupingEnabled = true;
// show the ban links in the search results. the
// collection name is cryptographic enough to show that
cr->m_isCustomCrawl = customCrawl;
cr->m_diffbotOnlyProcessIfNewUrl = true;
// default respider to off
cr->m_collectiveRespiderFrequency = 0.0;
//cr->m_restrictDomain = true;
// reset the crawl stats
// always turn off gigabits so &s=1000 can do summary skipping
cr->m_docsToScanForTopics = 0;
// turn off link voting, etc. to speed up
cr->m_getLinkInfo = false;
cr->m_computeSiteNumInlinks = false;
}
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
// . just the basics on these for now
// . if certain parms are changed then the url filters
// must be rebuilt, as well as possibly the waiting tree!!!
// . need to set m_urlFiltersHavePageCounts etc.
cr->rebuildUrlFilters ( );
cr->m_useRobotsTxt = true;
// reset crawler stats.they should be loaded from crawlinfo.txt
memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) );
memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) );
// note that
log("colldb: initial revival for %s",cr->m_coll);
// . assume we got some urls ready to spider
// . Spider.cpp will wait SPIDER_DONE_TIME seconds and if it has no
// urls it spidered in that time these will get set to 0 and it
// will send out an email alert if m_sentCrawlDoneAlert is not true.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = 1;
// set some defaults. max spiders for all priorities in this
// collection. NO, default is in Parms.cpp.
//cr->m_maxNumSpiders = 10;
//cr->m_needsSave = 1;
// start the spiders!
cr->m_spideringEnabled = true;
// override this?
saveIt = true;
//
// END NEW CODE
//
//log("admin: adding coll \"%s\" (new=%"INT32")",coll,(int32_t)isNew);
// MDW: create the new directory
retry22:
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
if ( errno == EINTR ) goto retry22;
g_errno = errno;
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
return log("admin: Creating directory %s had error: "
"%s.", dname,mstrerror(g_errno));
}
// save it into this dir... might fail!
if ( saveIt && ! cr->save() ) {
mdelete ( cr , sizeof(CollectionRec) , "CollectionRec" );
delete ( cr );
return log("admin: Failed to save file %s: %s",
dname,mstrerror(g_errno));
}
if ( ! registerCollRec ( cr , true ) )
return false;
// add the rdbbases for this coll, CollectionRec::m_bases[]
if ( ! addRdbBasesForCollRec ( cr ) )
return false;
return true;
}
void CollectionRec::setBasePtr ( char rdbId , class RdbBase *base ) {
// if in the process of swapping in, this will be false...
//if ( m_swappedOut ) { char *xx=NULL;*xx=0; }
if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
// Rdb::deleteColl() will call this even though we are swapped in
// but it calls it with "base" set to NULL after it nukes the RdbBase
// so check if base is null here.
if ( base && m_bases[ (unsigned char)rdbId ]){ char *xx=NULL;*xx=0; }
m_bases [ (unsigned char)rdbId ] = base;
}
RdbBase *CollectionRec::getBasePtr ( char rdbId ) {
if ( rdbId < 0 || rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
return m_bases [ (unsigned char)rdbId ];
}
static bool s_inside = false;
// . returns NULL w/ g_errno set on error.
// . TODO: ensure not called from in thread, not thread safe
RdbBase *CollectionRec::getBase ( char rdbId ) {
if ( s_inside ) { char *xx=NULL;*xx=0; }
if ( ! m_swappedOut ) return m_bases[(unsigned char)rdbId];
log("cdb: swapin collnum=%"INT32"",(int32_t)m_collnum);
// sanity!
if ( g_threads.amThread() ) { char *xx=NULL;*xx=0; }
s_inside = true;
// turn off quickpoll to avoid getbase() being re-called and
// coring from s_inside being true
int32_t saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
// load them back in. return NULL w/ g_errno set on error.
if ( ! g_collectiondb.addRdbBasesForCollRec ( this ) ) {
log("coll: error swapin: %s",mstrerror(g_errno));
g_conf.m_useQuickpoll = saved;
s_inside = false;
return NULL;
}
g_conf.m_useQuickpoll = saved;
s_inside = false;
g_collectiondb.m_numCollsSwappedOut--;
m_swappedOut = false;
log("coll: swapin was successful for collnum=%"INT32"",(int32_t)m_collnum);
return m_bases[(unsigned char)rdbId];
}
bool CollectionRec::swapOut ( ) {
if ( m_swappedOut ) return true;
log("cdb: swapout collnum=%"INT32"",(int32_t)m_collnum);
// free all RdbBases in each rdb
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
Rdb *rdb = g_process.m_rdbs[i];
// this frees all the RdbBase::m_files and m_maps for the base
rdb->resetBase ( m_collnum );
}
// now free each base itself
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
RdbBase *base = m_bases[i];
if ( ! base ) continue;
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
m_bases[i] = NULL;
}
m_swappedOut = true;
g_collectiondb.m_numCollsSwappedOut++;
return true;
}
// . called only by addNewColl() and by addExistingColl()
bool Collectiondb::registerCollRec ( CollectionRec *cr , bool isNew ) {
// add m_recs[] and to hashtable
if ( ! setRecPtr ( cr->m_collnum , cr ) )
return false;
return true;
}
// swap it in
bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
CollectionRec *cr = m_recs[i];
if ( ! cr ) continue;
// skip if swapped out
if ( cr->m_swappedOut ) continue;
// add rdb base files etc. for it
addRdbBasesForCollRec ( cr );
}
// now clean the trees. moved this into here from
// addRdbBasesForCollRec() since we call addRdbBasesForCollRec()
// now from getBase() to load on-demand for saving memory
cleanTrees();
return true;
}
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
char *coll = cr->m_coll;
//////
//
// if we are doing a dump from the command line, skip this stuff
//
//////
if ( g_dumpMode ) return true;
// tell rdbs to add one, too
//if ( ! g_indexdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_posdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_datedb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_titledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_revdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_sectiondb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_tagdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_catdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_checksumdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
//if ( ! g_tfndb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_clusterdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_linkdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_spiderdb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
if ( ! g_doledb.getRdb()->addRdbBase1 ( coll ) ) goto hadError;
// now clean the trees
//cleanTrees();
// debug message
//log ( LOG_INFO, "db: verified collection \"%s\" (%"INT32").",
// coll,(int32_t)cr->m_collnum);
// tell SpiderCache about this collection, it will create a
// SpiderCollection class for it.
//g_spiderCache.reset1();
// success
return true;
hadError:
log("db: error registering coll: %s",mstrerror(g_errno));
return false;
}
/*
bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
if ( r->getLong("admin",1) == 0 ) return false;
if ( g_conf.isMasterAdmin ( s , r ) ) return true;
char *c = r->getString ( "c" );
CollectionRec *cr = getRec ( c );
if ( ! cr ) return false;
return g_users.hasPermission ( r , PAGE_SEARCH );
//return cr->hasPermission ( r , s );
}
void savingCheckWrapper1 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) { log("colldb: we1 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.resetColl ( we->m_coll ,
we ,
we->m_purgeSeeds))
return;
// all done
we->m_callback ( we->m_state );
}
void savingCheckWrapper2 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) { log("colldb: we2 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
// all done
we->m_callback ( we->m_state );
}
*/
/*
// delete all records checked in the list
bool Collectiondb::deleteRecs ( HttpRequest *r ) {
for ( int32_t i = 0 ; i < r->getNumFields() ; i++ ) {
char *f = r->getField ( i );
if ( strncmp ( f , "del" , 3 ) != 0 ) continue;
char *coll = f + 3;
//if ( ! is_digit ( f[3] ) ) continue;
//int32_t h = atol ( f + 3 );
deleteRec ( coll , NULL );
}
return true;
}
*/
/*
// . delete a collection
// . this uses blocking unlinks, may make non-blocking later
// . returns false if blocked, true otherwise
bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
// force on for now
//deleteTurkdb = true;
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
//if ( g_conf.m_spideringEnabled ||
// g_spiderLoop.m_numSpidersOut > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are enabled or active.");
// return false;
//}
// ensure it's not NULL
if ( ! coll ) {
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
g_errno = ENOTFOUND;
return true;
}
// find the rec for this collection
collnum_t collnum = getCollnum ( coll );
return deleteRec2 ( collnum , we );
}
*/
// if there is an outstanding disk read thread or merge thread then
// Spider.cpp will handle the delete in the callback.
// this is now tryToDeleteSpiderColl in Spider.cpp
/*
void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
if ( ! sc->m_msg5.m_waitingForList &&
! sc->m_msg5b.m_waitingForList &&
! sc->m_msg1.m_mcast.m_inUse ) {
mdelete ( sc, sizeof(SpiderColl),"nukecr2");
delete ( sc );
return;
}
}
*/
/// this deletes the collection, not just part of a reset.
bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.m_collnum == collnum ) {
log("admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
// bitch if not found
if ( collnum < 0 ) {
g_errno = ENOTFOUND;
log(LOG_LOGIC,"admin: Collection #%"INT32" is bad, "
"delete failed.",(int32_t)collnum);
return true;
}
CollectionRec *cr = m_recs [ collnum ];
if ( ! cr ) {
log("admin: Collection id problem. Delete failed.");
g_errno = ENOTFOUND;
return true;
}
if ( g_process.isAnyTreeSaving() ) {
// note it
log("admin: tree is saving. waiting2.");
// all done
return false;
}
// spiders off
//if ( cr->m_spiderColl &&
// cr->m_spiderColl->getTotalOutstandingSpiders() > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are oustanding for collection. Turn off "
// "spiders and wait for them to exit.");
// return false;
//}
char *coll = cr->m_coll;
// note it
log(LOG_INFO,"db: deleting coll \"%s\" (%"INT32")",coll,
(int32_t)cr->m_collnum);
// we need a save
m_needsSave = true;
// nuke doleiptable and waintree and waitingtable
/*
SpiderColl *sc = g_spiderCache.getSpiderColl ( collnum );
sc->m_waitingTree.clear();
sc->m_waitingTable.clear();
sc->m_doleIpTable.clear();
g_spiderLoop.m_lockTable.clear();
g_spiderLoop.m_lockCache.clear(0);
sc->m_lastDownloadCache.clear(collnum);
*/
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
// remove from spider cache, tell it to sync up with collectiondb
//g_spiderCache.reset1();
// . TODO: remove from g_sync
// . remove from all rdbs
//g_indexdb.getRdb()->delColl ( coll );
g_posdb.getRdb()->delColl ( coll );
//g_datedb.getRdb()->delColl ( coll );
g_titledb.getRdb()->delColl ( coll );
//g_revdb.getRdb()->delColl ( coll );
//g_sectiondb.getRdb()->delColl ( coll );
g_tagdb.getRdb()->delColl ( coll );
// let's preserve the tags... they have all the turk votes in them
//if ( deleteTurkdb ) {
//}
//g_catdb.getRdb()->delColl ( coll );
//g_checksumdb.getRdb()->delColl ( coll );
g_spiderdb.getRdb()->delColl ( coll );
g_doledb.getRdb()->delColl ( coll );
//g_tfndb.getRdb()->delColl ( coll );
g_clusterdb.getRdb()->delColl ( coll );
g_linkdb.getRdb()->delColl ( coll );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
//sc->m_collnum = newCollnum;
//sc->reset();
// you have to set this for tryToDeleteSpiderColl to
// actually have a shot at deleting it
sc->m_deleteMyself = true;
// cr will be invalid int16_tly after this
// MDW: this is causing the core...
// use fake ptrs for easier debugging
//sc->m_cr = (CollectionRec *)0x99999;//NULL;
//sc->m_cr = NULL;
sc->setCollectionRec ( NULL );
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc ,"10");
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
// don't let cr reference us anymore, sc is on deathrow
// and "cr" is delete below!
//cr->m_spiderColl = (SpiderColl *)0x8888;//NULL;
cr->m_spiderColl = NULL;
}
// the bulk urls file too i guess
if ( cr->m_isCustomCrawl == 2 && g_hostdb.m_hostId == 0 ) {
SafeBuf bu;
bu.safePrintf("%sbulkurls-%s.txt",
g_hostdb.m_dir , cr->m_coll );
File bf;
bf.set ( bu.getBufStart() );
if ( bf.doesExist() ) bf.unlink();
}
// now remove from list of collections that might need a disk merge
removeFromMergeLinkedList ( cr );
//////
//
// remove from m_recs[]
//
//////
setRecPtr ( cr->m_collnum , NULL );
// free it
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
delete ( cr );
// do not do this here in case spiders were outstanding
// and they added a new coll right away and it ended up getting
// recs from the deleted coll!!
//while ( ! m_recs[m_numRecs-1] ) m_numRecs--;
// update the time
//updateTime();
// done
return true;
}
//#include "PageTurk.h"
/*
// . reset a collection
// . returns false if blocked and will call callback
bool Collectiondb::resetColl ( char *coll , bool purgeSeeds) {
// ensure it's not NULL
if ( ! coll ) {
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
g_errno = ENOCOLLREC;
return true;
}
// get the CollectionRec for "qatest123"
CollectionRec *cr = getRec ( coll ); // "qatest123" );
// must be there. if not, we create test i guess
if ( ! cr ) {
log("db: could not get coll rec \"%s\" to reset", coll);
char *xx=NULL;*xx=0;
}
return resetColl2 ( cr->m_collnum, purgeSeeds);
}
*/
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
// an add, make sure big enough
int32_t need = ((int32_t)collnum+1)*sizeof(CollectionRec *);
int32_t have = m_recPtrBuf.getLength();
int32_t need2 = need - have;
// if already big enough
if ( need2 <= 0 ) {
m_recs [ collnum ] = NULL;
return true;
}
m_recPtrBuf.setLabel ("crecptrb");
// . true here means to clear the new space to zeroes
// . this shit works based on m_length not m_capacity
if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
log("admin: error growing rec ptr buf2.");
return false;
}
// sanity
if ( m_recPtrBuf.getCapacity() < need ) { char *xx=NULL;*xx=0; }
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// update length of used bytes in case we re-alloc
m_recPtrBuf.setLength ( need );
// re-max
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// sanity
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
// initialize slot
m_recs [ collnum ] = NULL;
return true;
}
bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
// first time init hashtable that maps coll to collnum
if ( g_collTable.m_numSlots == 0 &&
! g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
false,0,"nhshtbl"))
return false;
// sanity
if ( collnum < 0 ) { char *xx=NULL;*xx=0; }
// sanity
int32_t max = m_recPtrBuf.getCapacity() / sizeof(CollectionRec *);
// set it
m_recs = (CollectionRec **)m_recPtrBuf.getBufStart();
// tell spiders to re-upadted the active list
g_spiderLoop.m_activeListValid = false;
g_spiderLoop.m_activeListModified = true;
// a delete?
if ( ! cr ) {
// sanity
if ( collnum >= max ) { char *xx=NULL;*xx=0; }
// get what's there
CollectionRec *oc = m_recs[collnum];
// let it go
m_recs[collnum] = NULL;
// if nothing already, done
if ( ! oc ) return true;
// tally it up
m_numRecsUsed--;
// delete key
int64_t h64 = hash64n(oc->m_coll);
// if in the hashtable UNDER OUR COLLNUM then nuke it
// otherwise, we might be called from resetColl2()
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return true;
collnum_t ct = *(collnum_t *)vp;
if ( ct != collnum ) return true;
g_collTable.removeKey ( &h64 );
return true;
}
// ensure m_recs[] is big enough for m_recs[collnum] to be a ptr
if ( ! growRecPtrBuf ( collnum ) )
return false;
// sanity
if ( cr->m_collnum != collnum ) { char *xx=NULL;*xx=0; }
// add to hash table to map name to collnum_t
int64_t h64 = hash64n(cr->m_coll);
// debug
//log("coll: adding key %"INT64" for %s",h64,cr->m_coll);
if ( ! g_collTable.addKey ( &h64 , &collnum ) )
return false;
// ensure last is NULL
m_recs[collnum] = cr;
// count it
m_numRecsUsed++;
//log("coll: adding key4 %"UINT64" for coll \"%s\" (%"INT32")",h64,cr->m_coll,
// (int32_t)i);
// reserve it
if ( collnum >= m_numRecs ) m_numRecs = collnum + 1;
// sanity to make sure collectionrec ptrs are legit
for ( int32_t j = 0 ; j < m_numRecs ; j++ ) {
if ( ! m_recs[j] ) continue;
if ( m_recs[j]->m_collnum == 1 ) continue;
}
// update the time
//updateTime();
return true;
}
// moves a file by first trying rename, then copying since cross device renaming doesn't work
// returns 0 on success
int mv(char* src, char* dest) {
int status = rename( src , dest );
if (status == 0)
return 0;
FILE *fsrc, *fdest;
fsrc = fopen(src, "r");
if (fsrc == NULL)
return -1;
fdest = fopen(dest, "w");
if (fdest == NULL) {
fclose(fsrc);
return -1;
}
const int BUF_SIZE = 1024;
char buf[BUF_SIZE];
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
int read = fread(buf, 1, BUF_SIZE, fsrc);
fwrite(buf, 1, read, fdest);
}
fclose(fsrc);
fclose(fdest);
if (ferror(fdest) || ferror(fsrc))
return -1;
remove(src);
return 0;
}
// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2( collnum_t oldCollnum,
collnum_t newCollnum,
//WaitEntry *we,
bool purgeSeeds){
// save parms in case we block
//we->m_purgeSeeds = purgeSeeds;
// now must be "qatest123" only for now
//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
// no spiders can be out. they may be referencing the CollectionRec
// in XmlDoc.cpp... quite likely.
//if ( g_conf.m_spideringEnabled ||
// g_spiderLoop.m_numSpidersOut > 0 ) {
// log("admin: Can not delete collection while "
// "spiders are enabled or active.");
// return false;
//}
// do not allow this if in repair mode
if ( g_repair.isRepairActive() && g_repair.m_collnum == oldCollnum ) {
log("admin: Can not delete collection while in repair mode.");
g_errno = EBADENGINEER;
return true;
}
//log("admin: resetting collnum %"INT32"",(int32_t)oldCollnum);
// CAUTION: tree might be in the middle of saving
// we deal with this in Process.cpp now
if ( g_process.isAnyTreeSaving() ) {
// we could not complete...
return false;
}
CollectionRec *cr = m_recs [ oldCollnum ];
// let's reset crawlinfo crap
cr->m_globalCrawlInfo.reset();
cr->m_localCrawlInfo.reset();
//collnum_t oldCollnum = cr->m_collnum;
//collnum_t newCollnum = m_numRecs;
// in case of bulk job, be sure to save list of spots
// copy existing list to a /tmp, where they will later be transferred back to the new folder
// now i just store in the root working dir... MDW
/*
char oldbulkurlsname[1036];
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)oldCollnum);
char newbulkurlsname[1036];
snprintf(newbulkurlsname, 1036, "%scoll.%s.%"INT32"/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(int32_t)newCollnum);
char tmpbulkurlsname[1036];
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%"INT32".bulkurls.txt",cr->m_coll,(int32_t)oldCollnum);
if (cr->m_isCustomCrawl == 2)
mv( oldbulkurlsname , tmpbulkurlsname );
*/
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
// remove locks from lock table:
sc->clearLocks();
// don't do this anymore, just nuke it in case
// m_populatingDoledb was true etc. there are too many
// flags to worry about
//sc->m_collnum = newCollnum;
//sc->reset();
// this will put it on "death row" so it will be deleted
// once Msg5::m_waitingForList/Merge is NULL
tryToDeleteSpiderColl ( sc,"11" );
//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
//delete ( sc );
cr->m_spiderColl = NULL;
}
// reset spider round
cr->m_spiderRoundNum = 0;
cr->m_spiderRoundStartTime = 0;
cr->m_spiderStatus = SP_INITIALIZING; // this is 0
//cr->m_spiderStatusMsg = NULL;
// reset seed buf
if ( purgeSeeds ) {
// free the buffer of seed urls
cr->m_diffbotSeeds.purge();
// reset seed dedup table
HashTableX *ht = &cr->m_seedHashTable;
ht->reset();
}
// so XmlDoc.cpp can detect if the collection was reset since it
// launched its spider:
cr->m_lastResetCount++;
if ( newCollnum >= m_numRecs ) m_numRecs = (int32_t)newCollnum + 1;
// advance sanity check. did we wrap around?
// right now we #define collnum_t int16_t
if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
// make a new collnum so records in transit will not be added
// to any rdb...
cr->m_collnum = newCollnum;
// update the timestamps since we are restarting/resetting
cr->m_diffbotCrawlStartTime = getTimeGlobalNoCore();
cr->m_diffbotCrawlEndTime = 0;
////////
//
// ALTER m_recs[] array
//
////////
// Rdb::resetColl() needs to know the new cr so it can move
// the RdbBase into cr->m_bases[rdbId] array. recycling.
setRecPtr ( newCollnum , cr );
// a new directory then since we changed the collnum
char dname[512];
sprintf(dname, "%scoll.%s.%"INT32"/",
g_hostdb.m_dir,
cr->m_coll,
(int32_t)newCollnum);
DIR *dir = opendir ( dname );
if ( dir )
closedir ( dir );
if ( dir ) {
//g_errno = EEXIST;
log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",cr->m_coll,dname);
}
if ( ::mkdir ( dname ,
getDirCreationFlags() ) ) {
// S_IRUSR | S_IWUSR | S_IXUSR |
// S_IRGRP | S_IWGRP | S_IXGRP |
// S_IROTH | S_IXOTH ) ) {
// valgrind?
//if ( errno == EINTR ) goto retry22;
//g_errno = errno;
log("admin: Creating directory %s had error: "
"%s.", dname,mstrerror(g_errno));
}
// be sure to copy back the bulk urls for bulk jobs
// MDW: now i just store that file in the root working dir
//if (cr->m_isCustomCrawl == 2)
// mv( tmpbulkurlsname, newbulkurlsname );
// . unlink all the *.dat and *.map files for this coll in its subdir
// . remove all recs from this collnum from m_tree/m_buckets
// . updates RdbBase::m_collnum
// . so for the tree it just needs to mark the old collnum recs
// with a collnum -1 in case it is saving...
g_posdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_titledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_tagdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_spiderdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_doledb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_clusterdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->deleteColl ( oldCollnum , newCollnum );
// reset crawl status too!
cr->m_spiderStatus = SP_INITIALIZING;
// . set m_recs[oldCollnum] to NULL and remove from hash table
// . do after calls to deleteColl() above so it wont crash
setRecPtr ( oldCollnum , NULL );
// save coll.conf to new directory
cr->save();
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we
// have in the test-parser subdir so we are consistent
//RdbCache *robots = Msg13::getHttpCacheRobots();
//RdbCache *others = Msg13::getHttpCacheOthers();
// clear() was removed do to possible corruption
//robots->clear ( oldCollnum );
//others->clear ( oldCollnum );
//g_templateTable.reset();
//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
// repopulate CollectionRec::m_sortByDateTable. should be empty
// since we are resetting here.
//initSortByDateTable ( coll );
// done
return true;
}
// a hack function
bool addCollToTable ( char *coll , collnum_t collnum ) {
// readd it to the hashtable that maps name to collnum too
int64_t h64 = hash64n(coll);
g_collTable.set(8,sizeof(collnum_t), 256,NULL,0,
false,0,"nhshtbl");
return g_collTable.addKey ( &h64 , &collnum );
}
// get coll rec specified in the HTTP request
CollectionRec *Collectiondb::getRec ( HttpRequest *r , bool useDefaultRec ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
// maybe it is crawlbot?
char *name = NULL;
char *token = NULL;
if ( ! coll ) {
name = r->getString("name");
token = r->getString("token");
}
char tmp[MAX_COLL_LEN+1];
if ( ! coll && token && name ) {
snprintf(tmp,MAX_COLL_LEN,"%s-%s",token,name);
coll = tmp;
}
// default to main first
if ( ! coll && useDefaultRec ) {
CollectionRec *cr = g_collectiondb.getRec("main");
if ( cr ) return cr;
}
// try next in line
if ( ! coll && useDefaultRec ) {
return getFirstRec ();
}
// give up?
if ( ! coll ) return NULL;
//if ( ! coll || ! coll[0] ) coll = g_conf.m_defaultColl;
return g_collectiondb.getRec ( coll );
}
char *Collectiondb::getDefaultColl ( HttpRequest *r ) {
char *coll = r->getString ( "c" );
if ( coll && ! coll[0] ) coll = NULL;
if ( coll ) return coll;
CollectionRec *cr = NULL;
// default to main first
if ( ! coll ) {
cr = g_collectiondb.getRec("main");
// CAUTION: cr could be deleted so don't trust this ptr
// if you give up control of the cpu
if ( cr ) return cr->m_coll;
}
// try next in line
if ( ! coll ) {
cr = getFirstRec ();
if ( cr ) return cr->m_coll;
}
// give up?
return NULL;
}
//CollectionRec *Collectiondb::getRec2 ( HttpRequest *r , bool useDefaultRec) {
// char *coll = getDefaultColl();
// return g_collectiondb.getRec(coll);
//}
// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
if ( ! coll ) coll = "";
return getRec ( coll , gbstrlen(coll) );
}
CollectionRec *Collectiondb::getRec ( char *coll , int32_t collLen ) {
if ( ! coll ) coll = "";
collnum_t collnum = getCollnum ( coll , collLen );
if ( collnum < 0 ) return NULL;
return m_recs [ (int32_t)collnum ];
}
CollectionRec *Collectiondb::getRec ( collnum_t collnum) {
if ( collnum >= m_numRecs || collnum < 0 ) {
// Rdb::resetBase() gets here, so don't always log.
// it is called from CollectionRec::reset() which is called
// from the CollectionRec constructor and ::load() so
// it won't have anything in rdb at that time
//log("colldb: collnum %"INT32" > numrecs = %"INT32"",
// (int32_t)collnum,(int32_t)m_numRecs);
return NULL;
}
return m_recs[collnum];
}
//CollectionRec *Collectiondb::getDefaultRec ( ) {
// if ( ! g_conf.m_defaultColl[0] ) return NULL; // no default?
// collnum_t collnum = getCollnum ( g_conf.m_defaultColl );
// if ( collnum < (collnum_t)0 ) return NULL;
// return m_recs[(int32_t)collnum];
//}
CollectionRec *Collectiondb::getFirstRec ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return m_recs[i];
return NULL;
}
collnum_t Collectiondb::getFirstCollnum ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return i;
return (collnum_t)-1;
}
char *Collectiondb::getFirstCollName ( ) {
for ( int32_t i = 0 ; i < m_numRecs ; i++ )
if ( m_recs[i] ) return m_recs[i]->m_coll;
return NULL;
}
char *Collectiondb::getCollName ( collnum_t collnum ) {
if ( collnum < 0 || collnum > m_numRecs ) return NULL;
if ( ! m_recs[(int32_t)collnum] ) return NULL;
return m_recs[collnum]->m_coll;
}
collnum_t Collectiondb::getCollnum ( char *coll ) {
int32_t clen = 0;
if ( coll ) clen = gbstrlen(coll );
return getCollnum ( coll , clen );
/*
//if ( ! coll ) coll = "";
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
if ( ! coll ) coll = g_conf.m_defaultColl;
if ( ! coll || ! coll[0] ) coll = "main";
// This is necessary for Statsdb to work, as it is
// not associated with any collection. Is this
// necessary for Catdb?
if ( coll[0]=='s' && coll[1] =='t' &&
strcmp ( "statsdb\0", coll ) == 0)
return 0;
if ( coll[0]=='f' && coll[1]=='a' &&
strcmp ( "facebookdb\0", coll ) == 0)
return 0;
if ( coll[0]=='a' && coll[1]=='c' &&
strcmp ( "accessdb\0", coll ) == 0)
return 0;
// because diffbot may have thousands of crawls/collections
// let's improve the speed here. try hashing it...
int64_t h64 = hash64n(coll);
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return -1; // not found
return *(collnum_t *)vp;
*/
/*
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
if ( m_recs[i]->m_coll[0] != coll[0] ) continue;
if ( strcmp ( m_recs[i]->m_coll , coll ) == 0 ) return i;
}
//if ( strcmp ( "catdb\0", coll ) == 0) return 0;
return (collnum_t)-1; // not found
*/
}
collnum_t Collectiondb::getCollnum ( char *coll , int32_t clen ) {
// default empty collection names
if ( coll && ! coll[0] ) coll = NULL;
if ( ! coll ) {
coll = g_conf.m_defaultColl;
if ( coll ) clen = gbstrlen(coll);
else clen = 0;
}
if ( ! coll || ! coll[0] ) {
coll = "main";
clen = gbstrlen(coll);
}
// This is necessary for Statsdb to work, as it is
//if ( ! coll ) coll = "";
// not associated with any collection. Is this
// necessary for Catdb?
if ( coll[0]=='s' && coll[1] =='t' &&
strcmp ( "statsdb\0", coll ) == 0)
return 0;
if ( coll[0]=='f' && coll[1]=='f' &&
strcmp ( "facebookdb\0", coll ) == 0)
return 0;
if ( coll[0]=='a' && coll[1]=='c' &&
strcmp ( "accessdb\0", coll ) == 0)
return 0;
// because diffbot may have thousands of crawls/collections
// let's improve the speed here. try hashing it...
int64_t h64 = hash64(coll,clen);
void *vp = g_collTable.getValue ( &h64 );
if ( ! vp ) return -1; // not found
return *(collnum_t *)vp;
/*
for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
if ( ! m_recs[i] ) continue;
if ( m_recs[i]->m_collLen != clen ) continue;
if ( strncmp(m_recs[i]->m_coll,coll,clen) == 0 ) return i;
}
//if ( strncmp ( "catdb\0", coll, clen ) == 0) return 0;
return (collnum_t)-1; // not found
*/
}
//collnum_t Collectiondb::getNextCollnum ( collnum_t collnum ) {
// for ( int32_t i = (int32_t)collnum + 1 ; i < m_numRecs ; i++ )
// if ( m_recs[i] ) return i;
// // no next one, use -1
// return (collnum_t) -1;
//}
// what collnum will be used the next time a coll is added?
collnum_t Collectiondb::reserveCollNum ( ) {
if ( m_numRecs < 0x7fff ) {
collnum_t next = m_numRecs;
// make the ptr NULL at least to accomodate the
// loop that scan up to m_numRecs lest we core
growRecPtrBuf ( next );
m_numRecs++;
return next;
}
// collnum_t is signed right now because we use -1 to indicate a
// bad collnum.
int32_t scanned = 0;
// search for an empty slot
for ( int32_t i = m_wrapped ; ; i++ ) {
// because collnum_t is 2 bytes, signed, limit this here
if ( i > 0x7fff ) i = 0;
// how can this happen?
if ( i < 0 ) i = 0;
// if we scanned the max # of recs we could have, we are done
if ( ++scanned >= m_numRecs ) break;
// skip if this is in use
if ( m_recs[i] ) continue;
// start after this one next time
m_wrapped = i+1;
// note it
log("colldb: returning wrapped collnum "
"of %"INT32"",(int32_t)i);
return (collnum_t)i;
}
log("colldb: no new collnum available. consider upping collnum_t");
// none available!!
return -1;
}
///////////////
//
// COLLECTIONREC
//
///////////////
#include "gb-include.h"
//#include "CollectionRec.h"
//#include "Collectiondb.h"
#include "HttpServer.h" // printColors2()
#include "Msg5.h"
#include "Threads.h"
#include "Datedb.h"
#include "Timedb.h"
#include "Spider.h"
#include "Process.h"
static CollectionRec g_default;
CollectionRec::CollectionRec() {
m_nextLink = NULL;
m_prevLink = NULL;
m_spiderCorruptCount = 0;
m_collnum = -1;
m_coll[0] = '\0';
m_updateRoundNum = 0;
m_swappedOut = false;
//m_numSearchPwds = 0;
//m_numBanIps = 0;
//m_numSearchIps = 0;
//m_numSpamIps = 0;
//m_numAdminPwds = 0;
//m_numAdminIps = 0;
memset ( m_bases , 0 , sizeof(RdbBase *)*RDB_END );
// how many keys in the tree of each rdb? we now store this stuff
// here and not in RdbTree.cpp because we no longer have a maximum
// # of collection recs... MAX_COLLS. each is a 32-bit "int32_t" so
// it is 4 * RDB_END...
memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
m_spiderColl = NULL;
m_overflow = 0x12345678;
m_overflow2 = 0x12345678;
// the spiders are currently uninhibited i guess
m_spiderStatus = SP_INITIALIZING; // this is 0
//m_spiderStatusMsg = NULL;
// for Url::getSite()
m_updateSiteRulesTable = 1;
//m_lastUpdateTime = 0LL;
m_clickNScrollEnabled = false;
// inits for sortbydatetable
m_inProgress = false;
m_msg5 = NULL;
m_importState = NULL;
// JAB - track which regex parsers have been initialized
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);
// clear these out so Parms::calcChecksum can work:
memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
//for ( int i = 0; i < MAX_FILTERS ; i++ )
// m_spiderQuotas[i] = -1;
memset( m_spiderPriorities, 0,
MAX_FILTERS*sizeof(*m_spiderPriorities) );
memset ( m_harvestLinks,0,MAX_FILTERS);
memset ( m_forceDelete,0,MAX_FILTERS);
//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
// *(m_searchPwds[i]) = '\0';
//}
//for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
// *(m_adminPwds[i]) = '\0';
//}
//memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
//memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
//memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
//memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
//for ( int i = 0; i < MAX_FILTERS; i++ ) {
// //m_pRegExParser[i] = NULL;
// *(m_regExs[i]) = '\0';
//}
m_numRegExs = 0;
//m_requests = 0;
//m_replies = 0;
//m_doingCallbacks = false;
m_lastResetCount = 0;
// regex_t types
m_hasucr = false;
m_hasupr = false;
// for diffbot caching the global spider stats
reset();
// add default reg ex if we do not have one
//setUrlFiltersToDefaults();
//rebuildUrlFilters();
}
CollectionRec::~CollectionRec() {
//invalidateRegEx ();
reset();
}
// new collection recs get this called on them
void CollectionRec::setToDefaults ( ) {
g_parms.setFromFile ( this , NULL , NULL , OBJ_COLL );
// add default reg ex
//setUrlFiltersToDefaults();
rebuildUrlFilters();
}
void CollectionRec::reset() {
//log("coll: resetting collnum=%"INT32"",(int32_t)m_collnum);
// . grows dynamically
// . setting to 0 buckets should never have error
//m_pageCountTable.set ( 4,4,0,NULL,0,false,MAX_NICENESS,"pctbl" );
// regex_t types
if ( m_hasucr ) regfree ( &m_ucr );
if ( m_hasupr ) regfree ( &m_upr );
m_hasucr = false;
m_hasupr = false;
m_sendingAlertInProgress = false;
// make sure we do not leave spiders "hanging" waiting for their
// callback to be called... and it never gets called
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
//if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
//if ( m_replies != m_requests ) { char *xx=NULL;*xx=0; }
m_localCrawlInfo.reset();
m_globalCrawlInfo.reset();
//m_requests = 0;
//m_replies = 0;
// free all RdbBases in each rdb
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
Rdb *rdb = g_process.m_rdbs[i];
rdb->resetBase ( m_collnum );
}
for ( int32_t i = 0 ; i < g_process.m_numRdbs ; i++ ) {
RdbBase *base = m_bases[i];
if ( ! base ) continue;
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
}
SpiderColl *sc = m_spiderColl;
// debug hack thing
//if ( sc == (SpiderColl *)0x8888 ) return;
// if never made one, we are done
if ( ! sc ) return;
// spider coll also!
sc->m_deleteMyself = true;
// if not currently being accessed nuke it now
tryToDeleteSpiderColl ( sc ,"12");
// if ( ! sc->m_msg5.m_waitingForList &&
// ! sc->m_msg5b.m_waitingForList &&
// ! sc->m_msg1.m_mcast.m_inUse ) {
// mdelete ( sc, sizeof(SpiderColl),"nukecr2");
// delete ( sc );
// }
}
CollectionRec *g_cr = NULL;
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use
// the value we received from call to setToDefaults()
// . returns false and sets g_errno on load error
bool CollectionRec::load ( char *coll , int32_t i ) {
// also reset some counts not included in parms list
reset();
// before we load, set to defaults in case some are not in xml file
g_parms.setToDefault ( (char *)this , OBJ_COLL , this );
// get the filename with that id
File f;
char tmp2[1024];
sprintf ( tmp2 , "%scoll.%s.%"INT32"/coll.conf", g_hostdb.m_dir , coll,i);
f.set ( tmp2 );
if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
// set our collection number
m_collnum = i;
// set our collection name
m_collLen = gbstrlen ( coll );
strcpy ( m_coll , coll );
if ( ! g_conf.m_doingCommandLine )
log(LOG_INFO,"db: Loading conf for collection %s (%"INT32")",coll,
(int32_t)m_collnum);
// collection name HACK for backwards compatibility
//if ( strcmp ( coll , "main" ) == 0 ) {
// m_coll[0] = '\0';
// m_collLen = 0;
//}
// the default conf file
char tmp1[1024];
snprintf ( tmp1 , 1023, "%sdefault.conf" , g_hostdb.m_dir );
// . set our parms from the file.
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( this , tmp2 , tmp1 , OBJ_COLL );
// add default reg ex IFF there are no url filters there now
//if(m_numRegExs == 0) rebuildUrlFilters();//setUrlFiltersToDefaults();
// this only rebuild them if necessary
rebuildUrlFilters();//setUrlFiltersToDefaults();
// temp check
//testRegex();
//
// LOAD the crawlinfo class in the collectionrec for diffbot
//
// LOAD LOCAL
snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_localCrawlInfo.reset();
SafeBuf sb;
// fillfromfile returns 0 if does not exist, -1 on read error
if ( sb.fillFromFile ( tmp1 ) > 0 )
//m_localCrawlInfo.setFromSafeBuf(&sb);
// it is binary now
gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
// if it had corrupted data from saving corrupted mem zero it out
CrawlInfo *stats = &m_localCrawlInfo;
// point to the stats for that host
int64_t *ss = (int64_t *)stats;
// are stats crazy?
bool crazy = false;
for ( int32_t j = 0 ; j < NUMCRAWLSTATS ; j++ ) {
// crazy stat?
if ( *ss > 1000000000LL ||
*ss < -1000000000LL ) {
crazy = true;
break;
}
ss++;
}
if ( m_localCrawlInfo.m_collnum != m_collnum )
crazy = true;
if ( crazy ) {
log("coll: had crazy spider stats for coll %s. zeroing out.",
m_coll);
m_localCrawlInfo.reset();
}
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
m_coll,
(int32_t)m_collnum,
(int32_t)m_localCrawlInfo.m_hasUrlsReadyToSpider);
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageDownloadSuccessesThisRound <
m_localCrawlInfo.m_pageDownloadSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageDownloadSuccessesThisRound =
m_localCrawlInfo.m_pageDownloadSuccesses;
}
// we introduced the this round counts, so don't start them at 0!!
if ( m_spiderRoundNum == 0 &&
m_localCrawlInfo.m_pageProcessSuccessesThisRound <
m_localCrawlInfo.m_pageProcessSuccesses ) {
log("coll: fixing process count this round for %s",m_coll);
m_localCrawlInfo.m_pageProcessSuccessesThisRound =
m_localCrawlInfo.m_pageProcessSuccesses;
}
// fix from old bug that was fixed
//if ( m_spiderRoundNum == 0 &&
// m_collectiveRespiderFrequency > 0.0 &&
// m_localCrawlInfo.m_sentCrawlDoneAlert ) {
// log("coll: bug fix: resending email alert for coll %s (%"INT32") "
// "of respider freq %f",m_coll,(int32_t)m_collnum,
// m_collectiveRespiderFrequency);
// m_localCrawlInfo.m_sentCrawlDoneAlert = false;
//}
// LOAD GLOBAL
snprintf ( tmp1 , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
log(LOG_DEBUG,"db: Loading %s",tmp1);
m_globalCrawlInfo.reset();
sb.reset();
if ( sb.fillFromFile ( tmp1 ) > 0 )
//m_globalCrawlInfo.setFromSafeBuf(&sb);
// it is binary now
gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Loaded %s (%"INT32") global hasurlsready=%"INT32"",
m_coll,
(int32_t)m_collnum,
(int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
if ( ! g_conf.m_doingCommandLine ) {
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.m_allocName = "twittbl";
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
}
////////////
//
// PAGE COUNT TABLE for doing quotas in url filters
//
/////////////
// log it up if there on disk
//snprintf ( tmp1 , 1023, "/coll.%s.%"INT32"/pagecounts.dat",
// m_coll , (int32_t)m_collnum );
//if ( ! m_pageCountTable.load ( g_hostdb.m_dir , tmp1 ) && g_errno )
// log("db: failed to load page count table: %s",
// mstrerror(g_errno));
// ignore errors i guess
g_errno = 0;
// fix for diffbot, spider time deduping
if ( m_isCustomCrawl ) m_dedupingEnabled = true;
// always turn off gigabits so &s=1000 can do summary skipping
if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;
// make min to merge smaller than normal since most collections are
// small and we want to reduce the # of vfds (files) we have
if ( m_isCustomCrawl ) {
m_posdbMinFilesToMerge = 6;
m_titledbMinFilesToMerge = 4;
m_linkdbMinFilesToMerge = 3;
m_tagdbMinFilesToMerge = 2;
}
// always turn on distributed spider locking because otherwise
// we end up calling Msg50 which calls Msg25 for the same root url
// at the same time, thereby wasting massive resources. it is also
// dangerous to run without this because webmaster get pissed when
// we slam their servers.
// This is now deprecated...
//m_useSpiderLocks = false;
// and all pages downloaded from a particular ip should be done
// by the same host in our cluster to prevent webmaster rage
//m_distributeSpiderGet = true;
//initSortByDateTable(m_coll);
return true;
}
/*
bool CollectionRec::countEvents ( ) {
// set our m_numEventsOnHost value
log("coll: loading event count termlist gbeventcount");
// temporarily turn off threads
bool enabled = g_threads.areThreadsEnabled();
g_threads.disableThreads();
// count them
m_numEventsOnHost = 0;
// 1MB at a time
int32_t minRecSizes = 1000000;
// look up this termlist, gbeventcount which we index in XmlDoc.cpp
int64_t termId = hash64n("gbeventcount") & TERMID_MASK;
// make datedb key from it
key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
Msg5 msg5;
RdbList list;
// . init m_numEventsOnHost by getting the exact length of that
// termlist on this host
// . send in the ping request packet so all hosts can total up
// . Rdb.cpp should be added to incrementally so we should have no
// double positives.
// . Rdb.cpp should inspect each datedb rec for this termid in
// a fast an efficient manner
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_DATEDB ,
m_coll ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
NULL )){// msg5b
// not allowed to block!
char *xx=NULL;*xx=0; }
// scan the list, score is how many valid events from that docid
uint32_t total = 0;
for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
unsigned char *rec = (unsigned char *)list.getCurrentRec();
// in datedb score is byte #5
total += (255-rec[5]);
}
// declare
char *lastKeyPtr;
key128_t newStartKey;
// add to count. datedb uses half keys so subtract 6 bytes
// since the termids will be the same...
//m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
m_numEventsOnHost += total;
// bail if under limit
if ( list.getListSize() < minRecSizes ) goto done;
// update key
lastKeyPtr = list.m_listEnd - 10;
// we make a new start key
list.getKey ( lastKeyPtr , (char *)&newStartKey );
// maxxed out?
if ( newStartKey.n0==0xffffffffffffffffLL &&
newStartKey.n1==0xffffffffffffffffLL )
goto done;
// sanity check
if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
// inc it
newStartKey.n0++;
// in the top if the bottom wrapped
if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
// assign
startKey = newStartKey;
// and loop back up for more now
goto loop;
done:
// update all colls count
g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
if ( enabled ) g_threads.enableThreads();
log("coll: got %"INT32" local events in termlist",m_numEventsOnHost);
// set "m_hasDocQualityFiler"
//updateFilters();
return true;
}
*/
bool CollectionRec::rebuildUrlFilters2 ( ) {
// tell spider loop to update active list
g_spiderLoop.m_activeListValid = false;
bool rebuild = true;
if ( m_numRegExs == 0 )
rebuild = true;
// don't touch it if not supposed to as int32_t as we have some already
//if ( m_urlFiltersProfile != UFP_NONE )
// rebuild = true;
// never for custom crawls however
if ( m_isCustomCrawl )
rebuild = false;
char *s = m_urlFiltersProfile.getBufStart();
// support the old UFP_CUSTOM, etc. numeric values
if ( !strcmp(s,"0" ) )
s = "custom";
// UFP_WEB SUPPORT
if ( !strcmp(s,"1" ) )
s = "web";
// UFP_NEWS
if ( !strcmp(s,"2" ) )
s = "shallow";
// leave custom profiles alone
if ( !strcmp(s,"custom" ) )
rebuild = false;
//if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
// addDefault = true;
if ( ! rebuild ) return true;
if ( !strcmp(s,"shallow" ) )
return rebuildShallowRules();
//if ( strcmp(s,"web") )
// just fall through for that
if ( !strcmp(s,"english") )
return rebuildLangRules( "en","com,us,gov");
if ( !strcmp(s,"german") )
return rebuildLangRules( "de","de");
if ( !strcmp(s,"french") )
return rebuildLangRules( "fr","fr");
if ( !strcmp(s,"norwegian") )
return rebuildLangRules( "nl","nl");
if ( !strcmp(s,"spanish") )
return rebuildLangRules( "es","es");
//if ( m_urlFiltersProfile == UFP_EURO )
// return rebuildLangRules( "de,fr,nl,es,sv,no,it",
// "com,gov,org,de,fr,nl,es,sv,no,it");
if ( !strcmp(s,"romantic") )
return rebuildLangRules("en,de,fr,nl,es,sv,no,it,fi,pt",
"de,fr,nl,es,sv,no,it,fi,pt,"
"com,gov,org"
);
if ( !strcmp(s,"chinese") )
return rebuildLangRules( "zh_cn,zh_tw","cn");
int32_t n = 0;
/*
m_regExs[n].set("default");
m_regExs[n].nullTerm();
m_spiderFreqs [n] = 30; // 30 days default
m_spiderPriorities[n] = 0;
m_maxSpidersPerRule[n] = 99;
m_spiderIpWaits[n] = 1000;
m_spiderIpMaxSpiders[n] = 7;
m_harvestLinks[n] = 1;
*/
// max spiders per ip
int32_t ipms = 7;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// a non temporary error, like a 404? retry once per 3 months i guess
m_regExs[n].set("errorcount>=1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 5; // 5 day retry
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// 20+ unique c block parent request urls means it is important!
m_regExs[n].set("numinlinks>7 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 52;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
// 20+ unique c block parent request urls means it is important!
m_regExs[n].set("numinlinks>7");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 51;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentrss && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentsitemap && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 44;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentrss");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 43;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isparentsitemap");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 42;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
if ( ! strcmp(s,"news") )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
// do not harvest links if we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
// do not harvest links if we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
else {
n++;
}
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
else {
n++;
}
/*
m_regExs[n].set("isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = resp4;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
n++;
*/
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
if ( ! strcmp(s,"news") ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// more rules
//m_spiderDiffbotApiNum[n] = 1;
//m_numRegExs11++;
//m_spiderDiffbotApiUrl[n].set("");
//m_spiderDiffbotApiUrl[n].nullTerm();
//m_numRegExs11++;
return true;
}
bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
// max spiders per ip
int32_t ipms = 7;
int32_t n = 0;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && tld==%s",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && isnew && "
"parentlang==%s,xx"
,langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
// m_regExs[n].set("hopcount==0 && iswww && isnew");
// m_harvestLinks [n] = 1;
// m_spiderFreqs [n] = 7; // 30 days default
// m_maxSpidersPerRule [n] = 9; // max spiders
// m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
// m_spiderIpWaits [n] = 1000; // same ip wait
// m_spiderPriorities [n] = 20;
// n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && iswww && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 18;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==0 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 17;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && isnew && parentlang==%s,xx",
tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 16;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==1 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 15;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 14;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount==2 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 13;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && isnew && parentlang==%s,xx",
langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 12;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && tld==%s",tldStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].reset();
m_regExs[n].safePrintf("hopcount>=3 && parentlang==%s,xx",langStr);
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 11;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding CHINESE rules
return true;
}
bool CollectionRec::rebuildShallowRules ( ) {
// max spiders per ip
int32_t ipms = 7;
int32_t n = 0;
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
// if not in the site list then nuke it
m_regExs[n].set("!ismanualadd && !insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100; // delete!
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
//
// stop if hopcount>=2 for things tagged shallow in sitelist
//
m_regExs[n].set("tag:shallow && hopcount>=2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 0; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
// if # of pages in this site indexed is >= 10 then stop as well...
m_regExs[n].set("tag:shallow && sitepages>=10");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 0; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 22;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 21;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = ipms; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding SHALLOW rules
return true;
}
/*
bool CrawlInfo::print (SafeBuf *sb ) {
return sb->safePrintf("objectsAdded:%"INT64"\n"
"objectsDeleted:%"INT64"\n"
"urlsConsidered:%"INT64"\n"
"downloadAttempts:%"INT64"\n"
"downloadSuccesses:%"INT64"\n"
"processAttempts:%"INT64"\n"
"processSuccesses:%"INT64"\n"
"lastupdate:%"UINT32"\n"
, m_objectsAdded
, m_objectsDeleted
, m_urlsConsidered
, m_pageDownloadAttempts
, m_pageDownloadSuccesses
, m_pageProcessAttempts
, m_pageProcessSuccesses
, m_lastUpdateTime
);
}
bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
return sscanf(sb->getBufStart(),
"objectsAdded:%"INT64"\n"
"objectsDeleted:%"INT64"\n"
"urlsConsidered:%"INT64"\n"
"downloadAttempts:%"INT64"\n"
"downloadSuccesses:%"INT64"\n"
"processAttempts:%"INT64"\n"
"processSuccesses:%"INT64"\n"
"lastupdate:%"UINT32"\n"
, &m_objectsAdded
, &m_objectsDeleted
, &m_urlsConsidered
, &m_pageDownloadAttempts
, &m_pageDownloadSuccesses
, &m_pageProcessAttempts
, &m_pageProcessSuccesses
, &m_lastUpdateTime
);
}
*/
// returns false on failure and sets g_errno, true otherwise
bool CollectionRec::save ( ) {
if ( g_conf.m_readOnlyMode ) return true;
//File f;
char tmp[1024];
//sprintf ( tmp , "%scollections/%"INT32".%s/c.conf",
// g_hostdb.m_dir,m_id,m_coll);
// collection name HACK for backwards compatibility
//if ( m_collLen == 0 )
// sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
//else
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/coll.conf",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
if ( ! g_parms.saveToXml ( (char *)this , tmp ,OBJ_COLL)) return false;
// log msg
//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
//
// save the crawlinfo class in the collectionrec for diffbot
//
// SAVE LOCAL
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
//log("coll: saving %s",tmp);
// in case emergency save from malloc core, do not alloc
char stack[1024];
SafeBuf sb(stack,1024);
//m_localCrawlInfo.print ( &sb );
// binary now
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
// SAVE GLOBAL
snprintf ( tmp , 1023, "%scoll.%s.%"INT32"/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
//log("coll: saving %s",tmp);
sb.reset();
//m_globalCrawlInfo.print ( &sb );
// binary now
sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
if ( sb.safeSave ( tmp ) == -1 ) {
log("db: failed to save file %s : %s",
tmp,mstrerror(g_errno));
g_errno = 0;
}
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
// do not need a save now
m_needsSave = false;
// waiting tree is saved in SpiderCache::save() called by Process.cpp
//SpiderColl *sc = m_spiderColl;
//if ( ! sc ) return true;
// save page count table which has # of pages indexed per
// subdomain/site and firstip for doing quotas in url filters table
//snprintf ( tmp , 1023, "coll.%s.%"INT32"/pagecounts.dat",
// m_coll , (int32_t)m_collnum );
//if ( ! m_pageCountTable.save ( g_hostdb.m_dir , tmp ) ) {
// log("db: failed to save file %s : %s",tmp,mstrerror(g_errno));
// g_errno = 0;
//}
return true;
}
// calls hasPermissin() below
bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
int32_t plen;
char *p = r->getString ( "pwd" , &plen );
int32_t ip = s->m_ip;
return hasPermission ( p , plen , ip );
}
// . does this password work for this collection?
bool CollectionRec::isAssassin ( int32_t ip ) {
// ok, make sure they came from an acceptable IP
//for ( int32_t i = 0 ; i < m_numSpamIps ; i++ )
// // they also have a matching IP, so they now have permission
// if ( m_spamIps[i] == ip ) return true;
return false;
}
// . does this password work for this collection?
bool CollectionRec::hasPermission ( char *p, int32_t plen , int32_t ip ) {
// just return true
// collection permission is checked from Users::verifyColl
// in User::getUserType for every request
return true;
// scan the passwords
// MDW: no longer, this is too vulnerable!!!
/*
for ( int32_t i = 0 ; i < m_numAdminPwds ; i++ ) {
int32_t len = gbstrlen ( m_adminPwds[i] );
if ( len != plen ) continue;
if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
// otherwise it's a match!
//goto checkIp;
// . matching one password is good enough now, default OR
// . because just matching an IP is good enough security,
// there is really no need for both IP AND passwd match
return true;
}
*/
// . if had passwords but the provided one didn't match, return false
// . matching one password is good enough now, default OR
//if ( m_numPasswords > 0 ) return false;
// checkIp:
// ok, make sure they came from an acceptable IP
//for ( int32_t i = 0 ; i < m_numAdminIps ; i++ )
// // they also have a matching IP, so they now have permission
// if ( m_adminIps[i] == ip ) return true;
// if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
//if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
// if they did not match an ip or password, even if both lists
// are empty, do not allow access... this prevents security breeches
// by accident
return false;
// if there were IPs then they failed to get in
//if ( m_numAdminIps > 0 ) return false;
// otherwise, they made it
//return true;
}
// can this ip perform a search or add url on this collection?
bool CollectionRec::hasSearchPermission ( TcpSocket *s , int32_t encapIp ) {
// get the ip
int32_t ip = 0; if ( s ) ip = s->m_ip;
// and the ip domain
int32_t ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
// and top 2 bytes for the israel isp that has this huge block
int32_t ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
// is it in the ban list?
/*
for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
// otherwise it's just a single banned ip
if ( m_banIps[i] == ip ) return false;
}
*/
// check the encapsulate ip if any
// 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
// /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
// +nj+state HTTP/1.0
/*
if ( encapIp ) {
ipd = ipdom ( encapIp );
ip = encapIp;
for ( int32_t i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
if ( m_banIps[i] == ip ) return false;
}
}
*/
return true;
/*
// do we have an "only" list?
if ( m_numSearchIps == 0 ) return true;
// it must be in that list if we do
for ( int32_t i = 0 ; i < m_numSearchIps ; i++ ) {
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_searchIps[i] ) ) {
if ( m_searchIps[i] == ipd ) return true;
continue;
}
// otherwise it's just a single ip
if ( m_searchIps[i] == ip ) return true;
}
*/
// otherwise no permission
return false;
}
bool expandRegExShortcuts ( SafeBuf *sb ) ;
void nukeDoledb ( collnum_t collnum );
// rebuild the regexes related to diffbot, such as the one for the URL pattern
bool CollectionRec::rebuildDiffbotRegexes() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// get the regexes
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
// recompiling regexes starts now
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_upr );
m_hasupr = false;
}
return true;
}
bool CollectionRec::rebuildUrlFiltersDiffbot() {
//logf(LOG_DEBUG,"db: rebuilding url filters");
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
if ( ucp && ! ucp[0] ) ucp = NULL;
char *upp = m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
// if we had a regex, that works for this purpose as well
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
char *ppp = m_diffbotPageProcessPattern.getBufStart();
if ( ppp && ! ppp[0] ) ppp = NULL;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
int32_t err;
if ( rx && ( err = regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
// error!
char errbuf[1024];
regerror(err,&m_ucr,errbuf,1000);
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && ( err = regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) ) { // |REG_NOSUB) ) {
char errbuf[1024];
regerror(err,&m_upr,errbuf,1000);
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
regfree ( &m_upr );
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
if ( api && ! api[0] ) api = NULL;
// convert from seconds to milliseconds. default is 250ms?
int32_t wait = (int32_t)(m_collectiveCrawlDelay * 1000.0);
// default to 250ms i guess. -1 means unset i think.
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
bool isEthan = false;
if (m_coll)isEthan=strstr(m_coll,"2b44a0e0bb91bbec920f7efd29ce3d5b");
// it looks like we are assuming all crawls are repeating so that
// &rountStart=<currenttime> or &roundStart=0 which is the same
// thing, will trigger a re-crawl. so if collectiveRespiderFreq
// is 0 assume it is like 999999.0 days. so that stuff works.
// also i had to make the "default" rule below always have a respider
// freq of 0.0 so it will respider right away if we make it past the
// "lastspidertime>={roundstart}" rule which we will if they
// set the roundstart time to the current time using &roundstart=0
float respiderFreq = m_collectiveRespiderFrequency;
if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
// lower from 7 to 1 since we have so many collections now
// ok, now we have much less colls so raise back to 7
int32_t diffbotipms = 7;//1; // 7
// make the gigablast regex table just "default" so it does not
// filtering, but accepts all urls. we will add code to pass the urls
// through m_diffbotUrlCrawlPattern alternatively. if that itself
// is empty, we will just restrict to the seed urls subdomain.
for ( int32_t i = 0 ; i < MAX_FILTERS ; i++ ) {
m_regExs[i].purge();
m_spiderPriorities[i] = 0;
m_maxSpidersPerRule [i] = 100;
// when someone has a bulk job of thousands of different
// domains it slows diffbot back-end down, so change this
// from 100 to 7 if doing a bulk job
if ( m_isCustomCrawl == 2 )
m_maxSpidersPerRule[i] = 2;// try 2 not 1 to be faster
m_spiderIpWaits [i] = wait;
m_spiderIpMaxSpiders[i] = diffbotipms; // keep it respectful
// ethan wants some speed
// if ( isEthan )
// m_spiderIpMaxSpiders[i] = 30;
//m_spidersEnabled [i] = 1;
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
m_harvestLinks[i] = true;
m_forceDelete [i] = false;
}
int32_t i = 0;
// 1st one! for query reindex/ query delete
m_regExs[i].set("isreindex");
m_spiderIpMaxSpiders [i] = 10;
m_spiderPriorities [i] = 70;
i++;
// 2nd default url
m_regExs[i].set("ismedia && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
// de-prioritize fakefirstip urls so we don't give the impression our
// spiders are slow. like if someone adds a bulk job with 100,000 urls
// then we sit there and process to lookup their ips and add a real
// spider request (if it falls onto the same shard) before we actually
// do any real spidering. so keep the priority here low.
m_regExs[i].set("isfakeip");
m_maxSpidersPerRule [i] = 7;
m_spiderIpMaxSpiders [i] = 7;
m_spiderPriorities [i] = 20;
m_spiderIpWaits [i] = 0;
i++;
// hopcount filter if asked for
if( m_diffbotMaxHops >= 0 ) {
// transform long to string
char numstr[21]; // enough to hold all numbers up to 64-bits
sprintf(numstr, "%"INT32"", (int32_t)m_diffbotMaxHops);
// form regEx like: hopcount>3
char hopcountStr[30];
strcpy(hopcountStr, "hopcount>");
strcat(hopcountStr, numstr);
m_regExs[i].set(hopcountStr);
// means DELETE :
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// just don't spider
m_maxSpidersPerRule[i] = 0;
// compatibility with m_spiderRoundStartTime:
m_spiderFreqs[i] = 0.0;
i++;
}
// 2nd default filter
// always turn this on for now. they need to add domains they want
// to crawl as seeds so they do not spider the web.
// no because FTB seeds with link pages that link to another
// domain. they just need to be sure to supply a crawl pattern
// to avoid spidering the whole web.
//
// if they did not EXPLICITLY provide a url crawl pattern or
// url crawl regex then restrict to seeds to prevent from spidering
// the entire internet.
//if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
// MDW: even if they supplied a crawl pattern let's restrict to seed
// domains 12/15/14
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
//}
bool ucpHasPositive = false;
// . scan them to see if all patterns start with '!' or not
// . if pattern starts with ! it is negative, otherwise positve
if ( ucp ) ucpHasPositive = hasPositivePattern ( ucp );
// if no crawl regex, and it has a crawl pattern consisting of
// only negative patterns then restrict to domains of seeds
if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_maxSpidersPerRule [i] = 0;
m_spiderPriorities [i] = 100; // delete!
m_forceDelete [i] = 1;
i++;
}
// don't bother re-spidering old pages if hopcount == maxhopcount
// and only process new urls is true. because we don't need to
// harvest outlinks from them.
if ( m_diffbotOnlyProcessIfNewUrl && m_diffbotMaxHops > 0 &&
// only crawls, not bulk jobs
m_isCustomCrawl == 1 ) {
m_regExs[i].purge();
m_regExs[i].safePrintf("isindexed && hopcount==%"INT32,
m_diffbotMaxHops );
m_spiderPriorities [i] = 14;
m_spiderFreqs [i] = 0.0;
m_maxSpidersPerRule [i] = 0; // turn off spiders
m_harvestLinks [i] = false;
i++;
}
// 3rd rule for respidering
// put this above the errocount>= rules below otherwise the crawl
// may never advance its round because it keeps retrying a ton of
// errored urls.
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if doing a one-shot crawl limit error retries to 3 times or
// if no urls currently available to spider, whichever comes first.
else {
m_regExs[i].set("errorcount>=3");
m_spiderPriorities [i] = 11;
m_spiderFreqs [i] = 0.0416;
m_maxSpidersPerRule [i] = 0; // turn off spiders
i++;
}
// diffbot needs to retry even on 500 or 404 errors since sometimes
// a seed url gets a 500 error mistakenly and it haults the crawl.
// so take out "!hastmperror".
m_regExs[i].set("errorcount>=1 && !hastmperror");
m_spiderPriorities [i] = 14;
m_spiderFreqs [i] = 0.0416; // every hour
//m_maxSpidersPerRule [i] = 0; // turn off spiders if not tmp error
i++;
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==1 && hastmperror");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.001; // 86 seconds
i++;
// and for docs that have errors respider once every 5 hours
m_regExs[i].set("errorcount==2 && hastmperror");
m_spiderPriorities [i] = 40;
m_spiderFreqs [i] = 0.003; // 3*86 seconds (was 24 hrs)
i++;
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
m_regExs[i].set("errorcount>=3 && hastmperror");
m_spiderPriorities [i] = 39;
m_spiderFreqs [i] = .25; // 1/4 day
// if bulk job, do not download a url more than 3 times
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
i++;
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
if ( respiderFreq <= 0.0 ) { // else {
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//m_regExs[i].set("isindexed");
m_regExs[i].set("hasreply");
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// url crawl and PAGE process pattern
if ( ucp && ! upp && ppp ) {
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// crawl everything else, but don't harvest links,
// we have to see if the page content matches the "ppp"
// to determine whether the page should be processed or not.
m_regExs[i].set("default");
m_spiderPriorities [i] = 52;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
m_harvestLinks [i] = false;
i++;
goto done;
}
// url crawl and process pattern
if ( ucp && upp ) {
m_regExs[i].set("matchesucp && matchesupp");
m_spiderPriorities [i] = 55;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// just process, do not spider links if does not match ucp
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
m_harvestLinks [i] = false;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the round
m_spiderFreqs[i] = 0.0;
i++;
}
// harvest links if we should crawl it
if ( ucp && ! upp ) {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away.
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
// process everything since upp is empty
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't delete, just don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
i++;
}
// just process
if ( upp && ! ucp ) {
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_harvestLinks [i] = false;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// crawl everything by default, no processing
m_regExs[i].set("default");
m_spiderPriorities [i] = 50;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
i++;
}
// no restraints
if ( ! upp && ! ucp ) {
// crawl everything by default, no processing
m_regExs[i].set("default");
m_spiderPriorities [i] = 50;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
// otherwise Spider.cpp's getSpiderTimeMS() returns a time
// in the future and we can't force the rounce
m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
done:
m_numRegExs = i;
m_numRegExs2 = i;
m_numRegExs3 = i;
m_numRegExs10 = i;
m_numRegExs5 = i;
m_numRegExs6 = i;
//m_numRegExs7 = i;
m_numRegExs8 = i;
m_numRegExs7 = i;
//m_numRegExs11 = i;
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
return true;
}
// . anytime the url filters are updated, this function is called
// . it is also called on load of the collection at startup
bool CollectionRec::rebuildUrlFilters ( ) {
if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
m_urlFiltersProfile.getBufStart());
// if not a custom crawl, and no expressions, add a default one
//if ( m_numRegExs == 0 && ! m_isCustomCrawl ) {
// setUrlFiltersToDefaults();
//}
// if not a custom crawl then set the url filters based on
// the url filter profile, if any
if ( ! m_isCustomCrawl )
rebuildUrlFilters2();
// set this so we know whether we have to keep track of page counts
// per subdomain/site and per domain. if the url filters have
// 'sitepages' 'domainpages' 'domainadds' or 'siteadds' we have to keep
// the count table SpiderColl::m_pageCountTable.
m_urlFiltersHavePageCounts = false;
for ( int32_t i = 0 ; i < m_numRegExs ; i++ ) {
// get the ith rule
SafeBuf *sb = &m_regExs[i];
char *p = sb->getBufStart();
if ( strstr(p,"sitepages") ||
strstr(p,"domainpages") ||
strstr(p,"siteadds") ||
strstr(p,"domainadds") ) {
m_urlFiltersHavePageCounts = true;
break;
}
}
// if collection is brand new being called from addNewColl()
// then sc will be NULL
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(m_collnum);
// . do not do this at startup
// . this essentially resets doledb
if ( g_doledb.m_rdb.m_initialized &&
// somehow this is initialized before we set m_recs[m_collnum]
// so we gotta do the two checks below...
sc &&
// must be a valid coll
m_collnum < g_collectiondb.m_numRecs &&
g_collectiondb.m_recs[m_collnum] ) {
log("coll: resetting doledb for %s (%li)",m_coll,
(long)m_collnum);
// clear doledb recs from tree
//g_doledb.getRdb()->deleteAllRecs ( m_collnum );
nukeDoledb ( m_collnum );
// add it back
//if ( ! g_doledb.getRdb()->addRdbBase2 ( m_collnum ) )
// log("coll: error re-adding doledb for %s",m_coll);
// just start this over...
// . MDW left off here
//tryToDelete ( sc );
// maybe this is good enough
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
//CollectionRec *cr = sc->m_cr;
// . rebuild sitetable? in PageBasic.cpp.
// . re-adds seed spdierrequests using msg4
// . true = addSeeds
// . no, don't do this now because we call updateSiteList()
// when we have &sitelist=xxxx in the request which will
// handle updating those tables
//updateSiteListTables ( m_collnum ,
// true ,
// cr->m_siteListBuf.getBufStart() );
}
// If the crawl is not generated by crawlbot, then we will just update
// the regexes concerning the urls to process
rebuildDiffbotRegexes();
if ( ! m_isCustomCrawl ){
return true;
}
// on the other hand, if it is a crawlbot job, then by convention the url filters are all set
// to some default ones.
return rebuildUrlFiltersDiffbot();
}
// for some reason the libc we use doesn't support these int16_tcuts,
// so expand them to something it does support
bool expandRegExShortcuts ( SafeBuf *sb ) {
if ( ! sb->safeReplace3 ( "\\d" , "[0-9]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\D" , "[^0-9]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\l" , "[a-z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\a" , "[A-Za-z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\u" , "[A-Z]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\w" , "[A-Za-z0-9_]" ) ) return false;
if ( ! sb->safeReplace3 ( "\\W" , "[^A-Za-z0-9_]" ) ) return false;
return true;
}
void testRegex ( ) {
//
// TEST
//
char *rx;
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
rx = ".*?article[0-9]*?.html";
regex_t ucr;
int32_t err;
if ( ( err = regcomp ( &ucr , rx ,
REG_ICASE
|REG_EXTENDED
//|REG_NEWLINE
//|REG_NOSUB
) ) ) {
// error!
char errbuf[1024];
regerror(err,&ucr,errbuf,1000);
log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,errbuf);
}
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
//char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
char *url = "http://staticpages.diffbot.com/testCrawl/regex/article1.html";
if ( regexec(&ucr,url,0,NULL,0) )
logf(LOG_DEBUG,"db: failed to match %s on %s",
url,rx);
else
logf(LOG_DEBUG,"db: MATCHED %s on %s",
url,rx);
exit(0);
}
int64_t CollectionRec::getNumDocsIndexed() {
RdbBase *base = getBase(RDB_TITLEDB);//m_bases[RDB_TITLEDB];
if ( ! base ) return 0LL;
return base->getNumGlobalRecs();
}
// messes with m_spiderColl->m_sendLocalCrawlInfoToHost[MAX_HOSTS]
// so we do not have to keep sending this huge msg!
bool CollectionRec::shouldSendLocalCrawlInfoToHost ( int32_t hostId ) {
if ( ! m_spiderColl ) return false;
if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
if ( hostId >= g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
// sanity
return m_spiderColl->m_sendLocalCrawlInfoToHost[hostId];
}
void CollectionRec::localCrawlInfoUpdate() {
if ( ! m_spiderColl ) return;
// turn on all the flags
memset(m_spiderColl->m_sendLocalCrawlInfoToHost,1,g_hostdb.m_numHosts);
}
// right after we send copy it for sending we set this so we do not send
// again unless localCrawlInfoUpdate() is called
void CollectionRec::sentLocalCrawlInfoToHost ( int32_t hostId ) {
if ( ! m_spiderColl ) return;
m_spiderColl->m_sendLocalCrawlInfoToHost[hostId] = 0;
}