tags into this file
sprintf(fn,"%s/%s/parse-shortdisplay.%llu.%lu.html",
g_hostdb.m_dir,testDir,h,g_test.m_runId);
// output to a special file
SafeBuf tmp;
// insert this
tmp.safeStrcpy("
\n");
// header stuff
tmp.safePrintf("\n");
// put the onclick script in there
tmp.safeStrcpy ( xd->getCheckboxScript() );
// concatenate just these sections in "sb" to "tmp"
tmp.cat2 ( sb ,
"
" ,
"
" );
// header stuff
tmp.safePrintf("\n\n");
// then dump
tmp.dumpToFile ( fn );
// if it had critical errors from XmlDoc::validateOutput()
// then create that file!
//if ( xd->m_validateMisses > 0 || xd->m_validateFlagged ) {
// make the critical file filename
char cf[1024];
sprintf (cf,"%s/%s/critical.%llu.%lu.txt",
g_hostdb.m_dir,testDir,h,g_test.m_runId);
// save to that
ttt.dumpToFile ( cf );
//char cmd[256];
//sprintf(cmd,"touch %s/test/critical.%llu.%lu.txt",
// g_hostdb.m_dir,h,g_test.m_runId);
//system(cmd);
// note it
//log("crazyin: %s",u->m_url );
// note it
//g_test.m_urlsAdded--;
g_test.m_urlsIndexed++;
// now in PingServer.cpp for hostid 0 it checks
// the urlsindexed from each host if g_conf.m_testParserEnabled
// is true to see if we should call g_test.stopIt()
// if that is zero we are done
//if ( g_test.m_urlsAdded == 0 && ! g_test.m_isAdding &&
// // only stop if not spidering links
// //! g_test.m_spiderLinks )
// g_conf.m_testParserEnabled )
// // wrap things up
// g_test.stopIt();
}
*/
// note it
// this should not happen any more since indexDoc() will take
// care of g_errno now by clearing it and adding an error spider
// reply to release the lock!!
if ( g_errno ) {
log("spider: ----CRITICAL CRITICAL CRITICAL----");
log("spider: ----CRITICAL CRITICAL CRITICAL----");
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: spidering %s has error: %s. uh48=%lli. "
"Respidering "
"in %li seconds. MAX_LOCK_AGE when lock expires.",
xd->m_firstUrl.m_url,
mstrerror(g_errno),
xd->getFirstUrlHash48(),
(long)MAX_LOCK_AGE);
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: ------ *** LOCAL ERROR *** ------");
log("spider: ----CRITICAL CRITICAL CRITICAL----");
log("spider: ----CRITICAL CRITICAL CRITICAL----");
// don't release the lock on it right now. just let the
// lock expire on it after MAX_LOCK_AGE seconds. then it will
// be retried. we need to debug gb so these things never
// hapeen...
}
// breathe
QUICKPOLL ( xd->m_niceness );
// . call the final callback used for injecting urls
// . this may send a reply back so the caller knows the url
// was fully injected into the index
// . Msg7.cpp uses a callback that returns a void, so use m_callback1!
//if ( xd->m_injectionCallback && injecting ) {
// g_errno = saved;
// // use the index code as the error for PageInject.cpp
// if ( ! g_errno && xd->m_indexCode ) g_errno = xd->m_indexCode;
// xd->m_injectionCallback ( xd->m_injectionState );
//}
// we don't need this g_errno passed this point
g_errno = 0;
// breathe
QUICKPOLL ( xd->m_niceness );
// did this doc get a chance to add its meta list to msg4 bufs?
//bool addedMetaList = m_docs[i]->m_listAdded;
// set this in case we need to call removeAllLocks
//m_uh48 = 0LL;
//if ( xd->m_sreqValid ) m_uh48 = xd->m_sreq.getUrlHash48();
// we are responsible for deleting doc now
mdelete ( m_docs[i] , sizeof(XmlDoc) , "Doc" );
delete (m_docs[i]);
m_docs[i] = NULL;
// we remove the spider lock from g_spiderLoop.m_lockTable in Rdb.cpp
// when it receives the negative doledb key. but if the this does not
// happen, we have a problem then!
//if ( addedMetaList ) return true;
// sanity
//if ( ! m_uh48 ) { char *xx=NULL; *xx=0; }
// the lock we had in g_spiderLoop.m_lockTable for the doleKey
// is now remove in Rdb.cpp when it receives a negative dole key to
// add to doledb... assuming we added that meta list!!
// m_uh48 should be set from above
//if ( ! removeAllLocks () ) return false;
// we did not block, so return true
return true;
}
void gotLockReplyWrapper ( void *state , UdpSlot *slot ) {
// cast it
Msg12 *msg12 = (Msg12 *)state;
// . call handler
// . returns false if waiting for more replies to come in
if ( ! msg12->gotLockReply ( slot ) ) return;
// if had callback, maybe from PageReindex.cpp
if ( msg12->m_callback ) msg12->m_callback ( msg12->m_state );
// ok, try to get another url to spider
else g_spiderLoop.spiderDoledUrls();
}
Msg12::Msg12 () {
m_numRequests = 0;
m_numReplies = 0;
}
// . returns false if blocked, true otherwise.
// . returns true and sets g_errno on error
// . before we can spider for a SpiderRequest we must be granted the lock
// . each group shares the same doledb and each host in the group competes
// for spidering all those urls.
// . that way if a host goes down is load is taken over
bool Msg12::getLocks ( long long uh48, // probDocId ,
char *url ,
DOLEDBKEY *doledbKey,
collnum_t collnum,
long sameIpWaitTime,
long maxSpidersOutPerIp,
long firstIp,
void *state ,
void (* callback)(void *state) ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// do not use locks for injections
//if ( m_sreq->m_isInjecting ) return true;
// get # of hosts in each mirror group
long hpg = g_hostdb.getNumHostsPerShard();
// reset
m_numRequests = 0;
m_numReplies = 0;
m_grants = 0;
m_removing = false;
m_confirming = false;
// make sure is really docid
//if ( probDocId & ~DOCID_MASK ) { char *xx=NULL;*xx=0; }
// . mask out the lower bits that may change if there is a collision
// . in this way a url has the same m_probDocId as the same url
// in the index. i.e. if we add a new spider request for url X and
// url X is already indexed, then they will share the same lock
// even though the indexed url X may have a different actual docid
// than its probable docid.
// . we now use probable docids instead of uh48 because query reindex
// in PageReindex.cpp adds docid based spider requests and we
// only know the docid, not the uh48 because it is creating
// SpiderRequests from docid-only search results. having to look
// up the msg20 summary for like 1M search results is too painful!
//m_lockKey = g_titledb.getFirstProbableDocId(probDocId);
// . use this for locking now, and let the docid-only requests just use
// the docid
m_lockKeyUh48 = makeLockTableKey ( uh48 , firstIp );
m_url = url;
m_callback = callback;
m_state = state;
m_hasLock = false;
m_origUh48 = uh48;
// support ability to spider multiple urls from same ip
m_doledbKey = *doledbKey;
m_collnum = collnum;
m_sameIpWaitTime = sameIpWaitTime;
m_maxSpidersOutPerIp = maxSpidersOutPerIp;
m_firstIp = firstIp;
// sanity check, just 6 bytes! (48 bits)
if ( uh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
if ( m_lockKeyUh48 & 0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
// cache time
long ct = 120;
// if docid based assume it was a query reindex and keep it short!
// otherwise we end up waiting 120 seconds for a query reindex to
// go through on a docid we just spidered. TODO: use m_urlIsDocId
if ( url && is_digit(url[0]) ) ct = 2;
// . this seems to be messing us up and preventing us from adding new
// requests into doledb when only spidering a few IPs.
// . make it random in the case of twin contention
ct = rand() % 10;
// . check our cache to avoid repetitive asking
// . use -1 for maxAge to indicate no max age
// . returns -1 if not in cache
// . use maxage of two minutes, 120 seconds
long lockTime ;
lockTime = g_spiderLoop.m_lockCache.getLong(0,m_lockKeyUh48,ct,true);
// if it was in the cache and less than 2 minutes old then return
// true now with m_hasLock set to false.
if ( lockTime >= 0 ) {
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: cached missed lock for %s "
"lockkey=%llu", m_url,m_lockKeyUh48);
return true;
}
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sending lock request for %s "
"lockkey=%llu", m_url,m_lockKeyUh48);
// now the locking group is based on the probable docid
//m_lockGroupId = g_hostdb.getGroupIdFromDocId(m_lockKey);
// ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( m_lockGroupId );
// the same group (shard) that has the spiderRequest/Reply is
// the one responsible for locking.
Host *hosts = g_hostdb.getMyShard();
// short cut
UdpServer *us = &g_udpServer;
static long s_lockSequence = 0;
// remember the lock sequence # in case we have to call remove locks
m_lockSequence = s_lockSequence++;
LockRequest *lr = &m_lockRequest;
lr->m_lockKeyUh48 = m_lockKeyUh48;
lr->m_firstIp = m_firstIp;
lr->m_removeLock = 0;
lr->m_lockSequence = m_lockSequence;
lr->m_collnum = collnum;
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// point to start of the 12 byte request buffer
char *request = (char *)lr;//m_lockKey;
long requestSize = sizeof(LockRequest);//12;
// loop over hosts in that shard
for ( long i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead (h) ) continue;
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sent lock "
"request #%li for lockkey=%llu %s to "
"hid=%li",m_numRequests,m_lockKeyUh48,
m_url,h->m_hostId);
// send request to him
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// i guess nothing... hmmm... all dead?
//char *xx=NULL; *xx=0;
// m_hasLock should be false... all lock hosts seem dead... wait
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: all lock hosts seem dead for %s "
"lockkey=%llu", m_url,m_lockKeyUh48);
return true;
}
// after adding the negative doledb recs to remove the url we are spidering
// from doledb, and adding the fake titledb rec to add a new entry into
// waiting tree so that our ip can have more than one outstanding spider,
// call the callback. usually msg4::addMetaList() will not block i'd guess.
void rejuvenateIPWrapper ( void *state ) {
Msg12 *THIS = (Msg12 *)state;
THIS->m_callback ( THIS->m_state );
}
// returns true if all done, false if waiting for more replies
bool Msg12::gotLockReply ( UdpSlot *slot ) {
// got reply
m_numReplies++;
// don't let udpserver free the request, it's our m_request[]
slot->m_sendBufAlloc = NULL;
// check for a hammer reply
char *reply = slot->m_readBuf;
long replySize = slot->m_readBufSize;
// if error, treat as a not grant
if ( g_errno ) {
bool logIt = true;
// note it
if ( g_conf.m_logDebugSpider )
log("spider: got msg12 reply error = %s",
mstrerror(g_errno));
// if we got an ETRYAGAIN when trying to confirm our lock
// that means doledb was saving/dumping to disk and we
// could not remove the record from doledb and add an
// entry to the waiting tree, so we need to keep trying
if ( g_errno == ETRYAGAIN && m_confirming ) {
// c ount it again
m_numRequests++;
// use what we were using
char *request = (char *)&m_confirmRequest;
long requestSize = sizeof(ConfirmRequest);
Host *h = g_hostdb.getHost(slot->m_hostId);
// send request to him
UdpServer *us = &g_udpServer;
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPt
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
return false;
// error?
// don't spam the log!
static long s_last = 0;
long now = getTimeLocal();
if ( now - s_last >= 1 ) {
s_last = now;
log("spider: error re-sending confirm "
"request: %s", mstrerror(g_errno));
}
}
// only log every 10 seconds for ETRYAGAIN
if ( g_errno == ETRYAGAIN ) {
static time_t s_lastTime = 0;
time_t now = getTimeLocal();
logIt = false;
if ( now - s_lastTime >= 3 ) {
logIt = true;
s_lastTime = now;
}
}
if ( logIt )
log ( "sploop: host had error getting lock url=%s"
": %s" ,
m_url,mstrerror(g_errno) );
}
// grant or not
if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++;
// wait for all to get back
if ( m_numReplies < m_numRequests ) return false;
// all done if we were removing
if ( m_removing ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: done removing all locks "
"(replies=%li) for %s",
m_numReplies,m_url);//m_sreq->m_url);
// we are done
m_gettingLocks = false;
return true;
}
// all done if we were confirming
if ( m_confirming ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: done confirming all locks "
"for %s uh48=%lli",m_url,m_origUh48);//m_sreq->m_url);
// we are done
m_gettingLocks = false;
// . keep processing
// . if the collection was nuked from under us the spiderUrl2
// will return true and set g_errno
if ( ! m_callback ) return g_spiderLoop.spiderUrl2();
// if we had a callback let our parent call it
return true;
}
// if got ALL locks, spider it
if ( m_grants == m_numReplies ) {
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%llu",
m_lockKeyUh48);
// flag this
m_hasLock = true;
// we are done
//m_gettingLocks = false;
///////
//
// now tell our group (shard) to remove from doledb
// and re-add to waiting tree. the evalIpLoop() function
// should skip this probable docid because it is in the
// LOCK TABLE!
//
// This logic should allow us to spider multiple urls
// from the same IP at the same time.
//
///////
// returns false if would block
if ( ! confirmLockAcquisition ( ) ) return false;
// . we did it without blocking, maybe cuz we are a single node
// . ok, they are all back, resume loop
// . if the collection was nuked from under us the spiderUrl2
// will return true and set g_errno
if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
// all done
return true;
}
// note it
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%llu "
"(grants=%li)", m_url,m_lockKeyUh48,m_grants);
// . if it was locked by another then add to our lock cache so we do
// not try to lock it again
// . if grants is not 0 then one host granted us the lock, but not
// all hosts, so we should probably keep trying on it until it is
// locked up by one host
if ( m_grants == 0 ) {
long now = getTimeGlobal();
g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL);
}
// reset again
m_numRequests = 0;
m_numReplies = 0;
// no need to remove them if none were granted because another
// host in our group might have it 100% locked.
if ( m_grants == 0 ) {
// no longer in locks operation mode
m_gettingLocks = false;
// ok, they are all back, resume loop
//if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
// all done
return true;
}
// note that
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: sending request to all in shard to "
"remove lock uh48=%llu. grants=%li",
m_lockKeyUh48,(long)m_grants);
// remove all locks we tried to get, BUT only if from our hostid!
// no no! that doesn't quite work right... we might be the ones
// locking it! i.e. another one of our spiders has it locked...
if ( ! removeAllLocks ( ) ) return false; // true;
// if did not block, how'd that happen?
log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno));
return true;
}
bool Msg12::removeAllLocks ( ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// skip if injecting
//if ( m_sreq->m_isInjecting ) return true;
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: removing all locks for %s %llu",
m_url,m_lockKeyUh48);
// we are now removing
m_removing = true;
LockRequest *lr = &m_lockRequest;
lr->m_lockKeyUh48 = m_lockKeyUh48;
lr->m_lockSequence = m_lockSequence;
lr->m_firstIp = m_firstIp;
lr->m_removeLock = 1;
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// make that the request
// . point to start of the 12 byte request buffer
// . m_lockSequence should still be valid
char *request = (char *)lr;//m_lockKey;
long requestSize = sizeof(LockRequest);//12;
// now the locking group is based on the probable docid
//unsigned long groupId = g_hostdb.getGroupIdFromDocId(m_lockKeyUh48);
// ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( groupId );
Host *hosts = g_hostdb.getMyShard();
// this must select the same group that is going to spider it!
// i.e. our group! because we check our local lock table to see
// if a doled url is locked before spidering it ourselves.
//Host *hosts = g_hostdb.getMyGroup();
// short cut
UdpServer *us = &g_udpServer;
// set the hi bit though for this one
//m_lockKey |= 0x8000000000000000LL;
// get # of hosts in each mirror group
long hpg = g_hostdb.getNumHostsPerShard();
// loop over hosts in that shard
for ( long i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead ( h ) ) continue;
// send request to him
if ( ! us->sendRequest ( request ,
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// did not block
return true;
}
bool Msg12::confirmLockAcquisition ( ) {
// ensure not in use. not msg12 replies outstanding.
if ( m_numRequests != m_numReplies ) { char *xx=NULL;*xx=0; }
// we are now removing
m_confirming = true;
// make that the request
// . point to start of the 12 byte request buffer
// . m_lockSequence should still be valid
ConfirmRequest *cq = &m_confirmRequest;
char *request = (char *)cq;
long requestSize = sizeof(ConfirmRequest);
// sanity
if ( requestSize == sizeof(LockRequest)){ char *xx=NULL;*xx=0; }
// set it
cq->m_collnum = m_collnum;
cq->m_doledbKey = m_doledbKey;
cq->m_firstIp = m_firstIp;
cq->m_lockKeyUh48 = m_lockKeyUh48;
cq->m_maxSpidersOutPerIp = m_maxSpidersOutPerIp;
// . use the locking group from when we sent the lock request
// . get ptr to list of hosts in the group
//Host *hosts = g_hostdb.getGroup ( m_lockGroupId );
// the same group (shard) that has the spiderRequest/Reply is
// the one responsible for locking.
Host *hosts = g_hostdb.getMyShard();
// this must select the same shard that is going to spider it!
// i.e. our shard! because we check our local lock table to see
// if a doled url is locked before spidering it ourselves.
//Host *hosts = g_hostdb.getMyShard();
// short cut
UdpServer *us = &g_udpServer;
// get # of hosts in each mirror group
long hpg = g_hostdb.getNumHostsPerShard();
// reset counts
m_numRequests = 0;
m_numReplies = 0;
// note it
if ( g_conf.m_logDebugSpider )
log("spider: confirming lock for uh48=%llu firstip=%s",
m_lockKeyUh48,iptoa(m_firstIp));
// loop over hosts in that shard
for ( long i = 0 ; i < hpg ; i++ ) {
// get a host
Host *h = &hosts[i];
// skip if dead! no need to get a reply from dead guys
if ( g_hostdb.isDead ( h ) ) continue;
// send request to him
if ( ! us->sendRequest ( request ,
// a size of 2 should mean confirm
requestSize ,
0x12 , // msgType
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL , // retSlotPtrPtr
this , // state data
gotLockReplyWrapper ,
60*60*24*365 ) )
// udpserver returns false and sets g_errno on error
return true;
// count them
m_numRequests++;
}
// block?
if ( m_numRequests > 0 ) return false;
// did not block
return true;
}
// use -1 for any collnum
long SpiderLoop::getNumSpidersOutPerIp ( long firstIp , collnum_t collnum ) {
long count = 0;
// count locks
HashTableX *ht = &g_spiderLoop.m_lockTable;
// scan the slots
long ns = ht->m_numSlots;
for ( long i = 0 ; i < ns ; i++ ) {
// breathe
//QUICKPOLL(niceness);
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
// skip if not outstanding, just a 5-second expiration wait
// when the spiderReply returns, so that in case a lock
// request for the same url was in progress, it will be denied.
if ( ! lock->m_spiderOutstanding ) continue;
// must be confirmed too
if ( ! lock->m_confirmed ) continue;
// correct collnum?
if ( lock->m_collnum != collnum && collnum != -1 ) continue;
// skip if not yet expired
if ( lock->m_firstIp == firstIp ) count++;
}
/*
for ( long i = 0 ; i <= m_maxUsed ; i++ ) {
// get it
XmlDoc *xd = m_docs[i];
// skip if empty
if ( ! xd ) continue;
// check it
if ( xd->m_firstIp == firstIp ) count++;
}
*/
return count;
}
void handleRequest12 ( UdpSlot *udpSlot , long niceness ) {
// get request
char *request = udpSlot->m_readBuf;
long reqSize = udpSlot->m_readBufSize;
// short cut
UdpServer *us = &g_udpServer;
// breathe
QUICKPOLL ( niceness );
// shortcut
char *reply = udpSlot->m_tmpBuf;
//
// . is it confirming that he got all the locks?
// . if so, remove the doledb record and dock the doleiptable count
// before adding a waiting tree entry to re-pop the doledb record
//
if ( reqSize == sizeof(ConfirmRequest) ) {
char *msg = NULL;
ConfirmRequest *cq = (ConfirmRequest *)request;
// confirm the lock
HashTableX *ht = &g_spiderLoop.m_lockTable;
long slot = ht->getSlot ( &cq->m_lockKeyUh48 );
if ( slot < 0 ) {
log("spider: got a confirm request for a key not "
"in the table! coll must have been deleted "
" or reset "
"while lock request was outstanding.");
g_errno = EBADENGINEER;
us->sendErrorReply ( udpSlot , g_errno );
return;
//char *xx=NULL;*xx=0; }
}
UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot );
lock->m_confirmed = true;
// note that
if ( g_conf.m_logDebugSpider ) // Wait )
log("spider: got confirm lock request for ip=%s",
iptoa(lock->m_firstIp));
// get it
SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum);
// make it negative
cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL;
// and add the negative rec to doledb (deletion operation)
Rdb *rdb = &g_doledb.m_rdb;
if ( ! rdb->addRecord ( cq->m_collnum,
(char *)&cq->m_doledbKey,
NULL , // data
0 , //dataSize
1 )){ // niceness
// tree is dumping or something, probably ETRYAGAIN
if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno));
}
//char *xx=NULL;*xx=0;
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// now remove from doleiptable since we removed from doledb
if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp );
// how many spiders outstanding for this coll and IP?
//long out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp);
// DO NOT add back to waiting tree if max spiders
// out per ip was 1 OR there was a crawldelay. but better
// yet, take care of that in the winReq code above.
// . now add to waiting tree so we add another spiderdb
// record for this firstip to doledb
// . true = callForScan
// . do not add to waiting tree if we have enough outstanding
// spiders for this ip. we will add to waiting tree when
// we receive a SpiderReply in addSpiderReply()
if ( sc && //out < cq->m_maxSpidersOutPerIp &&
// this will just return true if we are not the
// responsible host for this firstip
// DO NOT populate from this!!! say "false" here...
! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) &&
// must be an error...
g_errno ) {
msg = "FAILED TO ADD TO WAITING TREE";
log("spider: %s %s",msg,mstrerror(g_errno));
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// success!!
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// sanity check
if ( reqSize != sizeof(LockRequest) ) {
log("spider: bad msg12 request size of %li",reqSize);
us->sendErrorReply ( udpSlot , EBADREQUEST );
return;
}
// deny it if we are not synced yet! otherwise we core in
// getTimeGlobal() below
if ( ! isClockInSync() ) {
// log it so we can debug it
//log("spider: clock not in sync with host #0. so "
// "returning etryagain for lock reply");
// let admin know why we are not spidering
us->sendErrorReply ( udpSlot , ETRYAGAIN );
return;
}
LockRequest *lr = (LockRequest *)request;
//unsigned long long lockKey = *(long long *)request;
//long lockSequence = *(long *)(request+8);
// is this a remove operation? assume not
//bool remove = false;
// get top bit
//if ( lockKey & 0x8000000000000000LL ) remove = true;
// mask it out
//lockKey &= 0x7fffffffffffffffLL;
// sanity check, just 6 bytes! (48 bits)
if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
// note it
if ( g_conf.m_logDebugSpider )
log("spider: got msg12 request uh48=%lli remove=%li",
lr->m_lockKeyUh48, (long)lr->m_removeLock);
// get time
long nowGlobal = getTimeGlobal();
// shortcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
long hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port );
// this must be legit - sanity check
if ( hostId < 0 ) { char *xx=NULL;*xx=0; }
// remove expired locks from locktable
removeExpiredLocks ( hostId );
long long lockKey = lr->m_lockKeyUh48;
// check tree
long slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 );
// put it here
UrlLock *lock = NULL;
// if there say no no
if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot );
// if doing a remove operation and that was our hostid then unlock it
if ( lr->m_removeLock &&
lock &&
lock->m_hostId == hostId &&
lock->m_lockSequence == lr->m_lockSequence ) {
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: removing lock for lockkey=%llu hid=%li",
lr->m_lockKeyUh48,hostId);
// unlock it
ht->removeSlot ( slot );
// it is gone
lock = NULL;
}
// ok, at this point all remove ops return
if ( lr->m_removeLock ) {
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
/////////
//
// add new lock
//
/////////
// if lock > 1 hour old then remove it automatically!!
if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) {
// note it for now
log("spider: removing lock after %li seconds "
"for lockKey=%llu hid=%li",
(nowGlobal - lock->m_timestamp),
lr->m_lockKeyUh48,hostId);
// unlock it
ht->removeSlot ( slot );
// it is gone
lock = NULL;
}
// if lock still there, do not grant another lock
if ( lock ) {
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: refusing lock for lockkey=%llu hid=%li",
lr->m_lockKeyUh48,hostId);
reply[0] = 0;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// make the new lock
UrlLock tmp;
tmp.m_hostId = hostId;
tmp.m_lockSequence = lr->m_lockSequence;
tmp.m_timestamp = nowGlobal;
tmp.m_expires = 0;
tmp.m_firstIp = lr->m_firstIp;
tmp.m_collnum = lr->m_collnum;
// when the spider returns we remove its lock on reception of the
// spiderReply, however, we actually just set the m_expires time
// to 5 seconds into the future in case there is a current request
// to get a lock for that url in progress. but, we do need to
// indicate that the spider has indeed completed by setting
// m_spiderOutstanding to true. this way, addToWaitingTree() will
// not count it towards a "max spiders per IP" quota when deciding
// on if it should add a new entry for this IP.
tmp.m_spiderOutstanding = true;
// this is set when all hosts in the group (shard) have granted the
// lock and the host sends out a confirmLockAcquisition() request.
// until then we do not know if the lock will be granted by all hosts
// in the group (shard)
tmp.m_confirmed = false;
// put it into the table
if ( ! ht->addKey ( &lockKey , &tmp ) ) {
// return error if that failed!
us->sendErrorReply ( udpSlot , g_errno );
return;
}
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: granting lock for lockKey=%llu hid=%li",
lr->m_lockKeyUh48,hostId);
// grant the lock
reply[0] = 1;
us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
return;
}
// hostId is the remote hostid sending us the lock request
void removeExpiredLocks ( long hostId ) {
// when we last cleaned them out
static time_t s_lastTime = 0;
long nowGlobal = getTimeGlobalNoCore();
long niceness = MAX_NICENESS;
// only do this once per second at the most
if ( nowGlobal <= s_lastTime ) return;
// shortcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
restart:
// scan the slots
long ns = ht->m_numSlots;
// . clean out expired locks...
// . if lock was there and m_expired is up, then nuke it!
// . when Rdb.cpp receives the "fake" title rec it removes the
// lock, only it just sets the m_expired to a few seconds in the
// future to give the negative doledb key time to be absorbed.
// that way we don't repeat the same url we just got done spidering.
// . this happens when we launch our lock request on a url that we
// or a twin is spidering or has just finished spidering, and
// we get the lock, but we avoided the negative doledb key.
for ( long i = 0 ; i < ns ; i++ ) {
// breathe
QUICKPOLL(niceness);
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
long long lockKey = *(long long *)ht->getKeyFromSlot(i);
// if collnum got deleted or reset
collnum_t collnum = lock->m_collnum;
if ( collnum >= g_collectiondb.m_numRecs ||
! g_collectiondb.m_recs[collnum] ) {
log("spider: removing lock from missing collnum "
"%li",(long)collnum);
goto nuke;
}
// skip if not yet expired
if ( lock->m_expires == 0 ) continue;
if ( lock->m_expires >= nowGlobal ) continue;
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: removing lock after waiting. elapsed=%li."
" lockKey=%llu hid=%li expires=%lu nowGlobal=%lu",
(nowGlobal - lock->m_timestamp),
lockKey,hostId,lock->m_expires,nowGlobal);
nuke:
// nuke the slot and possibly re-chain
ht->removeSlot ( i );
// gotta restart from the top since table may have shrunk
goto restart;
}
// store it
s_lastTime = nowGlobal;
}
/////////////////////////
///////////////////////// PAGESPIDER
/////////////////////////
// don't change name to "State" cuz that might conflict with another
class State11 {
public:
long m_numRecs;
Msg5 m_msg5;
RdbList m_list;
TcpSocket *m_socket;
HttpRequest m_r;
collnum_t m_collnum;
char *m_coll;
long m_count;
key_t m_startKey;
key_t m_endKey;
long m_minRecSizes;
bool m_done;
SafeBuf m_safeBuf;
long m_priority;
};
static bool loadLoop ( class State11 *st ) ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the urls we got in doledb
// . doledb is sorted by priority complement then spider time
// . do not show urls in doledb whose spider time has not yet been reached,
// so only show the urls spiderable now
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageSpiderdb ( TcpSocket *s , HttpRequest *r ) {
// set up a msg5 and RdbLists to get the urls from spider queue
State11 *st ;
try { st = new (State11); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageSpiderdb: new(%i): %s",
sizeof(State11),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State11) , "PageSpiderdb" );
// get the priority/#ofRecs from the cgi vars
st->m_numRecs = r->getLong ("n", 20 );
st->m_r.copy ( r );
// get collection name
char *coll = st->m_r.getString ( "c" , NULL , NULL );
// get the collection record to see if they have permission
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// the socket read buffer will remain until the socket is destroyed
// and "coll" points into that
st->m_coll = coll;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( cr ) st->m_collnum = cr->m_collnum;
else st->m_collnum = -1;
// set socket for replying in case we block
st->m_socket = s;
st->m_count = 0;
st->m_priority = MAX_SPIDER_PRIORITIES - 1;
// get startKeys/endKeys/minRecSizes
st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority);
st->m_endKey = g_doledb.makeLastKey2 (st->m_priority);
st->m_minRecSizes = 20000;
st->m_done = false;
// returns false if blocked, true otherwise
return loadLoop ( st ) ;
}
static void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) ;
static bool sendPage ( State11 *st );
static bool printList ( State11 *st );
bool loadLoop ( State11 *st ) {
loop:
// let's get the local list for THIS machine (use msg5)
if ( ! st->m_msg5.getList ( RDB_DOLEDB ,
st->m_collnum ,
&st->m_list ,
st->m_startKey ,
st->m_endKey ,
st->m_minRecSizes ,
true , // include tree
false , // add to cache
0 , // max age
0 , // start file #
-1 , // # files
st , // callback state
gotListWrapper3 ,
0 , // niceness
true )) // do err correction
return false;
// print it. returns false on error
if ( ! printList ( st ) ) st->m_done = true;
// check if done
if ( st->m_done ) {
// send the page back
sendPage ( st );
// bail
return true;
}
// otherwise, load more
goto loop;
}
void gotListWrapper3 ( void *state , RdbList *list , Msg5 *msg5 ) {
// cast it
State11 *st = (State11 *)state;
// print it. returns false on error
if ( ! printList ( st ) ) st->m_done = true;
// check if done
if ( st->m_done ) {
// send the page back
sendPage ( st );
// bail
return;
}
// otherwise, load more
loadLoop( (State11 *)state );
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool printList ( State11 *st ) {
// useful
time_t nowGlobal ;
if ( isClockInSync() ) nowGlobal = getTimeGlobal();
else nowGlobal = getTimeLocal();
// print the spider recs we got
SafeBuf *sbTable = &st->m_safeBuf;
// shorcuts
RdbList *list = &st->m_list;
// row count
long j = 0;
// put it in there
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
// stop if we got enough
if ( st->m_count >= st->m_numRecs ) break;
// get the doledb key
key_t dk = list->getCurrentKey();
// update to that
st->m_startKey = dk;
// inc by one
st->m_startKey += 1;
// get spider time from that
long spiderTime = g_doledb.getSpiderTime ( &dk );
// skip if in future
if ( spiderTime > nowGlobal ) continue;
// point to the spider request *RECORD*
char *rec = list->getCurrentData();
// skip negatives
if ( (dk.n0 & 0x01) == 0 ) continue;
// count it
st->m_count++;
// what is this?
if ( list->getCurrentRecSize() <= 16 ) { char *xx=NULL;*xx=0;}
// sanity check. requests ONLY in doledb
if ( ! g_spiderdb.isSpiderRequest ( (key128_t *)rec )) {
char*xx=NULL;*xx=0;}
// get the spider rec, encapsed in the data of the doledb rec
SpiderRequest *sreq = (SpiderRequest *)rec;
// print it into sbTable
if ( ! sreq->printToTable ( sbTable,"ready",NULL,j))
return false;
// count row
j++;
}
// need to load more?
if ( st->m_count >= st->m_numRecs ||
// if list was a partial, this priority is short then
list->getListSize() < st->m_minRecSizes ) {
// . try next priority
// . if below 0 we are done
if ( --st->m_priority < 0 ) st->m_done = true;
// get startKeys/endKeys/minRecSizes
st->m_startKey = g_doledb.makeFirstKey2 (st->m_priority);
st->m_endKey = g_doledb.makeLastKey2 (st->m_priority);
// if we printed something, print a blank line after it
if ( st->m_count > 0 )
sbTable->safePrintf("
..."
" |
\n");
// reset for each priority
st->m_count = 0;
}
return true;
}
bool sendPage ( State11 *st ) {
// sanity check
//if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//SafeBuf sb; sb.safePrintf("Error = %s",mstrerror(g_errno));
// shortcut
SafeBuf *sbTable = &st->m_safeBuf;
// generate a query string to pass to host bar
char qs[64]; sprintf ( qs , "&n=%li", st->m_numRecs );
// store the page in here!
SafeBuf sb;
sb.reserve ( 64*1024 );
g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r , qs );
// get spider coll
collnum_t collnum = g_collectiondb.getCollnum ( st->m_coll );
// and coll rec
CollectionRec *cr = g_collectiondb.getRec ( collnum );
// print reason why spiders are not active for this collection
long tmp2;
SafeBuf mb;
if ( cr ) getSpiderStatusMsg ( cr , &mb , &tmp2 );
if ( mb.length() && tmp2 != SP_INITIALIZING )
sb.safePrintf(//"
"
""
""
//""
" | "
"%s"
" | "
"
"
"
\n"
, mb.getBufStart() );
// begin the table
sb.safePrintf ( "\n"
""
//""
"Currently Spidering on This Host"
" (%li spiders)"
//" (%li locks)"
//""
" |
\n"
, TABLE_STYLE
, (long)g_spiderLoop.m_numSpidersOut
//, g_spiderLoop.m_lockTable.m_numSlotsUsed
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb , true ) ) return false;
// shortcut
XmlDoc **docs = g_spiderLoop.m_docs;
// count # of spiders out
long j = 0;
// first print the spider recs we are spidering
for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) {
// get it
XmlDoc *xd = docs[i];
// skip if empty
if ( ! xd ) continue;
// sanity check
if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; }
// grab it
SpiderRequest *oldsr = &xd->m_sreq;
// get status
char *status = xd->m_statusMsg;
// show that
if ( ! oldsr->printToTable ( &sb , status,xd,j) ) return false;
// inc count
j++;
}
// end the table
sb.safePrintf ( "
\n" );
sb.safePrintf ( "
\n" );
/*
if ( g_spiderCache.m_numMsgSamples > 0 ) {
sb.safePrintf (
""
""
""
"Proportion of Spider Time Spent in "
"Section."
" | "
"
\n",
LIGHT_BLUE ,
DARK_BLUE );
HashTableT* m = &g_spiderCache.m_spiderMsgs;
for(long i = 0; i < m->getNumSlots();i++) {
if(m->getKey(i) == 0) continue;
sb.safePrintf (
""
"%.2f%% | "
"%.0f | "
"%s | "
"
\n",
100*m->getValueFromSlot(i)/
g_spiderCache.m_numMsgSamples,
m->getValueFromSlot(i),
(char*)m->getKey(i));
}
sb.safePrintf ("
\n");
}
*/
/*
// try to put these in tool tips
// describe the various parms
sb.safePrintf (
""
""
""
"Status descriptions"
" | "
"
\n"
""
//"getting link info | performing "
" | getting site title buf | getting "
"the title and all inlinker text of the root page."
" |
"
""
"getting outlink ip vector | getting "
"ips of the outlinks. Gets from tagdb firstip "
"tag if it exists."
" |
"
""
"getting robots.txt | downloading the "
"robots.txt file for this url."
" |
"
""
"checking for dup | looking up the url's "
"docid in checksumdb to see if its content checksum "
"is in use by another indexed document from the same "
"site. Will index even if it is a dup if it has a "
"higher quality."
" |
"
""
"getting web page | downloading the web "
"page."
" |
"
""
"getting cached web page | "
"looking up the "
"old record for this url in titledb to see how the "
"content changed."
" |
"
""
"adding links | adding links from the page "
"to spiderdb. Links are distributed to the host that "
"stores them based on the hash of the link. Make sure "
"<tfndbMaxPageCacheMem> is high enough to keep "
"tfndb disk seeks down. A tfndb access is done for "
"every link added."
" |
"
"
\n\n",
LIGHT_BLUE ,
DARK_BLUE );
*/
// then spider collection
//SpiderColl *sc = g_spiderCache.m_spiderColls[collnum];
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
//
// spiderdb rec stats, from scanning spiderdb
//
// if not there, forget about it
if ( sc ) sc->printStats ( sb );
//
// Spiders Table
//
long long totalPoints = g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderErrorsNew +
g_stats.m_totalSpiderSuccessOld +
g_stats.m_totalSpiderErrorsOld;
long long totalNew = g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderErrorsNew;
long long totalOld = g_stats.m_totalSpiderSuccessOld +
g_stats.m_totalSpiderErrorsOld;
double tsr = 100.00;
double nsr = 100.00;
double osr = 100.00;
if ( totalPoints > 0 ) {
tsr = 100.00*
(double)(g_stats.m_totalSpiderSuccessNew +
g_stats.m_totalSpiderSuccessOld) /
(double)totalPoints;
if ( totalNew > 0 )
nsr= 100.00*(double)(g_stats.m_totalSpiderSuccessNew) /
(double)(totalNew);
if ( totalOld > 0 )
osr= 100.00*(double)(g_stats.m_totalSpiderSuccessOld) /
(double)(totalOld);
}
long points = g_stats.m_spiderSample;
if ( points > 1000 ) points = 1000;
long sampleNew = g_stats.m_spiderNew;
long sampleOld = points - g_stats.m_spiderNew;
double tssr = 100.00;
double nssr = 100.00;
double ossr = 100.00;
if ( points > 0 ) {
tssr = 100.00*
(double)(points -
g_stats.m_spiderErrors) / (double)points ;
if ( sampleNew > 0 )
nssr = 100.00*(double)(sampleNew -
g_stats.m_spiderErrorsNew) /
(double)(sampleNew);
if ( sampleOld > 0 )
ossr = 100.00*(double)(sampleOld -
(g_stats.m_spiderErrors -
g_stats.m_spiderErrorsNew)) /
(double)(sampleOld);
}
sb.safePrintf(
"\n" ,
LIGHT_BLUE );
sb.safePrintf (
""
""
""
"Spider Stats |
\n"
""
" | Total | "
"Total New | "
"Total Old | "
"Sample | "
"Sample New | "
"Sample Old"
" |
"
"Total Spiders"
" | %lli | %lli | %lli | \n"
"%li | %li | %li |
\n"
//"Successful Spiders"
//" | %lli | %lli | %lli | \n"
//"%li | %li | %li |
\n"
//"Failed Spiders"
//" | %lli | %lli | %lli | \n"
//"%li | %li | %li |
\n"
"Success Rate"
" | %.02f%% | %.02f%% | "
"%.02f%% | %.02f%% | "
"%.02f%% | %.02f%% |
",
TABLE_STYLE,
DARK_BLUE,
totalPoints,
totalNew,
totalOld,
points,
sampleNew,
sampleOld,
//g_stats.m_totalSpiderSuccessNew +
//g_stats.m_totalSpiderSuccessOld,
//g_stats.m_totalSpiderSuccessNew,
//g_stats.m_totalSpiderSuccessOld,
//g_stats.m_spiderSuccessNew +
//g_stats.m_spiderSuccessOld,
//g_stats.m_spiderSuccessNew,
//g_stats.m_spiderSuccessOld,
//g_stats.m_totalSpiderErrorsNew +
//g_stats.m_totalSpiderErrorsOld,
//g_stats.m_totalSpiderErrorsNew,
//g_stats.m_totalSpiderErrorsOld,
//g_stats.m_spiderErrorsNew +
//g_stats.m_spiderErrorsOld,
//g_stats.m_spiderErrorsNew,
//g_stats.m_spiderErrorsOld,
tsr, nsr, osr, tssr, nssr, ossr );
long bucketsNew[65536];
long bucketsOld[65536];
memset ( bucketsNew , 0 , 65536*4 );
memset ( bucketsOld , 0 , 65536*4 );
for ( long i = 0 ; i < points; i++ ) {
long n = g_stats.m_errCodes[i];
if ( n < 0 || n > 65535 ) {
log("admin: Bad spider error code.");
continue;
}
if ( g_stats.m_isSampleNew[i] )
bucketsNew[n]++;
else
bucketsOld[n]++;
}
for ( long i = 0 ; i < 65536 ; i++ ) {
if ( g_stats.m_allErrorsNew[i] == 0 &&
g_stats.m_allErrorsOld[i] == 0 &&
bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue;
sb.safePrintf (
"%s | "
"%lli | "
"%lli | "
"%lli | "
"%li | "
"%li | "
"%li | "
"
\n" ,
LIGHT_BLUE,
mstrerror(i),
g_stats.m_allErrorsNew[i] +
g_stats.m_allErrorsOld[i],
g_stats.m_allErrorsNew[i],
g_stats.m_allErrorsOld[i],
bucketsNew[i] + bucketsOld[i] ,
bucketsNew[i] ,
bucketsOld[i] );
}
sb.safePrintf ( "
\n" );
// describe the various parms
/*
sb.safePrintf (
""
""
""
"Field descriptions"
" | "
"
\n"
""
"hits | The number of attempts that were "
"made by the spider to read a url from the spider "
"queue cache. | "
"
\n"
""
"misses | The number of those attempts that "
"failed to get a url to spider. | "
"
\n"
""
"cached | The number of urls that are "
"currently in the spider queue cache. | "
"
\n"
""
"water | The number of urls that were in the "
"spider queue cache at any one time, since the start "
"of the last disk scan. | "
"
\n"
""
"kicked | The number of urls that were "
"replaced in the spider queue cache with urls loaded "
"from disk, since the start of the last disk scan. | "
"
\n"
""
"added | The number of urls that were added "
"to the spider queue cache since the start of the last "
"disk scan. After a document is spidered its url "
"if often added again to the spider queue cache. | "
"
\n"
""
"attempted | The number of urls that "
"Gigablast attempted to add to the spider queue cache "
"since the start of the last disk scan. In "
"a distributed environment, urls are distributed "
"between twins so not all urls read will "
"make it into the spider queue cache. Also includes "
"spider recs attempted to be re-added to spiderdb "
"after being spidering, but usually with a different "
"spider time. | "
"
\n"
""
"nl | This is 1 iff Gigablast currently "
"needs to reload the spider queue cache from disk. | "
"
\n"
""
"rnl | This is 1 iff Gigablast currently "
"really needs to reload the spider queue cache from "
"disk. | "
"
\n"
""
"more | This is 1 iff there are urls on "
"the disk that are not in the spider queue cache. | "
"
\n"
""
"loading | This is 1 iff Gigablast is "
"currently loading this spider cache queue from "
"disk. | "
"
\n"
""
"scanned | The number of bytes that were "
"read from disk since the start of the last disk "
"scan. | "
"
\n"
""
"reads | The number of disk read "
"operations since the start of the last disk "
"scan. | "
"
\n"
""
"elapsed | The time in seconds that has "
"elapsed since the start or end of the last disk "
"scan, depending on if a scan is currently in "
"progress. | "
"
\n"
"
\n",
LIGHT_BLUE ,
DARK_BLUE );
*/
// done if no sc
if ( ! sc ) {
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length() );
}
/////
//
// READY TO SPIDER table
//
/////
long ns = 0;
if ( sc ) ns = sc->m_doleIpTable.getNumSlotsUsed();
// begin the table
sb.safePrintf ( "\n"
""
"URLs Ready to Spider for collection "
"%s"
""
" (%li ips in doleiptable)"
,
TABLE_STYLE,
st->m_coll ,
ns );
// print time format: 7/23/1971 10:45:32
time_t nowUTC = getTimeGlobal();
struct tm *timeStruct ;
char time[256];
timeStruct = gmtime ( &nowUTC );
strftime ( time , 256 , "%b %e %T %Y UTC", timeStruct );
sb.safePrintf("" // (current time = %s = %lu) "
" |
\n"
//,time,nowUTC
);
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest::printTableHeader ( &sb ,false ) ) return false;
// the the doledb spider recs
char *bs = sbTable->getBufStart();
if ( bs && ! sb.safePrintf("%s",bs) ) return false;
// end the table
sb.safePrintf ( "
\n" );
sb.safePrintf ( "
\n" );
/////////////////
//
// PRINT WAITING TREE
//
// each row is an ip. print the next url to spider for that ip.
//
/////////////////
sb.safePrintf ( "\n"
""
"IPs Waiting for Selection Scan for collection "
"%s"
""
,
TABLE_STYLE,
st->m_coll );
// print time format: 7/23/1971 10:45:32
long long timems = gettimeofdayInMillisecondsGlobal();
sb.safePrintf(" (current time = %llu)(totalcount=%li)"
"(waittablecount=%li) |
\n",
timems,
sc->m_waitingTree.getNumUsedNodes(),
sc->m_waitingTable.getNumUsedSlots());
sb.safePrintf("",DARK_BLUE);
sb.safePrintf("spidertime (MS) | \n");
sb.safePrintf("firstip | \n");
sb.safePrintf("
\n");
// the the waiting tree
long node = sc->m_waitingTree.getFirstNode();
long count = 0;
for ( ; node >= 0 ; node = sc->m_waitingTree.getNextNode(node) ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// get key
key_t *key = (key_t *)sc->m_waitingTree.getKey(node);
// get ip from that
long firstIp = (key->n0) & 0xffffffff;
// get the time
unsigned long long spiderTimeMS = key->n1;
// shift upp
spiderTimeMS <<= 32;
// or in
spiderTimeMS |= (key->n0 >> 32);
// get the rest of the data
sb.safePrintf(""
"%llu | "
"%s | "
"
\n",
LIGHT_BLUE,
spiderTimeMS,
iptoa(firstIp));
// stop after 20
if ( ++count == 20 ) break;
}
// ...
if ( count )
sb.safePrintf(""
"... |
\n",
LIGHT_BLUE);
// end the table
sb.safePrintf ( "
\n" );
sb.safePrintf ( "
\n" );
// get the socket
TcpSocket *s = st->m_socket;
// then we can nuke the state
mdelete ( st , sizeof(State11) , "PageSpiderdb" );
delete (st);
// erase g_errno for sending
g_errno = 0;
// now encapsulate it in html head/tail and send it off
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),sb.length() );
}
///////////////////////////////////
//
// URLFILTERS
//
///////////////////////////////////
/*
// assign these a value of 1 in s_table hashtable
static char *s_ypSites[] = {
"www.yellow.com",
"www.yellowpages.com",
"www.dexknows.com",
"yellowpages.aol.com",
"www.superpages.com",
"citysearch.com",
"www.yellowbook.com",
"www.magicyellow.com",
"home.digitalcity.com",
"www.switchboard.com",
"cityguide.aol.com",
"www.bizrate.com",
"www.restaurantica.com",
"www.insiderpages.com",
"local.yahoo.com"
};
// . assign these a value of 2 in s_table hashtable
// . mwells@g0:/y$ cat gobyout | awk '{print $4}' | grep -v goby.com | grep -vi goby | grep -v google.com | grep -v mappoint | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > foo
// . then take the top linked to sites on goby and print out for direct
// insertion into this file:
// then get the popular domains from THAT list:
// mwells@g0:/y$ cat foo | awk '{print $2}' | urlinfo | grep "dom: " | awk '{print $2}' | sort | uniq -c | sort > foodom
static char *s_aggSites[] = {
"isuwmsrugby.tripod.com",
"meyerlemon.eventbrite.com",
"miami.tourcorp.com",
"valentinesdaydatenightcoupleschi.eventbrite.com",
"volcano.si.edu",
"webpages.csus.edu",
"weddingextravaganza.eventbrite.com",
"www.alliancerugby.org",
"www.asuwrfc.com",
"www.btpd.org",
"www.chicagodragons.org",
"www.chsgeorgia.org",
"www.derugbyfoundation.org",
"www.foxborosportscenter.com",
"www.lynn.edu",
"www.owensboroparks.org",
"www.scitrek.org",
"www.southcarolinaparks.com",
"www.usbr.gov",
"dummil.eventbrite.com",
"jacksonvilleantiqueshow.eventbrite.com",
"kidsfest.eventbrite.com",
"piuvalentine.eventbrite.com",
"www.anytimefitness.com",
"www.dumbartonhouse.org",
"www.lsurugby.com",
"www.maliburugby.com",
"www.pitsrugby.com",
"www.renegaderugby.org",
"www.rotor.com",
"www.rugbyrats.com",
"www.sanjoserugby.com",
"www.seattleartists.com",
"www.sixflags.com",
"www.vacavillesports.com",
"atlcomedyfest.eventbrite.com",
"easyweekdaycooking.eventbrite.com",
"hartford.citysearch.com",
"healthythaicooking.eventbrite.com",
"hicaregiversconference.eventbrite.com",
"skiing.alpinezone.com",
"spirit.lib.uconn.edu",
"springfield.ettractions.com",
"tomatofest2011.eventbrite.com",
"www.abc-of-meditation.com",
"www.amf.com",
"www.atlantaharlequins.com",
"www.chicagoparkdistrict.com",
"www.denverwildfirerfc.org",
"www.gowaterfalling.com",
"www.harlequins.org",
"www.ignatius.org",
"www.masmacon.com",
"www.palmbeachrugby.org",
"www.riversiderugby.com",
"www.rmne.org",
"www.thehilliard.org",
"www.woodsmenrugby.com",
"devildoll.eventbrite.com",
"iexpectcrabfeedfundraiser.eventbrite.com",
"sports.groups.yahoo.com",
"valentinesdaycookingwithlove.eventbrite.com",
"www.agisamazing.com",
"www.ascendinglotus.com",
"www.auduboninstitute.org",
"www.azrugbyref.com",
"www.blackicerugby.com",
"www.bluegrassmuseum.org",
"www.krewerugby.com",
"www.lamorugby.com",
"www.lsue.edu",
"www.norwichrink.com",
"www.ombac.org",
"www.sdarmada.org",
"www.sirensrugby.com",
"www.tampabarbarians.org",
"www.travellanecounty.org",
"www.visit-newhampshire.com",
"hawaii.tourcorp.com",
"tasteofkorea.eventbrite.com",
"www.ballyfitness.com",
"www.calpolyrugby.com",
"www.destateparks.com",
"www.eaa.org",
"www.goldsgym.com",
"www.gonzagarugby.com",
"www.greatexplorations.org",
"www.heparks.org",
"www.imagisphere.org",
"www.jeffdavis.org",
"www.park.granitecity.com",
"www.poets.org",
"www.regis.edu",
"www.verizoncenter.com",
"mybridalsale.eventbrite.com",
"pigandsausagetoo.eventbrite.com",
"www.gaelrugby.com",
"www.independent.com",
"www.kohlchildrensmuseum.org",
"www.operaamerica.org",
"www.recration.du.edu",
"www.symmetricalskatingschool.org",
"www.telcomhistory.org",
"www.texasoutside.com",
"reagan.eureka.edu",
"stampede2011.eventbrite.com",
"synergy2011.eventbrite.com",
"theexperience2011.eventbrite.com",
"www.24hourfitness.com",
"www.dematha.org",
"www.facebook.com",
"www.iaapa.org",
"www.icelandrestoration.com",
"www.louisvillewomensrugby.com",
"www.manchesterrunningcompany.com",
"www.moaonline.org",
"www.pvicechalet.com",
"www.rendlake.com",
"attinuptown.eventbrite.com",
"chocolateanddessertfantasy.eventbrite.com",
"colorado.ettractions.com",
"longbeachstaterugby.webs.com",
"volcano.oregonstate.edu",
"www.columbiaspacescience.org",
"www.eventful.com",
"eventful.com",
"www.newmexico.org",
"www.rmparks.org",
"www.sbyouthrugby.org",
"www.venturacountyrugbyclub.com",
"www.wheatonicearena.com",
"faithorigins.eventbrite.com",
"jerseyshore.metromix.com",
"stlouis.citysearch.com",
"valentinesdaydatenightcooking.eventbrite.com",
"www.floridarugbyunion.com",
"www.rugbyatucf.com",
"www.stingrayrugby.com",
"www.usfbullsrugby.com",
"atlanta.going.com",
"klsnzwineday.eventbrite.com",
"losangeles.citysearch.com",
"sourdough.eventbrite.com",
"valentinesdaygourmetdating.eventbrite.com",
"web.mit.edu",
"www.airmuseum.org",
"www.eparugby.org",
"www.navicache.com",
"www.siliconvalleyrugby.org",
"www.yale.edu",
"rhodeisland.ettractions.com",
"studentorgs.vanderbilt.edu",
"www.jaxrugby.org",
"www.orlandomagazine.com",
"www.plnurugby.com",
"www.recreation.du.edu",
"www.riversideraptors.com",
"www.usarchery.org",
"cacspringfling.eventbrite.com",
"dallas.going.com",
"groups.northwestern.edu",
"hpualumniiphonelaunchparty.eventbrite.com",
"juliachild.eventbrite.com",
"southbaysciencesymposium2011.eventbrite.com",
"www.curugby.com",
"www.everyoneruns.net",
"www.glendalerugby.com",
"www.phantomsyouthrugby.org",
"www.usdrugby.com",
"10000expo-sponsoship-nec.eventbrite.com",
"greenville.metromix.com",
"spssan.eventbrite.com",
"www.cmaathletics.org",
"www.csulb.edu",
"www.doralrugby.com",
"www.neworleansrugbyclub.com",
"www.sos.louisiana.gov",
"www.southbayrugby.org",
"www.travelnevada.com",
"www.uicrugbyclub.org",
"www.atlantabucksrugby.org",
"www.dinodatabase.com",
"www.fest21.com",
"www.georgiatechrugby.com",
"www.gsuwomensrugby.com",
"www.siuwomensrugby.com",
"www.snowtracks.com",
"www.trainweb.com",
"www.visitnebraska.gov",
"www.visitsanantonio.com",
"hometown.aol.com",
"next2normal.eventbrite.com",
"sixmonthpassatlanta2011.eventbrite.com",
"winejazz2.eventbrite.com",
"www.amityrugby.org",
"www.meetandplay.com",
"www.miami.edu",
"www.miamirugby.com",
"www.phillipscollection.org",
"www.tridentsrugby.com",
"wwwbloggybootcampsandiego.eventbrite.com",
"whale-watching.gordonsguide.com",
"www.culturemob.com",
"www.denver-rugby.com",
"www.hillwoodmuseum.org",
"www.peabody.yale.edu",
"www.yoursciencecenter.com",
"newyorkcity.ettractions.com",
"rawfoodcert.eventbrite.com",
"www.discoverydepot.org",
"www.dukecityrugbyclub.com",
"www.jazztimes.com",
"www.kissimmeeairmuseum.com",
"www.southstreetseaportmuseum.org",
"www.wsbarbariansrugby.com",
"beerunch2011.eventbrite.com",
"milwaukee.ettractions.com",
"seminoletampa.casinocity.com",
"silveroak.eventbrite.com",
"tsunamifitclub.eventbrite.com",
"walking-tours.gordonsguide.com",
"www.alamedarugby.com",
"www.atshelicopters.com",
"www.camelbackrugby.com",
"www.dlshs.org",
"www.eteamz.com",
"newyork.ettractions.com",
"www.allaboutrivers.com",
"www.childrensmuseumatl.org",
"www.hartfordroses.org",
"www.nationalparks.org",
"www.seahawkyouthrugby.com",
"www.skiingthebackcountry.com",
"epcontinental.eventbrite.com",
"healthandwellnessshow.eventbrite.com",
"www.apopkamuseum.org",
"www.condorsrugby.com",
"www.dcr.virginia.gov",
"www.diabloyouthrugby.org",
"www.rockandice.com",
"honolulu.metromix.com",
"mowcrabfeed2011.eventbrite.com",
"ptt-superbowl.eventbrite.com",
"whitewater-rafting.gordonsguide.com",
"winearomatraining.eventbrite.com",
"www.broadway.com",
"www.usc.edu",
"www.gatorrugby.com",
"www.iumudsharks.net",
"www.scrrs.net",
"www.sfggrugby.com",
"www.unco.edu",
"hctmspring2011conference.eventbrite.com",
"sandiego.going.com",
"www.crt.state.la.us",
"www.foodhistorynews.com",
"www.lancerrugbyclub.org",
"www.littlerockrugby.com",
"www.sharksrugbyclub.com",
"www.channelislandsice.com",
"www.idealist.org",
"www.mbtykesrugby.com",
"katahdicon.eventbrite.com",
"foodwineloversfestival.eventbrite.com",
"maristeveningseries2011.eventbrite.com",
"philadelphia.ettractions.com",
"sugarrushla.eventbrite.com",
"www.chicagolions.com",
"www.skatingsafe.com",
"www.themeparkinsider.com",
"fremdcraftfairspring2011.eventbrite.com",
"gorptravel.away.com",
"minnesota.ettractions.com",
"www.chicagohopeacademy.org",
"www.fmcicesports.com",
"www.kitebeaches.com",
"www.mixedmartialarts.com",
"www.slatermill.org",
"www.sunnysideoflouisville.org",
"www.visitrochester.com",
"careshow.eventbrite.com",
"massachusetts.ettractions.com",
"edwardianla2011.eventbrite.com",
"indianapolis.metromix.com",
"www.pasadenamarathon.org",
"washington.going.com",
"www.sjquiltmuseum.org",
"www.wannakitesurf.com",
"fauwomensrugby.sports.officelive.com",
"newhampshire.ettractions.com",
"www.vcmha.org",
"milwaukee.going.com",
"phoenix.going.com",
"www.anrdoezrs.net",
"www.temperugby.com",
"pampermefabulous2011.eventbrite.com",
"www.napavalleyvineyards.org",
"r4k11.eventbrite.com",
"ramonamusicfest.eventbrite.com",
"www.abc-of-rockclimbing.com",
"www.geocities.com",
"jackson.metromix.com",
"www.santamonicarugby.com",
"cleveland.metromix.com",
"lancaster.ettractions.com",
"www.fortnet.org",
"www.horseandtravel.com",
"www.pubcrawler.com",
"kdwp.state.ks.us",
"www.berkeleyallblues.com",
"www.liferugby.com",
"www.socalmedicalmuseum.org",
"www.dcsm.org",
"www.sutler.net",
"desmoines.metromix.com",
"www.cavern.com",
"www.dotoledo.org",
"www.fws.gov",
"www.ghosttowngallery.com",
"www.museumamericas.org",
"www.museumsofboston.org",
"www.northshorerugby.com",
"geocaching.gpsgames.org",
"www.americaeast.com",
"www.cwrfc.org",
"www.jewelryshowguide.com",
"www.livelytimes.com",
"www.pascorugbyclub.com",
"www.westminsterice.com",
"www.claremontrugby.org",
"www.jugglingdb.com",
"www.metalblade.com",
"www.preservationnation.org",
"sofla2011.eventbrite.com",
"www.belmonticeland.com",
"www.dropzone.com",
"www.smecc.org",
"www.studentgroups.ucla.edu",
"www.visitdetroit.com",
"honolulu.going.com",
"sippingandsaving5.eventbrite.com",
"www.connecticutsar.org",
"www.guestranches.com",
"www.nvtrailmaps.com",
"www.visitnh.gov",
"illinois.ettractions.com",
"www.spymuseum.org",
"www.ci.riverside.ca.us",
"www.hbnews.us",
"www.santaclarayouthrugby.com",
"www.thestranger.com",
"www.freewebs.com",
"www.miamirugbykids.com",
"www.mtwashingtonvalley.org",
"www.ocbucksrugby.com",
"bridalpaloozala.eventbrite.com",
"maps.yahoo.com",
"www.azstateparks.com",
"www.paywindowpro.com",
"www.rowadventures.com",
"parksandrecreation.idaho.gov",
"www.artsmemphis.org",
"www.lasvegasweekly.com",
"www.redmountainrugby.org",
"san-francisco.tourcorp.com",
"www.khsice.com",
"www.vansenusauto.com",
"quinceanerasmagazineoc.eventbrite.com",
"www.mvc-sports.com",
"www.tbsa.com",
"www.travelportland.com",
"rtnpilgrim.eventbrite.com",
"www.bigfishtackle.com",
"www.centralmass.org",
"cpca2011.eventbrite.com",
"www.matadorrecords.com",
"www.sebabluegrass.org",
"prescott.showup.com",
"vintagevoltage2011.eventbrite.com",
"www.seattleperforms.com",
"www.valleyskating.com",
"resetbootcamp.eventbrite.com",
"www.abc-of-mountaineering.com",
"www.snocountry.com",
"events.nytimes.com",
"www.icecenter.net",
"www.livefrommemphis.com",
"www.pasadenarfc.com",
"www.ucsdrugby.com",
"uclaccim.eventbrite.com",
"www.visitchesapeake.com",
"www.natureali.org",
"www.nordicskiracer.com",
"www.nowplayingva.org",
"www.sbcounty.gov",
"www.seedesmoines.com",
"www.world-waterfalls.com",
"denver.going.com",
"hearstmuseum.berkeley.edu",
"www.lmurugby.com",
"www.ftlrugby.com",
"www.pelicanrugby.com",
"rtnharthighschool.eventbrite.com",
"www.visitri.com",
"www.aba.org",
"www.americaonice.us",
"www.thecontemporary.org",
"www.wherigo.com",
"www.drtopo.com",
"www.visitseattle.org",
"calendar.dancemedia.com",
"trips.outdoors.org",
"www.chs.org",
"www.myneworleans.com",
"www.oaklandice.com",
"nashville.metromix.com",
"www.americangolf.com",
"www.fossilmuseum.net",
"www.oakparkparks.com",
"www.visit-maine.com",
"www.oregonlive.com",
"www.allwashingtondctours.com",
"www.wannadive.net",
"www.sportsheritage.org",
"hudsonvalley.metromix.com",
"www.scificonventions.com",
"www.wildernessvolunteers.org",
"essencemusicfestival.eventbrite.com",
"www.kitesurfatlas.com",
"www.ndtourism.com",
"valentinesgourmetdatingchicago.eventbrite.com",
"www.fingerlakeswinecountry.com",
"www.dmnh.org",
"www.ticketnetwork.com",
"partystroll.eventbrite.com",
"www.bedandbreakfastnetwork.com",
"www.sternmass.org",
"www.visitnh.com",
"www.places2ride.com",
"www.hawaiieventsonline.com",
"www.ucirugby.com",
"www.gohawaii.com",
"www.writersforum.org",
"www.roadracingworld.com",
"www.bigisland.org",
"www.boatbookings.com",
"www.lhs.berkeley.edu",
"www.dnr.state.mn.us",
"www.mostateparks.com",
"www.historicnewengland.org",
"www.waza.org",
"www.backbayrfc.com",
"newyork.metromix.com",
"www.larebellion.org",
"teetimes.golfhub.com",
"10000expo-sponsoship-ceg.eventbrite.com",
"10000expo-sponsor-bjm.eventbrite.com",
"parks.ky.gov",
"www.bostonusa.com",
"www.visitbuffaloniagara.com",
"www.sharksice.com",
"2011burbankapprentice.eventbrite.com",
"kansascity.ettractions.com",
"www.bicycling.com",
"www.cityofchino.org",
"www.ridingworld.com",
"www.whittierrugby.com",
"10000bestjobsam.eventbrite.com",
"www.adventurecentral.com",
"www.earlymusic.org",
"www.upcomingevents.com",
"www.sleddogcentral.com",
"www.capecodkidz.com",
"www.collectorsguide.com",
"www.cougarrugby.org",
"www.sfvrugby.com",
"strivetothrivepabcconf.eventbrite.com",
"www.visithoustontexas.com",
"www.authorstrack.com",
"www.aboutgolfschools.org",
"www.huntingspotz.com",
"www.lib.az.us",
"members.aol.com",
"www.fs.fed.us",
"www.ncarts.org",
"www.vermonttravelplanner.org",
//"www.scubadiving.com",
"www.waterfallsnorthwest.com",
"www.philadelphiausa.travel",
"www.usgolfschoolguide.com",
"njgin.state.nj.us",
"www.artcards.cc",
"www.rimonthly.com",
"www.atlanta.net",
"www.glacialgardens.com",
"2011superbowlcruise.eventbrite.com",
"swimming-with-dolphins.gordonsguide.com",
"www.trackpedia.com",
// why was this in there?
//"www.dailyherald.com",
"www.nhm.org",
"boston.ettractions.com",
"www.geneseefun.com",
"www.travelsd.com",
"www.golfbuzz.com",
"www.in.gov",
"cincinnati.metromix.com",
"www.sanjose.com",
"brevard.metromix.com",
"www.dogsledrides.com",
"www.orvis.com",
"philadelphia.going.com",
"twincities.metromix.com",
"www.orlandorugby.com",
"www.csufrugby.com",
"www.larugby.com",
"www.washingtonwine.org",
"calendar.gardenweb.com",
"gulfcoast.metromix.com",
"florida.ettractions.com",
"www.northeastwaterfalls.com",
"www.computerhistory.org",
"www.ct.gov",
"www.hosteltraveler.com",
"www.thinkrentals.com",
"www.4x4trailhunters.com",
"www.cityweekly.net",
"www.yourrunning.com",
"www.spasofamerica.com",
"www.indoorclimbing.com",
"www.utah.com",
"boston.going.com",
"minneapolisstpaul.ettractions.com",
"www.coolrunning.com",
"www.greensboronc.org",
"www.michigan.org",
"www.artfestival.com",
"www.divespots.com",
"www.oregonstateparks.org",
"www.virginiawine.org",
"www.morebeach.com",
"www.minnesotamonthly.com",
"www.texasescapes.com",
"www.usatf.org",
"www.findrentals.com",
"www.hachettebookgroup.com",
"www.racesonline.com",
"www.usace.army.mil",
"web.georgia.org",
"detroit.metromix.com",
"www.homebrewersassociation.org",
"www.baltimore.org",
"www.gastateparks.org",
"www.arkansasstateparks.com",
"www.visitlasvegas.com",
"www.whenwerv.com",
"www.chilicookoff.com",
"www.bikeride.com",
"www.eaglerockrugby.com",
"www.pickwickgardens.com",
"flagstaff.showup.com",
"miami.going.com",
"www.anchorage.net",
"www.wlra.us",
"www.thetrustees.org",
"www.artnet.com",
"www.mthoodterritory.com",
"www.hihostels.com",
"www.bfa.net",
"167.102.232.26",
"www.flyins.com",
"www.stepintohistory.com",
"www.festing.com",
"www.pursuetheoutdoors.com",
"newyork.going.com",
"www.fishingguidenetwork.com",
"www.visit-massachusetts.com",
"www.visitindy.com",
"www.washingtonpost.com",
"www.greatamericandays.com",
"www.washingtonian.com",
"national.citysearch.com",
"www.infohub.com",
"www.productionhub.com",
"www.events.org",
"www.traveliowa.com",
"www.findmyadventure.com",
"delaware.metromix.com",
"www.marinmagazine.com",
"us.penguingroup.com",
"www.bicycletour.com",
"www.travelok.com",
"www.scububble.com",
"www.childrensmuseums.org",
"www.conventionscene.com",
"www.scubaspots.com",
"www.tnvacation.com",
"stlouis.ettractions.com",
"www.mxparks.com",
"florida.greatestdivesites.com",
"www.nowplayingaustin.com",
"www.skinnyski.com",
"www.sportoften.com",
"www.zvents.com",
"www.visitphoenix.com",
"palmsprings.metromix.com",
"upcoming.yahoo.com",
"www.washington.org",
"www.balloonridesacrossamerica.com",
"www.playbill.com",
"palmbeach.ettractions.com",
"louisville.metromix.com",
"www.animecons.com",
"www.findanartshow.com",
"www.usef.org",
"www.villagevoice.com",
"www.discovergold.org",
"www.georgiaoffroad.com",
"www.memphistravel.com",
"dc.metromix.com",
"www.aplf-planetariums.info",
"www.skateisi.com",
"www.usacycling.org",
"www.wine-compass.com",
"www.visitdelaware.com",
"tucson.metromix.com",
"www.happycow.net",
"www.indiecraftshows.com",
"www.gethep.net",
"www.agritourismworld.com",
"stlouis.metromix.com",
"phoenix.metromix.com",
"stream-flow.allaboutrivers.com",
"www.festivalsandevents.com",
"www.winemcgee.com",
"www.aurcade.com",
"www.visitjacksonville.com",
"www.nashvillescene.com",
"www.4x4trails.net",
"www.americancraftmag.org",
"blog.danceruniverse.com",
"www.vacationrealty.com",
"www.californiasciencecenter.org",
"www.rollerhome.com",
"www.atvsource.com",
"www.hotairballooning.com",
"www.freeskateparks.com",
"www.ruralbounty.com",
"connecticut.ettractions.com",
"www.localattractions.com",
"www.skategroove.com",
"www.hawaiitours.com",
"www.visitrhodeisland.com",
"www.swac.org",
"www.swimmingholes.org",
"www.roadfood.com",
"www.gotriadscene.com",
"www.runnersworld.com",
"www.outerquest.com",
"www.seattleweekly.com",
"www.onlyinsanfrancisco.com",
"www.bikereg.com",
"www.artslant.com",
"www.louisianatravel.com",
"www.operabase.com",
"www.stepintoplaces.com",
"www.vinarium-usa.com",
"www.visitconnecticut.com",
"www.abc-of-mountainbiking.com",
"www.wannask8.com",
"www.xcski.org",
"www.active-days.org",
"www.hawaiiactivities.com",
"www.massvacation.com",
"www.uspa.org",
"miami.ettractions.com",
"www.abc-of-hiking.com",
"www.bestofneworleans.com",
"www.phillyfunguide.com",
"www.beermonthclub.com",
"www.newenglandwaterfalls.com",
"www.lake-link.com",
"www.festivalfinder.com",
"www.visitmississippi.org",
"www.lanierbb.com",
"www.thepmga.com",
"www.skitown.com",
"www.fairsandfestivals.net",
"sanfrancisco.going.com",
"www.koa.com",
"www.wildlifeviewingareas.com",
"www.boatrenting.com",
"www.nowplayingutah.com",
"www.ultimaterollercoaster.com",
"www.findacraftfair.com",
"www.ababmx.com",
"www.abc-of-skiing.com",
"www.pw.org",
"tampabay.metromix.com",
"www.onthesnow.com",
"www.sunny.org",
"www.visitnewengland.com",
"atlanta.metromix.com",
"www.allaboutapples.com",
"www.monsterjam.com",
"www.bnbfinder.com",
"www.sandiego.org",
"www.worldcasinodirectory.com",
"www.yoga.com",
"www.1-800-volunteer.org",
"www.visitkc.com",
"www.theskichannel.com",
"www.thephoenix.com",
"www.virginia.org",
"www.avclub.com",
"www.orlandoinfo.com",
"www.trustedtours.com",
"www.peakradar.com",
"web.minorleaguebaseball.com",
"www.artshound.com",
"www.daytonabeach.com",
"chicago.going.com",
"www.cetaceanwatching.com",
"www.citypages.com",
"www.nowplayingnashville.com",
"www.discoverlosangeles.com",
"www.ratebeer.com",
"www.harpercollins.com",
"www.seenewengland.com",
"www.visitmt.com",
"www.goldstar.com",
"www.caverbob.com",
"www.sanjose.org",
"www.backcountrysecrets.com",
"authors.simonandschuster.com",
"rafting.allaboutrivers.com",
"chicago.ettractions.com",
"iweb.aam-us.org",
"www.theputtingpenguin.com",
"www.festivals.com",
"www.artsboston.org",
"www.aboutskischools.com",
"tucson.showup.com",
"www.thiswaytothe.net",
"www.rei.com",
"www.magicseaweed.com",
"www.waterfallswest.com",
"fortlauderdale.ettractions.com",
"www.foodreference.com",
"www.californiawineryadvisor.com",
"www.teamap.com",
"www.neworleanscvb.com",
"www.skatetheory.com",
"www.visitmaine.com",
"www.rollerskating.org",
"www.culturecapital.com",
"www.delawarescene.com",
"www.nyc-arts.org",
"www.huntingoutfitters.net",
"www.showcaves.com",
"www.soccerbars.com",
"www.visitnewportbeach.com",
"www.beerme.com",
"www.pitch.com",
"www.museum.com",
"www.hauntworld.com",
"www.forestcamping.com",
"www.dogpark.com",
"www.critterplaces.com",
"www.visitnj.org",
"www.findagrave.com",
"www.arcadefly.com",
"www.winerybound.com",
"www.usms.org",
"www.zipscene.com",
"www.horsetraildirectory.com",
"www.coaster-net.com",
"www.anaheimoc.org",
"www.visitpa.com",
"www.antiquetrader.com",
"www.dallasobserver.com",
"www.eventsetter.com",
"www.goingoutside.com",
"www.sightseeingworld.com",
"www.artlog.com",
"www.bnbstar.com",
"www.hostels.com",
"www.theartnewspaper.com",
"consumer.discoverohio.com",
"www.nssio.org",
"www.wingshootingusa.org",
"www.shootata.com",
"www.randomhouse.com",
"www.artforum.com",
"www.bachtrack.com",
"www.wayspa.com",
"www.visitidaho.org",
"www.exploreminnesota.com",
"chicago.metromix.com",
"www.worldgolf.com",
"nysparks.state.ny.us",
"www.meetup.com",
"www.skateboardparks.com",
"www.downtownjacksonville.org",
"www.lighthousefriends.com",
"www.strikespots.com",
"ww2.americancanoe.org",
"www.inlandarts.com",
"www.horseshowcentral.com",
"www.ridingresource.com",
"www.experiencewa.com",
"database.thrillnetwork.com",
"denver.metromix.com",
"www.bostoncentral.com",
"www.segwayguidedtours.com",
"www.colorado.com",
"www.artandseek.org",
"www.floridastateparks.org",
"www.sparkoc.com",
"losangeles.going.com",
"www.motorcycleevents.com",
"www.destination-store.com",
"www.scubadviser.com",
"www.booktour.com",
"www.cloud9living.com",
"www.allaboutjazz.com",
"www.sacramento365.com",
"www.discoversouthcarolina.com",
"www.riverfronttimes.com",
"www.hauntedhouses.com",
"www.arenamaps.com",
"www.artsnwct.org",
"www.eventbrite.com",
"animal.discovery.com",
"www.eatfeats.com",
"www.1001seafoods.com",
"www.malletin.com",
"www.yelp.com",
"www.wannasurf.com",
"www.clubplanet.com",
"www.dupagecvb.com",
"www.smartdestinations.com",
"www.artfaircalendar.com",
"www.excitations.com",
"www.balloonrideus.com",
"www.extravagift.com",
"www.skisite.com",
"www.orlandoweekly.com",
"www.iloveny.com",
"www.sandiegoreader.com",
"web.usarugby.org",
"www.artscalendar.com",
"www.sfweekly.com",
"store-locator.barnesandnoble.com",
"www.realhaunts.com",
"trails.mtbr.com",
"www.bbonline.com",
"www.pickyourownchristmastree.org",
"events.myspace.com",
"www.alabama.travel",
"www.ctvisit.com",
"freepages.history.rootsweb.com",
"www.waterparks.com",
"www.flavorpill.com",
"www.marinasdirectory.org",
"www.publicgardens.org",
"www.alwaysonvacation.com",
"www.infosports.com",
"www.summitpost.org",
"www.exploregeorgia.org",
"www.brewerysearch.com",
"www.phoenixnewtimes.com",
"www.marinas.com",
"www.arestravel.com",
"www.gamebirdhunts.com",
"www.cbssports.com",
"tutsan.forest.net",
"www.azcentral.com",
"www.tennispulse.org",
"www.westword.com",
"www.factorytoursusa.com",
"www.americanwhitewater.org",
"www.spamagazine.com",
"www.dogparkusa.com",
"tps.cr.nps.gov",
"www.sfstation.com",
"www.abc-of-yoga.com",
"www.worldeventsguide.com",
"www.active.com",
"www.beerexpedition.com",
"www.iloveinns.com",
"www.warpig.com",
"www.artsopolis.com",
"www.skatepark.com",
"www.offroadnorthamerica.com",
"www.visitflorida.com",
"www.last.fm",
"www.pbplanet.com",
"www.traveltex.com",
"phoenix.showup.com",
"www.travelandleisure.com",
"www.kentuckytourism.com",
"www.gospelgigs.com",
"www.whenwegetthere.com",
"www.surfline.com",
"www.stubhub.com",
"www.centerstagechicago.com",
"www.sunshineartist.com",
"www.reserveamerica.com",
"www.clubzone.com",
"www.paddling.net",
"www.xperiencedays.com",
"www.razorgator.com",
"www.dalejtravis.com",
"www.pickyourown.org",
"www.localhikes.com",
"www.parks.ca.gov",
"www.casinocity.com",
"www.nofouls.com",
"www.laweekly.com",
"www.denver.org",
"www.enjoyillinois.com",
"www.livenation.com",
"www.viator.com",
"members.bikeleague.org",
"www.skatespotter.com",
"family.go.com",
"www.myspace.com",
"www.takemefishing.org",
"www.localwineevents.com",
"www.rinkdirectory.com",
"www.walkjogrun.net",
"www.nps.gov",
"www.ghosttowns.com",
"www.theatermania.com",
"www.skateboardpark.com",
"www.miaminewtimes.com",
"www.explorechicago.org",
"www.ocweekly.com",
"www.ustasearch.com",
"www.rateclubs.com",
"www.tennismetro.com",
"www.motorcyclemonster.com",
"www.hauntedhouse.com",
"www.pumpkinpatchesandmore.org",
"www.courtsoftheworld.com",
"www.ecoanimal.com",
"www.yogafinder.com",
"www.traillink.com",
"www.equinenow.com",
"www.jambase.com",
"www.spaemergency.com",
//"www.vacationhomerentals.com",
"www.ava.org",
"affiliate.isango.com",
"www.museumland.net",
"www.dirtworld.com",
"www.rockclimbing.com",
"www.kijubi.com",
"www.outdoortrips.info",
"www.visitcalifornia.com",
"www.heritagesites.com",
"www.bedandbreakfast.com",
"www.discoveramerica.com",
"www.singletracks.com",
"www.museumstuff.com",
"www.opentable.com",
"www.homeaway.com",
"www.thegolfcourses.net",
"www.golflink.com",
"www.trekaroo.com",
"gocitykids.parentsconnect.com",
"www.wildernet.com",
"www.10best.com",
"swim.isport.com",
"www.wheretoshoot.org",
"www.hostelworld.com",
"www.landbigfish.com",
"www.recreation.gov",
"www.healthclubdirectory.com",
"www.spafinder.com",
"www.nationalregisterofhistoricplaces.com",
"www.americantowns.com",
"www.hmdb.org",
"www.golfnow.com",
"www.grandparents.com",
"www.swimmersguide.com",
"www.luxergy.com",
"activities.wildernet.com",
"events.mapchannels.com",
"www.museumsusa.org",
"www.rinktime.com",
"www.rentandorbuy.com",
"www.mytravelguide.com",
"playspacefinder.kaboom.org",
"www.famplosion.com",
"www.eviesays.com",
"www.anglerweb.com",
"www.trails.com",
"www.waymarking.com",
"www.priceline.com",
"local.yahoo.com",
"ticketmaster.com",
// rss feeds
"trumba.com",
// movie times:
"cinemark.com",
// domains (hand selected from above list filtered with urlinfo)
"patch.com",
"gordonsguide.com",
"tourcorp.com",
"americangolf.com",
"casinocity.com",
"going.com",
"metromix.com",
"ettractions.com",
"citysearch.com",
"eventbrite.com"
};
*/
/*
static HashTableX s_table;
static bool s_init = false;
static char s_buf[25000];
static long s_craigsList;
bool initAggregatorTable ( ) {
// this hashtable is used for "isyellowpages" and "iseventaggregator"
if ( s_init ) return true;
// use niceness 0
s_table.set(4,1,4096,s_buf,25000,false,0,"spsitetbl");
// now stock it with yellow pages sites
long n = (long)sizeof(s_ypSites)/ sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
char *s = s_ypSites[i];
long slen = gbstrlen ( s );
long h32 = hash32 ( s , slen );
char val = 1;
if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;}
}
// then stock with event aggregator sites
n = (long)sizeof(s_aggSites)/ sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
char *s = s_aggSites[i];
long slen = gbstrlen ( s );
long h32 = hash32 ( s , slen );
char val = 2;
if ( ! s_table.addKey(&h32,&val)) {char*xx=NULL;*xx=0;}
}
// do not repeat this
s_init = true;
s_craigsList = hash32n("craigslist.org");
return true;
}
bool isAggregator ( long siteHash32,long domHash32,char *url,long urlLen ) {
// make sure its stocked
initAggregatorTable();
// is site a hit?
char *v = (char *)s_table.getValue ( &siteHash32 );
// hit?
if ( v && *v ) return true;
// try domain?
v = (char *)s_table.getValue ( &domHash32 );
// hit?
if ( v && *v ) return true;
// these guys mirror eventful.com's db so let's grab it...
// abcd.com
if ( urlLen>30 &&
url[11]=='t' &&
url[18]=='o' &&
strncmp(url,"http://www.thingstodoin",23) == 0 )
return true;
// craigslist
if ( domHash32 == s_craigsList && strstr(url,".com/cal/") )
return true;
// otherwise, no
return false;
}
*/
#define SIGN_EQ 1
#define SIGN_NE 2
#define SIGN_GT 3
#define SIGN_LT 4
#define SIGN_GE 5
#define SIGN_LE 6
// from PageBasic.cpp
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) ;
// . this is called by SpiderCache.cpp for every url it scans in spiderdb
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
// because things like "parentIsRSS" can be both true or false since a url
// can have multiple spider recs associated with it!
long getUrlFilterNum2 ( SpiderRequest *sreq ,
SpiderReply *srep ,
long nowGlobal ,
bool isForMsg20 ,
long niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ) {
// convert lang to string
char *lang = NULL;
long langLen = 0;
if ( srep ) {
// this is NULL on corruption
lang = getLanguageAbbr ( srep->m_langId );
langLen = gbstrlen(lang);
}
char *tld = (char *)-1;
long tldLen;
long urlLen = sreq->getUrlLen();
char *url = sreq->m_url;
char *row;
bool checkedRow = false;
//SpiderColl *sc = cr->m_spiderColl;
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
// log("hey");
//initAggregatorTable();
//long tldlen2;
//char *tld2 = getTLDFast ( sreq->m_url , &tldlen2);
//bool bad = true;
//if ( tld2[0] == 'c' && tld2[1] == 'o' && tld2[2]=='m' ) bad = false;
//if ( tld2[0] == 'o' && tld2[1] == 'r' && tld2[2]=='g' ) bad = false;
//if ( tld2[0] == 'u' && tld2[1] == 's' ) bad = false;
//if ( tld2[0] == 'g' && tld2[1] == 'o' && tld2[2]=='v' ) bad = false;
//if ( tld2[0] == 'e' && tld2[1] == 'd' && tld2[2]=='u' ) bad = false;
//if ( tld2[0] == 'i' && tld2[1] == 'n' && tld2[2]=='f' ) bad = false;
//if ( bad )
// log("hey");
// shortcut
char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
if ( ucp && ! ucp[0] ) ucp = NULL;
// get the compiled regular expressions
regex_t *ucr = &cr->m_ucr;
regex_t *upr = &cr->m_upr;
if ( ! cr->m_hasucr ) ucr = NULL;
if ( ! cr->m_hasupr ) upr = NULL;
char *ext;
char *special;
// CONSIDER COMPILING FOR SPEED:
// 1) each command can be combined into a bitmask on the spiderRequest
// bits, or an access to m_siteNumInlinks, or a substring match
// 2) put all the strings we got into the list of Needles
// 3) then generate the list of needles the SpiderRequest/url matches
// 4) then reduce each line to a list of needles to have, a
// min/max/equal siteNumInlinks, min/max/equal hopCount,
// and a bitMask to match the bit flags in the SpiderRequest
// stop at first regular expression it matches
for ( long i = 0 ; i < cr->m_numRegExs ; i++ ) {
// breathe
QUICKPOLL ( niceness );
// get the ith rule
SafeBuf *sb = &cr->m_regExs[i];
//char *p = cr->m_regExs[i];
char *p = sb->getBufStart();
checkNextRule:
// skip leading whitespace
while ( *p && isspace(*p) ) p++;
// do we have a leading '!'
bool val = 0;
if ( *p == '!' ) { val = 1; p++; }
// skip whitespace after the '!'
while ( *p && isspace(*p) ) p++;
// new rules for when to download (diffbot) page
if ( *p == 'm' &&
p[1]== 'a' &&
p[2]== 't' &&
p[3]== 'c' &&
p[4]== 'h' &&
p[5]== 'e' &&
p[6]== 's' &&
p[7]== 'u' &&
p[8]== 'c' &&
p[9]== 'p' ) {
// . skip this expression row if does not match
// . url must match one of the patterns in there.
// . inline this for speed
// . "ucp" is a ||-separated list of substrings
// . "ucr" is a regex
// . regexec returns 0 for a match
if ( ucr && regexec(ucr,url,0,NULL,0) &&
// seed or other manual addition always matches
! sreq->m_isAddUrl &&
! sreq->m_isInjecting )
continue;
// do not require a match on ucp if ucr is given
if ( ucp && ! ucr &&
! doesStringContainPattern(url,ucp) &&
// seed or other manual addition always matches
! sreq->m_isAddUrl &&
! sreq->m_isInjecting )
continue;
p += 10;
p = strstr(p,"&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// new rules for when to "process" (diffbot) page
if ( *p == 'm' &&
p[1]== 'a' &&
p[2]== 't' &&
p[3]== 'c' &&
p[4]== 'h' &&
p[5]== 'e' &&
p[6]== 's' &&
p[7]== 'u' &&
p[8]== 'p' &&
p[9]== 'p' ) {
// . skip this expression row if does not match
// . url must match one of the patterns in there.
// . inline this for speed
// . "upp" is a ||-separated list of substrings
// . "upr" is a regex
// . regexec returns 0 for a match
if ( upr && regexec(upr,url,0,NULL,0) )
continue;
if ( upp && !upr &&!doesStringContainPattern(url,upp))
continue;
p += 10;
p = strstr(p,"&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasauthorityinlink",18) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasAuthorityInlinkValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasAuthorityInlink==val)continue;
// skip
p += 18;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hascontactinfo",14) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasContactInfoValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasContactInfo==val ) continue;
// skip
p += 14;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasaddress",10) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! srep->m_hasAddressValid ) continue;
// if no match continue
if ( (bool)srep->m_hasAddress==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hastod",6) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! srep->m_hasTODValid ) continue;
// if no match continue
if ( (bool)srep->m_hasTOD==val ) continue;
// skip
p += 6;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( *p=='h' && strncmp(p,"hasreply",8) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
//if ( (bool)srep == (bool)val ) continue;
if ( (bool)(sreq->m_hadReply) == (bool)val ) continue;
// skip it for speed
p += 8;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// hastmperror, if while spidering, the last reply was
// like EDNSTIMEDOUT or ETCPTIMEDOUT or some kind of
// usually temporary condition that warrants a retry
if ( *p=='h' && strncmp(p,"hastmperror",11) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// get our error code
long errCode = srep->m_errCode;
// . make it zero if not tmp error
// . now have EDOCUNCHANGED and EDOCNOGOODDATE from
// Msg13.cpp, so don't count those here...
if ( errCode != EDNSTIMEDOUT &&
errCode != ETCPTIMEDOUT &&
errCode != EDNSDEAD &&
// assume diffbot is temporarily experiencing errs
errCode != EDIFFBOTINTERNALERROR &&
// out of memory while crawling?
errCode != ENOMEM &&
errCode != ENETUNREACH &&
errCode != EHOSTUNREACH )
errCode = 0;
// if no match continue
if ( (bool)errCode == val ) continue;
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
/*
if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_hasSiteVenueValid ) continue;
// if no match continue
if ( (bool)sreq->m_hasSiteVenue==val ) continue;
// allow "!isindexed" if no SpiderReply at all
//if ( ! srep && val == 0 ) continue;
// skip
p += 12;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
*/
if ( *p != 'i' ) goto skipi;
if ( strncmp(p,"isinjected",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_isInjecting==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"isdocidbased",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_urlIsDocId==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"iscontacty",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid
if ( ! sreq->m_isContactyValid ) continue;
// if no match continue
if ( (bool)sreq->m_isContacty==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// is it in the big list of sites?
if ( strncmp(p,"insitelist",10) == 0 ) {
// skip for msg20
//if ( isForMsg20 ) continue;
if ( ! checkedRow ) {
// only do once for speed
checkedRow = true;
// this function is in PageBasic.cpp
row = getMatchingUrlPattern ( sc, sreq );
}
// if we are not submitted from the add url api, skip
if ( (bool)row == val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// . was it submitted from PageAddUrl.cpp?
// . replaces the "add url priority" parm
if ( strncmp(p,"isaddurl",8) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we are not submitted from the add url api, skip
if ( (bool)sreq->m_isAddUrl == val ) continue;
// skip
p += 8;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( p[0]=='i' && strncmp(p,"ismanualadd",11) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// . if we are not submitted from the add url api, skip
// . if we have '!' then val is 1
if ( sreq->m_isAddUrl ||
sreq->m_isInjecting ||
sreq->m_isPageParser ) {
if ( val ) continue;
}
else {
if ( ! val ) continue;
}
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// does it have an rss inlink? we want to expedite indexing
// of such pages. i.e. that we gather from an rss feed that
// we got from a pingserver...
if ( strncmp(p,"isparentrss",11) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we have no such inlink
if ( (bool)sreq->m_parentIsRSS == val ) continue;
// skip
p += 11;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
/*
if ( strncmp(p,"isparentindexed",16) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if we have no such inlink
if ( (bool)sreq->m_wasParentIndexed == val ) continue;
// skip
p += 16;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
*/
// we can now handle this guy since we have the latest
// SpiderReply, pretty much guaranteed
if ( strncmp(p,"isindexed",9) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if reply does not KNOW because of an error
// since XmDoc::indexDoc() called
// XmlDoc::getNewSpiderReply() and did not have this
// info...
if ( srep && (bool)srep->m_isIndexedINValid ) continue;
// if no match continue
if ( srep && (bool)srep->m_isIndexed==val ) continue;
// allow "!isindexed" if no SpiderReply at all
if ( ! srep && val == 0 ) continue;
// skip
p += 9;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"ingoogle",8) == 0 ) {
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if not valid (pageaddurl? injection?)
if ( ! sreq->m_inGoogleValid ) continue;
// if no match continue
if ( (bool)sreq->m_inGoogle == val ) continue;
// allow "!isindexed" if no SpiderReply at all
if ( ! srep && val == 0 ) continue;
// skip
p += 8;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// . check to see if a page is linked to by
// www.weblogs.com/shortChanges.xml and if it is we put
// it into a queue that has a respider rate no faster than
// 30 days, because we don't need to spider it quick since
// it is in the ping server!
if ( strncmp(p,"isparentpingserver",18) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_parentIsPingServer == val) continue;
// skip
p += 18;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"ispingserver",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
if ( (bool)sreq->m_isPingServer == val ) continue;
// skip
p += 12;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamesubdomain",17 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
if ( val == 0 &&
sreq->m_parentHostHash32 != sreq->m_hostHash32 )
continue;
if ( val == 1 &&
sreq->m_parentHostHash32 == sreq->m_hostHash32 )
continue;
p += 6;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp ( p , "isonsamedomain",14 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
if ( val == 0 &&
sreq->m_parentDomHash32 != sreq->m_domHash32 )
continue;
if ( val == 1 &&
sreq->m_parentDomHash32 == sreq->m_domHash32 )
continue;
p += 6;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// jpg JPG gif GIF wmv mpg css etc.
if ( strncmp ( p , "ismedia",7 ) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// check the extension
if ( urlLen<=5 ) continue;
ext = url + urlLen - 4;
if ( ext[0] == '.' ) {
if ( to_lower_a(ext[1]) == 'c' &&
to_lower_a(ext[2]) == 's' &&
to_lower_a(ext[3]) == 's' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'n' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'w' &&
to_lower_a(ext[2]) == 'm' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'w' &&
to_lower_a(ext[2]) == 'a' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'j' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'g' &&
to_lower_a(ext[2]) == 'i' &&
to_lower_a(ext[3]) == 'f' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'i' &&
to_lower_a(ext[2]) == 'c' &&
to_lower_a(ext[3]) == 'o' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == '3' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'p' &&
to_lower_a(ext[3]) == '4' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'm' &&
to_lower_a(ext[2]) == 'o' &&
to_lower_a(ext[3]) == 'v' )
goto gotOne;
if ( to_lower_a(ext[1]) == 'a' &&
to_lower_a(ext[2]) == 'v' &&
to_lower_a(ext[3]) == 'i' )
goto gotOne;
}
else if ( ext[-1] == '.' ) {
if ( to_lower_a(ext[0]) == 'm' &&
to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'e' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
if ( to_lower_a(ext[0]) == 'j' &&
to_lower_a(ext[1]) == 'p' &&
to_lower_a(ext[2]) == 'e' &&
to_lower_a(ext[3]) == 'g' )
goto gotOne;
}
// two letter extensions
else if ( ext[1] == '.' ) {
if ( to_lower_a(ext[2]) == 'g' &&
to_lower_a(ext[3]) == 'z' )
goto gotOne;
}
// check for ".css?" substring
special = strstr(url,".css?");
if ( special ) goto gotOne;
special = strstr(url,"/print/");
if ( special ) goto gotOne;
// no match, try the next rule
continue;
gotOne:
p += 7;
p = strstr(p, "&&");
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
// check for "isrss" aka "rss"
if ( strncmp(p,"isrss",5) == 0 ) {
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// if we are not rss, we do not match this rule
if ( (bool)srep->m_isRSS == val ) continue;
// skip it
p += 5;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for permalinks. for new outlinks we *guess* if its
// a permalink by calling isPermalink() function.
if (!strncmp(p,"ispermalink",11) ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// if we are not rss, we do not match this rule
if ( (bool)srep->m_isPermalink == val ) continue;
// skip it
p += 11;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// supports LF_ISPERMALINK bit for outlinks that *seem* to
// be permalinks but might not
if (!strncmp(p,"ispermalinkformat",17) ) {
// if we are not rss, we do not match this rule
if ( (bool)sreq->m_isUrlPermalinkFormat ==val)continue;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for this
if ( strncmp(p,"isnewoutlink",12) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// skip if we do not match this rule
if ( (bool)sreq->m_isNewOutlink == val ) continue;
// skip it
p += 10;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// check for this
if ( strncmp(p,"isnewrequest",12) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if we are a new request and val is 1 (has '!')
if ( ! srep && val ) continue;
// skip if we are a new request and val is 1 (has '!')
if(srep&&sreq->m_addedTime>srep->m_spideredTime &&val)
continue;
// skip if we are old and val is 0 (does not have '!')
if(srep&&sreq->m_addedTime<=srep->m_spideredTime&&!val)
continue;
// skip it for speed
p += 12;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// kinda like isnewrequest, but has no reply. use hasreply?
if ( strncmp(p,"isnew",5) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// if we got a reply, we are not new!!
if ( (bool)sreq->m_hadReply != (bool)val ) continue;
// skip it for speed
p += 5;
// check for &&
p = strstr(p, "&&");
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// iswww, means url is like www.xyz.com/...
if ( strncmp(p,"iswww", 5) == 0 ) {
// now this is a bit
if ( (bool)sreq->m_isWWWSubdomain == (bool)val )
continue;
/*
// skip "iswww"
p += 5;
// skip over http:// or https://
char *u = sreq->m_url;
if ( u[4] == ':' ) u += 7;
if ( u[5] == ':' ) u += 8;
// url MUST be a www url
char isWWW = 0;
if( u[0] == 'w' &&
u[1] == 'w' &&
u[2] == 'w' ) isWWW = 1;
// skip if no match
if ( isWWW == val ) continue;
*/
// TODO: fix www.knightstown.skepter.com
// maybe just have a bit in the spider request
// another rule?
p = strstr(p,"&&");
if ( ! p ) return i;
// skip the '&&'
p += 2;
goto checkNextRule;
}
// non-boolen junk
skipi:
// . we always match the "default" reg ex
// . this line must ALWAYS exist!
if ( *p=='d' && ! strcmp(p,"default" ) )
return i;
// set the sign
char *s = p;
// skip s to after
while ( *s && is_alpha_a(*s) ) s++;
// skip white space before the operator
//char *saved = s;
while ( *s && is_wspace_a(*s) ) s++;
char sign = 0;
if ( *s == '=' ) {
s++;
if ( *s == '=' ) s++;
sign = SIGN_EQ;
}
else if ( *s == '!' && s[1] == '=' ) {
s += 2;
sign = SIGN_NE;
}
else if ( *s == '<' ) {
s++;
if ( *s == '=' ) { sign = SIGN_LE; s++; }
else sign = SIGN_LT;
}
else if ( *s == '>' ) {
s++;
if ( *s == '=' ) { sign = SIGN_GE; s++; }
else sign = SIGN_GT;
}
// skip whitespace after the operator
while ( *s && is_wspace_a(*s) ) s++;
// seed counts. how many seeds this subdomain has. 'siteadds'
if ( *p == 's' &&
p[1] == 'i' &&
p[2] == 't' &&
p[3] == 'e' &&
p[4] == 'a' &&
p[5] == 'd' &&
p[6] == 'd' &&
p[7] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
// a special hack so it is seeds so we can use same tbl
long h32 = sreq->m_siteHash32 ^ 0x123456;
long *valPtr =(long *)quotaTable->getValue(&h32);
long a;
// if no count in table, that is strange, i guess
// skip for now???
// this happens if INJECTING a url from the
// "add url" function on homepage
if ( ! valPtr ) a=0;//continue;//{char *xx=NULL;*xx=0;}
// shortcut
else a = *valPtr;
//log("siteadds=%li for %s",a,sreq->m_url);
// what is the provided value in the url filter rule?
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// domain seeds. 'domainadds'
if ( *p == 'd' &&
p[1] == 'o' &&
p[2] == 'm' &&
p[3] == 'a' &&
p[4] == 'i' &&
p[5] == 'n' &&
p[6] == 'a' &&
p[7] == 'd' &&
p[8] == 'd' &&
p[9] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
// a special hack so it is seeds so we can use same tbl
long h32 = sreq->m_domHash32 ^ 0x123456;
long *valPtr ;
valPtr = (long *)quotaTable->getValue(&h32);
// if no count in table, that is strange, i guess
// skip for now???
long a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// what is the provided value in the url filter rule?
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// new quotas. 'sitepages' = pages from site.
// 'sitepages > 20 && seedcount <= 1 --> FILTERED'
if ( *p == 's' &&
p[1] == 'i' &&
p[2] == 't' &&
p[3] == 'e' &&
p[4] == 'p' &&
p[5] == 'a' &&
p[6] == 'g' &&
p[7] == 'e' &&
p[8] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
long *valPtr = (long *)quotaTable->
getValue(&sreq->m_siteHash32);
// if no count in table, that is strange, i guess
// skip for now???
long a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// shortcut
//log("sitepgs=%li for %s",a,sreq->m_url);
// what is the provided value in the url filter rule?
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// domain quotas. 'domainpages > 10 && hopcount >= 1 --> FILTERED'
if ( *p == 'd' &&
p[1] == 'o' &&
p[2] == 'm' &&
p[3] == 'a' &&
p[4] == 'i' &&
p[5] == 'n' &&
p[6] == 'p' &&
p[7] == 'a' &&
p[8] == 'g' &&
p[9] == 'e' &&
p[10] == 's' ) {
// need a quota table for this
if ( ! quotaTable ) continue;
long *valPtr ;
valPtr=(long*)quotaTable->getValue(&sreq->m_domHash32);
// if no count in table, that is strange, i guess
// skip for now???
long a;
if ( ! valPtr ) a = 0;//{ char *xx=NULL;*xx=0; }
else a = *valPtr;
// what is the provided value in the url filter rule?
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// tld:cn
if ( *p=='t' && strncmp(p,"tld",3)==0){
// set it on demand
if ( tld == (char *)-1 )
tld = getTLDFast ( sreq->m_url , &tldLen );
// no match if we have no tld. might be an IP only url,
// or not in our list in Domains.cpp::isTLD()
if ( ! tld || tldLen == 0 ) continue;
// set these up
//char *a = tld;
//long alen = tldLen;
char *b = s;
// loop for the comma-separated list of tlds
// like tld:us,uk,fr,it,de
subloop1:
// get length of it in the regular expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
long blen = b - start;
//char sm;
// if we had tld==com,org,...
if ( sign == SIGN_EQ &&
blen == tldLen &&
strncasecmp(start,tld,tldLen)==0 )
// if we matched any, that's great
goto matched1;
// if its tld!=com,org,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == tldLen &&
strncasecmp(start,tld,tldLen)==0 )
// we do not match this rule if we matched
// and of the tlds in the != list
continue;
// might have another tld in a comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched1;
// otherwise, bad sign?
continue;
}
// advance to next tld if there was a comma after us
b++;
// and try again
goto subloop1;
// otherwise
// do we match, if not, try next regex
//sm = strncasecmp(a,b,blen);
//if ( sm != 0 && sign == SIGN_EQ ) goto miss1;
//if ( sm == 0 && sign == SIGN_NE ) goto miss1;
// come here on a match
matched1:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// lang:en,zh_cn
if ( *p=='l' && strncmp(p,"lang",4)==0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// skip if unknown? no, we support "xx" as unknown now
//if ( srep->m_langId == 0 ) continue;
// set these up
char *b = s;
// loop for the comma-separated list of langids
// like lang==en,es,...
subloop2:
// get length of it in the regular expression box
char *start = b;
while ( *b && !is_wspace_a(*b) && *b!=',' ) b++;
long blen = b - start;
//char sm;
// if we had lang==en,es,...
if ( sign == SIGN_EQ &&
blen == langLen &&
strncasecmp(start,lang,langLen)==0 )
// if we matched any, that's great
goto matched2;
// if its lang!=en,es,...
// and we equal the string, then we do not matcht his
// particular rule!!!
if ( sign == SIGN_NE &&
blen == langLen &&
strncasecmp(start,lang,langLen)==0 )
// we do not match this rule if we matched
// and of the langs in the != list
continue;
// might have another in the comma-separated list
if ( *b != ',' ) {
// if that was the end of the list and the
// sign was == then skip this rule
if ( sign == SIGN_EQ ) continue;
// otherwise, if the sign was != then we win!
if ( sign == SIGN_NE ) goto matched2;
// otherwise, bad sign?
continue;
}
// advance to next list item if was a comma after us
b++;
// and try again
goto subloop2;
// come here on a match
matched2:
// we matched, now look for &&
p = strstr ( b , "&&" );
// if nothing, else then it is a match
if ( ! p ) return i;
// skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
// come here if we did not match the tld
}
// hopcount == 20 [&&]
if ( *p=='h' && strncmp(p, "hopcount", 8) == 0){
// skip if not valid
if ( ! sreq->m_hopCountValid ) continue;
// shortcut
long a = sreq->m_hopCount;
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// the last time it was spidered
if ( *p=='l' && strncmp(p,"lastspidertime",14) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
long a = 0;
// if no spider reply we can't match this rule!
if ( ! srep ) continue;
// shortcut
if ( srep ) a = srep->m_spideredTime;
// make it point to the retry count
long b ;
// now "s" can be "{roundstart}"
if ( s[0]=='{' && strncmp(s,"{roundstart}",12)==0)
b = cr->m_spiderRoundStartTime;//Num;
else
b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
if ( *p=='e' && strncmp(p,"errorcount",10) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// skip for msg20
if ( isForMsg20 ) continue;
// reply based
if ( ! srep ) continue;
// shortcut
long a = srep->m_errCount;
// make it point to the retry count
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// siteNumInlinks >= 300 [&&]
if ( *p=='s' && strncmp(p, "sitenuminlinks", 14) == 0){
// these are -1 if they are NOT valid
long a1 = sreq->m_siteNumInlinks;
// only assign if valid
long a2 = -1; if ( srep ) a2 = srep->m_siteNumInlinks;
// assume a1 is the best
long a ;
// assign to the first valid one
if ( a1 != -1 ) a = a1;
else if ( a2 != -1 ) a = a2;
// swap if both are valid, but srep is more recent
if ( a1 != -1 && a2 != -1 &&
srep->m_spideredTime > sreq->m_addedTime )
a = a2;
// skip if nothing valid
if ( a == -1 ) continue;
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// skip fast
p += 14;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
/*
// retryNum >= 2 [&&] ...
if ( *p=='r' && strncmp(p, "retrynum", 8) == 0){
// shortcut
long a = sr->m_retryNum;
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
*/
// how many days have passed since it was last attempted
// to be spidered? used in conjunction with percentchanged
// to assign when to re-spider it next
if ( *p=='s' && strncmp(p, "spiderwaited", 12) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// skip for msg20
if ( isForMsg20 ) continue;
// do not match rule if never attempted
if ( srep->m_spideredTime == 0 ) {char*xx=NULL;*xx=0;}
if ( srep->m_spideredTime == -1 ) {char*xx=NULL;*xx=0;}
// shortcut
float af = (srep->m_spideredTime - nowGlobal);
// make into days
af /= (3600.0*24.0);
// back to a long, round it
long a = (long)(af + 0.5);
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// percentchanged >= 50 [&&] ...
if ( *p=='p' && strncmp(p, "percentchangedperday", 20) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// skip for msg20
if ( isForMsg20 ) continue;
// shortcut
float a = srep->m_percentChangedPerDay;
// make it point to the priority
float b = atof(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// httpStatus == 400
if ( *p=='h' && strncmp(p, "httpstatus", 10) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// shortcut (errCode doubles as g_errno)
long a = srep->m_errCode;
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// how old is the doc in seconds? age is the pubDate age
if ( *p =='a' && strncmp(p, "age", 3) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// shortcut
long age;
if ( srep->m_pubDate <= 0 ) age = -1;
else age = nowGlobal - srep->m_pubDate;
// we can not match if invalid
if ( age <= 0 ) continue;
// make it point to the priority
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && age != b ) continue;
if ( sign == SIGN_NE && age == b ) continue;
if ( sign == SIGN_GT && age <= b ) continue;
if ( sign == SIGN_LT && age >= b ) continue;
if ( sign == SIGN_GE && age < b ) continue;
if ( sign == SIGN_LE && age > b ) continue;
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
/*
MDW: i replaced this with
m_contentHash32 to make spiders faster/smarter so let's
take this out for now
// how many new inlinkers we got since last spidered time?
if ( *p =='n' && strncmp(p, "newinlinks", 10) == 0){
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
// must have a reply
if ( ! srep ) continue;
// . make it point to the newinlinks.
// . # of new SpiderRequests added since
// srep->m_spideredTime
// . m_dupCache insures that the same ip/hostHash
// does not add more than 1 SpiderRequest for the
// same url/outlink
long a = srep->m_newRequests;
long b = atoi(s);
// compare
if ( sign == SIGN_EQ && a != b ) continue;
if ( sign == SIGN_NE && a == b ) continue;
if ( sign == SIGN_GT && a <= b ) continue;
if ( sign == SIGN_LT && a >= b ) continue;
if ( sign == SIGN_GE && a < b ) continue;
if ( sign == SIGN_LE && a > b ) continue;
// quick
p += 10;
// look for more
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
*/
// our own regex thing (match front of url)
if ( *p=='^' ) {
// advance over caret
p++;
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
long plen = p - pstart;
// empty? that's kinda an error
if ( plen == 0 )
continue;
long m = 1;
// check to see if we matched if url was long enough
if ( urlLen >= plen )
m = strncmp(pstart,url,plen);
if ( ( m == 0 && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( m && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// no match
continue;
}
// our own regex thing (match end of url)
if ( *p=='$' ) {
// advance over dollar sign
p++;
// a hack for $\.css, skip over the backslash too
if ( *p=='\\' && *(p+1)=='.' ) p++;
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
long plen = p - pstart;
// empty? that's kinda an error
if ( plen == 0 )
continue;
// . do we match it?
// . url has to be at least as big
// . match our tail
long m = 1;
// check to see if we matched if url was long enough
if ( urlLen >= plen )
m = strncmp(pstart,url+urlLen-plen,plen);
if ( ( m == 0 && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( m && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
// no match
continue;
}
// . by default a substring match
// . action=edit
// . action=history
// now pstart pts to the string we will match
char *pstart = p;
// make "p" point to one past the last char in string
while ( *p && ! is_wspace_a(*p) ) p++;
// how long is the string to match?
long plen = p - pstart;
// need something...
if ( plen <= 0 ) continue;
// must be at least as big
//if ( urlLen < plen ) continue;
// nullilfy it temporarily
char c = *p;
*p = '\0';
// does url contain it? haystack=u needle=p
char *found = strstr ( url , pstart );
// put char back
*p = c;
// kinda of a hack fix. if they inject a filtered url
// into test coll, do not filter it! fixes the fact that
// we filtered facebook, but still add it in our test
// collection injection in urls.txt
if ( found &&
sreq->m_isInjecting &&
cr->m_coll[0]=='t' &&
cr->m_coll[1]=='e' &&
cr->m_coll[2]=='s' &&
cr->m_coll[3]=='t' &&
cr->m_coll[4]=='\0' &&
cr->m_spiderPriorities[i] < 0 )
continue;
// support "!company" meaning if it does NOT match
// then do this ...
if ( ( found && val == 0 ) ||
// if they used the '!' operator and we
// did not match the string, that's a
// row match
( ! found && val == 1 ) ) {
// another expression follows?
p = strstr(s, "&&");
//if nothing, else then it is a match
if ( ! p ) return i;
//skip the '&&' and go to next rule
p += 2;
goto checkNextRule;
}
}
// sanity check ... must be a default rule!
//char *xx=NULL;*xx=0;
// return -1 if no match, caller should use a default
return -1;
}
//static bool s_ufnInit = false;
//static HashTableX s_ufnTable;
//void clearUfnTable ( ) {
// s_ufnTable.clear();
// s_ufnTree.clear();
//}
long getUrlFilterNum ( SpiderRequest *sreq ,
SpiderReply *srep ,
long nowGlobal ,
bool isForMsg20 ,
long niceness ,
CollectionRec *cr ,
bool isOutlink ,
HashTableX *quotaTable ) {
/*
turn this off for now to save memory on the g0 cluster.
we should nuke this anyway with rankdb
// init table?
if ( ! s_ufnInit ) {
s_ufnInit = true;
if ( ! s_ufnTable.set(8,
1,
1024*1024*5,
NULL,0,
false,
MAX_NICENESS,
"ufntab") ) { char *xx=NULL;*xx=0; }
}
// check in cache using date of request and reply and uh48 as the key
long long key64 = sreq->getUrlHash48();
key64 ^= (long long)sreq->m_addedTime;
if ( srep ) key64 ^= ((long long)srep->m_spideredTime)<<32;
char *uv = (char *)s_ufnTable.getValue(&key64);
if ( uv )
return *uv;
*/
char ufn = getUrlFilterNum2 ( sreq,
srep,
nowGlobal,
isForMsg20,
niceness,
cr,
isOutlink,
quotaTable );
/*
// is table full? clear it if so
if ( s_ufnTable.getNumSlotsUsed() > 2000000 ) {
log("spider: resetting ufn table");
s_ufnTable.clear();
}
// cache it
s_ufnTable.addKey ( &key64 , &ufn );
*/
return (long)ufn;
}
bool SpiderColl::printStats ( SafeBuf &sb ) {
return true;
}
// . dedup for spiderdb
// . TODO: we can still have spider request dups in this if they are
// sandwiched together just right because we only compare to the previous
// SpiderRequest we added when looking for dups. just need to hash the
// relevant input bits and use that for deduping.
// . TODO: we can store ufn/priority/spiderTime in the SpiderRequest along
// with the date now, so if url filters do not change then
// gotSpiderdbList() can assume those to be valid and save time. BUT it does
// have siteNumInlinks...
void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
//long need = list->m_listSize;
char *newList = list->m_list;//(char *)mmalloc (need,"dslist");
//if ( ! newList ) {
// log("spider: could not dedup spiderdb list: %s",
// mstrerror(g_errno));
// return;
//}
char *dst = newList;
char *restorePoint = newList;
long long reqUh48 = 0LL;
long long repUh48 = 0LL;
SpiderReply *oldRep = NULL;
SpiderRequest *oldReq = NULL;
char *lastKey = NULL;
char *prevLastKey = NULL;
// save list ptr in case of re-read?
//char *saved = list->m_listPtr;
// reset it
list->resetListPtr();
for ( ; ! list->isExhausted() ; ) {
// breathe. NO! assume in thread!!
//QUICKPOLL(niceness);
// get rec
char *rec = list->getCurrentRec();
// pre skip it
list->skipCurrentRec();
// skip if negative, just copy over
if ( ( rec[0] & 0x01 ) == 0x00 ) {
// should not be in here if this was true...
if ( removeNegRecs ) {
log("spider: filter got negative key");
char *xx=NULL;*xx=0;
}
// save this
prevLastKey = lastKey;
lastKey = dst;
// otherwise, keep it
memmove ( dst , rec , sizeof(key128_t) );
dst += sizeof(key128_t);
continue;
}
// is it a reply?
if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
// cast it
SpiderReply *srep = (SpiderReply *)rec;
// shortcut
long long uh48 = srep->getUrlHash48();
// crazy?
if ( ! uh48 ) {
//uh48 = hash64b ( srep->m_url );
uh48 = 12345678;
log("spider: got uh48 of zero for spider req. "
"computing now.");
}
// does match last reply?
if ( repUh48 == uh48 ) {
// if he's a later date than us, skip us!
if ( oldRep->m_spideredTime >=
srep->m_spideredTime )
// skip us!
continue;
// otherwise, erase him
dst = restorePoint;
lastKey = prevLastKey;
}
// save in case we get erased
restorePoint = dst;
prevLastKey = lastKey;
lastKey = dst;
// get our size
long recSize = srep->getRecSize();
// and add us
memmove ( dst , rec , recSize );
// advance
dst += recSize;
// update this crap for comparing to next reply
repUh48 = uh48;
oldRep = srep;
// get next spiderdb record
continue;
}
// shortcut
SpiderRequest *sreq = (SpiderRequest *)rec;
// shortcut
long long uh48 = sreq->getUrlHash48();
// crazy?
if ( ! uh48 ) {
//uh48 = hash64b ( sreq->m_url );
uh48 = 12345678;
log("spider: got uh48 of zero for spider req. "
"computing now.");
}
// update request with SpiderReply if newer, because ultimately
// ::getUrlFilterNum() will just look at SpiderRequest's
// version of these bits!
if ( oldRep && repUh48 == uh48 &&
oldRep->m_spideredTime > sreq->m_addedTime ) {
// if request was a page reindex docid based request
// and url has since been spidered, nuke it!
if ( sreq->m_urlIsDocId ) continue;
// same if indexcode was EFAKEFIRSTIP which XmlDoc.cpp
// re-adds to spiderdb with the right firstip. once
// those guys have a reply we can ignore them.
// TODO: what about diffbotxyz spider requests? those
// have a fakefirstip... they should not have requests
// though, since their parent url has that.
if ( sreq->m_fakeFirstIp ) continue;
SpiderReply *old = oldRep;
sreq->m_inGoogle = old->m_inGoogle;
sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
sreq->m_hasContactInfo = old->m_hasContactInfo;
//sreq->m_hasSiteVenue = old->m_hasSiteVenue;
}
// if we are not the same url as last request, add it
if ( uh48 != reqUh48 ) {
// a nice hook in
addIt:
// save in case we get erased
restorePoint = dst;
prevLastKey = lastKey;
// get our size
long recSize = sreq->getRecSize();
// save this
lastKey = dst;
// and add us
memmove ( dst , rec , recSize );
// advance
dst += recSize;
// update this crap for comparing to next reply
reqUh48 = uh48;
oldReq = sreq;
// get next spiderdb record
continue;
}
// try to kinda grab the min hop count as well
if ( sreq->m_hopCountValid && oldReq->m_hopCountValid ) {
if ( oldReq->m_hopCount < sreq->m_hopCount )
sreq->m_hopCount = oldReq->m_hopCount;
else
oldReq->m_hopCount = sreq->m_hopCount;
}
// if he's essentially different input parms but for the
// same url, we want to keep him because he might map the
// url to a different url priority!
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
// makes a difference as far a m_minPubDate goes, because
// we want to make sure not to delete that request that
// has m_parentPrevSpiderTime
// no no, we prefer the most recent spider request
// from thsi site in the logic above, so this is not
// necessary. mdw commented out.
//oldReq->m_wasParentIndexed != sreq->m_wasParentIndexed||
oldReq->m_isInjecting != sreq->m_isInjecting ||
oldReq->m_hasContent != sreq->m_hasContent ||
oldReq->m_isAddUrl != sreq->m_isAddUrl ||
oldReq->m_isPageReindex != sreq->m_isPageReindex ||
oldReq->m_forceDelete != sreq->m_forceDelete )
// we are different enough to coexist
goto addIt;
// . if the same check who has the most recent added time
// . if we are not the most recent, just do not add us
if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
// otherwise, erase over him
dst = restorePoint;
lastKey = prevLastKey;
// and add us over top of him
goto addIt;
}
// free the old list
//char *oldbuf = list->m_alloc;
//long oldSize = list->m_allocSize;
// sanity check
if ( dst < list->m_list || dst > list->m_list + list->m_listSize ) {
char *xx=NULL;*xx=0; }
// and stick our newly filtered list in there
//list->m_list = newList;
list->m_listSize = dst - newList;
// set to end i guess
list->m_listPtr = dst;
//list->m_allocSize = need;
//list->m_alloc = newList;
list->m_listEnd = list->m_list + list->m_listSize;
list->m_listPtrHi = NULL;
//KEYSET(list->m_lastKey,lastKey,list->m_ks);
if ( lastKey ) KEYSET(list->m_lastKey,lastKey,list->m_ks);
//mfree ( oldbuf , oldSize, "oldspbuf");
}
///////
//
// diffbot uses these for limiting crawls in a collection
//
///////
void gotCrawlInfoReply ( void *state , UdpSlot *slot);
static long s_requests = 0;
static long s_replies = 0;
static long s_validReplies = 0;
static bool s_inUse = false;
// . just call this once per second for all collections
// . figure out how to backoff on collections that don't need it so much
// . ask every host for their crawl infos for each collection rec
void updateAllCrawlInfosSleepWrapper ( int fd , void *state ) {
// debug test
//long mr = g_collectiondb.m_recs[0]->m_maxCrawlRounds;
//log("mcr: %li",mr);
// i don't know why we have locks in the lock table that are not
// getting removed... so log when we remove an expired locks and see.
// piggyback on this sleep wrapper call i guess...
// perhaps the collection was deleted or reset before the spider
// reply could be generated. in that case we'd have a dangling lock.
removeExpiredLocks ( -1 );
if ( s_inUse ) return;
char *request = "";
long requestSize = 0;
s_inUse = true;
// reset tmp crawlinfo classes to hold the ones returned to us
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
cr->m_tmpCrawlInfo.reset();
}
// send out the msg request
for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = g_hostdb.getHost(i);
// skip if dead. no! we need replies from all hosts
// otherwise our counts could be short and we might end up
// re-spidering stuff even though we've really hit maxToCrawl
//if ( g_hostdb.isDead(i) ) {
// if ( g_conf.m_logDebugSpider )
// log("spider: skipping dead host #%li "
// "when getting "
// "crawl info",i);
// continue;
//}
// count it as launched
s_requests++;
// launch it
if ( ! g_udpServer.sendRequest ( request,
requestSize,
0xc1 , // msgtype
h->m_ip ,
h->m_port ,
h->m_hostId ,
NULL, // retslot
NULL, // state
gotCrawlInfoReply ) ) {
log("spider: error sending c1 request: %s",
mstrerror(g_errno));
s_replies++;
}
}
// return false if we blocked awaiting replies
if ( s_replies < s_requests )
return;
// how did this happen?
log("spider: got bogus crawl info replies!");
s_inUse = false;
return;
// somehow we did not block... hmmmm...
//char *xx=NULL;*xx=0;
//gotCrawlInfoReply( cr , NULL );
// we did not block...
//return true;
}
// . Parms.cpp calls this when it receives our "spiderRoundNum" increment above
// . all hosts should get it at *about* the same time
void spiderRoundIncremented ( CollectionRec *cr ) {
log("spider: incrementing spider round for coll %s to %li (%lu)",
cr->m_coll,cr->m_spiderRoundNum,cr->m_spiderRoundStartTime);
// . need to send a notification for this round
// . we are only here because the round was incremented and
// Parms.cpp just called us... and that only happens in
// doneSending... so do not send again!!!
//cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// . if we set sentCrawlDoneALert to 0 it will immediately
// trigger another round increment !! so we have to set these
// to true to prevent that.
// . if we learnt that there really are no more urls ready to spider
// then we'll go to the next round. but that can take like
// SPIDER_DONE_TIMER seconds of getting nothing.
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = true;
cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound = 0;
cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound = 0;
cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound = 0;
cr->m_needsSave = true;
}
void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// loop over each LOCAL crawlinfo we received from this host
CrawlInfo *ptr = (CrawlInfo *)(slot->m_readBuf);
CrawlInfo *end = (CrawlInfo *)(slot->m_readBuf+ slot->m_readBufSize);
long allocSize = slot->m_readBufMaxSize;
// host sending us this reply
Host *h = slot->m_host;
// assume it is a valid reply, not an error, like a udptimedout
s_validReplies++;
// reply is error? then use the last known good reply we had from him
if ( ! slot->m_readBuf || g_errno ) {
log("spider: got crawlinfo reply error: %s",
mstrerror(g_errno));
// just clear it
g_errno = 0;
// just use his last known good reply
ptr = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReply;
end = (CrawlInfo *)h->m_lastKnownGoodCrawlInfoReplyEnd;
// if never had any reply... can't be valid then
if ( ! ptr ) s_validReplies--;
}
// otherwise, if reply was good it is the last known good now!
else {
// free the old good one and replace it with the new one
if ( h->m_lastKnownGoodCrawlInfoReply ) {
//log("spider: skiipping possible bad free!!!! until we fix");
mfree ( h->m_lastKnownGoodCrawlInfoReply ,
h->m_replyAllocSize ,
"lknown" );
}
// add in the new good in case he goes down in the future
h->m_lastKnownGoodCrawlInfoReply = (char *)ptr;
h->m_lastKnownGoodCrawlInfoReplyEnd = (char *)end;
// set new alloc size
h->m_replyAllocSize = allocSize;
// if valid, don't let him free it now!
slot->m_readBuf = NULL;
}
// inc it
s_replies++;
if ( s_replies > s_requests ) { char *xx=NULL;*xx=0; }
// crap, if any host is dead and not reporting it's number then
// that seriously fucks us up because our global count will drop
// and something that had hit a max limit, like maxToCrawl, will
// now be under the limit and the crawl will resume.
// what's the best way to fix this?
//
// perhaps, let's just keep the dead host's counts the same
// as the last time we got them. or maybe the simplest way is to
// just not allow spidering if a host is dead
// the sendbuf should never be freed! it points into collrec
slot->m_sendBufAlloc = NULL;
/////
// SCAN the list of CrawlInfos we received from this host,
// one for each non-null collection
/////
// . add the LOCAL stats we got from the remote into the GLOBAL stats
// . readBuf is null on an error, so check for that...
// . TODO: do not update on error???
for ( ; ptr < end ; ptr++ ) {
// get collnum
collnum_t collnum = (collnum_t)(ptr->m_collnum);
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) {
log("spider: updatecrawlinfo collnum %li "
"not found",(long)collnum);
continue;
}
CrawlInfo *stats = ptr;
long long *gs = (long long *)&cr->m_tmpCrawlInfo;
long long *ss = (long long *)stats;
for ( long i = 0 ; i < NUMCRAWLSTATS ; i++ ) {
*gs = *gs + *ss;
gs++;
ss++;
}
// . special counts
// . assume round #'s match!
//if ( ss->m_spiderRoundNum ==
// cr->m_localCrawlInfo.m_spiderRoundNum ) {
cr->m_tmpCrawlInfo.m_pageDownloadSuccessesThisRound +=
stats->m_pageDownloadSuccessesThisRound;
cr->m_tmpCrawlInfo.m_pageProcessSuccessesThisRound +=
stats->m_pageProcessSuccessesThisRound;
//}
if ( stats->m_hasUrlsReadyToSpider ) {
// inc the count otherwise
cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider++;
// . no longer initializing?
// . sometimes other shards get the spider requests
// and not us!!!
if ( cr->m_spiderStatus == SP_INITIALIZING )
cr->m_spiderStatus = SP_INPROGRESS;
// i guess we are back in business even if
// m_spiderStatus was SP_MAXTOCRAWL or
// SP_ROUNDDONE...
cr->m_spiderStatus = SP_INPROGRESS;
// unflag the sent flag if we had sent an alert
// but only if it was a crawl round done alert,
// not a maxToCrawl or maxToProcess or
// maxRounds alert.
// we can't do this because on startup we end
// up setting hasUrlsReadyToSpider to true and
// we may have already sent an email, and it
// gets RESET here when it shouldn't be
//if(cr->m_localCrawlInfo.m_sentCrawlDoneAlert
//== SP_ROUNDDONE )
//cr->m_localCrawlInfo.m_sentCrawlDoneAlert=0;
// revival?
if ( ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
log("spider: reviving crawl %s from host %li",
cr->m_coll,slot->m_host->m_hostId);
}
// if not the last reply, skip this part
if ( s_replies < s_requests ) continue;
// if it's the last reply we are to receive, and 1 or more
// hosts did not have a valid reply, and not even a
// "last known good reply" then then we can't do
// much, so do not spider then because our counts could be
// way off and cause us to start spidering again even though
// we hit a maxtocrawl limit!!!!!
if ( s_validReplies < s_replies ) {
// this will tell us to halt all spidering
// because a host is essentially down!
s_countsAreValid = false;
// might as well stop the loop here since we are
// not updating our crawlinfo states.
break;
}
// revival?
//if ( cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider &&
// ! cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
// log("spider: reviving crawl %s (%li)",cr->m_coll,
// cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider);
//}
//bool has = cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider;
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider &&
! cr->m_tmpCrawlInfo.m_hasUrlsReadyToSpider )
log("spider: all %li hosts report %s (%li) has no "
"more urls ready to spider",
s_replies,cr->m_coll,(long)cr->m_collnum);
// now copy over to global crawl info so things are not
// half ass should we try to read globalcrawlinfo
// in between packets received.
memcpy ( &cr->m_globalCrawlInfo ,
&cr->m_tmpCrawlInfo ,
sizeof(CrawlInfo) );
// turn not assume we are out of urls just yet if a host
// in the network has not reported...
//if ( g_hostdb.hasDeadHost() && has )
// cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider = true;
// should we reset our "sent email" flag?
bool reset = false;
// can't reset if we've never sent an email out yet
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ) reset = true;
// must have some urls ready to spider now so we can send
// another email after another round of spidering
if (!cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider) reset=false;
// . if we have urls ready to be spidered then prepare to send
// another email/webhook notification.
// . do not reset this flag if SP_MAXTOCRAWL etc otherwise we
// end up sending multiple notifications, so this logic here
// is only for when we are done spidering a round, which
// happens when hasUrlsReadyToSpider goes false for all
// shards.
if ( reset ) {
log("spider: resetting sent crawl done alert to 0 "
"for coll %s",cr->m_coll);
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
// update cache time
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
// make it save to disk i guess
cr->m_needsSave = true;
// and we've examined at least one url. to prevent us from
// sending a notification if we haven't spidered anything
// because no seed urls have been added/injected.
//if ( cr->m_globalCrawlInfo.m_urlsConsidered == 0 ) return;
if ( cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 )
continue;
// if urls were considered and roundstarttime is still 0 then
// set it to the current time...
//if ( cr->m_spiderRoundStartTime == 0 )
// // all hosts in the network should sync with host #0
// // on this
// cr->m_spiderRoundStartTime = getTimeGlobal();
// but of course if it has urls ready to spider, do not send
// alert... or if this is -1, indicating "unknown".
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
continue;
// update status if nto already SP_MAXTOCRAWL, etc. we might
// just be flat out of urls
if ( ! cr->m_spiderStatus ||
cr->m_spiderStatus == SP_INPROGRESS ||
cr->m_spiderStatus == SP_INITIALIZING )
cr->m_spiderStatus = SP_ROUNDDONE;
//
// TODO: set the spiderstatus outright here...
// maxtocrawl, maxtoprocess, etc. based on the counts.
//
// only host #0 sends emails
if ( g_hostdb.m_myHost->m_hostId != 0 )
continue;
// . if already sent email for this, skip
// . localCrawlInfo stores this value on disk so persistent
// . we do it this way so SP_ROUNDDONE can be emailed and then
// we'd email SP_MAXROUNDS to indicate we've hit the maximum
// round count.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert )
continue;
// do email and web hook...
sendNotificationForCollRec ( cr );
// deal with next collection rec
}
// wait for more replies to come in
if ( s_replies < s_requests ) return;
// initialize
s_replies = 0;
s_requests = 0;
s_validReplies = 0;
s_inUse = false;
}
void handleRequestc1 ( UdpSlot *slot , long niceness ) {
//char *request = slot->m_readBuf;
// just a single collnum
if ( slot->m_readBufSize != 0 ) { char *xx=NULL;*xx=0;}
//if ( ! isClockSynced() ) {
//}
//collnum_t collnum = *(collnum_t *)request;
//CollectionRec *cr = g_collectiondb.getRec(collnum);
// deleted from under us? i've seen this happen
//if ( ! cr ) {
// log("spider: c1: coll deleted returning empty reply");
// g_udpServer.sendReply_ass ( "", // reply
// 0,
// 0 , // alloc
// 0 , //alloc size
// slot );
// return;
//}
// while we are here update CrawlInfo::m_nextSpiderTime
// to the time of the next spider request to spider.
// if doledb is empty and the next rec in the waiting tree
// does not have a time of zero, but rather, in the future, then
// return that future time. so if a crawl is enabled we should
// actively call updateCrawlInfo a collection every minute or
// so.
//cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
//long long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
//long long nextSpiderTimeMS;
// this will be 0 for ip's which have not had their SpiderRequests
// in spiderdb scanned yet to get the best SpiderRequest, so we
// just have to wait for that.
/*
nextSpiderTimeMS = sc->getEarliestSpiderTimeFromWaitingTree(0);
if ( ! sc->m_waitingTreeNeedsRebuild &&
sc->m_lastDoledbReadEmpty &&
cr->m_spideringEnabled &&
g_conf.m_spideringEnabled &&
nextSpiderTimeMS > nowGlobalMS +10*60*1000 )
// turn off this flag, "ready queue" is empty
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
// but send back a -1 if we do not know yet because we haven't
// read the doledblists from disk from all priorities for this coll
if ( sc->m_numRoundsDone == 0 )
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = -1;
*/
//long now = getTimeGlobal();
SafeBuf replyBuf;
long now = getTimeGlobalNoCore();
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue;
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// this is now needed for alignment by the receiver
ci->m_collnum = i;
SpiderColl *sc = cr->m_spiderColl;
/////////
//
// ARE WE DONE SPIDERING?????
//
/////////
// if we haven't spidered anything in 1 min assume the
// queue is basically empty...
if ( ci->m_lastSpiderAttempt &&
ci->m_lastSpiderCouldLaunch &&
ci->m_hasUrlsReadyToSpider &&
// the next round we are waiting for, if any, must
// have had some time to get urls! otherwise we
// will increment the round # and wait just
// SPIDER_DONE_TIMER seconds and end up setting
// hasUrlsReadyToSpider to false!
now > cr->m_spiderRoundStartTime + SPIDER_DONE_TIMER &&
// no spiders currently out. i've seen a couple out
// waiting for a diffbot reply. wait for them to
// return before ending the round...
sc && sc->m_spidersOut == 0 &&
// it must have launched at least one url! this should
// prevent us from incrementing the round # at the gb
// process startup
//ci->m_numUrlsLaunched > 0 &&
//cr->m_spideringEnabled &&
//g_conf.m_spideringEnabled &&
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch >
(long) SPIDER_DONE_TIMER ) {
// this is the MOST IMPORTANT variable so note it
log("spider: coll %s has no more urls to spider",
cr->m_coll);
// assume our crawl on this host is completed i guess
ci->m_hasUrlsReadyToSpider = 0;
// save that!
cr->m_needsSave = true;
// set the time that this happens
cr->m_diffbotCrawlEndTime = getTimeGlobalNoCore();
}
// save it
replyBuf.safeMemcpy ( ci , sizeof(CrawlInfo) );
}
g_udpServer.sendReply_ass ( replyBuf.getBufStart() ,
replyBuf.length() ,
replyBuf.getBufStart() , // alloc
replyBuf.getCapacity() , //alloc size
slot );
// udp server will free this
replyBuf.detachBuf();
}
bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
if ( ! g_conf.m_spideringEnabled && ! cx->m_isCustomCrawl )
return msg->safePrintf("Spidering disabled in "
"master controls. You can turn it "
"back on there.");
if ( g_conf.m_readOnlyMode )
return msg->safePrintf("In read-only mode. Spidering off.");
if ( g_dailyMerge.m_mergeMode )
return msg->safePrintf("Daily merge engaged, spidering "
"paused.");
if ( g_udpServer.getNumUsedSlots() >= 1300 )
return msg->safePrintf("Too many UDP slots in use, "
"spidering paused.");
if ( g_repairMode )
return msg->safePrintf("In repair mode, spidering paused.");
// do not spider until collections/parms in sync with host #0
if ( ! g_parms.m_inSyncWithHost0 )
return msg->safePrintf("Parms not in sync with host #0, "
"spidering paused");
// don't spider if not all hosts are up, or they do not all
// have the same hosts.conf.
if ( g_pingServer.m_hostsConfInDisagreement )
return msg->safePrintf("Hosts.conf discrepancy, "
"spidering paused.");
long now = getTimeGlobal();
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_spiderStatus == SP_MAXTOCRAWL &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Jobs has reached maxToCrawl limit. "
"Next crawl round to start "
"in %li seconds.",
cx->m_spiderRoundStartTime-now );
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS &&
cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Jobs has reached maxToProcess limit. "
"Next crawl round to start "
"in %li seconds.",
cx->m_spiderRoundStartTime-now );
}
if ( cx->m_spiderStatus == SP_MAXTOCRAWL ) {
*status = SP_MAXTOCRAWL;
return msg->safePrintf ( "Job has reached maxToCrawl "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXTOPROCESS ) {
*status = SP_MAXTOPROCESS;
return msg->safePrintf ( "Job has reached maxToProcess "
"limit." );
}
if ( cx->m_spiderStatus == SP_MAXROUNDS ) {
*status = SP_MAXROUNDS;
return msg->safePrintf ( "Job has reached maxRounds "
"limit." );
}
// . 0 means not to RE-crawl
// . indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
now < cx->m_spiderRoundStartTime ) {
*status = SP_ROUNDDONE;
return msg->safePrintf("Next crawl round to start "
"in %li seconds.",
cx->m_spiderRoundStartTime-now );
}
if ( ! cx->m_spideringEnabled ) {
*status = SP_PAUSED;
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job paused.");
else
return msg->safePrintf("Spidering disabled "
"in spider controls.");
}
if ( ! g_conf.m_spideringEnabled ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "
"by root administrator for "
"maintenance.");
}
// out CollectionRec::m_globalCrawlInfo counts do not have a dead
// host's counts tallied into it, which could make a difference on
// whether we have exceed a maxtocrawl limit or some such, so wait...
if ( ! s_countsAreValid ) {
*status = SP_ADMIN_PAUSED;
return msg->safePrintf("All crawling temporarily paused "
"because a shard is down.");
}
// if spiderdb is empty for this coll, then no url
// has been added to spiderdb yet.. either seed or spot
//CrawlInfo *cg = &cx->m_globalCrawlInfo;
//if ( cg->m_pageDownloadAttempts == 0 ) {
// *status = SP_NOURLS;
// return msg->safePrintf("Crawl is waiting for urls.");
//}
if ( cx->m_spiderStatus == SP_INITIALIZING ) {
*status = SP_INITIALIZING;
return msg->safePrintf("Job is initializing.");
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency <= 0.0 &&
cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
*status = SP_COMPLETED;
return msg->safePrintf("Job has completed and no "
"repeat is scheduled.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE && ! cx->m_isCustomCrawl ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
"Change your url filters, try "
"adding new urls, or wait for "
"existing urls to be respidered.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) {
*status = SP_ROUNDDONE;
return msg->safePrintf ( "Job round completed.");
}
// otherwise in progress?
*status = SP_INPROGRESS;
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job is in progress.");
else
return msg->safePrintf("Spider is in progress.");
}
// pattern is a ||-separted list of substrings
bool doesStringContainPattern ( char *content , char *pattern ) {
//bool checkForNegatives ) {
char *p = pattern;
long matchedOne = 0;
bool hadPositive = false;
long count = 0;
// scan the " || " separated substrings
for ( ; *p ; ) {
// get beginning of this string
char *start = p;
// skip white space
while ( *start && is_wspace_a(*start) ) start++;
// done?
if ( ! *start ) break;
// find end of it
char *end = start;
while ( *end && end[0] != '|' )
end++;
// advance p for next guy
p = end;
// should be two |'s
if ( *p ) p++;
if ( *p ) p++;
// temp null this
char c = *end;
*end = '\0';
// count it as an attempt
count++;
bool matchFront = false;
if ( start[0] == '^' ) { start++; matchFront = true; }
// if pattern is NOT/NEGATIVE...
bool negative = false;
if ( start[0] == '!' && start[1] && start[1]!='|' ) {
start++;
negative = true;
}
else
hadPositive = true;
// . is this substring anywhere in the document
// . check the rawest content before converting to utf8 i guess
// . suuport the ^ operator
char *foundPtr = NULL;
if ( matchFront ) {
// if we match the front, set to bogus 0x01
if ( strncmp(content,start,end-start)==0 )
foundPtr =(char *)0x01;
}
else {
foundPtr = strstr ( content , start ) ;
}
// debug log statement
//if ( foundPtr )
// log("build: page %s matches ppp of \"%s\"",
// m_firstUrl.m_url,start);
// revert \0
*end = c;
// negative mean we should NOT match it
if ( negative ) {
// so if its matched, that is bad
if ( foundPtr ) return false;
continue;
}
// skip if not found
if ( ! foundPtr ) continue;
// did we find it?
matchedOne++;
// if no negatives, done
//if ( ! checkForNegatives )
//return true;
}
// if we had no attempts, it is ok
if ( count == 0 ) return true;
// must have matched one at least
if ( matchedOne ) return true;
// if all negative? i.e. !category||!author
if ( ! hadPositive ) return true;
// if we had an unfound substring...
return false;
}
long getFakeIpForUrl1 ( char *url1 ) {
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( url1 );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
return firstIp;
}
long getFakeIpForUrl2 ( Url *url2 ) {
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( url2 );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
return firstIp;
}
// returns false and sets g_errno on error
bool SpiderRequest::setFromAddUrl ( char *url ) {
// reset it
reset();
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( url );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
//long firstIp = getFakeIpForUrl1 ( url );
// ensure not crazy
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// . now fill it up
// . TODO: calculate the other values... lazy!!! (m_isRSSExt,
// m_siteNumInlinks,...)
m_isNewOutlink = 1;
m_isAddUrl = 1;
m_addedTime = getTimeGlobal();//now;
m_fakeFirstIp = 1;
m_probDocId = probDocId;
m_firstIp = firstIp;
m_hopCount = 0;
// new: validate it?
m_hopCountValid = 1;
// its valid if root
Url uu; uu.set ( url );
if ( uu.isRoot() ) m_hopCountValid = true;
// too big?
if ( gbstrlen(url) > MAX_URL_LEN ) {
g_errno = EURLTOOLONG;
return false;
}
// the url! includes \0
strcpy ( m_url , url );
// call this to set m_dataSize now
setDataSize();
// make the key dude -- after setting url
setKey ( firstIp , 0LL, false );
// need a fake first ip lest we core!
//m_firstIp = (pdocId & 0xffffffff);
// how to set m_firstIp? i guess addurl can be throttled independently
// of the other urls??? use the hash of the domain for it!
long dlen;
char *dom = getDomFast ( url , &dlen );
// fake it for this...
//m_firstIp = hash32 ( dom , dlen );
// sanity
if ( ! dom ) {
g_errno = EBADURL;
return false;
//return sendReply ( st1 , true );
}
m_domHash32 = hash32 ( dom , dlen );
// and "site"
long hlen = 0;
char *host = getHostFast ( url , &hlen );
m_siteHash32 = hash32 ( host , hlen );
m_hostHash32 = m_siteHash32;
return true;
}
bool SpiderRequest::setFromInject ( char *url ) {
// just like add url
if ( ! setFromAddUrl ( url ) ) return false;
// but fix this
m_isAddUrl = 0;
m_isInjecting = 1;
return true;
}